In [130]:
!pip install -q transformers datasets

In [116]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.model_selection import train_test_split

# Load data

In [117]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Preprocess the data

In [118]:
class TextCleaner():
    def __init__(self):
        pass

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

cleaner = TextCleaner()
df['cleaned_text'] = df['Message'].apply(cleaner.clean_text)

In [119]:
df.head()

Unnamed: 0,Category,Message,cleaned_text
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


# Label Encoding

In [120]:
# model = LabelEncoder()
# model.fit(df['Category'])
# df['label'] = model.transform(df['Category'])

# Label encoding
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'].tolist())
print(label_encoder.classes_)  # Display original classes

['ham' 'spam']


# Train test split

In [121]:
X_train, X_test = train_test_split(df, test_size=0.2)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df[['cleaned_text']], df['Category'], test_size=0.2, random_state=42)

In [122]:
X_train.shape, X_test.shape

((4457, 1), (1115, 1))

In [123]:
# Convert datasets to tokenized format
train_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_train['cleaned_text'], 'label': y_train}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_test['cleaned_text'], 'label': y_test}))


# Model training

In [125]:
from datasets import Dataset
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_test = test_dataset.map(tokenize_data, batched=True)


Map:   0%|          | 0/4457 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

In [126]:
for i in range(5):
  print(tokenized_train[i])

{'text': 'reply to win 100 weekly where will the 2006 fifa world cup be held send stop to 87239 to end service', 'label': 1, '__index_level_0__': 1978, 'input_ids': [101, 7514, 2000, 2663, 2531, 4882, 2073, 2097, 1996, 2294, 5713, 2088, 2452, 2022, 2218, 4604, 2644, 2000, 6584, 21926, 2683, 2000, 2203, 2326, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'text': 'hello sort of out in town already that so dont rush home i am eating nachos will let you know eta', 'label': 0, '__index_level_0__': 3989, 'input_ids': [101, 7592, 4066, 1997, 2041, 1999, 2237, 2525, 2008, 2061, 2123, 2102, 5481, 2188, 1045, 2572, 5983, 6583, 9905, 2015, 2097, 2292, 2017, 2113, 27859, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'text': 'how come guoyang go n tell her then u told her', 'label': 0, '__index_level_0__': 3935, 'input_ids': [101, 2129, 2272, 22720, 12198, 2175, 1050, 2425, 2014, 2059,

In [128]:
from transformers import DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args= TrainingArguments(
    output_dir="model_dir",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch"
    )

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

model_out = trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.132,0.077893


AttributeError: 'TrainOutput' object has no attribute 'save_pretrained'

In [129]:
# Save the model and tokenizer
model.save_pretrained("model_dir3")  # Save the trained model
tokenizer.save_pretrained("model_dir3")  # Save the tokenizer

('model_dir3/tokenizer_config.json',
 'model_dir3/special_tokens_map.json',
 'model_dir3/vocab.txt',
 'model_dir3/added_tokens.json',
 'model_dir3/tokenizer.json')

# Full code

In [97]:
import pandas as pd
import re
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load dataset
df = pd.read_csv('spam.csv')
print(df.head())  # Display the first few rows

# Define the TextCleaner class
class TextCleaner():
    def __init__(self):
        pass

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

# Apply text cleaning
cleaner = TextCleaner()
df['cleaned_text'] = df['Message'].apply(cleaner.clean_text)

# Assuming the target label is in a column named 'label', convert to numerical labels
df['label'] = df['Category'].map({'spam': 1, 'ham': 0})  # Adjust according to your label values

# Split the dataset into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['cleaned_text']], df['label'], test_size=0.2, random_state=42)

# Convert datasets to the Hugging Face Dataset format
train_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_train['cleaned_text'], 'label': y_train}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_test['cleaned_text'], 'label': y_test}))

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize data
def tokenize_data(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_test = test_dataset.map(tokenize_data, batched=True)

# Prepare model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training arguments
args = TrainingArguments(
    output_dir="model_dir",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the trained model
model.save_pretrained("model_dir")


  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...




Map:   0%|          | 0/4457 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.1217,0.076652


# Save model

In [138]:
# After training your model
model.save_pretrained("distilbert-spam_classification")
tokenizer.save_pretrained("distilbert-spam_classification")  # Save the tokenizer

('distilbert-spam_classification/tokenizer_config.json',
 'distilbert-spam_classification/special_tokens_map.json',
 'distilbert-spam_classification/vocab.txt',
 'distilbert-spam_classification/added_tokens.json',
 'distilbert-spam_classification/tokenizer.json')

# Prediction

In [142]:
import re
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.preprocessing import LabelEncoder

# Function to clean input text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Example input text
#new_text = "Congratulations! You've won a $1000 cash prize. Click here to claim."
new_text1 = "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
new_text2 = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
new_text3 = "reply to win 100 weekly where will the 2006 fifa world cup be held send stop to 87239 to end service"

# Clean the input text
cleaned_text = clean_text(new_text2)
print(cleaned_text)

# Load the model and tokenizer
#/content/model_dir
model = AutoModelForSequenceClassification.from_pretrained("model_dir3")
tokenizer = AutoTokenizer.from_pretrained("model_dir3")

# Tokenize the input text
inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True)

# Make predictions
with torch.no_grad():  # Disable gradient calculation
    outputs = model(**inputs)
    print(outputs)
    logits = outputs.logits  # Get the raw logits
    print(logits)
    predicted_class = logits.argmax(dim=-1).item()  # Get the predicted class index
    print(predicted_class)

# Assuming you have the label encoder from before
predicted_label = label_encoder.inverse_transform([predicted_class])
print(f"Predicted label: {predicted_label[0]}")

free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s
SequenceClassifierOutput(loss=None, logits=tensor([[-2.1985,  1.8397]]), hidden_states=None, attentions=None)
tensor([[-2.1985,  1.8397]])
1
Predicted label: spam


# Without torch

In [145]:
import re
from transformers import pipeline
from sklearn.preprocessing import LabelEncoder

# Function to clean input text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Example input text
new_text2 = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

# Clean the input text
cleaned_text = clean_text(new_text2)
print(cleaned_text)

# Load the model using pipeline
classifier = pipeline("text-classification", model="model_dir3", tokenizer="model_dir3")

# Make predictions
prediction = classifier(cleaned_text)

# Get the predicted label and score
predicted_label = prediction[0]['label']
if predicted_label == 'LABEL_0':
    predicted_label = 'ham'
else:
    predicted_label = 'spam'
predicted_score = prediction[0]['score']

# Print the actual predicted label
print(f"Predicted label: {predicted_label} with score: {predicted_score:.4f}")


free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s
Predicted label: spam with score: 0.9827


In [133]:
!pip install huggingface-hub



In [136]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): Traceback (most recent call last):
  File "/usr/local/bin/huggingface-cli", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/commands/huggingface_cli.py", line 52, in main
  File "/usr/local/lib/python3.10/dist-packages/