# 1. Install requirements

In [1]:
!pip install transformers



In [2]:
!pip install datasets



In [3]:
!pip install --upgrade pandas



In [4]:
! pip install evaluate



#Configs

In [5]:
from google.colab import drive
drive.mount('/content/drive')
data_path = "/content/drive/My Drive/dataset.csv" #@param {type:"string"}
text_column_name = "email" #@param {type:"string"}
label_column_name = "category" #@param {type:"string"}

model_name = "distilbert-base-uncased" #@param {type:"string"}
test_size = 0.2 #@param {type:"number"}
num_labels = 2 #@param {type:"number"}

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 2. Read and Prepare the Dataset

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv(data_path)

In [8]:
df.head()

Unnamed: 0,email,category
0,"URL: http://www.newsisfree.com/click/-1,817167...",not-spam
1,"On Thu, 19 Sep 2002, Bill Stoddard wrote:\n\n-...",not-spam
2,Dan Kohn <dan@dankohn.com> writes:\n\n\n\n> Gu...,not-spam
3,wintermute wrote:\n\n>>Anyone know where in Ir...,not-spam
4,"I attended the same conference, and was impres...",not-spam


### Clean Dataset

In [9]:
from bs4 import BeautifulSoup

In [10]:
class Cleaner():
  def __init__(self):
    pass
  def put_line_breaks(self,text):
    text = text.replace('</p>','</p>\n')
    return text
  def remove_html_tags(self,text):
    cleantext = BeautifulSoup(text, "lxml").text
    return cleantext
  def clean(self,text):
    text = self.put_line_breaks(text)
    text = self.remove_html_tags(text)
    return text

In [11]:
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

  cleantext = BeautifulSoup(text, "lxml").text


In [12]:
df.head()

Unnamed: 0,email,category,text_cleaned
0,"URL: http://www.newsisfree.com/click/-1,817167...",not-spam,"URL: http://www.newsisfree.com/click/-1,817167..."
1,"On Thu, 19 Sep 2002, Bill Stoddard wrote:\n\n-...",not-spam,"On Thu, 19 Sep 2002, Bill Stoddard wrote:\n\n-..."
2,Dan Kohn <dan@dankohn.com> writes:\n\n\n\n> Gu...,not-spam,"Dan Kohn writes:\n\n\n\n> Guys, the Habeas In..."
3,wintermute wrote:\n\n>>Anyone know where in Ir...,not-spam,wintermute wrote:\n\n>>Anyone know where in Ir...
4,"I attended the same conference, and was impres...",not-spam,"I attended the same conference, and was impres..."


### Label Encoder

In [13]:
from sklearn import preprocessing

In [14]:
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())
df['label'] = le.transform(df[label_column_name].tolist())

In [15]:
df.head()

Unnamed: 0,email,category,text_cleaned,label
0,"URL: http://www.newsisfree.com/click/-1,817167...",not-spam,"URL: http://www.newsisfree.com/click/-1,817167...",0
1,"On Thu, 19 Sep 2002, Bill Stoddard wrote:\n\n-...",not-spam,"On Thu, 19 Sep 2002, Bill Stoddard wrote:\n\n-...",0
2,Dan Kohn <dan@dankohn.com> writes:\n\n\n\n> Gu...,not-spam,"Dan Kohn writes:\n\n\n\n> Guys, the Habeas In...",0
3,wintermute wrote:\n\n>>Anyone know where in Ir...,not-spam,wintermute wrote:\n\n>>Anyone know where in Ir...,0
4,"I attended the same conference, and was impres...",not-spam,"I attended the same conference, and was impres...",0


### Train/Test Split

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
df_train,df_test = train_test_split(df,test_size=test_size)

### Convert to Huggingface Dataset

In [18]:
from datasets import Dataset

In [19]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

### Tokenizer

In [20]:
from transformers import AutoTokenizer

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["text_cleaned"], truncation=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [22]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3036 [00:00<?, ? examples/s]

In [23]:
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/760 [00:00<?, ? examples/s]

# 3. Initialize Model

In [24]:
from transformers import AutoModelForSequenceClassification

In [25]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 4. Train model

In [26]:
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np

In [27]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [28]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [29]:
!pip install transformers[torch]



In [30]:
!pip install transformers torch torchvision torchaudio




In [31]:
!pip install transformers
!pip install accelerate==0.27.2



In [32]:
import accelerate

In [33]:
from transformers import TrainingArguments, Trainer


In [37]:

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [38]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2048,0.131637,0.956579
2,0.1461,0.200269,0.957895
3,0.1133,0.194268,0.944737
4,0.1777,0.199186,0.953947
5,0.1259,0.120301,0.977632


TrainOutput(global_step=1900, training_loss=0.15355087681820517, metrics={'train_runtime': 830.7604, 'train_samples_per_second': 18.272, 'train_steps_per_second': 2.287, 'total_flos': 1986333119323440.0, 'train_loss': 0.15355087681820517, 'epoch': 5.0})

In [39]:
trainer.save_model('spam_model')

# 5. Evaluate Model

In [40]:
from sklearn.metrics import classification_report

In [41]:
preds = trainer.predict(tokenized_train)
preds = np.argmax(preds[:3][0],axis=1)
GT = df_train['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1521
           1       0.98      0.99      0.98      1515

    accuracy                           0.98      3036
   macro avg       0.98      0.98      0.98      3036
weighted avg       0.98      0.98      0.98      3036



In [42]:
preds = trainer.predict(tokenized_test)
preds = np.argmax(preds[:3][0],axis=1) #preds[:3][1]
GT = df_test['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       379
           1       0.98      0.98      0.98       381

    accuracy                           0.98       760
   macro avg       0.98      0.98      0.98       760
weighted avg       0.98      0.98      0.98       760

