## Fine-Tuning DistilBERT on SMS Spam Dataset

This notebook loads a pre-trained DistilBERT model and fine-tunes it on the classic SMS spam dataset (ham/spam). It includes text preprocessing, tokenization, training with Hugging Face Trainer, evaluation, and saving the final model.


## 📦 1. Imports and Setup

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer,
    PreTrainedModel,
    PreTrainedTokenizer
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import os
from datetime import datetime

# Force usage of CPU
device = torch.device("cpu")

  from .autonotebook import tqdm as notebook_tqdm


## 📂 2. Load SMS Spam Dataset

In [2]:
df = pd.read_csv('D:\Sajid\email-spam-detection/data/spam.csv', encoding='latin-1')


df = df[['v1', 'v2']]
df.columns = ['label', 'text']

# Check again
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## 📂 3. Basic cleaning of the dataset

In [3]:
# Check for missing values
print("\nMissing values:\n")
print(df.isnull().sum(),'\n')


print(f"Shape Raw data: {df.shape}")



# Check for duplicate rows
duplicate_count = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_count}")

# If duplicates exist, drop them
df = df.drop_duplicates()


print(f"Shape after removing duplicates: {df.shape}")



Missing values:

label    0
text     0
dtype: int64 

Shape Raw data: (5572, 2)

Number of duplicate rows: 403
Shape after removing duplicates: (5169, 2)


In [4]:
# Encode the labels: spam = 1, ham = 0
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Check distribution
df['label'].value_counts()


label
0    4516
1     653
Name: count, dtype: int64

## 🧪 3. Train-Test Split

In [8]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['label']  # Keeps class proportions same in both sets
)

## 🔠 4. Tokenization using DistilBERT Tokenizer both traing / validation dataset

In [None]:
# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')



# Tokenize the train and validation texts
train_encodings = tokenizer(
    train_texts,
    truncation=True,
    padding=True,
    return_tensors='pt'
)

val_encodings = tokenizer(
    val_texts,
    truncation=True,
    padding=True,
    return_tensors='pt'
)

## 📦 5. Create PyTorch Dataset Wrapper

In [None]:
# Custom dataset class to handle our tokenized data
class SMSDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)



# Instantiate datasets
train_dataset = SMSDataset(train_encodings, train_labels)
val_dataset = SMSDataset(val_encodings, val_labels)

## 🧠 6. Load Pre-trained DistilBERT Model

In [12]:
# DistilBertForSequenceClassification -->   special version of DistilBERT from Hugging Face's transformers library that is customized for sequence classification tasks

# Load DistilBERT for binary classification (num_labels=2)
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2
)




Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## ⚙️ 7. Set TrainingArguments

In [None]:
# Define training arguments for the Trainer API

training_args = TrainingArguments(
    #output_dir="./results",              # Where to save model checkpoints
    num_train_epochs=3,                  # Number of training epochs
    per_device_train_batch_size=16,      # Batch size for training
    per_device_eval_batch_size=16,       # Batch size for evaluation
    eval_strategy="epoch",         # Evaluate model at end of each epoch
    save_strategy="epoch",               # Save model at end of each epoch
    logging_dir="./logs",                # Log directory for TensorBoard
    logging_steps=10,                    # Log every 10 steps
    load_best_model_at_end=True,         # Automatically load best model
    metric_for_best_model="f1",    # Metric to decide best model
    save_total_limit=1                   # Only keep latest best checkpoint
)


## 📏 8. Define Metrics Function

In [None]:
# Function to compute accuracy, precision, recall, and F1 score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    #precision, recall, f1, support = precision_recall_fscore_support(labels, preds, average=None, labels=[0, 1])

    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

## 🚂 9. Train the Model

In [None]:
# Initialize HuggingFace Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


#Train Model
trainer.train()

##  10. Evaluate the Model

In [None]:
# Evaluate model performance
trainer.evaluate(val_dataset)



{'eval_loss': 0.03423997014760971,
 'eval_accuracy': 0.9922630560928434,
 'eval_f1': 0.9692307692307692,
 'eval_precision': 0.9767441860465116,
 'eval_recall': 0.9618320610687023,
 'eval_runtime': 275.08,
 'eval_samples_per_second': 3.759,
 'eval_steps_per_second': 0.236,
 'epoch': 3.0}

In [None]:
# Print classification report for detailed class-wise metrics


# Get model predictions
predictions = trainer.predict(val_dataset)

# Extract predicted class labels
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

#  Generate classification report
print(classification_report(y_true, y_pred, target_names=['ham', 'spam']))




              precision    recall  f1-score   support

         ham       0.99      1.00      1.00       903
        spam       0.98      0.96      0.97       131

    accuracy                           0.99      1034
   macro avg       0.99      0.98      0.98      1034
weighted avg       0.99      0.99      0.99      1034



## 💾 11. Save Final Model with Timestamped Folder

In [None]:


save_path = "D:/Sajid/email-spam-detection/models/bert-sms-spam"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved to: {save_path}")
