### Fine-tuning BERT SMS Model on Email Spam Dataset

This notebook loads a pre-trained DistilBERT model trained on SMS spam data and fine-tunes it on a phishing email dataset. 

# 🧾 1. Library Imports and Setup

In [None]:

import pandas as pd
import numpy as np
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset
import re
import random
import torch

# Ensure we use CPU only
device = torch.device('cpu')

  from .autonotebook import tqdm as notebook_tqdm


# 🧾 2. Load Pre-trained SMS Model from Disk

In [2]:
# Path to your saved model
model_path = "D:/Sajid/email-spam-detection/models/bert-sms-spam/version_20250508_185703"

# Load tokenizer and model from disk
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path).to(device)


# 🧾 3. Load and Preprocess Phishing Email Dataset

In [None]:
# Load the phishing email dataset
email_df = pd.read_csv("D:\Sajid\email-spam-detection\data\Phishing Email Dataset/phishing_email.csv")

email_df.head()



Unnamed: 0,text_combined,label
0,hpl nom may 25 2001 see attached file hplno 52...,0
1,nom actual vols 24 th forwarded sabrae zajac h...,0
2,enron actuals march 30 april 1 201 estimated a...,0
3,hpl nom may 30 2001 see attached file hplno 53...,0
4,hpl nom june 1 2001 see attached file hplno 60...,0


In [5]:
# Keep only the relevant columns and rename for consistency
email_df = email_df[['text_combined', 'label']].dropna()
email_df = email_df.rename(columns={'text_combined': 'text', 'label': 'label'})

# Remove duplicate messages based on the cleaned text
email_df = email_df.drop_duplicates(subset='text')

email_df.head()

Unnamed: 0,text,label
0,hpl nom may 25 2001 see attached file hplno 52...,0
1,nom actual vols 24 th forwarded sabrae zajac h...,0
2,enron actuals march 30 april 1 201 estimated a...,0
3,hpl nom may 30 2001 see attached file hplno 53...,0
4,hpl nom june 1 2001 see attached file hplno 60...,0


In [6]:
# Clean the text by removing URLs and special characters, and lowercasing
def clean_email_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    return text

In [None]:
email_df['text'] = email_df['text'].apply(clean_email_text)

# saving the copy of the complete data set
email_df_full = email_df.copy()

## 🧾 4. Randomly Sample Emails for Training + Validation
- since using CPU so sampling a subset of data to speed up the training

In [None]:
#  Save the original row indices so we can later use them if required
original_indices_used = email_df.sample(n=3000, random_state=42).index

# Now slice the actual training data
email_df = email_df.loc[original_indices_used].reset_index(drop=True)

#to check the dist of both classes in the sampled data
email_df['label'].value_counts()

## 🧾 5. Train-Test Split

In [11]:
# Split into training and validation sets with stratification to preserve label distribution
train_texts, val_texts, train_labels, val_labels = train_test_split(
    email_df['text'].tolist(),
    email_df['label'].tolist(),
    test_size=0.2,
    stratify=email_df['label'],
    random_state=42
)

## 🧾 6. Tokenization

In [12]:
# Convert text into token IDs with padding and truncation to max_length
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

## 🧾 7. Wrap Tokenized Data in PyTorch Dataset

In [13]:
# Custom Dataset class to use with HuggingFace Trainer
class EmailDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create datasets
train_dataset = EmailDataset(train_encodings, train_labels)
val_dataset = EmailDataset(val_encodings, val_labels)

## 🧾 8. Define TrainingArguments

In [14]:
# Set up the training configuration
training_args = TrainingArguments(
   # output_dir="outputs/bert_email_model",         # Save directory
    num_train_epochs=3,                             # Number of training epochs
    per_device_train_batch_size=4,                  # Batch size for training
    per_device_eval_batch_size=4,                   # Batch size for evaluation
    gradient_accumulation_steps=2,
    eval_strategy="epoch",                   # Evaluate after each epoch
    save_strategy="epoch",                         # Save model after each epoch
    logging_dir="logs",                            # Logging directory
    load_best_model_at_end=True,                    # Load best model by selected metric
    metric_for_best_model="f1",                    # Best model based on F1 score
    greater_is_better=True,                         # Higher F1 is better
    no_cuda=True                                    # Use CPU
)



## 🧾 9. Define Custom Evaluation Metrics

In [15]:

# Define how metrics will be computed during evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

## 🧾 10. Initialize Trainer and Fine-tune Model

In [16]:

# Initialize Trainer with model, args, data, and metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [17]:
# Train the model on email dataset
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.33242,0.915,0.992248,0.839344,0.909414
2,0.277600,0.142995,0.963333,0.952077,0.977049,0.964401
3,0.277600,0.122552,0.973333,0.97377,0.97377,0.97377


TrainOutput(global_step=900, training_loss=0.17937433666653102, metrics={'train_runtime': 13840.9678, 'train_samples_per_second': 0.52, 'train_steps_per_second': 0.065, 'total_flos': 953765270323200.0, 'train_loss': 0.17937433666653102, 'epoch': 3.0})

## 🧾 11. Evaluate on Validation Set

In [18]:

# Evaluate the model using built-in evaluate function
eval_result = trainer.evaluate()
print(eval_result)


{'eval_loss': 0.12255200743675232, 'eval_accuracy': 0.9733333333333334, 'eval_precision': 0.9737704918032787, 'eval_recall': 0.9737704918032787, 'eval_f1': 0.9737704918032787, 'eval_runtime': 203.7895, 'eval_samples_per_second': 2.944, 'eval_steps_per_second': 0.736, 'epoch': 3.0}


In [19]:
# Print detailed classification report using scikit-learn
preds = trainer.predict(val_dataset)
y_true = val_labels
y_pred = np.argmax(preds.predictions, axis=1)
print(classification_report(y_true, y_pred, target_names=["ham", "spam"]))

              precision    recall  f1-score   support

         ham       0.97      0.97      0.97       295
        spam       0.97      0.97      0.97       305

    accuracy                           0.97       600
   macro avg       0.97      0.97      0.97       600
weighted avg       0.97      0.97      0.97       600



# 🧾 12. Evaluate on Unseen  Emails / Test Data
- testing data on randomly chosen 2000 rows (excluding 10000 rows used in training and model validation)

In [20]:
# Select rows NOT used in original 1000
remaining_df = email_df_full.drop(index=original_indices_used)

#Randomly sample 2000 rows from the remaining data
test_df = remaining_df.sample(n=2000, random_state=7).reset_index(drop=True)



In [21]:
# Clean and tokenize (reuse cleaning function if needed)
test_df['text'] = test_df['text'].apply(clean_email_text)

# Tokenize
test_encodings = tokenizer(
    test_df['text'].tolist(),
    truncation=True,
    padding=True,
    max_length=512
)

# Wrap in PyTorch Dataset
test_dataset = EmailDataset(test_encodings, test_df['label'].tolist())

# Evaluate
test_preds = trainer.predict(test_dataset)
y_test_true = test_df['label'].tolist()
y_test_pred = np.argmax(test_preds.predictions, axis=1)

# Detailed classification report
from sklearn.metrics import classification_report
print("\n📊 Classification Report (on 2000 unseen emails):")
print(classification_report(y_test_true, y_test_pred, target_names=["ham (0)", "spam (1)"]))


📊 Classification Report (on 2000 unseen emails):
              precision    recall  f1-score   support

     ham (0)       0.97      0.96      0.97       970
    spam (1)       0.97      0.97      0.97      1030

    accuracy                           0.97      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       0.97      0.97      0.97      2000



## Saving the Model

In [22]:
# Save Fine-tuned Model to specified directory
save_path = "D:/Sajid/email-spam-detection/models/bert-email-spam"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved to: {save_path}")


Model and tokenizer saved to: D:/Sajid/email-spam-detection/models/bert-email-spam
