#### Environment Setup

In [1]:
import os

workspace_dir = '/content/spam-detection'
branch = 'master'
current_dir = os.getcwd()
if not os.path.exists(workspace_dir) and current_dir != workspace_dir:
    !git clone https://github.com/RationalEar/spam-detection.git
    os.chdir(workspace_dir)
    !git checkout $branch
    !ls -al
    !pip install -q transformers==4.48.0 scikit-learn pandas numpy
    !pip install -q torch --index-url https://download.pytorch.org/whl/cu126
    !pip install captum --no-deps --ignore-installed
else:
    os.chdir(workspace_dir)
    !git pull origin $branch

remote: Enumerating objects: 11, done.[K
remote: Counting objects:   9% (1/11)[Kremote: Counting objects:  18% (2/11)[Kremote: Counting objects:  27% (3/11)[Kremote: Counting objects:  36% (4/11)[Kremote: Counting objects:  45% (5/11)[Kremote: Counting objects:  54% (6/11)[Kremote: Counting objects:  63% (7/11)[Kremote: Counting objects:  72% (8/11)[Kremote: Counting objects:  81% (9/11)[Kremote: Counting objects:  90% (10/11)[Kremote: Counting objects: 100% (11/11)[Kremote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects:  50% (1/2)[Kremote: Compressing objects: 100% (2/2)[Kremote: Compressing objects: 100% (2/2), done.[K
remote: Total 6 (delta 4), reused 6 (delta 4), pack-reused 0 (from 0)[K
Unpacking objects:  16% (1/6)Unpacking objects:  33% (2/6)Unpacking objects:  50% (3/6)Unpacking objects:  66% (4/6)Unpacking objects:  83% (5/6)Unpacking objects: 100% (6/6)Unpacking objects: 100% (6/6), 1.30 KiB | 664.00 KiB/s, done.
From 

In [2]:
### If running on Google Colab, mount Google Drive
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
from utils.functions import set_seed
from utils.constants import DATA_PATH, MODEL_SAVE_PATH

set_seed(42)

#### Load the preprocessed data

In [4]:
train_df = pd.read_pickle(open(f"{DATA_PATH}/data/processed/train.pkl", "rb"))
val_df = pd.read_pickle(open(f"{DATA_PATH}/data/processed/val.pkl", "rb"))
test_df = pd.read_pickle(open(f"{DATA_PATH}/data/processed/test.pkl", "rb"))
train_df.head()

Unnamed: 0,subject,text,label,source,sender_hash,reply_to_hash,date
0,Personal Finance: Resolutions You Can Keep,personal finance resolutions you keep motley f...,0,hard_ham,bb339a04eb35de16f6386c5ca0d57fd88b20916663bd84...,3d0448fc6a4d02914e3adf6812ede7310a82838909afac...,"Wed, 02 Jan 2002 13:55:00 -0500"
1,Please help a newbie compile mplayer :-),please help newbie compile mplayer hello i jus...,0,easy_ham,2f890790e67625bdfd8e3c7cca018bf511c2cbca431554...,492368811b79453838d5e7e3692f607adee8d7e71ddd2e...,"Thu, 31 Jan 2002 22:44:14 -0700"
2,Re: Please help a newbie compile mplayer :-),re please help newbie compile mplayer make sur...,0,easy_ham,d83f5738686fa88436e12f3710c15b270666e3061ba627...,492368811b79453838d5e7e3692f607adee8d7e71ddd2e...,"Fri, 01 Feb 2002 00:53:41 -0600"
3,Re: Please help a newbie compile mplayer :-),re please help newbie compile mplayer lance wr...,0,easy_ham,2f890790e67625bdfd8e3c7cca018bf511c2cbca431554...,492368811b79453838d5e7e3692f607adee8d7e71ddd2e...,"Fri, 01 Feb 2002 02:01:44 -0700"
4,Re: Please help a newbie compile mplayer :-),re please help newbie compile mplayer once upo...,0,easy_ham,f9579e33dbc2d625e2ba35d53c611b8c3bd09cca4c7760...,492368811b79453838d5e7e3692f607adee8d7e71ddd2e...,"Fri, 01 Feb 2002 10:29:23 +0100"


In [5]:
# Build vocabulary from training data
from utils.functions import build_vocab

word2idx, idx2word = build_vocab(train_df['text'])

#### BERT Model Configuration Verification

The BERT model is configured with the following specifications:
- **Base Model**: bert-base-uncased (12 layers, 768 hidden dim)
- **Classification Head**: Added linear layer for spam detection
- **Explainability**: Integrated gradients and attention analysis
- **Regularization**:
  - Dropout: p=0.2
  - Label smoothing: Îµ=0.1  
  - Gradient clipping: max norm=1.0
- **Optimization**:
  - Layer-wise learning rate decay (lower layers get lower LR)
  - Base learning rate: 2e-5
  - AdamW optimizer with weight decay
- **Training**: End-to-end fine-tuning on spam detection task

In [6]:
# BERT uses its own pre-trained embeddings, no need for GloVe
# Set parameters for BERT training
embedding_dim = 768  # BERT hidden size
max_len = 200
pretrained_embeddings = None  # Not used for BERT

#### Train the BERT model

In [7]:
from training.trainer import train_model
start_time = pd.Timestamp.now()

# Train BERT model with enhanced regularization
model = train_model('bert', train_df, val_df, test_df,
                   embedding_dim=embedding_dim,
                   pretrained_embeddings=pretrained_embeddings,
                   model_save_path=MODEL_SAVE_PATH,
                   max_len=max_len,
                   evaluate=True)
end_time = pd.Timestamp.now()

Training BERT with enhanced regularization:
- Dropout: 0.2
- Label smoothing: 0.1
- Gradient clipping: 1.0
- Layer-wise learning rate decay
- Learning rate: 2e-05
Epoch 1/10 - Train Loss: 0.3273, Val Loss: 0.0995
Saved best model to /content/drive/MyDrive/Projects/spam-detection-data/trained-models/best_bert_model.pt
Epoch 2/10 - Train Loss: 0.2436, Val Loss: 0.1248
Epoch 3/10 - Train Loss: 0.2241, Val Loss: 0.1123
Epoch 4/10 - Train Loss: 0.2097, Val Loss: 0.1116
Epoch 5/10 - Train Loss: 0.2068, Val Loss: 0.1157
Epoch 6/10 - Train Loss: 0.2054, Val Loss: 0.1131
Early stopping triggered after 6 epochs
Final model saved to /content/drive/MyDrive/Projects/spam-detection-data/trained-models/spam_bert_final.pt

Evaluating on test set:
              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       415
         1.0       0.95      0.96      0.96       191

    accuracy                           0.97       606
   macro avg       0.97      0.97      0.

In [8]:
training_time = end_time - start_time
print(f"Training completed in: {training_time}")

Training completed in: 0 days 00:05:07.849667
