#### Google Colab: Environment Setup

In [1]:
# clone the repository if directory `/content/spam_detection2` does not exist and we're not already in it
import os

workspace_dir = '/content/spam_detection2'
current_dir = os.getcwd()
if not os.path.exists(workspace_dir) and current_dir != workspace_dir:
    !git clone https://github.com/RationalEar/spam_detection2.git
    os.chdir(workspace_dir)
    !ls -al
    !pip install -q transformers==4.48.0 scikit-learn pandas numpy matplotlib mlflow beautifulsoup4 shap lime
    !pip install -q torch --index-url https://download.pytorch.org/whl/cu126
else:
    os.chdir(workspace_dir)
    !git pull origin feature/bilstm-evaluation

remote: Enumerating objects: 5, done.[K
remote: Counting objects:  20% (1/5)[Kremote: Counting objects:  40% (2/5)[Kremote: Counting objects:  60% (3/5)[Kremote: Counting objects:  80% (4/5)[Kremote: Counting objects: 100% (5/5)[Kremote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (1/1)[Kremote: Compressing objects: 100% (1/1), done.[K
remote: Total 3 (delta 2), reused 3 (delta 2), pack-reused 0 (from 0)[K
Unpacking objects:  33% (1/3)Unpacking objects:  66% (2/3)Unpacking objects: 100% (3/3)Unpacking objects: 100% (3/3), 1.38 KiB | 1.38 MiB/s, done.
From https://github.com/RationalEar/spam_detection2
 * branch            feature/bilstm-evaluation -> FETCH_HEAD
   40d4950..0828e24  feature/bilstm-evaluation -> origin/feature/bilstm-evaluation
Updating 40d4950..0828e24
Fast-forward
 train.py | 26 [32m+++++++++++++++++++++++[m[31m---[m
 1 file changed, 23 insertions(+), 3 deletions(-)


In [2]:
from google.colab import drive
import pandas as pd
from utils.preprocessor import load_glove_embeddings
from train import train_model

In [3]:
# Mount Google Drive for saving models
drive.mount('/content/drive')
ROOT_PATH = '/content/drive/MyDrive/Projects/spam_detection2/'
MODEL_SAVE_PATH = os.path.join(ROOT_PATH, 'models')
os.makedirs(MODEL_SAVE_PATH, exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
from train import set_seed

set_seed(42)

#### Load preprocessed data (assumes PKLs/CSVs are available in data/processed/)

In [5]:
train_df = pd.read_pickle(ROOT_PATH + 'data/processed/train.pkl')
test_df = pd.read_pickle(ROOT_PATH + 'data/processed/test.pkl')

In [6]:
# Build vocabulary from training data
from utils.functions import build_vocab

word2idx, idx2word = build_vocab(train_df['text'])

In [7]:
# Load GloVe embeddings
GLOVE_PATH = os.path.join(ROOT_PATH, 'data/raw/glove.6B/glove.6B.300d.txt')
embedding_dim = 300
max_len = 200
pretrained_embeddings = load_glove_embeddings(GLOVE_PATH, word2idx, embedding_dim)

#### Train CNN Model

In [None]:
train_model('cnn', train_df, test_df, embedding_dim=embedding_dim, pretrained_embeddings=pretrained_embeddings,
                model_save_path=MODEL_SAVE_PATH, max_len=max_len, evaluate=True)

Epoch 1/50 - Loss: 0.3674
Epoch 2/50 - Loss: 0.1603
Epoch 3/50 - Loss: 0.1075
Epoch 4/50 - Loss: 0.0653
Epoch 5/50 - Loss: 0.0533
Epoch 6/50 - Loss: 0.0426
Epoch 7/50 - Loss: 0.0336
Epoch 8/50 - Loss: 0.0319
Epoch 9/50 - Loss: 0.0288
Epoch 10/50 - Loss: 0.0261
Epoch 11/50 - Loss: 0.0215
Epoch 12/50 - Loss: 0.0272
Epoch 13/50 - Loss: 0.0143
Epoch 14/50 - Loss: 0.0193
Epoch 15/50 - Loss: 0.0170
Epoch 16/50 - Loss: 0.0135
Epoch 17/50 - Loss: 0.0126
Epoch 18/50 - Loss: 0.0125
Epoch 19/50 - Loss: 0.0115
Epoch 20/50 - Loss: 0.0308
Epoch 21/50 - Loss: 0.0318
Epoch 22/50 - Loss: 0.0212
Epoch 23/50 - Loss: 0.0184
Epoch 24/50 - Loss: 0.0137
Epoch 25/50 - Loss: 0.0169
Epoch 26/50 - Loss: 0.0134
Epoch 27/50 - Loss: 0.0114
Epoch 28/50 - Loss: 0.0107
Epoch 29/50 - Loss: 0.0104
Epoch 30/50 - Loss: 0.0105
Epoch 31/50 - Loss: 0.0099
Epoch 32/50 - Loss: 0.0101
Epoch 33/50 - Loss: 0.0102
Epoch 34/50 - Loss: 0.0114
Epoch 35/50 - Loss: 0.0102
Epoch 36/50 - Loss: 0.0105
Epoch 37/50 - Loss: 0.0104
Epoch 38/5

SpamCNN(
  (embedding): Embedding(25373, 300)
  (conv1): Conv1d(300, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(128, 64, kernel_size=(5,), stride=(1,), padding=(2,))
  (conv3): Conv1d(64, 32, kernel_size=(7,), stride=(1,), padding=(3,))
  (global_max_pool): AdaptiveMaxPool1d(output_size=1)
  (fc1): Linear(in_features=32, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

#### Train BiLSTM Model

In [9]:
train_model('bilstm', train_df, test_df, embedding_dim=embedding_dim, pretrained_embeddings=pretrained_embeddings,
                model_save_path=MODEL_SAVE_PATH, max_len=max_len, evaluate=True)

Epoch 1/40:
Train Loss: 0.6237, Train Acc: 0.6884
Val Loss: 0.5116, Val Acc: 0.6479
Saved best model to /content/drive/MyDrive/Projects/spam_detection2/models/best_bilstm_model.pt
Epoch 2/40:
Train Loss: 0.5184, Train Acc: 0.6955
Val Loss: 0.5198, Val Acc: 0.7256
Epoch 3/40:
Train Loss: 0.4839, Train Acc: 0.7389
Val Loss: 0.4666, Val Acc: 0.7488
Saved best model to /content/drive/MyDrive/Projects/spam_detection2/models/best_bilstm_model.pt
Epoch 4/40:
Train Loss: 0.4115, Train Acc: 0.7610
Val Loss: 0.4624, Val Acc: 0.7479
Saved best model to /content/drive/MyDrive/Projects/spam_detection2/models/best_bilstm_model.pt
Epoch 5/40:
Train Loss: 0.4026, Train Acc: 0.7627
Val Loss: 0.4201, Val Acc: 0.7587
Saved best model to /content/drive/MyDrive/Projects/spam_detection2/models/best_bilstm_model.pt
Epoch 6/40:
Train Loss: 0.3857, Train Acc: 0.7835
Val Loss: 0.4266, Val Acc: 0.7579
Epoch 7/40:
Train Loss: 0.3694, Train Acc: 0.7819
Val Loss: 0.3833, Val Acc: 0.8000
Saved best model to /content

BiLSTMSpam(
  (embedding): Embedding(25373, 300)
  (lstm): LSTM(300, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (attention): Attention(
    (attn): Linear(in_features=256, out_features=1, bias=True)
  )
  (fc1): Linear(in_features=256, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

#### Train BERT Model

In [None]:
train_model('bert', train_df, test_df, embedding_dim=embedding_dim, pretrained_embeddings=pretrained_embeddings,
                model_save_path=MODEL_SAVE_PATH, max_len=max_len, evaluate=True)

Epoch 1/10 - Loss: 0.1961
Epoch 2/10 - Loss: 0.0694
Epoch 3/10 - Loss: 0.0415
Epoch 4/10 - Loss: 0.0269
Epoch 5/10 - Loss: 0.0239
Epoch 6/10 - Loss: 0.0220
Epoch 7/10 - Loss: 0.0179
Epoch 8/10 - Loss: 0.0147
Epoch 9/10 - Loss: 0.0117
Epoch 10/10 - Loss: 0.0115
Model saved to /content/drive/MyDrive/Projects/spam_detection2/models/spam_bert.pt
              precision    recall  f1-score   support

         0.0       0.99      0.98      0.98       830
         1.0       0.95      0.98      0.97       380

    accuracy                           0.98      1210
   macro avg       0.97      0.98      0.97      1210
weighted avg       0.98      0.98      0.98      1210

Confusion Matrix:
 [[810  20]
 [  7 373]]


SpamBERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi