#### Environment Setup

In [1]:
import os

workspace_dir = '/content/spam-detection'
branch = 'feature/bilstm-training'
current_dir = os.getcwd()
if not os.path.exists(workspace_dir) and current_dir != workspace_dir:
    !git clone https://github.com/RationalEar/spam-detection.git
    os.chdir(workspace_dir)
    !git checkout $branch
    !ls -al
    !pip install -q transformers==4.48.0 scikit-learn pandas numpy
    !pip install -q torch --index-url https://download.pytorch.org/whl/cu126
else:
    os.chdir(workspace_dir)
    !git pull origin $branch

Cloning into 'spam-detection'...
remote: Enumerating objects: 221, done.[K
remote: Counting objects: 100% (221/221), done.[K
remote: Compressing objects: 100% (147/147), done.[K
remote: Total 221 (delta 99), reused 177 (delta 61), pack-reused 0 (from 0)[K
Receiving objects: 100% (221/221), 2.04 MiB | 21.35 MiB/s, done.
Resolving deltas: 100% (99/99), done.
Branch 'feature/bilstm-training' set up to track remote branch 'feature/bilstm-training' from 'origin'.
Switched to a new branch 'feature/bilstm-training'
total 68
drwxr-xr-x 11 root root 4096 Jun 27 07:33 .
drwxr-xr-x  1 root root 4096 Jun 27 07:33 ..
-rw-r--r--  1 root root  584 Jun 27 07:33 docker-compose.yml
-rw-r--r--  1 root root  879 Jun 27 07:33 Dockerfile
-rw-r--r--  1 root root   92 Jun 27 07:33 .dockerignore
drwxr-xr-x  2 root root 4096 Jun 27 07:33 docs
drwxr-xr-x  2 root root 4096 Jun 27 07:33 explainability
drwxr-xr-x  8 root root 4096 Jun 27 07:33 .git
-rw-r--r--  1 root root   38 Jun 27 07:33 .gitignore
drwxr-xr-x

In [2]:
### If running on Google Colab, mount Google Drive
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
from utils.functions import set_seed
from utils.constants import DATA_PATH, MODEL_SAVE_PATH

set_seed(42)

#### Load the preprocessed data

In [4]:
train_df = pd.read_pickle(open(f"{DATA_PATH}/data/processed/train.pkl", "rb"))
val_df = pd.read_pickle(open(f"{DATA_PATH}/data/processed/val.pkl", "rb"))
test_df = pd.read_pickle(open(f"{DATA_PATH}/data/processed/test.pkl", "rb"))
train_df.head()

Unnamed: 0,subject,text,label,source,sender_hash,reply_to_hash,date
0,Personal Finance: Resolutions You Can Keep,personal finance resolutions you keep motley f...,0,hard_ham,bb339a04eb35de16f6386c5ca0d57fd88b20916663bd84...,3d0448fc6a4d02914e3adf6812ede7310a82838909afac...,"Wed, 02 Jan 2002 13:55:00 -0500"
1,Please help a newbie compile mplayer :-),please help newbie compile mplayer hello i jus...,0,easy_ham,2f890790e67625bdfd8e3c7cca018bf511c2cbca431554...,492368811b79453838d5e7e3692f607adee8d7e71ddd2e...,"Thu, 31 Jan 2002 22:44:14 -0700"
2,Re: Please help a newbie compile mplayer :-),re please help newbie compile mplayer make sur...,0,easy_ham,d83f5738686fa88436e12f3710c15b270666e3061ba627...,492368811b79453838d5e7e3692f607adee8d7e71ddd2e...,"Fri, 01 Feb 2002 00:53:41 -0600"
3,Re: Please help a newbie compile mplayer :-),re please help newbie compile mplayer lance wr...,0,easy_ham,2f890790e67625bdfd8e3c7cca018bf511c2cbca431554...,492368811b79453838d5e7e3692f607adee8d7e71ddd2e...,"Fri, 01 Feb 2002 02:01:44 -0700"
4,Re: Please help a newbie compile mplayer :-),re please help newbie compile mplayer once upo...,0,easy_ham,f9579e33dbc2d625e2ba35d53c611b8c3bd09cca4c7760...,492368811b79453838d5e7e3692f607adee8d7e71ddd2e...,"Fri, 01 Feb 2002 10:29:23 +0100"


In [5]:
# Build vocabulary from training data
from utils.functions import build_vocab

word2idx, idx2word = build_vocab(train_df['text'])

In [6]:
from preprocess.data_loader import load_glove_embeddings

# Load GloVe embeddings
GLOVE_PATH = os.path.join(DATA_PATH, 'data/raw/glove.6B/glove.6B.300d.txt')
embedding_dim = 300
max_len = 200
pretrained_embeddings = load_glove_embeddings(GLOVE_PATH, word2idx, embedding_dim)

#### Train the BiLSTM model

In [7]:
from training.trainer import train_model
start_time = pd.Timestamp.now()
model = train_model('bilstm', train_df, val_df, test_df, embedding_dim=embedding_dim, pretrained_embeddings=pretrained_embeddings,
                model_save_path=MODEL_SAVE_PATH, max_len=max_len, evaluate=True)
end_time = pd.Timestamp.now()

Epoch 1/40:
Train Loss: 0.3078, Train Acc: 0.8656
Val Loss: 0.1545, Val Acc: 0.9470
Saved best model to /content/drive/MyDrive/Projects/spam-detection-data/trained-models/best_bilstm_model.pt
Epoch 2/40:
Train Loss: 0.1295, Train Acc: 0.9595
Val Loss: 0.1257, Val Acc: 0.9503
Saved best model to /content/drive/MyDrive/Projects/spam-detection-data/trained-models/best_bilstm_model.pt
Epoch 3/40:
Train Loss: 0.0915, Train Acc: 0.9721
Val Loss: 0.1026, Val Acc: 0.9636
Saved best model to /content/drive/MyDrive/Projects/spam-detection-data/trained-models/best_bilstm_model.pt
Epoch 4/40:
Train Loss: 0.0648, Train Acc: 0.9775
Val Loss: 0.1256, Val Acc: 0.9619
Epoch 5/40:
Train Loss: 0.0454, Train Acc: 0.9868
Val Loss: 0.1717, Val Acc: 0.9520
Epoch 6/40:
Train Loss: 0.0345, Train Acc: 0.9899
Val Loss: 0.1732, Val Acc: 0.9536
Epoch 7/40:
Train Loss: 0.0259, Train Acc: 0.9926
Val Loss: 0.1471, Val Acc: 0.9520
Epoch 8/40:
Train Loss: 0.0200, Train Acc: 0.9921
Val Loss: 0.1874, Val Acc: 0.9503
Epoc

In [8]:
training_time = end_time - start_time
print(f"Training completed in: {training_time}")

Training completed in: 0 days 00:00:46.961394
