#### Environment Setup

In [1]:
import os

workspace_dir = '/content/spam-detection'
branch = 'master'
current_dir = os.getcwd()
if not os.path.exists(workspace_dir) and current_dir != workspace_dir:
    !git clone https://github.com/RationalEar/spam-detection.git
    os.chdir(workspace_dir)
    !git checkout $branch
    !ls -al
    !pip install -q transformers==4.48.0 scikit-learn pandas numpy
    !pip install -q torch --index-url https://download.pytorch.org/whl/cu126
    !pip install captum --no-deps --ignore-installed
else:
    os.chdir(workspace_dir)
    !git pull origin $branch

Cloning into 'spam-detection'...
remote: Enumerating objects: 328, done.[K
remote: Counting objects: 100% (328/328), done.[K
remote: Compressing objects: 100% (220/220), done.[K
remote: Total 328 (delta 153), reused 253 (delta 88), pack-reused 0 (from 0)[K
Receiving objects: 100% (328/328), 3.48 MiB | 12.73 MiB/s, done.
Resolving deltas: 100% (153/153), done.
Branch 'feature/cnn-model' set up to track remote branch 'feature/cnn-model' from 'origin'.
Switched to a new branch 'feature/cnn-model'
total 48
drwxr-xr-x 10 root root 4096 Jul 19 09:34 .
drwxr-xr-x  1 root root 4096 Jul 19 09:34 ..
drwxr-xr-x  2 root root 4096 Jul 19 09:34 docs
drwxr-xr-x  8 root root 4096 Jul 19 09:34 .git
-rw-r--r--  1 root root   30 Jul 19 09:34 .gitignore
drwxr-xr-x  2 root root 4096 Jul 19 09:34 integrations
drwxr-xr-x  2 root root 4096 Jul 19 09:34 metrics
drwxr-xr-x  2 root root 4096 Jul 19 09:34 models
drwxr-xr-x  2 root root 4096 Jul 19 09:34 preprocess
-rw-r--r--  1 root root 2441 Jul 19 09:34 req

In [2]:
### If running on Google Colab, mount Google Drive
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
from utils.functions import set_seed
from utils.constants import DATA_PATH, MODEL_SAVE_PATH

set_seed(42)

DATA_PATH: /content/drive/MyDrive/Projects/spam-detection-data
WORKSPACE_DIR: /content/spam-detection


#### Load the preprocessed data

In [4]:
train_df = pd.read_pickle(open(f"{DATA_PATH}/data/processed/train.pkl", "rb"))
val_df = pd.read_pickle(open(f"{DATA_PATH}/data/processed/val.pkl", "rb"))
test_df = pd.read_pickle(open(f"{DATA_PATH}/data/processed/test.pkl", "rb"))
train_df.head()

Unnamed: 0,subject,text,label,source,sender_hash,reply_to_hash,date
0,Personal Finance: Resolutions You Can Keep,personal finance resolutions you keep motley f...,0,hard_ham,bb339a04eb35de16f6386c5ca0d57fd88b20916663bd84...,3d0448fc6a4d02914e3adf6812ede7310a82838909afac...,"Wed, 02 Jan 2002 13:55:00 -0500"
1,Please help a newbie compile mplayer :-),please help newbie compile mplayer hello i jus...,0,easy_ham,2f890790e67625bdfd8e3c7cca018bf511c2cbca431554...,492368811b79453838d5e7e3692f607adee8d7e71ddd2e...,"Thu, 31 Jan 2002 22:44:14 -0700"
2,Re: Please help a newbie compile mplayer :-),re please help newbie compile mplayer make sur...,0,easy_ham,d83f5738686fa88436e12f3710c15b270666e3061ba627...,492368811b79453838d5e7e3692f607adee8d7e71ddd2e...,"Fri, 01 Feb 2002 00:53:41 -0600"
3,Re: Please help a newbie compile mplayer :-),re please help newbie compile mplayer lance wr...,0,easy_ham,2f890790e67625bdfd8e3c7cca018bf511c2cbca431554...,492368811b79453838d5e7e3692f607adee8d7e71ddd2e...,"Fri, 01 Feb 2002 02:01:44 -0700"
4,Re: Please help a newbie compile mplayer :-),re please help newbie compile mplayer once upo...,0,easy_ham,f9579e33dbc2d625e2ba35d53c611b8c3bd09cca4c7760...,492368811b79453838d5e7e3692f607adee8d7e71ddd2e...,"Fri, 01 Feb 2002 10:29:23 +0100"


In [5]:
# Build vocabulary from training data
from utils.functions import build_vocab

word2idx, idx2word = build_vocab(train_df['text'])

In [6]:
from preprocess.data_loader import load_glove_embeddings

# Load GloVe embeddings
GLOVE_PATH = os.path.join(DATA_PATH, 'data/raw/glove.6B/glove.6B.300d.txt')
embedding_dim = 300
max_len = 200
pretrained_embeddings = load_glove_embeddings(GLOVE_PATH, word2idx, embedding_dim)

#### Train the CNN model

In [7]:
from training.trainer import train_model

start_time = pd.Timestamp.now()
model = train_model('cnn', train_df, val_df, test_df, embedding_dim=embedding_dim, pretrained_embeddings=pretrained_embeddings,
                model_save_path=MODEL_SAVE_PATH, max_len=max_len, evaluate=True)
end_time = pd.Timestamp.now()

Epoch 1/50 - Train Loss: 0.3521, Val Loss: 0.1854
Saved best model to /content/drive/MyDrive/Projects/spam-detection-data/trained-models/best_cnn_model.pt
Epoch 2/50 - Train Loss: 0.1344, Val Loss: 0.1480
Saved best model to /content/drive/MyDrive/Projects/spam-detection-data/trained-models/best_cnn_model.pt
Epoch 3/50 - Train Loss: 0.0892, Val Loss: 0.1377
Saved best model to /content/drive/MyDrive/Projects/spam-detection-data/trained-models/best_cnn_model.pt
Epoch 4/50 - Train Loss: 0.0615, Val Loss: 0.1669
Epoch 5/50 - Train Loss: 0.0466, Val Loss: 0.1692
Epoch 6/50 - Train Loss: 0.0343, Val Loss: 0.2322
Epoch 7/50 - Train Loss: 0.0310, Val Loss: 0.2303
Epoch 8/50 - Train Loss: 0.0152, Val Loss: 0.2248
Early stopping triggered after 8 epochs
Final model saved to /content/drive/MyDrive/Projects/spam-detection-data/trained-models/spam_cnn_final.pt

Evaluating on test set:
              precision    recall  f1-score   support

         0.0       0.98      0.96      0.97       415
     

In [8]:
training_time = end_time - start_time
print(f"Training completed in: {training_time}")

Training completed in: 0 days 00:00:13.512176
