#### Environment Setup

In [1]:
import os

workspace_dir = '/content/spam-detection'
branch = 'feature/cnn-model'
current_dir = os.getcwd()
if not os.path.exists(workspace_dir) and current_dir != workspace_dir:
    !git clone https://github.com/RationalEar/spam-detection.git
    os.chdir(workspace_dir)
    !git checkout $branch
    !ls -al
    !pip install -q transformers==4.48.0 scikit-learn pandas numpy
    !pip install -q torch --index-url https://download.pytorch.org/whl/cu126
else:
    os.chdir(workspace_dir)
    !git pull origin $branch

Cloning into 'spam-detection'...
remote: Enumerating objects: 116, done.[K
remote: Counting objects: 100% (116/116), done.[K
remote: Compressing objects: 100% (78/78), done.[K
remote: Total 116 (delta 43), reused 101 (delta 29), pack-reused 0 (from 0)[K
Receiving objects: 100% (116/116), 135.78 KiB | 739.00 KiB/s, done.
Resolving deltas: 100% (43/43), done.
Branch 'feature/cnn-model' set up to track remote branch 'feature/cnn-model' from 'origin'.
Switched to a new branch 'feature/cnn-model'
total 48
drwxr-xr-x 10 root root 4096 Jun  8 16:26 .
drwxr-xr-x  1 root root 4096 Jun  8 16:26 ..
drwxr-xr-x  2 root root 4096 Jun  8 16:26 docs
drwxr-xr-x  8 root root 4096 Jun  8 16:26 .git
-rw-r--r--  1 root root   30 Jun  8 16:26 .gitignore
drwxr-xr-x  2 root root 4096 Jun  8 16:26 integrations
drwxr-xr-x  2 root root 4096 Jun  8 16:26 metrics
drwxr-xr-x  2 root root 4096 Jun  8 16:26 models
drwxr-xr-x  2 root root 4096 Jun  8 16:26 preprocess
-rw-r--r--  1 root root 2441 Jun  8 16:26 requi

In [2]:
### If running on Google Colab, mount Google Drive
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
from utils.functions import set_seed
from utils.constants import DATA_PATH, MODEL_SAVE_PATH

set_seed(42)

DATA_PATH: /content/drive/MyDrive/Projects/spam-detection-data
WORKSPACE_DIR: /content/spam-detection


#### Load the preprocessed data

In [4]:
train_df = pd.read_pickle(open(f"{DATA_PATH}/data/processed/train.pkl", "rb"))
val_df = pd.read_pickle(open(f"{DATA_PATH}/data/processed/val.pkl", "rb"))
test_df = pd.read_pickle(open(f"{DATA_PATH}/data/processed/test.pkl", "rb"))
train_df.head()

Unnamed: 0,subject,text,label,source,sender_hash,reply_to_hash,date
0,Personal Finance: Resolutions You Can Keep,personal finance resolutions you keep motley f...,0,hard_ham,bb339a04eb35de16f6386c5ca0d57fd88b20916663bd84...,3d0448fc6a4d02914e3adf6812ede7310a82838909afac...,"Wed, 02 Jan 2002 13:55:00 -0500"
1,Please help a newbie compile mplayer :-),please help newbie compile mplayer hello i jus...,0,easy_ham,2f890790e67625bdfd8e3c7cca018bf511c2cbca431554...,492368811b79453838d5e7e3692f607adee8d7e71ddd2e...,"Thu, 31 Jan 2002 22:44:14 -0700"
2,Re: Please help a newbie compile mplayer :-),re please help newbie compile mplayer make sur...,0,easy_ham,d83f5738686fa88436e12f3710c15b270666e3061ba627...,492368811b79453838d5e7e3692f607adee8d7e71ddd2e...,"Fri, 01 Feb 2002 00:53:41 -0600"
3,Re: Please help a newbie compile mplayer :-),re please help newbie compile mplayer lance wr...,0,easy_ham,2f890790e67625bdfd8e3c7cca018bf511c2cbca431554...,492368811b79453838d5e7e3692f607adee8d7e71ddd2e...,"Fri, 01 Feb 2002 02:01:44 -0700"
4,Re: Please help a newbie compile mplayer :-),re please help newbie compile mplayer once upo...,0,easy_ham,f9579e33dbc2d625e2ba35d53c611b8c3bd09cca4c7760...,492368811b79453838d5e7e3692f607adee8d7e71ddd2e...,"Fri, 01 Feb 2002 10:29:23 +0100"


In [5]:
# Build vocabulary from training data
from utils.functions import build_vocab

word2idx, idx2word = build_vocab(train_df['text'])

In [6]:
from preprocess.data_loader import load_glove_embeddings

# Load GloVe embeddings
GLOVE_PATH = os.path.join(DATA_PATH, 'data/raw/glove.6B/glove.6B.300d.txt')
embedding_dim = 300
max_len = 200
pretrained_embeddings = load_glove_embeddings(GLOVE_PATH, word2idx, embedding_dim)

#### Train the CNN model

In [11]:
from training.trainer import train_model

train_model('cnn', train_df, val_df, test_df, embedding_dim=embedding_dim, pretrained_embeddings=pretrained_embeddings,
                model_save_path=MODEL_SAVE_PATH, max_len=max_len, evaluate=True)



Epoch 1/50 - Train Loss: 0.3816, Val Loss: 0.2301
Saved best model to /content/drive/MyDrive/Projects/spam-detection-data/trained-models/best_cnn_model.pt
Epoch 2/50 - Train Loss: 0.1406, Val Loss: 0.1357
Saved best model to /content/drive/MyDrive/Projects/spam-detection-data/trained-models/best_cnn_model.pt
Epoch 3/50 - Train Loss: 0.0864, Val Loss: 0.1366
Epoch 4/50 - Train Loss: 0.0620, Val Loss: 0.1597
Epoch 5/50 - Train Loss: 0.0499, Val Loss: 0.1130
Saved best model to /content/drive/MyDrive/Projects/spam-detection-data/trained-models/best_cnn_model.pt
Epoch 6/50 - Train Loss: 0.0318, Val Loss: 0.1517
Epoch 7/50 - Train Loss: 0.0413, Val Loss: 0.1466
Epoch 8/50 - Train Loss: 0.0227, Val Loss: 0.2477
Epoch 9/50 - Train Loss: 0.0193, Val Loss: 0.1844
Epoch 10/50 - Train Loss: 0.0132, Val Loss: 0.2053
Early stopping triggered after 10 epochs
Final model saved to /content/drive/MyDrive/Projects/spam-detection-data/trained-models/spam_cnn_final.pt

Evaluating on test set:
            

SpamCNN(
  (embedding): Embedding(25245, 300)
  (conv1): Conv1d(300, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(128, 64, kernel_size=(5,), stride=(1,), padding=(2,))
  (conv3): Conv1d(64, 32, kernel_size=(7,), stride=(1,), padding=(3,))
  (global_max_pool): AdaptiveMaxPool1d(output_size=1)
  (fc1): Linear(in_features=32, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)