In [2]:
!git clone https://github.com/RationalEar/spam_detection2.git

Cloning into 'spam_detection2'...
remote: Enumerating objects: 22, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 22 (delta 3), reused 22 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (22/22), 39.61 KiB | 19.80 MiB/s, done.
Resolving deltas: 100% (3/3), done.


In [4]:
import os
os.chdir('/content/spam_detection2')
!ls -al

total 196
drwxr-xr-x 6 root root   4096 May  3 14:06 .
drwxr-xr-x 1 root root   4096 May  3 14:06 ..
-rw-r--r-- 1 root root    637 May  3 14:06 changelog.md
drwxr-xr-x 8 root root   4096 May  3 14:06 .git
drwxr-xr-x 2 root root   4096 May  3 14:06 .github
-rw-r--r-- 1 root root     61 May  3 14:06 .gitignore
-rw-r--r-- 1 root root   5754 May  3 14:06 implementation-plan.md
-rw-r--r-- 1 root root   1015 May  3 14:06 local.ipynb
drwxr-xr-x 2 root root   4096 May  3 14:06 models
-rw-r--r-- 1 root root   1754 May  3 14:06 requirements.txt
-rw-r--r-- 1 root root   9128 May  3 14:06 SpamDetection.ipynb
-rw-r--r-- 1 root root 136084 May  3 14:06 thesis.md
drwxr-xr-x 2 root root   4096 May  3 14:06 utils


In [5]:
# Google Colab: Environment Setup
!pip install -q transformers==4.48.0 scikit-learn pandas numpy matplotlib mlflow beautifulsoup4 shap lime
!pip install -q torch --index-url https://download.pytorch.org/whl/cu126

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m112.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.0/29.0 MB[0m [31m88.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m99.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m12.4 MB/s[0m eta 

In [6]:
from google.colab import drive
import random
import numpy as np
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from utils.preprocessor import preprocess_text, load_glove_embeddings
from models.cnn import SpamCNN
from models.bilstm import BiLSTMSpam
from models.bert import SpamBERT
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

In [7]:
# Mount Google Drive for saving models
drive.mount('/content/drive')
ROOT_PATH = '/content/drive/MyDrive/Projects/spam_detection2/'
MODEL_SAVE_PATH = os.path.join(ROOT_PATH, 'models')
os.makedirs(MODEL_SAVE_PATH, exist_ok=True)

Mounted at /content/drive


In [8]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seed(42)

In [10]:
# Load preprocessed data (assumes PKLs/CSVs are available in data/processed/)
train_df = pd.read_pickle(ROOT_PATH + 'data/processed/train.pkl')
test_df = pd.read_pickle(ROOT_PATH + 'data/processed/test.pkl')

In [11]:
# Build vocabulary from training data
def build_vocab(texts, min_freq=2):
    from collections import Counter
    counter = Counter()
    for text in texts:
        counter.update(text.split())
    vocab = {word for word, freq in counter.items() if freq >= min_freq}
    word2idx = {word: idx+2 for idx, word in enumerate(sorted(vocab))}
    word2idx['<PAD>'] = 0
    word2idx['<UNK>'] = 1
    return word2idx

word2idx = build_vocab(train_df['text'])

In [12]:
# Tokenize and numericalize
max_len = 200
def encode(text, word2idx, max_len=200):
    tokens = text.split()
    idxs = [word2idx.get(token, word2idx['<UNK>']) for token in tokens]
    if len(idxs) < max_len:
        idxs += [word2idx['<PAD>']] * (max_len - len(idxs))
    else:
        idxs = idxs[:max_len]
    return idxs

In [13]:
X_train = torch.tensor([encode(t, word2idx, max_len) for t in train_df['text']])
y_train = torch.tensor(train_df['label'].values, dtype=torch.float32)
X_test = torch.tensor([encode(t, word2idx, max_len) for t in test_df['text']])
y_test = torch.tensor(test_df['label'].values, dtype=torch.float32)

In [14]:
# Load GloVe embeddings
GLOVE_PATH = os.path.join(ROOT_PATH, 'data/raw/glove.6B/glove.6B.300d.txt')
embedding_dim = 300
pretrained_embeddings = load_glove_embeddings(GLOVE_PATH, word2idx, embedding_dim)

In [21]:
# Choose model: 'cnn', 'bilstm', or 'bert'
model_type = 'bert'  # Change to 'bilstm' or 'bert' as needed

if model_type == 'cnn':
    model = SpamCNN(vocab_size=len(word2idx), embedding_dim=embedding_dim, pretrained_embeddings=pretrained_embeddings)
    train_inputs, train_labels = X_train, y_train
    test_inputs, test_labels = X_test, y_test
elif model_type == 'bilstm':
    model = BiLSTMSpam(vocab_size=len(word2idx), embedding_dim=embedding_dim, pretrained_embeddings=pretrained_embeddings)
    train_inputs, train_labels = X_train, y_train
    test_inputs, test_labels = X_test, y_test
elif model_type == 'bert':
    from transformers import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    def bert_encode(texts, tokenizer, max_len=200):
        return tokenizer(texts.tolist(), padding='max_length', truncation=True, max_length=max_len, return_tensors='pt')
    train_encodings = bert_encode(train_df['text'], tokenizer, max_len)
    test_encodings = bert_encode(test_df['text'], tokenizer, max_len)
    model = SpamBERT()
    train_inputs, train_labels = train_encodings, y_train
    test_inputs, test_labels = test_encodings, y_test
else:
    raise ValueError('Invalid model_type')

In [22]:
# Move model to GPU if available
model = model.cuda() if torch.cuda.is_available() else model

# Training Loop
batch_size = 32
epochs = 10
criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-4)

if model_type in ['cnn', 'bilstm']:
    train_dataset = TensorDataset(train_inputs, train_labels)
    test_dataset = TensorDataset(test_inputs, test_labels)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
else:  # BERT
    train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels)
    test_dataset = TensorDataset(test_inputs['input_ids'], test_inputs['attention_mask'], test_labels)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        if model_type == 'bert':
            input_ids, attention_mask, labels = [b.cuda() if torch.cuda.is_available() else b for b in batch]
            outputs, _ = model(input_ids=input_ids, attention_mask=attention_mask)
        else:
            inputs, labels = [b.cuda() if torch.cuda.is_available() else b for b in batch]
            outputs = model(inputs)
            if isinstance(outputs, tuple):
                outputs = outputs[0]
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(train_loader):.4f}")

# Save model to Google Drive
model_save_file = os.path.join(MODEL_SAVE_PATH, f'spam_{model_type}.pt')
model.save(model_save_file)
print(f"Model saved to {model_save_file}")

Epoch 1/10 - Loss: 0.2237
Epoch 2/10 - Loss: 0.2631
Epoch 3/10 - Loss: 0.6303
Epoch 4/10 - Loss: 0.6259
Epoch 5/10 - Loss: 0.6305
Epoch 6/10 - Loss: 0.6299
Epoch 7/10 - Loss: 0.6259
Epoch 8/10 - Loss: 0.6270
Epoch 9/10 - Loss: 0.6285
Epoch 10/10 - Loss: 0.6328
Model saved to /content/drive/MyDrive/Projects/spam_detection2/models/spam_bert.pt
