In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('./spam_ham_dataset.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [5]:
df1 = df.drop('Unnamed: 0', axis=1)

In [6]:
df1.text[1]

'Subject: hpl nom for january 9 , 2001\r\n( see attached file : hplnol 09 . xls )\r\n- hplnol 09 . xls'

In [7]:
df1.text = df.text.apply(lambda x: x.replace('\r\n', ' '))
df1.text[1]

'Subject: hpl nom for january 9 , 2001 ( see attached file : hplnol 09 . xls ) - hplnol 09 . xls'

In [8]:
df2 = df1.drop('label', axis=1)
len(df2.text)

5171

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

X = df2.text
y = df2.label_num
print(len(X),len(y))

5171 5171


In [10]:
vectorizer = TfidfVectorizer(max_features=5000)
X_vec = vectorizer.fit_transform(X)
labels = df2.label_num.values

In [11]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_vec,labels, test_size=0.2, random_state=42, stratify=labels)

In [12]:
y_test.shape

(1035,)

In [13]:
from torch.utils.data import TensorDataset, DataLoader

In [14]:
X_train_tensor = torch.tensor(X_train.toarray(),dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.toarray(),dtype=torch.float32)
y_train_tensor = torch.tensor(y_train,dtype=torch.long)
y_test_tensor = torch.tensor(y_test,dtype=torch.long)


In [15]:
len(X_train_tensor), len(X_test_tensor)

(4136, 1035)

In [16]:
train_dataset = TensorDataset(X_train_tensor,y_train_tensor)
test_dataset = TensorDataset(X_test_tensor,y_test_tensor)

In [17]:
for i in range(3):
    X, y = train_dataset[i]
    print(f"Sample {i} - Label: {y}, Vector shape: {X.shape}")


Sample 0 - Label: 1, Vector shape: torch.Size([5000])
Sample 1 - Label: 1, Vector shape: torch.Size([5000])
Sample 2 - Label: 0, Vector shape: torch.Size([5000])


In [18]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

In [19]:
class  SpamClassifier(nn.Module):
    def __init__(self, input_size, out_put, layers, drop_out=0.5):
        super().__init__()

        layerlist = []

        for i in layers:
            layerlist.append(nn.Linear(input_size,i))
            layerlist.append(nn.ReLU())
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(drop_out))
            input_size = i

        layerlist.append(nn.Linear(layers[-1], out_put))

        self.layers = nn.Sequential(*layerlist)

    def forward(self,X):
        X = self.layers(X)
        return X
        

        

In [20]:
torch.manual_seed(42)
model = SpamClassifier(5000, 2, [256,128,64], drop_out=0.5)
model

SpamClassifier(
  (layers): Sequential(
    (0): Linear(in_features=5000, out_features=256, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): ReLU()
    (6): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.5, inplace=False)
    (8): Linear(in_features=128, out_features=64, bias=True)
    (9): ReLU()
    (10): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.5, inplace=False)
    (12): Linear(in_features=64, out_features=2, bias=True)
  )
)

In [21]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)

In [22]:
import time
start_time = time.time()


epochs = 10


for i  in range(epochs):
    
    for n,(X_train,y_train) in enumerate(train_loader):
        y_pred = model(X_train)
        loss = criterion(y_pred,y_train)

        if i%2 == 1:
            print(f'epoch: {i}  loss: {loss.item():10.8f}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        




total_time = time.time() - start_time
print(f'Total training time: {total_time}')

epoch: 1  loss: 0.07798538
epoch: 1  loss: 0.06813315
epoch: 1  loss: 0.09583057
epoch: 1  loss: 0.08445978
epoch: 1  loss: 0.10837626
epoch: 1  loss: 0.08653607
epoch: 1  loss: 0.06846983
epoch: 1  loss: 0.07409866
epoch: 1  loss: 0.05839927
epoch: 1  loss: 0.13366187
epoch: 1  loss: 0.05178288
epoch: 1  loss: 0.07308362
epoch: 1  loss: 0.09390484
epoch: 1  loss: 0.03595007
epoch: 1  loss: 0.07145340
epoch: 1  loss: 0.06756314
epoch: 1  loss: 0.09032404
epoch: 1  loss: 0.02488624
epoch: 1  loss: 0.08295932
epoch: 1  loss: 0.10214652
epoch: 1  loss: 0.05881514
epoch: 1  loss: 0.02904821
epoch: 1  loss: 0.06298947
epoch: 1  loss: 0.03316285
epoch: 1  loss: 0.09046593
epoch: 1  loss: 0.10113331
epoch: 1  loss: 0.04589811
epoch: 1  loss: 0.04235240
epoch: 1  loss: 0.03519171
epoch: 1  loss: 0.07403626
epoch: 1  loss: 0.04594018
epoch: 1  loss: 0.02662151
epoch: 1  loss: 0.02990822
epoch: 1  loss: 0.05408601
epoch: 1  loss: 0.03507973
epoch: 1  loss: 0.04869901
epoch: 1  loss: 0.06321923
e

In [26]:
def test(test_loader): 
    model.eval() 
    correct = 0 
    counter = 0 
    with torch.no_grad(): 
        for X_test,y_test in test_loader:
            y_val = model(X_test) 
            _,pred = torch.max(y_val.data,1) 
            counter += y_test.size(0) 
            correct += (pred==y_test).sum().item() 
            
    print(100*correct/counter)

        

In [27]:
test(test_loader)

98.26086956521739


In [28]:
# TEST MODEL

def predict(email):
    model.eval()
    X_new = vectorizer.transform(email)
    X_new_tensor = torch.tensor(X_new.toarray(), dtype=torch.float32)  # shape [1, 5000]

    with torch.no_grad():
        y_pred = model(X_new_tensor)
        predicted_class = torch.argmax(y_pred, dim=1).item()

    label_map = {0: "ham", 1: "spam"}
    print(f"The model predicts: {label_map[predicted_class]}")

In [35]:
predict(["Click promotion for discount!"])

The model predicts: spam
