In [1]:
import pandas as pd
import torch
import numpy as np

from tqdm import tqdm
from torch import nn
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader
from torchmetrics.classification import BinaryF1Score

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from sklearn import preprocessing

In [2]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [3]:
device

'cuda:0'

In [4]:
torch.set_default_device(device)

In [5]:
if device == 'cuda':
    torch.cuda.empty_cache()

In [6]:
df = pd.read_csv('emails.csv')

In [7]:
df

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,Email 5168,2,2,2,3,0,0,32,0,0,...,0,0,0,0,0,0,0,0,0,0
5168,Email 5169,35,27,11,2,6,5,151,4,3,...,0,0,0,0,0,0,0,1,0,0
5169,Email 5170,0,0,1,1,0,0,11,0,0,...,0,0,0,0,0,0,0,0,0,1
5170,Email 5171,2,7,1,0,2,1,28,2,0,...,0,0,0,0,0,0,0,1,0,1


In [8]:
#Email No. column replicates index
df = df.drop(columns=['Email No.'])

In [9]:
df

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,0,0
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,2,2,3,0,0,32,0,0,5,...,0,0,0,0,0,0,0,0,0,0
5168,35,27,11,2,6,5,151,4,3,23,...,0,0,0,0,0,0,0,1,0,0
5169,0,0,1,1,0,0,11,0,0,1,...,0,0,0,0,0,0,0,0,0,1
5170,2,7,1,0,2,1,28,2,0,8,...,0,0,0,0,0,0,0,1,0,1


In [10]:
#Imbalanced data - oversampling may be needed?
df['Prediction'].value_counts()

Prediction
0    3672
1    1500
Name: count, dtype: int64

In [11]:
#Outlier or spam email?
df.max(axis=None)

2327

In [12]:
df[['Is_Legit', 'Is_Spam']] = pd.get_dummies(df['Prediction'])
df.drop(columns='Prediction', inplace=True)

In [13]:
df

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,jay,valued,lay,infrastructure,military,allowing,ff,dry,Is_Legit,Is_Spam
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,True,False
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,1,0,True,False
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,True,False
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,True,False
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,1,0,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,2,2,3,0,0,32,0,0,5,...,0,0,0,0,0,0,0,0,True,False
5168,35,27,11,2,6,5,151,4,3,23,...,0,0,0,0,0,0,1,0,True,False
5169,0,0,1,1,0,0,11,0,0,1,...,0,0,0,0,0,0,0,0,False,True
5170,2,7,1,0,2,1,28,2,0,8,...,0,0,0,0,0,0,1,0,False,True


In [14]:
validation_df = df.sample(frac=0.1)

In [15]:
df = df[~df.isin(validation_df)].dropna().map(lambda x: int(x))

In [16]:
df

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,jay,valued,lay,infrastructure,military,allowing,ff,dry,Is_Legit,Is_Spam
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,1,0
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,1,0
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,1,0,1,0
5,4,5,1,4,2,3,45,1,0,16,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5165,1,0,1,0,3,1,12,1,0,2,...,0,0,1,0,0,0,0,0,1,0
5167,2,2,2,3,0,0,32,0,0,5,...,0,0,0,0,0,0,0,0,1,0
5168,35,27,11,2,6,5,151,4,3,23,...,0,0,0,0,0,0,1,0,1,0
5169,0,0,1,1,0,0,11,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [17]:
validation_df = validation_df.sort_index()
validation_df

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,jay,valued,lay,infrastructure,military,allowing,ff,dry,Is_Legit,Is_Spam
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,1,0,True,False
13,4,5,7,1,5,1,37,1,3,8,...,0,0,0,0,0,0,2,0,True,False
18,1,3,1,0,2,0,14,0,0,1,...,0,0,0,0,0,0,0,0,True,False
21,5,1,13,2,3,1,36,2,5,5,...,0,0,0,0,0,0,1,0,True,False
24,0,0,1,0,4,0,10,0,0,1,...,0,0,0,0,0,0,0,0,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5123,3,9,3,1,0,0,75,2,2,14,...,3,0,0,0,0,0,0,0,True,False
5142,2,1,1,0,1,1,20,0,0,0,...,0,0,0,0,0,0,2,0,False,True
5163,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,False,True
5166,1,0,1,1,0,0,4,0,0,0,...,0,0,0,0,0,0,0,0,False,True


In [18]:
NUM_CLASSES = 2
HIDDEN_LAYERS = 6000
NUM_EPOCHS = 8
BATCH_SIZE = 128

In [19]:
#Classic DNN
class DNNClassifier(nn.Module):
    def __init__(self, input_layers, hidden_layers, output_layers):
        super().__init__()
        self.sequential_ = nn.Sequential(
            nn.Linear(in_features = input_layers, out_features = hidden_layers),
            #nn.GELU(),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(in_features = hidden_layers, out_features = hidden_layers),
            #nn.GELU(),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(in_features = hidden_layers, out_features = output_layers),
            nn.Sigmoid(),
            #nn.LogSoftmax(dim=1),
        )
    def forward(self, x):
        return self.sequential_(x)

In [20]:
model_ = DNNClassifier(3000, HIDDEN_LAYERS, NUM_CLASSES)
model_ = model_.to(device)

In [21]:
torch.compile(model_)

OptimizedModule(
  (_orig_mod): DNNClassifier(
    (sequential_): Sequential(
      (0): Linear(in_features=3000, out_features=6000, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.2, inplace=False)
      (3): Linear(in_features=6000, out_features=6000, bias=True)
      (4): ReLU()
      (5): Dropout(p=0.2, inplace=False)
      (6): Linear(in_features=6000, out_features=2, bias=True)
      (7): Sigmoid()
    )
  )
)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, ~df.columns.isin(['Is_Spam', 'Is_Legit'])], df[['Is_Spam', 'Is_Legit']], test_size = 0.2)

In [23]:
#Minmax scaler is required as a normalization tool to remove outlier effect
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train.values)
X_test = scaler.fit_transform(X_test.values)

In [24]:
loss_fn = nn.BCELoss()
optimizer = Adam(params = model_.parameters(), lr=1e-4, weight_decay=1e-5)

In [25]:
metric = BinaryF1Score()

In [26]:
X_train_tensor =  torch.from_numpy(X_train).float()
y_train_tensor =  torch.from_numpy(y_train.values).float()
X_test_tensor =  torch.from_numpy(X_test).float()
y_test_tensor =  torch.from_numpy(y_test.values).float()

In [27]:
train_ds = TensorDataset(X_train_tensor, y_train_tensor)
test_ds = TensorDataset(X_test_tensor, y_test_tensor)

In [28]:
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)

In [29]:
for epoch in tqdm(range(NUM_EPOCHS)):
    train_loss, train_f1 = 0, 0
    for batch, (X, y) in enumerate(train_dl):
        X = X.to(device)
        y = y.to(device)
        y_pred = model_(X)
        loss = loss_fn(y_pred, y)        
        train_loss += loss
        train_f1 += metric(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    train_loss /= len(train_dl)
    train_f1 /= len(train_dl)    
    print(f"Train loss: {train_loss:.5f} | Train F1: {train_f1*100:.2f}%")
    
    test_loss, test_f1 = 0, 0
    with torch.inference_mode():         
        for X, y in test_dl:
            X = X.to(device)
            y = y.to(device)
            test_pred = model_(X)     
            test_loss += loss_fn(test_pred, y)
            test_f1 += metric(test_pred, y)
        test_loss /= len(test_dl)
        test_f1 /= len(test_dl)        
        print(f"Test loss: {test_loss:.5f} | Test F1: {test_f1*100:.2f}%\n")

 12%|█▎        | 1/8 [00:00<00:05,  1.34it/s]

Train loss: 0.51120 | Train F1: 73.69%
Test loss: 0.29568 | Test F1: 82.40%



 25%|██▌       | 2/8 [00:01<00:03,  1.79it/s]

Train loss: 0.19881 | Train F1: 89.98%
Test loss: 0.12772 | Test F1: 95.66%



 38%|███▊      | 3/8 [00:01<00:02,  2.00it/s]

Train loss: 0.07178 | Train F1: 98.50%
Test loss: 0.14480 | Test F1: 97.96%



 50%|█████     | 4/8 [00:02<00:01,  2.12it/s]

Train loss: 0.03484 | Train F1: 99.51%
Test loss: 0.10038 | Test F1: 97.90%



 62%|██████▎   | 5/8 [00:02<00:01,  2.19it/s]

Train loss: 0.02257 | Train F1: 99.56%
Test loss: 0.15517 | Test F1: 97.85%



 75%|███████▌  | 6/8 [00:02<00:00,  2.24it/s]

Train loss: 0.01657 | Train F1: 99.64%
Test loss: 0.16241 | Test F1: 97.75%



 88%|████████▊ | 7/8 [00:03<00:00,  2.27it/s]

Train loss: 0.01279 | Train F1: 99.64%
Test loss: 0.16405 | Test F1: 97.75%



100%|██████████| 8/8 [00:03<00:00,  2.13it/s]

Train loss: 0.01025 | Train F1: 99.71%
Test loss: 0.20913 | Test F1: 97.60%






In [30]:
y_pred_f1 = np.around(test_pred.cpu().numpy()[:, 0], 0)

In [31]:
y_pred_f1

array([0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0.,
       0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0.,
       0.], dtype=float32)

In [32]:
y_test_f1 = y.cpu().numpy()[:, 0]

In [33]:
y_test_f1

array([0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0.,
       0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0.,
       0.], dtype=float32)

In [34]:
f1_score(y_pred_f1, y_test_f1)

1.0

In [35]:
#Validation
validation_features = validation_df.loc[:, ~validation_df.columns.isin(['Is_Legit', 'Is_Spam'])]

In [36]:
scaler = preprocessing.MinMaxScaler()
validation_features = scaler.fit_transform(validation_features.values)

In [37]:
validation_pred = model_(torch.Tensor(validation_features).to(device)).cpu().detach().numpy()

In [38]:
validation_df['pred'] = np.around(validation_pred[:, 0], 0)
validation_df.drop(columns='Is_Legit', inplace=True)

In [39]:
validation_df

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,jay,valued,lay,infrastructure,military,allowing,ff,dry,Is_Spam,pred
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,1,0,False,0.0
13,4,5,7,1,5,1,37,1,3,8,...,0,0,0,0,0,0,2,0,False,0.0
18,1,3,1,0,2,0,14,0,0,1,...,0,0,0,0,0,0,0,0,False,0.0
21,5,1,13,2,3,1,36,2,5,5,...,0,0,0,0,0,0,1,0,False,0.0
24,0,0,1,0,4,0,10,0,0,1,...,0,0,0,0,0,0,0,0,False,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5123,3,9,3,1,0,0,75,2,2,14,...,3,0,0,0,0,0,0,0,False,0.0
5142,2,1,1,0,1,1,20,0,0,0,...,0,0,0,0,0,0,2,0,True,1.0
5163,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,True,1.0
5166,1,0,1,1,0,0,4,0,0,0,...,0,0,0,0,0,0,0,0,True,1.0


In [40]:
f1_score(validation_df['pred'], validation_df['Is_Spam'])

0.9672727272727273

In [41]:
#1D convolutional NN
#Using the same dataloader and batch settings

In [42]:
z

NameError: name 'z' is not defined

In [None]:
class CNN1DClassifier(nn.Module):
    def __init__(self, input_layers, hidden_layers, output_layers, k_size = 3):
        super().__init__()
        self.sequential_ = nn.Sequential(
            nn.Conv1d(in_channels = input_layers, out_channels = hidden_layers, kernel_size = 3),
            nn.GELU(),
            nn.MaxPool1d(kernel_size = 3),
            nn.Conv1d(in_channels = hidden_layers, out_channels = hidden_layers, kernel_size = 3),
            nn.GELU(),
            nn.MaxPool1d(kernel_size = 3),
            nn.Flatten(),
            nn.LazyLinear(out_features = output_layers),
            nn.Sigmoid(),
        )
    def forward(self, x):
        return self.sequential_(x)

In [None]:
model_2 = CNN1DClassifier(3000, BATCH_SIZE, 2, k_size = 3)

In [None]:
torch.compile(model_2)

In [None]:
for epoch in tqdm(range(NUM_EPOCHS)):
    train_loss, train_f1 = 0, 0
    for batch, (X, y) in enumerate(train_dl):
        X = X.T
        y = y.T
        y_pred = model_2(X)
        loss = loss_fn(y_pred.T, y)        
        train_loss += loss
        train_f1 += metric(y_pred.T, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    train_loss /= len(train_dl)
    train_f1 /= len(train_dl)    
    print(f"Train loss: {train_loss:.5f} | Train F1: {train_f1*100:.2f}%")
    
    test_loss, test_f1 = 0, 0
    with torch.inference_mode():         
        for X, y in test_dl:
            X = X.T
            y = y.T
            test_pred = model_2(X)     
            test_loss += loss_fn(test_pred.T, y)
            test_f1 += metric(test_pred.T, y)
        test_loss /= len(test_dl)
        test_f1 /= len(test_dl)        
        print(f"Test loss: {test_loss:.5f} | Test F1: {test_f1*100:.2f}%\n")