In [1]:
import pandas as pd
import torch
import numpy as np
import os

from tqdm import tqdm
from torch import nn
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader
from torchmetrics.classification import BinaryF1Score

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from sklearn import preprocessing

In [2]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [3]:
device

'cuda:0'

In [4]:
torch.set_default_device(device)

In [5]:
if device == 'cuda':
    torch.cuda.empty_cache()

In [6]:
df = pd.read_csv('emails.csv')

In [7]:
df

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,Email 5168,2,2,2,3,0,0,32,0,0,...,0,0,0,0,0,0,0,0,0,0
5168,Email 5169,35,27,11,2,6,5,151,4,3,...,0,0,0,0,0,0,0,1,0,0
5169,Email 5170,0,0,1,1,0,0,11,0,0,...,0,0,0,0,0,0,0,0,0,1
5170,Email 5171,2,7,1,0,2,1,28,2,0,...,0,0,0,0,0,0,0,1,0,1


In [8]:
#Email No. column replicates index
df = df.drop(columns=['Email No.'])
#Remove duplicate rows
df = df.drop_duplicates()

In [9]:
df

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,0,0
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,2,2,3,0,0,32,0,0,5,...,0,0,0,0,0,0,0,0,0,0
5168,35,27,11,2,6,5,151,4,3,23,...,0,0,0,0,0,0,0,1,0,0
5169,0,0,1,1,0,0,11,0,0,1,...,0,0,0,0,0,0,0,0,0,1
5170,2,7,1,0,2,1,28,2,0,8,...,0,0,0,0,0,0,0,1,0,1


In [10]:
#Imbalanced data - oversampling may be needed?
df['Prediction'].value_counts()

Prediction
0    3170
1    1461
Name: count, dtype: int64

In [11]:
#Outlier or spam email?
df.max(axis=None)

2327

In [12]:
df[['Is_Legit', 'Is_Spam']] = pd.get_dummies(df['Prediction'])
df.drop(columns='Prediction', inplace=True)

In [13]:
df

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,jay,valued,lay,infrastructure,military,allowing,ff,dry,Is_Legit,Is_Spam
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,True,False
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,1,0,True,False
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,True,False
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,True,False
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,1,0,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,2,2,3,0,0,32,0,0,5,...,0,0,0,0,0,0,0,0,True,False
5168,35,27,11,2,6,5,151,4,3,23,...,0,0,0,0,0,0,1,0,True,False
5169,0,0,1,1,0,0,11,0,0,1,...,0,0,0,0,0,0,0,0,False,True
5170,2,7,1,0,2,1,28,2,0,8,...,0,0,0,0,0,0,1,0,False,True


In [14]:
validation_df = df.sample(frac=0.1)

In [15]:
#Remove validation data from train df
df = df[~df.isin(validation_df)].dropna().map(lambda x: int(x))

In [16]:
df

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,jay,valued,lay,infrastructure,military,allowing,ff,dry,Is_Legit,Is_Spam
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,1,0,1,0
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,1,0
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,1,0
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5166,1,0,1,1,0,0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5167,2,2,2,3,0,0,32,0,0,5,...,0,0,0,0,0,0,0,0,1,0
5169,0,0,1,1,0,0,11,0,0,1,...,0,0,0,0,0,0,0,0,0,1
5170,2,7,1,0,2,1,28,2,0,8,...,0,0,0,0,0,0,1,0,0,1


In [17]:
validation_df = validation_df.sort_index()
validation_df

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,jay,valued,lay,infrastructure,military,allowing,ff,dry,Is_Legit,Is_Spam
5,4,5,1,4,2,3,45,1,0,16,...,0,0,0,0,0,0,0,0,False,True
15,6,2,1,0,2,0,36,3,1,8,...,0,0,0,0,0,0,0,0,True,False
17,36,21,6,14,7,17,194,25,5,59,...,0,0,0,0,0,0,3,0,False,True
20,0,0,1,1,0,0,15,1,0,2,...,0,0,0,0,0,0,0,0,True,False
22,0,3,6,0,5,0,30,0,2,6,...,0,0,0,0,0,0,0,0,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5128,5,8,10,1,0,0,40,3,1,14,...,0,0,0,0,0,0,0,0,True,False
5136,1,4,1,2,0,1,15,1,0,1,...,0,0,0,0,0,0,1,0,True,False
5146,0,3,2,0,0,0,7,0,0,1,...,0,0,0,0,0,0,0,0,True,False
5158,2,1,1,0,1,1,16,0,1,2,...,0,0,0,0,0,0,1,0,True,False


In [18]:
NUM_CLASSES = 2
HIDDEN_LAYERS = 6000
NUM_EPOCHS = 8
BATCH_SIZE = 128

In [19]:
#Classic DNN
class DNNClassifier(nn.Module):
    def __init__(self, input_layers, hidden_layers, output_layers):
        super().__init__()
        self.sequential_ = nn.Sequential(
            nn.Linear(in_features = input_layers, out_features = hidden_layers),
            #nn.GELU(),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(in_features = hidden_layers, out_features = hidden_layers),
            #nn.GELU(),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(in_features = hidden_layers, out_features = output_layers),
            nn.Sigmoid(),
            #nn.LogSoftmax(dim=1),
        )
    def forward(self, x):
        return self.sequential_(x)

In [20]:
model_ = DNNClassifier(3000, HIDDEN_LAYERS, NUM_CLASSES)
model_ = model_.to(device)

In [21]:
torch.compile(model_)

OptimizedModule(
  (_orig_mod): DNNClassifier(
    (sequential_): Sequential(
      (0): Linear(in_features=3000, out_features=6000, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.2, inplace=False)
      (3): Linear(in_features=6000, out_features=6000, bias=True)
      (4): ReLU()
      (5): Dropout(p=0.2, inplace=False)
      (6): Linear(in_features=6000, out_features=2, bias=True)
      (7): Sigmoid()
    )
  )
)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, ~df.columns.isin(['Is_Spam', 'Is_Legit'])], df[['Is_Spam', 'Is_Legit']], test_size = 0.2)

In [23]:
#Minmax scaler is required as a normalization tool to remove outlier effect
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train.values)
X_test = scaler.fit_transform(X_test.values)

In [24]:
loss_fn = nn.BCELoss()
optimizer = Adam(params = model_.parameters(), lr=1e-4, weight_decay=1e-5)

In [25]:
metric = BinaryF1Score()

In [26]:
X_train_tensor =  torch.from_numpy(X_train).float()
y_train_tensor =  torch.from_numpy(y_train.values).float()
X_test_tensor =  torch.from_numpy(X_test).float()
y_test_tensor =  torch.from_numpy(y_test.values).float()

In [27]:
train_ds = TensorDataset(X_train_tensor, y_train_tensor)
test_ds = TensorDataset(X_test_tensor, y_test_tensor)

In [28]:
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)

In [29]:
for epoch in tqdm(range(NUM_EPOCHS)):
    train_loss, train_f1 = 0, 0
    for batch, (X, y) in enumerate(train_dl):
        X = X.to(device)
        y = y.to(device)
        y_pred = model_(X)
        loss = loss_fn(y_pred, y)        
        train_loss += loss
        train_f1 += metric(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    train_loss /= len(train_dl)
    train_f1 /= len(train_dl)    
    print(f"Train loss: {train_loss:.5f} | Train F1: {train_f1*100:.2f}%")
    
    test_loss, test_f1 = 0, 0
    with torch.inference_mode():         
        for X, y in test_dl:
            X = X.to(device)
            y = y.to(device)
            test_pred = model_(X)     
            test_loss += loss_fn(test_pred, y)
            test_f1 += metric(test_pred, y)
        test_loss /= len(test_dl)
        test_f1 /= len(test_dl)        
        print(f"Test loss: {test_loss:.5f} | Test F1: {test_f1*100:.2f}%\n")


 12%|█▎        | 1/8 [00:00<00:04,  1.46it/s]

Train loss: 0.55834 | Train F1: 69.21%
Test loss: 0.36631 | Test F1: 76.68%



 25%|██▌       | 2/8 [00:01<00:03,  1.97it/s]

Train loss: 0.27987 | Train F1: 84.71%
Test loss: 0.19961 | Test F1: 92.96%



 38%|███▊      | 3/8 [00:01<00:02,  2.21it/s]

Train loss: 0.10285 | Train F1: 97.63%
Test loss: 0.15002 | Test F1: 96.51%



 50%|█████     | 4/8 [00:01<00:01,  2.33it/s]

Train loss: 0.04189 | Train F1: 99.33%
Test loss: 0.39746 | Test F1: 96.46%



 62%|██████▎   | 5/8 [00:02<00:01,  2.42it/s]

Train loss: 0.02478 | Train F1: 99.55%
Test loss: 0.49998 | Test F1: 96.90%



 75%|███████▌  | 6/8 [00:02<00:00,  2.47it/s]

Train loss: 0.01737 | Train F1: 99.65%
Test loss: 0.65367 | Test F1: 96.40%



 88%|████████▊ | 7/8 [00:03<00:00,  2.50it/s]

Train loss: 0.01322 | Train F1: 99.71%
Test loss: 0.69093 | Test F1: 96.24%



100%|██████████| 8/8 [00:03<00:00,  2.35it/s]

Train loss: 0.01063 | Train F1: 99.72%
Test loss: 0.70683 | Test F1: 96.46%






In [30]:
y_pred_f1 = np.around(test_pred.cpu().numpy()[:, 0], 0)

In [31]:
y_test_f1 = y.cpu().numpy()[:, 0]

In [32]:
f1_score(y_pred_f1, y_test_f1)

0.8717948717948718

In [33]:
#Validation
validation_features = validation_df.loc[:, ~validation_df.columns.isin(['Is_Legit', 'Is_Spam'])]

In [34]:
scaler = preprocessing.MinMaxScaler()
validation_features = scaler.fit_transform(validation_features.values)

In [35]:
validation_pred = model_(torch.Tensor(validation_features).to(device)).cpu().detach().numpy()

In [36]:
validation_df['pred'] = np.around(validation_pred[:, 0], 0)
validation_df.drop(columns='Is_Legit', inplace=True)

In [37]:
validation_df

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,jay,valued,lay,infrastructure,military,allowing,ff,dry,Is_Spam,pred
5,4,5,1,4,2,3,45,1,0,16,...,0,0,0,0,0,0,0,0,True,1.0
15,6,2,1,0,2,0,36,3,1,8,...,0,0,0,0,0,0,0,0,False,0.0
17,36,21,6,14,7,17,194,25,5,59,...,0,0,0,0,0,0,3,0,True,1.0
20,0,0,1,1,0,0,15,1,0,2,...,0,0,0,0,0,0,0,0,False,0.0
22,0,3,6,0,5,0,30,0,2,6,...,0,0,0,0,0,0,0,0,False,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5128,5,8,10,1,0,0,40,3,1,14,...,0,0,0,0,0,0,0,0,False,0.0
5136,1,4,1,2,0,1,15,1,0,1,...,0,0,0,0,0,0,1,0,False,0.0
5146,0,3,2,0,0,0,7,0,0,1,...,0,0,0,0,0,0,0,0,False,0.0
5158,2,1,1,0,1,1,16,0,1,2,...,0,0,0,0,0,0,1,0,False,0.0


In [38]:
f1_score(validation_df['pred'], validation_df['Is_Spam'])

0.9611307420494699

In [39]:
#Great F1 score, save model
torch.jit.script(model_).save(os.path.join(os.getcwd(), 'linear_model.pt'))

In [40]:
#1D convolutional NN
#Using the same dataloader and batch settings

In [41]:
NUM_CLASSES = 2
NUM_EPOCHS = 64
BATCH_SIZE = 128

In [42]:
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)

In [43]:
if device == 'cuda':
    torch.cuda.empty_cache()

In [44]:
class CNN1DClassifier(nn.Module):
    def __init__(self, input_channels, length, output_channels, k_size = 3, k_size_pooling = 2):
        super().__init__()
        self.sequential_ = nn.Sequential(
            #Layer1
            nn.Conv1d(in_channels = input_channels, out_channels = length, kernel_size = (k_size), padding=2),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.MaxPool1d(kernel_size = (k_size_pooling), stride=k_size_pooling),
            #Layer2
            nn.Conv1d(in_channels = length, out_channels = int(length*2), kernel_size = (k_size), padding=2),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.MaxPool1d(kernel_size = (k_size_pooling), stride=k_size_pooling),
            #Dense layer
            nn.Flatten(),
            nn.LazyLinear(out_features = output_channels),#binary
            nn.Sigmoid(),
        )
    def forward(self, x):
        return self.sequential_(x)

In [45]:
model_2 = CNN1DClassifier(input_channels = 1, length = 32, output_channels = 2, k_size = 3, k_size_pooling = 2)



In [46]:
loss_fn = nn.BCELoss()
optimizer = Adam(params = model_2.parameters(), lr=1e-4, weight_decay=1e-5)

In [47]:
metric = BinaryF1Score()

In [48]:
torch.compile(model_2)

OptimizedModule(
  (_orig_mod): CNN1DClassifier(
    (sequential_): Sequential(
      (0): Conv1d(1, 32, kernel_size=(3,), stride=(1,), padding=(2,))
      (1): GELU(approximate='none')
      (2): Dropout(p=0.2, inplace=False)
      (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (4): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=(2,))
      (5): GELU(approximate='none')
      (6): Dropout(p=0.2, inplace=False)
      (7): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (8): Flatten(start_dim=1, end_dim=-1)
      (9): LazyLinear(in_features=0, out_features=2, bias=True)
      (10): Sigmoid()
    )
  )
)

In [49]:
for epoch in tqdm(range(NUM_EPOCHS)):
    train_loss, train_f1 = 0, 0
    for batch, (X, y) in enumerate(train_dl):
        X = X.unsqueeze(dim=1).to(device)
        y = y.unsqueeze(dim=1).to(device)
        y_pred = model_2(X).squeeze(dim=1)
        loss = loss_fn(y_pred, y.squeeze(dim=1))        
        train_loss += loss
        train_f1 += metric(y_pred, y.squeeze(dim=1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    train_loss /= len(train_dl)
    train_f1 /= len(train_dl)    
    print(f"Train loss: {train_loss:.5f} | Train F1: {train_f1*100:.2f}%")
    
    test_loss, test_f1 = 0, 0
    with torch.inference_mode():         
        for X, y in test_dl:
            X = X.unsqueeze(dim=1).to(device)
            y = y.unsqueeze(dim=1).to(device)
            test_pred = model_2(X).squeeze(dim=1)   
            test_loss += loss_fn(test_pred, y.squeeze(dim=1))
            test_f1 += metric(test_pred, y.squeeze(dim=1))
        test_loss /= len(test_dl)
        test_f1 /= len(test_dl)        
        print(f"Test loss: {test_loss:.5f} | Test F1: {test_f1*100:.2f}%\n")

  2%|▏         | 1/64 [00:00<00:21,  2.97it/s]

Train loss: 0.63451 | Train F1: 67.51%
Test loss: 0.60787 | Test F1: 70.20%



  3%|▎         | 2/64 [00:00<00:17,  3.56it/s]

Train loss: 0.62576 | Train F1: 67.92%
Test loss: 0.60544 | Test F1: 70.20%



  5%|▍         | 3/64 [00:00<00:15,  3.82it/s]

Train loss: 0.62233 | Train F1: 67.92%
Test loss: 0.60167 | Test F1: 70.20%



  6%|▋         | 4/64 [00:01<00:15,  3.89it/s]

Train loss: 0.61870 | Train F1: 67.92%
Test loss: 0.59842 | Test F1: 70.20%



  8%|▊         | 5/64 [00:01<00:14,  3.99it/s]

Train loss: 0.61519 | Train F1: 68.01%
Test loss: 0.59355 | Test F1: 70.80%



  9%|▉         | 6/64 [00:01<00:14,  4.06it/s]

Train loss: 0.61084 | Train F1: 68.83%
Test loss: 0.58885 | Test F1: 71.17%



 11%|█         | 7/64 [00:01<00:13,  4.11it/s]

Train loss: 0.60621 | Train F1: 69.03%
Test loss: 0.58423 | Test F1: 71.43%



 12%|█▎        | 8/64 [00:02<00:13,  4.14it/s]

Train loss: 0.60140 | Train F1: 69.35%
Test loss: 0.57778 | Test F1: 71.43%



 14%|█▍        | 9/64 [00:02<00:13,  4.12it/s]

Train loss: 0.59637 | Train F1: 69.65%
Test loss: 0.57128 | Test F1: 71.77%



 16%|█▌        | 10/64 [00:02<00:13,  4.14it/s]

Train loss: 0.59002 | Train F1: 70.01%
Test loss: 0.56437 | Test F1: 72.32%



 17%|█▋        | 11/64 [00:02<00:12,  4.14it/s]

Train loss: 0.58340 | Train F1: 70.24%
Test loss: 0.55695 | Test F1: 72.55%



 19%|█▉        | 12/64 [00:02<00:12,  4.13it/s]

Train loss: 0.57728 | Train F1: 70.48%
Test loss: 0.54889 | Test F1: 72.88%



 20%|██        | 13/64 [00:03<00:12,  4.15it/s]

Train loss: 0.56968 | Train F1: 70.68%
Test loss: 0.53910 | Test F1: 72.92%



 22%|██▏       | 14/64 [00:03<00:12,  4.16it/s]

Train loss: 0.56209 | Train F1: 71.05%
Test loss: 0.52923 | Test F1: 73.14%



 23%|██▎       | 15/64 [00:03<00:11,  4.18it/s]

Train loss: 0.55263 | Train F1: 71.44%
Test loss: 0.51898 | Test F1: 73.10%



 25%|██▌       | 16/64 [00:03<00:11,  4.19it/s]

Train loss: 0.54367 | Train F1: 71.87%
Test loss: 0.50989 | Test F1: 73.36%



 27%|██▋       | 17/64 [00:04<00:11,  4.21it/s]

Train loss: 0.53376 | Train F1: 72.24%
Test loss: 0.49859 | Test F1: 73.43%



 28%|██▊       | 18/64 [00:04<00:10,  4.22it/s]

Train loss: 0.52282 | Train F1: 72.66%
Test loss: 0.48764 | Test F1: 73.36%



 30%|██▉       | 19/64 [00:04<00:10,  4.23it/s]

Train loss: 0.51138 | Train F1: 73.18%
Test loss: 0.47684 | Test F1: 73.90%



 31%|███▏      | 20/64 [00:04<00:10,  4.24it/s]

Train loss: 0.49913 | Train F1: 73.57%
Test loss: 0.46739 | Test F1: 73.77%



 33%|███▎      | 21/64 [00:05<00:10,  4.23it/s]

Train loss: 0.48654 | Train F1: 74.49%
Test loss: 0.45198 | Test F1: 74.73%



 34%|███▍      | 22/64 [00:05<00:09,  4.21it/s]

Train loss: 0.47399 | Train F1: 76.02%
Test loss: 0.43470 | Test F1: 74.87%



 36%|███▌      | 23/64 [00:05<00:09,  4.21it/s]

Train loss: 0.46061 | Train F1: 76.71%
Test loss: 0.42155 | Test F1: 75.40%



 38%|███▊      | 24/64 [00:05<00:09,  4.22it/s]

Train loss: 0.44850 | Train F1: 77.77%
Test loss: 0.40720 | Test F1: 76.32%



 39%|███▉      | 25/64 [00:06<00:09,  4.21it/s]

Train loss: 0.43592 | Train F1: 78.93%
Test loss: 0.39760 | Test F1: 76.25%



 41%|████      | 26/64 [00:06<00:09,  4.20it/s]

Train loss: 0.42246 | Train F1: 79.94%
Test loss: 0.38768 | Test F1: 77.09%



 42%|████▏     | 27/64 [00:06<00:08,  4.22it/s]

Train loss: 0.41272 | Train F1: 80.95%
Test loss: 0.37339 | Test F1: 77.78%



 44%|████▍     | 28/64 [00:06<00:08,  4.23it/s]

Train loss: 0.39933 | Train F1: 82.19%
Test loss: 0.36560 | Test F1: 79.02%



 45%|████▌     | 29/64 [00:07<00:08,  4.23it/s]

Train loss: 0.39117 | Train F1: 82.54%
Test loss: 0.35663 | Test F1: 79.59%



 47%|████▋     | 30/64 [00:07<00:08,  4.23it/s]

Train loss: 0.37988 | Train F1: 83.52%
Test loss: 0.34246 | Test F1: 80.53%



 48%|████▊     | 31/64 [00:07<00:07,  4.22it/s]

Train loss: 0.36924 | Train F1: 84.01%
Test loss: 0.33667 | Test F1: 81.55%



 50%|█████     | 32/64 [00:07<00:07,  4.22it/s]

Train loss: 0.36092 | Train F1: 85.00%
Test loss: 0.33095 | Test F1: 81.98%



 52%|█████▏    | 33/64 [00:07<00:07,  4.16it/s]

Train loss: 0.35091 | Train F1: 85.77%
Test loss: 0.32575 | Test F1: 82.44%



 53%|█████▎    | 34/64 [00:08<00:07,  4.12it/s]

Train loss: 0.34420 | Train F1: 86.06%
Test loss: 0.31425 | Test F1: 82.97%



 55%|█████▍    | 35/64 [00:08<00:07,  4.00it/s]

Train loss: 0.33545 | Train F1: 86.33%
Test loss: 0.30963 | Test F1: 83.57%



 56%|█████▋    | 36/64 [00:08<00:07,  3.88it/s]

Train loss: 0.32709 | Train F1: 87.17%
Test loss: 0.30813 | Test F1: 84.62%



 58%|█████▊    | 37/64 [00:09<00:06,  3.87it/s]

Train loss: 0.31908 | Train F1: 87.66%
Test loss: 0.29774 | Test F1: 85.94%



 59%|█████▉    | 38/64 [00:09<00:06,  3.88it/s]

Train loss: 0.31058 | Train F1: 87.61%
Test loss: 0.29486 | Test F1: 85.42%



 61%|██████    | 39/64 [00:09<00:06,  3.92it/s]

Train loss: 0.30419 | Train F1: 88.46%
Test loss: 0.28513 | Test F1: 87.27%



 62%|██████▎   | 40/64 [00:09<00:06,  3.95it/s]

Train loss: 0.29901 | Train F1: 88.46%
Test loss: 0.28745 | Test F1: 87.29%



 64%|██████▍   | 41/64 [00:10<00:05,  3.97it/s]

Train loss: 0.29252 | Train F1: 88.55%
Test loss: 0.28123 | Test F1: 87.21%



 66%|██████▌   | 42/64 [00:10<00:05,  3.96it/s]

Train loss: 0.28142 | Train F1: 89.03%
Test loss: 0.27619 | Test F1: 88.69%



 67%|██████▋   | 43/64 [00:10<00:05,  3.94it/s]

Train loss: 0.27252 | Train F1: 89.93%
Test loss: 0.27885 | Test F1: 86.51%



 69%|██████▉   | 44/64 [00:10<00:05,  3.94it/s]

Train loss: 0.27566 | Train F1: 89.73%
Test loss: 0.28104 | Test F1: 86.55%



 70%|███████   | 45/64 [00:11<00:04,  3.93it/s]

Train loss: 0.27178 | Train F1: 89.72%
Test loss: 0.27200 | Test F1: 88.18%



 72%|███████▏  | 46/64 [00:11<00:04,  3.93it/s]

Train loss: 0.26268 | Train F1: 90.04%
Test loss: 0.26045 | Test F1: 90.22%



 73%|███████▎  | 47/64 [00:11<00:04,  3.94it/s]

Train loss: 0.25646 | Train F1: 90.53%
Test loss: 0.27588 | Test F1: 87.19%



 75%|███████▌  | 48/64 [00:11<00:04,  3.92it/s]

Train loss: 0.25591 | Train F1: 90.56%
Test loss: 0.25804 | Test F1: 89.86%



 77%|███████▋  | 49/64 [00:12<00:03,  3.92it/s]

Train loss: 0.25192 | Train F1: 90.36%
Test loss: 0.26018 | Test F1: 89.06%



 78%|███████▊  | 50/64 [00:12<00:03,  3.93it/s]

Train loss: 0.24810 | Train F1: 90.25%
Test loss: 0.26481 | Test F1: 89.12%



 80%|███████▉  | 51/64 [00:12<00:03,  3.92it/s]

Train loss: 0.24282 | Train F1: 90.61%
Test loss: 0.24979 | Test F1: 90.73%



 81%|████████▏ | 52/64 [00:12<00:03,  3.91it/s]

Train loss: 0.23653 | Train F1: 91.21%
Test loss: 0.25856 | Test F1: 89.15%



 83%|████████▎ | 53/64 [00:13<00:02,  3.94it/s]

Train loss: 0.23357 | Train F1: 90.71%
Test loss: 0.24363 | Test F1: 91.17%



 84%|████████▍ | 54/64 [00:13<00:02,  3.95it/s]

Train loss: 0.22972 | Train F1: 91.19%
Test loss: 0.25499 | Test F1: 90.23%



 86%|████████▌ | 55/64 [00:13<00:02,  3.97it/s]

Train loss: 0.22752 | Train F1: 91.23%
Test loss: 0.24222 | Test F1: 91.06%



 88%|████████▊ | 56/64 [00:13<00:02,  3.97it/s]

Train loss: 0.22262 | Train F1: 91.71%
Test loss: 0.25278 | Test F1: 90.06%



 89%|████████▉ | 57/64 [00:14<00:01,  3.95it/s]

Train loss: 0.22023 | Train F1: 91.70%
Test loss: 0.24890 | Test F1: 90.77%



 91%|█████████ | 58/64 [00:14<00:01,  3.96it/s]

Train loss: 0.21755 | Train F1: 91.99%
Test loss: 0.24394 | Test F1: 90.71%



 92%|█████████▏| 59/64 [00:14<00:01,  3.96it/s]

Train loss: 0.21507 | Train F1: 92.12%
Test loss: 0.24603 | Test F1: 91.05%



 94%|█████████▍| 60/64 [00:14<00:01,  3.94it/s]

Train loss: 0.20695 | Train F1: 92.33%
Test loss: 0.25082 | Test F1: 90.80%



 95%|█████████▌| 61/64 [00:15<00:00,  3.93it/s]

Train loss: 0.20612 | Train F1: 92.47%
Test loss: 0.24639 | Test F1: 90.43%



 97%|█████████▋| 62/64 [00:15<00:00,  3.94it/s]

Train loss: 0.20136 | Train F1: 92.63%
Test loss: 0.24963 | Test F1: 90.43%



 98%|█████████▊| 63/64 [00:15<00:00,  3.95it/s]

Train loss: 0.20191 | Train F1: 92.26%
Test loss: 0.24221 | Test F1: 91.03%



100%|██████████| 64/64 [00:15<00:00,  4.03it/s]

Train loss: 0.20273 | Train F1: 92.52%
Test loss: 0.24221 | Test F1: 90.73%






In [50]:
validation_pred = model_2(torch.Tensor(validation_features).unsqueeze(dim=1).to(device)).cpu().detach().numpy()

In [51]:
validation_df['pred_2nd'] = np.around(validation_pred[:, 0], 0)

In [52]:
validation_df

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,valued,lay,infrastructure,military,allowing,ff,dry,Is_Spam,pred,pred_2nd
5,4,5,1,4,2,3,45,1,0,16,...,0,0,0,0,0,0,0,True,1.0,1.0
15,6,2,1,0,2,0,36,3,1,8,...,0,0,0,0,0,0,0,False,0.0,0.0
17,36,21,6,14,7,17,194,25,5,59,...,0,0,0,0,0,3,0,True,1.0,1.0
20,0,0,1,1,0,0,15,1,0,2,...,0,0,0,0,0,0,0,False,0.0,0.0
22,0,3,6,0,5,0,30,0,2,6,...,0,0,0,0,0,0,0,False,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5128,5,8,10,1,0,0,40,3,1,14,...,0,0,0,0,0,0,0,False,0.0,0.0
5136,1,4,1,2,0,1,15,1,0,1,...,0,0,0,0,0,1,0,False,0.0,0.0
5146,0,3,2,0,0,0,7,0,0,1,...,0,0,0,0,0,0,0,False,0.0,0.0
5158,2,1,1,0,1,1,16,0,1,2,...,0,0,0,0,0,1,0,False,0.0,0.0


In [53]:
f1_score(validation_df['pred_2nd'], validation_df['Is_Spam'])

0.8615384615384616

In [54]:
torch.jit.script(model_2).save(os.path.join(os.getcwd(), 'conv_model.pt'))

In [55]:
confusion_matrix(validation_df['pred_2nd'], validation_df['Is_Spam'])

array([[315,  30],
       [  6, 112]])

In [1]:
#Load model and get predictions as numpy array
#Can be run on fresh kernel
import os
import torch
import numpy as np
import pandas as pd

from sklearn import preprocessing

df = pd.read_csv('emails.csv')
device = 'cpu'
scaler = preprocessing.MinMaxScaler()
X_test = scaler.fit_transform(df.loc[:, ~df.columns.isin(['Is_Spam', 'Is_Legit', 'Prediction', 'Email No.'])])
torch.set_default_device('cpu')
final_model = torch.jit.load(os.path.join(os.getcwd(), 'linear_model.pt')).to(device)
final_model.eval()
pred = np.around(final_model(torch.Tensor(X_test).to(device)).cpu().detach().numpy()[:, 0], 0)
pred

array([0., 0., 0., ..., 1., 1., 0.], dtype=float32)