In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns

from torch.utils.data import DataLoader

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
# Ensure PyTorch is installed and compatible
# Install the correct version of PyTorch based on your system's configuration
# For CUDA-enabled GPUs, use the appropriate CUDA version; otherwise, use the CPU version
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

import torch
import torch.nn as nn

from copy import deepcopy

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

OSError: [WinError 127] The specified procedure could not be found

In [None]:
df = pd.read_csv("C:\\Users\\Here\\Desktop\\Disertatie-Final\\4. Cod\\Data\\Phishing_Email.csv")

In [None]:
df.head()

In [None]:
tokenizer = get_tokenizer("basic_english")

def yield_tokenizer(batch):
    for txt, _ in batch:
        yield tokenizer(txt)
        
vocab = build_vocab_from_iterator(yield_tokenizer(df.values), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [None]:
EPOCHS = 20
LR = 0.1
GAMMA = 0.1
STEP = 10
VOCAB_SIZE = len(vocab)
EM_SIZE = 64
OUT_SIZE = 2
BATCH = 128

In [None]:
text_pipeline = lambda x: vocab(tokenizer(x))

In [None]:
def collate_batch(x):
    texts, labels, offsets = [], [], [0]
    for txt, label in x:
        text = torch.tensor(text_pipeline(txt), dtype=torch.int64)
        labels += [label]
        texts += [text]
        offsets += [text.size(0)]
    texts = torch.cat(texts)
    labels = torch.tensor(labels, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    
    if torch.cuda.is_available():
        texts, labels, offsets = texts.cuda(), labels.cuda(), offsets.cuda()
        
    return texts, labels, offsets

In [None]:
train, validation = train_test_split(df.values, random_state=42, test_size=0.2)
val, test = train_test_split(validation, random_state=42, test_size=0.5)

In [None]:
train_dl = DataLoader(train, batch_size=BATCH, shuffle=True, collate_fn=collate_batch)
val_dl = DataLoader(val, batch_size=BATCH, shuffle=False, collate_fn=collate_batch)

In [None]:
class Block(nn.Module):
    def __init__(self, in_features, out_features, downscale=True):
        super(Block, self).__init__()
        self.linear = nn.Linear(in_features, out_features)
        self.norm = nn.BatchNorm1d(out_features)
        self.relu = nn.ReLU()
        self.scale = downscale
        
        
    def forward(self, x):
        y = self.linear(x)
        y = self.norm(y)
        y = self.relu(y)
        if self.scale:
            return y
        y = torch.add(y, x)
        return y

class PhishingDetector(nn.Module):
    def __init__(self, vocab_size, em_size, out_size):
        super(PhishingDetector, self).__init__()
        self.embed = nn.EmbeddingBag(vocab_size, em_size)
        self.layers = nn.Sequential(Block(em_size, 128),
                                   Block(128, 128, False),
                                    Block(128, 128, False),
                                   Block(128, 64),
                                   Block(64, 64, False),
                                   Block(64, 64, False))
        self.fc = nn.Linear(64, out_size)
        
    def forward(self, x, off):
        x = self.embed(x, off)
        x = self.layers(x)
        x = self.fc(x)
        return nn.functional.softmax(x, dim=1)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

In [None]:
model = PhishingDetector(VOCAB_SIZE, EM_SIZE, OUT_SIZE)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=STEP, gamma=GAMMA)

In [None]:
best_model = deepcopy(model)

best_acc = 0

train_loss = []
train_acc = []
val_loss = []
val_acc = []

for i in range(1, EPOCHS+1):
    model.train()
    
    diff = 0
    acc = 0
    total = 0
    
    for text, label, offset in train_dl:
        optimizer.zero_grad()
        
        out = model(text, offset)
        loss = criterion(out, label)
        diff += loss.item()
        acc += (out.argmax(1) == label).sum().item()
        total += out.size(0)
        loss.backward()
        optimizer.step()
        
    train_loss += [diff/total]
    train_acc += [acc/total]
    
    model.eval()
    
    diff = 0
    acc = 0
    total = 0
    
    with torch.no_grad():
        for text, label, offset in train_dl:
            out = model(text, offset)
            loss = criterion(out, label)
            diff += loss.item()
            acc += (out.argmax(1) == label).sum().item()
            total += out.size(0)
    
    val_loss += [diff/total]
    val_acc += [acc/total]
    
    if val_acc[-1] >= best_acc:
        best_acc = val_acc[-1]
        best_model = deepcopy(model)
        
    print("Epoch {} train loss {} acc {} val loss {} acc {}".format(i, train_loss[-1],
                                                                   train_acc[-1], val_loss[-1],
                                                                   val_acc[-1]))
    
    scheduler.step()

In [None]:
def predict(text):
    text = torch.cat([torch.tensor(text_pipeline(text))])
    offset = torch.tensor([0])
    
    if torch.cuda.is_available():
        text, offset = text.cuda(), offset.cuda()
        
    best_model.eval()
    with torch.no_grad():
        out = model(text, offset)
    
    index = out.argmax(1).item()
    
    return index, out[0][index].item()

In [None]:
pred = []
proba = []
truth = []

for i in range(test.shape[0]):
    label, conf = predict(test[i, 0])
    pred += [label]
    proba += [conf]
    truth += [test[i, 1]]