# Analyze Dataset
---
1. Encode email data using BERT.
2. Split data into test and train splits.
3. Train an SVM model to predict the class of an email.
4. Test the model on unseen emails.

## Imports

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from cuml.svm import SVC 
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch

  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [9]:
email_dataframe = pd.read_csv("./data/email_dataset.csv")
email_dataframe = email_dataframe[["Email Type", "clean_text"]]
# Rename clean_text column to Email Text
email_dataframe.rename(columns={"clean_text": "Email Text"}, inplace=True)
# Encode "Email Type" into a numerical format: 0 for ham, 1 for phishing, and 2 for spam
email_dataframe["label_id"] = email_dataframe["Email Type"].astype("category").cat.codes
email_dataframe

Unnamed: 0,Email Type,Email Text,label_id
0,ham,gary production high island larger block comme...,0
1,ham,calpine daily gas nomination doc,0
2,ham,fyi see note already done stella forwarded ste...,0
3,ham,fyi forwarded lauri allen hou ect pm kimberly ...,0
4,ham,jackie since inlet river plant shut last day f...,0
...,...,...,...
51760,ham,rick moen im confused thought gpled money paid...,0
51761,phish,date lonely housewife always wanted date lonel...,1
51762,ham,request submitted access request anita dupont ...,0
51763,ham,important prc mtg hi dorn john discovered rece...,0


## Encode Email Text using SBERT

Activate CUDA if available.

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [None]:
# Load a BERT-based encoder
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

# Encode emails
X = model.encode(
    email_dataframe['Email Text'].tolist(),
    convert_to_numpy=True,
    batch_size=32,
    show_progress_bar=True
)

Batches: 100%|██████████| 1618/1618 [00:41<00:00, 39.44it/s] 


## Run SVM model

In [12]:
# Labels
y = email_dataframe['label_id']

# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVM classifier
clf = LinearSVC()
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

# Results
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.98      0.98      0.98      5572
           1       0.92      0.90      0.91      1344
           2       1.00      1.00      1.00      3437

    accuracy                           0.98     10353
   macro avg       0.97      0.96      0.96     10353
weighted avg       0.98      0.98      0.98     10353



### Alternative: Use Raw BERT via HuggingFace Transformers

In [None]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

def bert_encode(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=256)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:,0,:].numpy()   # CLS token
    return cls_embedding.flatten()

# Encode entire column
X = np.vstack([bert_encode(t) for t in email_dataframe['Email Text']])
