# Analyze Dataset
---
1. Encode email data using BERT.
2. Split data into test and train splits.
3. Train an SVM model to predict the class of an email.
4. Test the model on unseen emails.

## Imports

In [27]:
from pathlib import Path
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
import joblib

## Load Data

In [20]:
email_dataframe = pd.read_parquet("./data/2_clean_email_dataset.parquet")
email_dataframe = email_dataframe[["Email Type", "Email Text"]]
# Rename clean_text column to Email Text
email_dataframe.rename(columns={"clean_text": "Email Text"}, inplace=True)
# Encode "Email Type" into a numerical format: 0 for ham, 1 for phishing, and 2 for spam
email_dataframe["label_id"] = email_dataframe["Email Type"].astype("category").cat.codes
email_dataframe

Unnamed: 0,Email Type,Email Text,label_id
1,ham,gary production high island larger block comme...,0
2,ham,calpine daily gas nomination doc,0
3,ham,fyi see note already done stella forwarded ste...,0
4,ham,fyi forwarded lauri allen hou ect pm kimberly ...,0
5,ham,jackie since inlet river plant shut last day f...,0
...,...,...,...
52360,ham,rick moen im confused thought gpled money paid...,0
52361,phish,date lonely housewife always wanted date lonel...,1
52362,ham,request submitted access request anita dupont ...,0
52363,ham,important prc mtg hi dorn john discovered rece...,0


## Encode Email Text using SBERT

Activate CUDA if available.

In [21]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [22]:
# Load a BERT-based encoder
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

# Encode emails
X = model.encode(
    email_dataframe['Email Text'].tolist(),
    convert_to_numpy=True,
    batch_size=32,
    show_progress_bar=True
)

Batches: 100%|██████████| 1618/1618 [00:39<00:00, 41.20it/s] 


In [23]:
# Labels
y = email_dataframe['label_id']

# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train SVM classifier
clf = SVC(kernel="linear", class_weight="balanced")
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

# Results
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.95      0.97      5523
           1       0.83      0.95      0.89      1396
           2       1.00      1.00      1.00      3434

    accuracy                           0.97     10353
   macro avg       0.94      0.97      0.95     10353
weighted avg       0.97      0.97      0.97     10353



Save as `.joblib` file for later reuse.

In [None]:
Path("./models").mkdir(parents=True, exist_ok=True)
joblib.dump(clf, "./models/svm_model.joblib")

['svm_model.joblib']

### Alternative: Use Raw BERT via HuggingFace Transformers

In [26]:
# import torch
# from transformers import BertTokenizer, BertModel
# import numpy as np

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')
# model.eval()

# def bert_encode(text):
#     inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=256)
#     with torch.no_grad():
#         outputs = model(**inputs)
#     cls_embedding = outputs.last_hidden_state[:,0,:].numpy()   # CLS token
#     return cls_embedding.flatten()

# # Encode entire column
# X = np.vstack([bert_encode(t) for t in email_dataframe['Email Text']])
