# Analyze Dataset
---
1. Encode email data using BERT.
2. Split data into test and train splits.
3. Train an SVM model to predict the class of an email.
4. Test the model on unseen emails.

## Imports

In [12]:
from pathlib import Path
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
import joblib
import platform

## Load Data

In [13]:
email_dataframe = pd.read_feather("./data/2_balanced_email_dataset.feather")

# Encode "Email Type" into a numerical format: 0 for ham, 1 for phishing, and 2 for spam
email_dataframe["label_id"] = email_dataframe["Email Type"].astype("category").cat.codes

## Encode Email Text using SBERT

Activate CUDA if available.

In [18]:
if platform.system() == "Windows":
    device = "cuda" if torch.cuda.is_available() else "cpu"
elif platform.system() == "Darwin":
    device = "mps" if torch.backends.mps.is_available() else "cpu"
else:
    device = "cpu"

# Store the variable to be used in 5. Demo Classify Emails.ipynb
%store device

print("Using device:", device)

Stored 'device' (str)
Using device: mps


In [15]:
# Load a BERT-based encoder
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

# Encode emails
X = model.encode(
    email_dataframe['Email Text'].tolist(),
    convert_to_numpy=True,
    batch_size=32,
    show_progress_bar=True
)

Batches: 100%|██████████| 1610/1610 [02:43<00:00,  9.85it/s]


In [16]:
# Labels
y = email_dataframe['label_id']

# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train SVM classifier
clf = SVC(kernel="linear", class_weight="balanced")
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

# Results
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.96      0.95      0.96      3434
           1       0.95      0.96      0.96      3434
           2       1.00      1.00      1.00      3435

    accuracy                           0.97     10303
   macro avg       0.97      0.97      0.97     10303
weighted avg       0.97      0.97      0.97     10303



Save as `.joblib` file for later reuse.

In [17]:
Path("./models").mkdir(parents=True, exist_ok=True)
joblib.dump(clf, "./models/svm_model.joblib")

['./models/svm_model.joblib']

### Alternative: Use Raw BERT via HuggingFace Transformers

In [26]:
# import torch
# from transformers import BertTokenizer, BertModel
# import numpy as np

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')
# model.eval()

# def bert_encode(text):
#     inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=256)
#     with torch.no_grad():
#         outputs = model(**inputs)
#     cls_embedding = outputs.last_hidden_state[:,0,:].numpy()   # CLS token
#     return cls_embedding.flatten()

# # Encode entire column
# X = np.vstack([bert_encode(t) for t in email_dataframe['Email Text']])
