# Analyze Dataset
---
1. Encode email data using BERT.
2. Split data into test and train splits.
3. Train an SVM model to predict the class of an email.
4. Test the model on unseen emails.

## Imports

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer  # TODO: install
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## Load Data

In [5]:
email_dataframe = pd.read_csv("./data/email_dataset.csv")
email_dataframe = email_dataframe[["Email Type", "clean_text"]]
# Rename clean_text column to Email Text
email_dataframe.rename(columns={"clean_text": "Email Text"}, inplace=True)
email_dataframe

Unnamed: 0,Email Type,Email Text
0,ham,gary production high island larger block comme...
1,ham,calpine daily gas nomination doc
2,ham,fyi see note already done stella forwarded ste...
3,ham,fyi forwarded lauri allen hou ect pm kimberly ...
4,ham,jackie since inlet river plant shut last day f...
...,...,...
51760,ham,rick moen im confused thought gpled money paid...
51761,phish,date lonely housewife always wanted date lonel...
51762,ham,request submitted access request anita dupont ...
51763,ham,important prc mtg hi dorn john discovered rece...


## Encode Email Text using SBERT

In [None]:
# Load a BERT-based encoder
model = SentenceTransformer('all-MiniLM-L6-v2')   # Fast + high quality

# Encode emails
X = model.encode(email_dataframe['Email Text'].tolist(), batch_size=32, show_progress_bar=True)

# Labels
y = email_dataframe['Label']

# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVM classifier
clf = LinearSVC()
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

# Results
print(classification_report(y_test, y_pred))


### Alternative: Use Raw BERT via HuggingFace Transformers

In [None]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

def bert_encode(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=256)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:,0,:].numpy()   # CLS token
    return cls_embedding.flatten()

# Encode entire column
X = np.vstack([bert_encode(t) for t in email_dataframe['Email Text']])
