In [None]:

# Cell 1 — Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import joblib
import json


In [None]:

# Cell 2 — Load dataset

# OPTION A: Load from CSV (place phishing_email_dataset.csv in same folder)
use_hf = False   # set to True if you want HuggingFace

if not use_hf:
    df = pd.read_csv("phishing_email_dataset.csv")
else:
    from datasets import load_dataset
    ds = load_dataset("zefang-liu/phishing-email-dataset")
    df = pd.DataFrame(ds['train'])

df.head()


In [None]:

# Cell 3 — Choose columns

# Try to auto-detect text and label columns
text_cols = [c for c in df.columns if any(k in c.lower() for k in ('text','email','body','message'))]
label_cols = [c for c in df.columns if any(k in c.lower() for k in ('label','target','class'))]

print("Text column candidates:", text_cols)
print("Label column candidates:", label_cols)

X = df[text_cols[0]].astype(str).fillna("")
y = df[label_cols[0]]


In [None]:

# Cell 4 — Encode labels (make them numeric 0/1 etc.)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("Original labels:", list(le.classes_))
print("Encoded as:", list(range(len(le.classes_))))


In [None]:

# Cell 5 — Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)


In [None]:

# Cell 6 — TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2), stop_words="english")
X_train_t = vectorizer.fit_transform(X_train)
X_test_t = vectorizer.transform(X_test)


In [None]:

# Cell 7 — Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_t, y_train)


In [None]:

# Cell 8 — Evaluate
preds = model.predict(X_test_t)
print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds, target_names=le.classes_))


In [None]:

# Cell 9 — Save artifacts
joblib.dump(model, "phishing_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")
joblib.dump(le, "label_encoder.pkl")

# Build human-friendly label map
human_map = {}
for enc_val, orig in enumerate(le.classes_):
    if "phish" in orig.lower() or "spam" in orig.lower():
        human_map[enc_val] = "Phishing"
    else:
        human_map[enc_val] = "Safe"

with open("human_label_map.json", "w") as f:
    json.dump(human_map, f, indent=2)

print("Saved phishing_model.pkl, vectorizer.pkl, label_encoder.pkl, human_label_map.json")
