# HD Spam / Phishing / URL Detection — Works With Your CSVs

This notebook inspects each CSV to map columns correctly and then trains/evaluates models per the HD rubric.

## 0) Setup & Paths

In [None]:
import os, re, json, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.cluster import KMeans
from sklearn.metrics import (
    precision_recall_curve, average_precision_score,
    precision_recall_fscore_support, confusion_matrix
)
from urllib.parse import urlparse
from scipy.sparse import hstack
from joblib import dump

DATA_DIR = "/mnt/data" if os.path.exists("/mnt/data/sms_spam.csv") else "."
SMS_PATH    = os.path.join(DATA_DIR, "sms_spam.csv")
EMAILS_PATH = os.path.join(DATA_DIR, "emails.csv")
URLS_PATH   = os.path.join(DATA_DIR, "urls.csv")

BASE_DIR = os.path.abspath("hd_project")
RAW_DIR  = os.path.join(BASE_DIR, "data", "raw")
PROC_DIR = os.path.join(BASE_DIR, "data", "processed")
MODEL_DIR= os.path.join(BASE_DIR, "models")
for d in [RAW_DIR, PROC_DIR, MODEL_DIR]:
    os.makedirs(d, exist_ok=True)

print("Data dir:", DATA_DIR)
print("Project dir:", BASE_DIR)

## 1) Robust Loaders (auto-detect columns)

In [None]:
def _inspect(name, df):
    print(f"[{name}] columns:", list(df.columns))
    return df.head(3)

def _to_binary_labels(s):
    mapping = {
        "spam":1, "ham":0,
        "malicious":1, "benign":0,
        "phish":1, "legit":0, "legitimate":0,
        "defacement":1, "notspam":0, "not_spam":0
    }
    if s.dtype.kind in "iu":
        return s.astype(int)
    return s.astype(str).str.strip().str.lower().map(lambda x: mapping.get(x, x)).astype(int)

def load_sms(path):
    df = pd.read_csv(path, encoding="latin-1")
    df.columns = [c.strip().lower() for c in df.columns]
    _inspect("SMS", df)

    if {"v1","v2"}.issubset(df.columns):
        df = df.rename(columns={"v1":"label","v2":"text"})
        df["label"] = _to_binary_labels(df["label"])
        out = df[["text","label"]].copy()
    else:
        text_candidates  = [c for c in ["text","message","sms","msg","content"] if c in df.columns]
        if not text_candidates:
            str_cols = df.select_dtypes(include=["object"]).columns.tolist()
            if str_cols: text_candidates=[str_cols[0]]
        label_candidates = [c for c in ["label","category","spam","class","target"] if c in df.columns]
        if not (text_candidates and label_candidates):
            raise ValueError("SMS must have text + label-like column")
        tcol, lcol = text_candidates[0], label_candidates[0]
        df["label"] = _to_binary_labels(df[lcol])
        out = df.rename(columns={tcol:"text"})[["text","label"]].copy()

    out = out.drop_duplicates(subset=["text"]).reset_index(drop=True)
    out["source"] = "sms"
    return out

def load_emails(path):
    df = pd.read_csv(path)
    df.columns = [c.strip().lower() for c in df.columns]
    _inspect("EMAILS", df)

    if "text" in df.columns and "spam" in df.columns:
        df["label"] = _to_binary_labels(df["spam"])
        out = df.rename(columns={"text":"text"})[["text","label"]].dropna()
    else:
        text_candidates  = [c for c in ["text","email_text","body","message","content","subject_body"] if c in df.columns]
        if not text_candidates:
            str_cols = df.select_dtypes(include=["object"]).columns.tolist()
            if str_cols: text_candidates=[str_cols[0]]
        label_candidates = [c for c in ["label","spam","is_phish","target","class"] if c in df.columns]
        if not (text_candidates and label_candidates):
            raise ValueError("Emails must have text + label-like column")
        tcol, lcol = text_candidates[0], label_candidates[0]
        df["label"] = _to_binary_labels(df[lcol])
        out = df.rename(columns={tcol:"text"})[["text","label"]].dropna()

    out["source"] = "phish"
    return out

def load_urls(path):
    df = pd.read_csv(path)
    df.columns = [c.strip().lower() for c in df.columns]
    _inspect("URLS", df)

    url_col = "url" if "url" in df.columns else next((c for c in df.columns if "url" in c), None)
    if not url_col: raise ValueError("URLs need a 'url' column.")
    label_col = next((c for c in ["label","is_malicious","spam","target","class","malicious"] if c in df.columns), None)
    if not label_col: raise ValueError("URLs need a label-like column.")
    df["label"] = _to_binary_labels(df[label_col])
    out = df.rename(columns={url_col:"text"})[["text","label"]].dropna()
    out["source"] = "url"
    return out

## 2) Load All Three & Preview

In [None]:
sms_df   = load_sms(SMS_PATH)
email_df = load_emails(EMAILS_PATH)
url_df   = load_urls(URLS_PATH)

print("Loaded shapes:", sms_df.shape, email_df.shape, url_df.shape)
display(sms_df.head(3)); display(email_df.head(3)); display(url_df.head(3))

## 3) Clean Each & Save Clean Copies

In [None]:
def clean_text(s):
    s = str(s).lower().strip()
    s = re.sub(r"[\r\n\t]+"," ", s)
    s = re.sub(r"\s+"," ", s)
    return s

sms_clean    = sms_df.copy();   sms_clean["text"]    = sms_clean["text"].map(clean_text)
emails_clean = email_df.copy(); emails_clean["text"] = emails_clean["text"].map(clean_text)
urls_clean   = url_df.copy();   urls_clean["text"]   = urls_clean["text"].map(clean_text)

sms_clean.to_csv(os.path.join(PROC_DIR, "sms_clean.csv"), index=False)
emails_clean.to_csv(os.path.join(PROC_DIR, "emails_clean.csv"), index=False)
urls_clean.to_csv(os.path.join(PROC_DIR, "urls_clean.csv"), index=False)

print("Saved cleaned copies to:", PROC_DIR)

## 4) Merge → `data/processed/final.csv`

In [None]:
df = pd.concat([sms_clean, emails_clean, urls_clean], ignore_index=True)
final_path = os.path.join(PROC_DIR, "final.csv")
df.to_csv(final_path, index=False)
print("Saved merged final:", final_path, "shape:", df.shape)
display(df.source.value_counts())

## 5) Eval Helpers (PR‑AUC, F₂ threshold, Confusion Matrix)

In [None]:
def evaluate_probabilities(y_true, prob_pos, beta=2.0):
    precision, recall, thresholds = precision_recall_curve(y_true, prob_pos)
    fbeta = (1+beta**2)*(precision*recall)/(beta**2*precision + recall + 1e-12)
    best_idx = int(np.nanargmax(fbeta))
    best_thr = float(thresholds[max(0, best_idx-1)]) if best_idx>0 else 0.5
    return {"best_threshold": best_thr,
            "best_precision": float(precision[best_idx]),
            "best_recall": float(recall[best_idx]),
            "best_fbeta": float(fbeta[best_idx]),
            "pr_auc": float(average_precision_score(y_true, prob_pos))}

def report_at_threshold(y_true, prob_pos, thr):
    y_pred = (prob_pos >= thr).astype(int)
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None, labels=[1])
    cm = confusion_matrix(y_true, y_pred, labels=[0,1])
    return {"threshold": float(thr),
            "precision_malicious": float(p[0]),
            "recall_malicious": float(r[0]),
            "f1_malicious": float(f1[0]),
            "confusion_matrix[[TN,FP],[FN,TP]]": cm.tolist()}

## 6) Text Models: TF‑IDF (word 1–2 + char 3–5) → LogReg & Calibrated Linear SVM (5‑fold CV)

In [None]:
text_df = df[df["source"].isin(["sms","phish"])].reset_index(drop=True)
X_text = text_df["text"].values
y_text = text_df["label"].values

tfidf_word = TfidfVectorizer(analyzer="word", ngram_range=(1,2), min_df=1)
tfidf_char = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), min_df=1)
Xw = tfidf_word.fit_transform(X_text)
Xc = tfidf_char.fit_transform(X_text)
Xwc = hstack([Xw, Xc])

logreg = LogisticRegression(max_iter=2000, class_weight="balanced")
svm_cal = CalibratedClassifierCV(LinearSVC(class_weight="balanced"), method="sigmoid", cv=3)

def cv_probs(estimator, X, y, cv=5):
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    out = np.zeros_like(y, dtype=float)
    for tr, te in skf.split(X, y):
        est = estimator
        est.fit(X[tr], y[tr])
        out[te] = est.predict_proba(X[te])[:,1]
    return out

prob_log = cv_probs(logreg, Xwc, y_text, cv=5)
prob_svm = cv_probs(svm_cal, Xwc, y_text, cv=5)

best_log = evaluate_probabilities(y_text, prob_log, beta=2.0)
best_svm = evaluate_probabilities(y_text, prob_svm, beta=2.0)
rep_log  = report_at_threshold(y_text, prob_log, best_log["best_threshold"])
rep_svm  = report_at_threshold(y_text, prob_svm, best_svm["best_threshold"])

print("=== Text: Logistic Regression ===")
print(json.dumps(best_log, indent=2)); print(json.dumps(rep_log, indent=2))
print("=== Text: Linear SVM (Calibrated) ===")
print(json.dumps(best_svm, indent=2)); print(json.dumps(rep_svm, indent=2))

## 7) URL Model: RandomForest (5‑fold CV) on engineered URL features

In [None]:
def shannon_entropy(s):
    if not s: return 0.0
    p = np.array([s.count(c) for c in set(s)], dtype=float); p/=p.sum()
    return float(-(p*np.log2(p + 1e-12)).sum())

import re
def url_features(u):
    try:
        p = urlparse(u); host=p.netloc or ""; pathq=(p.path or "")+("?" + p.query if p.query else "")
        full = (p.netloc or "") + (p.path or "") + (p.query or "")
    except:
        host=""; pathq=""; full=str(u)
    return {"len": len(u), "dots": u.count("."), "dashes": u.count("-"),
            "digits": sum(ch.isdigit() for ch in u),
            "specials": sum(ch in "!@#$%^&*()_+=[]{}|;:'\\\",<>?/" for ch in u),
            "entropy": shannon_entropy(full), "num_subdomains": host.count("."),
            "has_ip": int(bool(re.search(r"\\b\\d{1,3}(?:\\.\\d{1,3}){3}\\b", host))),
            "tld_len": len(host.split(".")[-1]) if "." in host else 0,
            "path_len": len(pathq)}

url_only = df[df["source"]=="url"].reset_index(drop=True)
X_url = pd.DataFrame([url_features(u) for u in url_only["text"].tolist()])
y_url = url_only["label"].values

rf = RandomForestClassifier(n_estimators=400, class_weight="balanced", random_state=42, n_jobs=-1)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
prob_rf = cross_val_predict(rf, X_url, y_url, cv=skf, method="predict_proba")[:,1]

best_rf = evaluate_probabilities(y_url, prob_rf, beta=2.0)
rep_rf  = report_at_threshold(y_url, prob_rf, best_rf["best_threshold"])

print("=== URL: RandomForest ===")
print(json.dumps(best_rf, indent=2)); print(json.dumps(rep_rf, indent=2))

## 8) Extra 1 — K‑Means (themes)

In [None]:
if len(text_df) >= 6:
    tfidf_clu = TfidfVectorizer(analyzer="word", ngram_range=(1,2), min_df=1)
    Xc2 = tfidf_clu.fit_transform(text_df["text"].values)
    k = min(6, max(2, int(np.sqrt(len(text_df))//2)))
    km = KMeans(n_clusters=k, random_state=42, n_init="auto")
    labels = km.fit_predict(Xc2)
    terms = np.array(tfidf_clu.get_feature_names_out())
    order = km.cluster_centers_.argsort()[:, ::-1]
    cluster_top_terms = {i: terms[order[i,:10]].tolist() for i in range(k)}
    print("Top terms per cluster:")
    for i in range(k):
        print(i, ":", cluster_top_terms[i])
else:
    print("[K-Means skipped] Not enough text rows.")

## 9) Extra 2 — IsolationForest (benign-only training)

In [None]:
if len(text_df) >= 10:
    tfidf_an = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), min_df=1)
    Xa = tfidf_an.fit_transform(text_df["text"].values)
    ya = text_df["label"].values
    benign_mask = (ya==0)
    iso = IsolationForest(n_estimators=400, random_state=42, contamination="auto")
    iso.fit(Xa[benign_mask].toarray())
    scores = iso.decision_function(Xa.toarray())
    prob_like = (scores.min() - scores)
    prob_like = (prob_like - prob_like.min())/(prob_like.max()-prob_like.min()+1e-12)
    best_iso = evaluate_probabilities(ya, prob_like, beta=2.0)
    rep_iso  = report_at_threshold(ya, prob_like, best_iso["best_threshold"])
    print("=== IsolationForest (text) ===")
    print(json.dumps(best_iso, indent=2)); print(json.dumps(rep_iso, indent=2))
else:
    print("[IsolationForest skipped] Not enough text rows.")

## 10) Save Final Text Model (+ threshold)

In [None]:
tfidf_word_f = TfidfVectorizer(analyzer="word", ngram_range=(1,2), min_df=1)
tfidf_char_f = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), min_df=1)
Xw_f = tfidf_word_f.fit_transform(X_text)
Xc_f = tfidf_char_f.fit_transform(X_text)
Xwc_f = hstack([Xw_f, Xc_f])

svm_final = CalibratedClassifierCV(LinearSVC(class_weight="balanced"), method="sigmoid", cv=3)
svm_final.fit(Xwc_f, y_text)

dump(tfidf_word_f, os.path.join(MODEL_DIR, "tfidf_word.joblib"))
dump(tfidf_char_f, os.path.join(MODEL_DIR, "tfidf_char.joblib"))
dump(svm_final,    os.path.join(MODEL_DIR, "svm_calibrated.joblib"))

thr = 0.5
try: thr = float(best_svm["best_threshold"])
except: pass
with open(os.path.join(MODEL_DIR, "threshold.json"), "w") as f:
    json.dump({"f2_threshold": thr}, f, indent=2)

print("Artifacts saved to:", MODEL_DIR)