MailGuard — Train Spam/Phishing Classifier
Purpose: prototype preprocessing, train baseline model (Tfidf + LogisticRegression),
evaluate, pick thresholds, and export artifacts for local Django inference.



In [1]:
%pip install numpy pandas joblib


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import re
from html import unescape
from pathlib import Path
import json
import numpy as np
import pandas as pd
import joblib

In [3]:
%pip install pandas scikit-learn nltk joblib matplotlib seaborn


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
# NLP
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

# sparse utilities
from scipy.sparse import csr_matrix, hstack

# Setup constants
RANDOM_STATE = 42
ROOT = Path('.')
ARTIFACTS_DIR = ROOT / 'mailguard' / 'model' / 'artifacts'
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

# Ensure NLTK data (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\subha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\subha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\subha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
  
df = pd.read_csv("F:\Projects\MailGuard_Research_notebooks\emails.csv")
df = df.rename(columns={ 'spam': 'label' }) if 'spam' in df.columns else df
df = df[['text','label']].copy()  # keep only necessary columns for baseline
df.head()

  df = pd.read_csv("F:\Projects\MailGuard_Research_notebooks\emails.csv")


Unnamed: 0,text,label
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [8]:
# Remove exact duplicate rows
print('Before drop duplicates:', len(df))
df.drop_duplicates(inplace=True)
print('After drop duplicates:', len(df))

# Nulls
print('Null counts:\n', df.isna().sum())

Before drop duplicates: 5728
After drop duplicates: 5695
Null counts:
 text     0
label    0
dtype: int64


In [9]:
print(df['label'].value_counts())


label
0    4327
1    1368
Name: count, dtype: int64


In [10]:
# Cell 3 — Mojibake & HTML-entity cleanup helpers
MOJIBAKE_REPLACEMENTS = {
    'ΓÇó': '-', 'ΓÇô': '-', 'ΓÇ£': '"', 'ΓÇ¥': '"', 'ΓÇÖ': "'",
    'ΓÇü': 'u', 'â': '-', 'â': '-', 'â': '"', 'â': "'",
    'â¢': '-', 'â¢': '-', 'Ã©': 'e', '\ufeff': ''
}
MOJIBAKE_REGEX = re.compile('|'.join(re.escape(k) for k in MOJIBAKE_REPLACEMENTS.keys()))

URL_RE = re.compile(r'https?://\S+|www\.\S+', flags=re.IGNORECASE)
EMAIL_RE = re.compile(r'\b[\w\.-]+@[\w\.-]+\.\w+\b')
PHONE_RE = re.compile(r'(\+?\d[\d\-\s]{7,}\d)')
IMAGE_PLACEHOLDER_RE = re.compile(r'\[image:[^\]]*\]', flags=re.IGNORECASE)
FORWARD_MARKERS = ['forwarded message', '---------- forwarded message', 'from:']

def fix_mojibake(s: str) -> str:
    if not s:
        return s
    return MOJIBAKE_REGEX.sub(lambda m: MOJIBAKE_REPLACEMENTS[m.group(0)], s)

def clean_text(raw: str) -> str:
    """Full OCR-aware cleaning:
       - fix mojibake, unescape HTML entities
       - remove forwarded blocks (keep only latest)
       - replace URLs, emails, phones, image placeholders with tokens
       - remove HTML tags, collapse whitespace, lowercase
    """
    if pd.isna(raw):
        return ''
    s = str(raw)
    s = fix_mojibake(s)
    s = unescape(s)
    # remove forwarded blocks
    low = s.lower()
    for m in FORWARD_MARKERS:
        idx = low.find(m)
        if idx != -1:
            s = s[:idx]
            low = s.lower()
            break
    # tokens
    s = URL_RE.sub(' <URL> ', s)
    s = EMAIL_RE.sub(' <EMAIL> ', s)
    s = PHONE_RE.sub(' <PHONE> ', s)
    s = IMAGE_PLACEHOLDER_RE.sub(' <IMAGE> ', s)
    # strip html tags
    s = re.sub(r'<[^>]+>', ' ', s)
    # remove control chars and collapse whitespace
    s = re.sub(r'[\r\n\t]+', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s.lower()

def tokenize_and_lemmatize(text: str) -> str:
    tokens = word_tokenize(text)
    out = []
    for t in tokens:
        if t.isalpha() and t not in STOPWORDS:
            out.append(LEMMATIZER.lemmatize(t))
    return ' '.join(out)


In [11]:
# Cell 4 — Apply cleaning & lemmatization
df['clean_text'] = df['text'].astype(str).map(clean_text)
df['lemmatized_text'] = df['clean_text'].map(tokenize_and_lemmatize)

In [12]:
# Optional quick sanity
print("Sample cleaned:")
print(df['clean_text'].iloc[0][:300])
print("Sample lemmatized:")
print(df['lemmatized_text'].iloc[0][:200])

Sample cleaned:
subject: naturally irresistible your corporate identity lt is really hard to recollect a company : the market is full of suqgestions and the information isoverwhelminq ; but a good catchy logo , stylish statlonery and outstanding website will make the task much easier . we do not promise that havinq
Sample lemmatized:
subject naturally irresistible corporate identity lt really hard recollect company market full suqgestions information isoverwhelminq good catchy logo stylish statlonery outstanding website make task 


In [13]:
# Cell 5 — Structured numeric features (n_urls, n_emails, has_image, text_len)
def extract_structured_features_from_series(series_text: pd.Series) -> pd.DataFrame:
    n_urls = series_text.str.count(URL_RE.pattern).fillna(0).astype(int)
    n_emails = series_text.str.count(EMAIL_RE.pattern).fillna(0).astype(int)
    has_image = series_text.str.contains(r'\[image:|\<image\>', case=False, regex=True).astype(int).fillna(0).astype(int)
    text_len = series_text.fillna('').map(len).astype(int)
    return pd.DataFrame({
        'n_urls': n_urls,
        'n_emails': n_emails,
        'has_image': has_image,
        'text_len': text_len
    })

X_text = df['lemmatized_text']
X_num_df = extract_structured_features_from_series(df['clean_text'])
y = df['label'].astype(int)

print("Numeric feature sample:")
print(X_num_df.head())

Numeric feature sample:
   n_urls  n_emails  has_image  text_len
0       0         0          0      1466
1       0         0          0       594
2       0         0          0       439
3       0         0          0       471
4       0         0          0       232


In [14]:
# Cell 6 — Train/test split (reproducible & stratified)
X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    X_text, X_num_df, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

# Reset indices
X_text_train = X_text_train.reset_index(drop=True)
X_text_test  = X_text_test.reset_index(drop=True)
X_num_train  = X_num_train.reset_index(drop=True)
X_num_test   = X_num_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test  = y_test.reset_index(drop=True)

print("Shapes after split:")
print("X_text_train:", X_text_train.shape, "X_num_train:", X_num_train.shape, "y_train:", y_train.shape)
print("X_text_test :", X_text_test.shape,  "X_num_test :", X_num_test.shape,  "y_test :", y_test.shape)

Shapes after split:
X_text_train: (4556,) X_num_train: (4556, 4) y_train: (4556,)
X_text_test : (1139,) X_num_test : (1139, 4) y_test : (1139,)


In [15]:
# Cell 7 — TF-IDF vectorization (fit on train)
tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1,2))
X_text_train_tfidf = tfidf.fit_transform(X_text_train.tolist())
X_text_test_tfidf  = tfidf.transform(X_text_test.tolist())
print("TF-IDF shapes:", X_text_train_tfidf.shape, X_text_test_tfidf.shape)

TF-IDF shapes: (4556, 50000) (1139, 50000)


In [16]:
# Cell 8 — Scale numeric features (fit on train)
scaler = StandardScaler()
X_num_train_scaled = scaler.fit_transform(X_num_train)
X_num_test_scaled  = scaler.transform(X_num_test)

# convert numeric to sparse and concat
X_num_train_sparse = csr_matrix(X_num_train_scaled)
X_num_test_sparse  = csr_matrix(X_num_test_scaled)

X_train_final = hstack([X_text_train_tfidf, X_num_train_sparse])
X_test_final  = hstack([X_text_test_tfidf,  X_num_test_sparse])

print("Final shapes (train/test):", X_train_final.shape, X_test_final.shape)

Final shapes (train/test): (4556, 50004) (1139, 50004)


In [17]:
# Cell 9 — Train classifier
clf = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE)
clf.fit(X_train_final, y_train)
print("Model trained.")

Model trained.


In [18]:
# Cell 10 — Evaluate
y_proba = clf.predict_proba(X_test_final)[:,1]
y_pred  = (y_proba >= 0.5).astype(int)
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1       :", f1_score(y_test, y_pred))
print("ROC AUC  :", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# Print shapes to confirm exact X_train/X_test sizes
print("X_train shape (sparse):", X_train_final.shape)
print("X_test  shape (sparse):", X_test_final.shape)
print("y_train shape:", y_train.shape)
print("y_test  shape:", y_test.shape)

Precision: 0.9609929078014184
Recall   : 0.9890510948905109
F1       : 0.9748201438848921
ROC AUC  : 0.9995738576431373
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       865
           1       0.96      0.99      0.97       274

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.98      1139
weighted avg       0.99      0.99      0.99      1139

Confusion matrix:
 [[854  11]
 [  3 271]]
X_train shape (sparse): (4556, 50004)
X_test  shape (sparse): (1139, 50004)
y_train shape: (4556,)
y_test  shape: (1139,)


In [19]:
# Cell 11 — Save artifacts (as you requested)
joblib.dump(tfidf, ARTIFACTS_DIR / 'tfidf.joblib')
joblib.dump(scaler, ARTIFACTS_DIR / 'scaler.joblib')
joblib.dump(clf, ARTIFACTS_DIR / 'model_logreg.joblib')

# Also save metadata (threshold defaults you can adjust)
meta = {'label_map': {0:'benign',1:'spam'}, 'thresholds': {'auto_block': 0.95, 'review': 0.6}}
with open(ARTIFACTS_DIR / 'metadata.json', 'w') as f:
    json.dump(meta, f)

print("Artifacts saved to:", ARTIFACTS_DIR)

Artifacts saved to: mailguard\model\artifacts
