In [2]:
# SMS Spam Detection: EDA + BoW + TF-IDF + Preprocessing
# -----------------------------------------------------------
# This script:
# 1) Loads the "SMS Spam Collection" dataset via kagglehub
# 2) Performs EDA (top words/bigrams/trigrams per class + wordclouds)
# 3) Builds a baseline Bag-of-Words + Logistic Regression model
# 4) Builds a TF-IDF + Logistic Regression model
# 5) Adds text preprocessing (tokenize, lowercase, remove emails/urls/html, numbers, punctuation,
#    remove stopwords, lemmatize) + TF-IDF + Logistic Regression
# 6) Prints the F1-scores for the three setups and saves artifacts to /mnt/data
#
# Run:
#   python sms_spam_pipeline.py
#
# Notes:
# - The same classifier (LogisticRegression) is used across all three experiments.
# - Artifacts (plots and CSVs) are saved into ./artifacts by default.
# - If running in Kaggle/Colab, internet should be available for any pip installs (if needed).
# -----------------------------------------------------------

import os
import re
import sys
import json
import string
import warnings
from collections import Counter
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Optional installs for environments that may not have these
def _ensure_packages():
    try:
        import nltk  # noqa: F401
    except ImportError:
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "nltk"])

    try:
        from wordcloud import WordCloud  # noqa: F401
    except ImportError:
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "wordcloud"])

    # kagglehub is nice-to-have (we fall back gracefully if missing)
    try:
        import kagglehub  # noqa: F401
    except ImportError:
        try:
            import subprocess
            subprocess.check_call([sys.executable, "-m", "pip", "install", "kagglehub"])
        except Exception:
            pass

_ensure_packages()

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, confusion_matrix

# NLTK downloads (idempotent)
def _ensure_nltk_data():
    resources = [
        ("punkt", "tokenizers/punkt"),
        ("punkt_tab", "tokenizers/punkt_tab"),  # newer nltk sometimes needs this
        ("stopwords", "corpora/stopwords"),
        ("wordnet", "corpora/wordnet"),
        ("omw-1.4", "corpora/omw-1.4"),
        ("averaged_perceptron_tagger", "taggers/averaged_perceptron_tagger"),
        ("averaged_perceptron_tagger_eng", "taggers/averaged_perceptron_tagger_eng"),  # NLTK 3.8+
    ]
    for pkg, path in resources:
        try:
            nltk.data.find(path)
        except LookupError:
            nltk.download(pkg, quiet=True)

_ensure_nltk_data()

# ---------------------
# Config
# ---------------------
ARTIFACT_DIR = Path("./artifacts")
ARTIFACT_DIR.mkdir(exist_ok=True, parents=True)

RANDOM_STATE = 42
TOPK = 10

# ---------------------
# Data Loading
# ---------------------
def load_sms_spam():
    """
    Load the UCI SMS Spam Collection dataset via kagglehub (if available).
    Falls back to searching the current directory for a CSV with columns v1 (label) and v2 (text).
    Returns a DataFrame with columns: ['label', 'text'].
    """
    path = None
    try:
        import kagglehub
        print("[info] Using kagglehub to download dataset...")
        path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")
        print("[info] kagglehub path:", path)
        base = Path(path)
        # The dataset usually contains 'spam.csv'
        candidates = list(base.glob("*.csv"))
        if not candidates:
            # some mirrors might place the file in nested directories
            candidates = list(base.rglob("*.csv"))
    except Exception as e:
        print("[warn] kagglehub unavailable or failed:", repr(e))
        # fallback: search local folder
        base = Path(".")
        candidates = list(base.glob("*.csv")) + list(base.rglob("*.csv"))

    if not candidates:
        raise FileNotFoundError("Could not find a CSV file for the SMS dataset. "
                                "Please ensure 'spam.csv' (with columns v1,v2) is present.")

    # prefer a file named spam.csv if present
    csv_path = None
    for c in candidates:
        if c.name.lower() == "spam.csv":
            csv_path = c
            break
    if csv_path is None:
        csv_path = candidates[0]

    print(f"[info] Loading: {csv_path}")
    # The CSV sometimes uses latin-1
    df = pd.read_csv(csv_path, encoding="latin-1")
    # Some versions have extra unnamed columns; keep only 'v1','v2'
    if "v1" in df.columns and "v2" in df.columns:
        df = df[["v1", "v2"]].rename(columns={"v1": "label", "v2": "text"})
    elif "label" in df.columns and "message" in df.columns:
        df = df[["label", "message"]].rename(columns={"label": "label", "message": "text"})
    else:
        # Try to auto-detect the first two columns
        df = df.iloc[:, :2].rename(columns={df.columns[0]: "label", df.columns[1]: "text"})

    # Clean up label/text types
    df["label"] = df["label"].astype(str).str.strip().str.lower()
    df["text"] = df["text"].astype(str)

    # Filter to only expected labels
    df = df[df["label"].isin(["ham", "spam"])].reset_index(drop=True)
    print(f"[info] Dataset size: {len(df)} (ham={sum(df.label=='ham')}, spam={sum(df.label=='spam')})")
    return df

# ---------------------
# Text Cleaning / Tokenization
# ---------------------
RE_HTML = re.compile(r"<.*?>")
RE_URL = re.compile(r"(https?://\S+|www\.\S+)")
RE_EMAIL = re.compile(r"\b[\w\.-]+@[\w\.-]+\.\w+\b")
RE_NUM = re.compile(r"\d+")
RE_PUNCT = re.compile(rf"[{re.escape(string.punctuation)}]")

STOPWORDS = set(stopwords.words("english"))
LEMMATIZER = WordNetLemmatizer()

def _wordnet_pos(treebank_tag):
    """
    Map NLTK POS tags to WordNet POS tags for better lemmatization.
    """
    if treebank_tag.startswith('J'):
        return 'a'  # adjective
    elif treebank_tag.startswith('V'):
        return 'v'  # verb
    elif treebank_tag.startswith('N'):
        return 'n'  # noun
    elif treebank_tag.startswith('R'):
        return 'r'  # adverb
    return 'n'  # default to noun

def clean_for_eda(text, remove_emails_urls_html=True, lower=True, remove_numbers=True, remove_punct=True,
                  remove_stop=True, lemma=False):
    """
    Light cleaning used for EDA (top words/ngrams + wordcloud).
    The prompt only mandates removing stopwords, but we also remove obvious noise (urls/emails/html/punct/numbers)
    so counts/wordclouds are meaningful.
    Lemmatization is optional (default False) for EDA.
    Returns a list of tokens.
    """
    if remove_emails_urls_html:
        text = RE_HTML.sub(" ", text)
        text = RE_URL.sub(" ", text)
        text = RE_EMAIL.sub(" ", text)
    if lower:
        text = text.lower()
    if remove_numbers:
        text = RE_NUM.sub(" ", text)
    if remove_punct:
        text = RE_PUNCT.sub(" ", text)

    tokens = word_tokenize(text)
    if remove_stop:
        tokens = [t for t in tokens if t not in STOPWORDS and t.strip()]

    if lemma:
        # POS-tag-based lemmatization
        tagged = pos_tag(tokens)
        tokens = [LEMMATIZER.lemmatize(w, _wordnet_pos(pos)) for w, pos in tagged]

    return [t for t in tokens if t]


def advanced_tokenizer(text):
    """
    Full preprocessing pipeline for modeling step #5 (Tf-IDF + preprocessing).
    Steps:
      - tokenize the text
      - convert to lower case
      - remove stop words
      - remove email-ids, urls and html tags if any
      - remove numbers
      - remove punctuation marks
      - apply Lemmatization to each token
    Returns a list of lemmas suitable for scikit-learn vectorizers (tokenizer usage).
    """
    # Order: remove html/urls/emails first to avoid junk tokens
    text = RE_HTML.sub(" ", text)
    text = RE_URL.sub(" ", text)
    text = RE_EMAIL.sub(" ", text)
    text = text.lower()
    text = RE_NUM.sub(" ", text)
    text = RE_PUNCT.sub(" ", text)

    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in STOPWORDS and t.strip()]

    tagged = pos_tag(tokens)
    lemmas = [LEMMATIZER.lemmatize(w, _wordnet_pos(pos)) for w, pos in tagged]
    return [t for t in lemmas if t]


# ---------------------
# EDA Helpers
# ---------------------
def get_top_ngrams(tokens_list, n=1, topk=10):
    """
    Given a list of tokens, compute top-k n-grams.
    """
    if n == 1:
        grams = tokens_list
    else:
        grams = [" ".join(tokens_list[i:i+n]) for i in range(len(tokens_list) - n + 1)]
    counter = Counter(grams)
    return counter.most_common(topk)

def eda_by_class(df, label_value, topk=10, save_prefix="ham"):
    """
    Compute and save EDA artifacts for a given class.
    - 10 most frequent unigrams, bigrams, trigrams (after cleaning; stopwords removed)
    - Wordcloud image
    Returns dict with DataFrames for the top n-grams.
    """
    subset = df[df["label"] == label_value]["text"].tolist()
    # Build a flat list of tokens with EDA cleaning (no lemmatization by default)
    tokens_all = []
    for txt in subset:
        tokens_all.extend(clean_for_eda(txt, lemma=False))

    # Top n-grams
    uni = get_top_ngrams(tokens_all, n=1, topk=topk)
    bi = get_top_ngrams(tokens_all, n=2, topk=topk)
    tri = get_top_ngrams(tokens_all, n=3, topk=topk)

    uni_df = pd.DataFrame(uni, columns=["unigram", "count"])
    bi_df = pd.DataFrame(bi, columns=["bigram", "count"])
    tri_df = pd.DataFrame(tri, columns=["trigram", "count"])

    uni_df.to_csv(ARTIFACT_DIR / f"top_{save_prefix}_unigrams.csv", index=False)
    bi_df.to_csv(ARTIFACT_DIR / f"top_{save_prefix}_bigrams.csv", index=False)
    tri_df.to_csv(ARTIFACT_DIR / f"top_{save_prefix}_trigrams.csv", index=False)

    # Wordcloud
    try:
        from wordcloud import WordCloud
        wc_text = " ".join(tokens_all)
        if wc_text.strip():
            wc = WordCloud(width=1200, height=600, background_color="white").generate(wc_text)
            plt.figure(figsize=(12,6))
            plt.imshow(wc, interpolation="bilinear")
            plt.axis("off")
            plt.tight_layout()
            out_path = ARTIFACT_DIR / f"wordcloud_{save_prefix}.png"
            plt.savefig(out_path, dpi=150)
            plt.close()
            print(f"[info] Saved wordcloud: {out_path}")
        else:
            print(f"[warn] No tokens for wordcloud ({save_prefix}).")
    except Exception as e:
        print("[warn] WordCloud failed:", repr(e))

    return {"uni": uni_df, "bi": bi_df, "tri": tri_df}


# ---------------------
# Modeling
# ---------------------
def evaluate_model(name, vectorizer, model, X_train, X_test, y_train, y_test):
    """
    Fit vectorizer+model, evaluate on test, return macro-F1 and report path.
    """
    Xtr = vectorizer.fit_transform(X_train)
    Xte = vectorizer.transform(X_test)

    model.fit(Xtr, y_train)
    y_pred = model.predict(Xte)
    f1 = f1_score(y_test, y_pred, average="macro")
    print(f"\n=== {name} ===")
    print(f"F1-macro: {f1:.4f}")
    print("Classification report:")
    report = classification_report(y_test, y_pred, digits=4)
    print(report)

    # Save report
    report_path = ARTIFACT_DIR / f"classification_report_{name.replace(' ', '_').lower()}.txt"
    with open(report_path, "w", encoding="utf-8") as f:
        f.write(f"{name}\n\nF1-macro: {f1:.6f}\n\n")
        f.write(report)
    print(f"[info] Saved report: {report_path}")
    return f1


def main():
    warnings.filterwarnings("ignore")
    print("[info] Loading dataset...")
    df = load_sms_spam()

    # -----------------
    # EDA
    # -----------------
    print("\n[info] Running EDA...")
    ham_eda = eda_by_class(df, "ham", topk=TOPK, save_prefix="ham")
    spam_eda = eda_by_class(df, "spam", topk=TOPK, save_prefix="spam")

    # Save quick previews
    ham_eda["uni"].head(10).to_csv(ARTIFACT_DIR / "preview_ham_unigrams.csv", index=False)
    spam_eda["uni"].head(10).to_csv(ARTIFACT_DIR / "preview_spam_unigrams.csv", index=False)

    # -----------------
    # Train/Test Split
    # -----------------
    X = df["text"].values
    y = df["label"].values
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
    )
    print("\n[info] Data split:")
    print("  Train size:", len(X_train), " Test size:", len(X_test))

    # -----------------
    # Modeling: same classifier for all experiments
    # -----------------
    clf = LogisticRegression(max_iter=1000, solver="liblinear", random_state=RANDOM_STATE)

    results = []

    # 1) Bag-of-Words
    bow_vec = CountVectorizer(stop_words="english", min_df=2)
    f1_bow = evaluate_model("BoW + LogisticRegression", bow_vec, clf, X_train, X_test, y_train, y_test)
    results.append(("BoW", f1_bow))

    # 2) TF-IDF
    tfidf_vec = TfidfVectorizer(stop_words="english", min_df=2)
    f1_tfidf = evaluate_model("TF-IDF + LogisticRegression", tfidf_vec, clf, X_train, X_test, y_train, y_test)
    results.append(("TF-IDF", f1_tfidf))

    # 3) TF-IDF + Preprocessing (custom tokenizer does all cleaning steps incl. lemmatization)
    tfidf_pre_vec = TfidfVectorizer(min_df=2, tokenizer=advanced_tokenizer, preprocessor=None, token_pattern=None)
    f1_tfidf_pre = evaluate_model("TF-IDF+Preproc + LogisticRegression", tfidf_pre_vec, clf, X_train, X_test, y_train, y_test)
    results.append(("TF-IDF + Preprocessing", f1_tfidf_pre))

    # -----------------
    # Final Score Table
    # -----------------
    res_df = pd.DataFrame(results, columns=["Method", "F1_macro"]).sort_values("F1_macro", ascending=False)
    print("\n=== Final Scores (same classifier across methods) ===")
    print(res_df.to_string(index=False))

    out_csv = ARTIFACT_DIR / "final_scores.csv"
    res_df.to_csv(out_csv, index=False)
    print(f"[info] Saved final scores: {out_csv}")

    # Also save top n-gram CSV previews into a compact JSON for quick glance
    summary = {
        "dataset_size": len(df),
        "class_counts": df["label"].value_counts().to_dict(),
        "scores": [{"method": m, "f1_macro": float(f)} for m, f in results],
    }
    with open(ARTIFACT_DIR / "run_summary.json", "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2)
    print(f"[info] Summary JSON saved at {ARTIFACT_DIR / 'run_summary.json'}")

if __name__ == "__main__":
    main()


[info] Loading dataset...
[info] Using kagglehub to download dataset...
[info] kagglehub path: C:\Users\jaypr\.cache\kagglehub\datasets\uciml\sms-spam-collection-dataset\versions\1
[info] Loading: C:\Users\jaypr\.cache\kagglehub\datasets\uciml\sms-spam-collection-dataset\versions\1\spam.csv
[info] Dataset size: 5572 (ham=4825, spam=747)

[info] Running EDA...
[info] Saved wordcloud: artifacts\wordcloud_ham.png
[info] Saved wordcloud: artifacts\wordcloud_spam.png

[info] Data split:
  Train size: 4457  Test size: 1115

=== BoW + LogisticRegression ===
F1-macro: 0.9501
Classification report:
              precision    recall  f1-score   support

         ham     0.9758    1.0000    0.9877       966
        spam     1.0000    0.8389    0.9124       149

    accuracy                         0.9785      1115
   macro avg     0.9879    0.9195    0.9501      1115
weighted avg     0.9790    0.9785    0.9777      1115

[info] Saved report: artifacts\classification_report_bow_+_logisticregressio