In [1]:
import re, string
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [2]:
import urllib.request, zipfile, io

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_bytes = urllib.request.urlopen(url).read()
z = zipfile.ZipFile(io.BytesIO(zip_bytes))
raw = z.read("SMSSpamCollection").decode("utf-8", errors="ignore")

rows = [line.split("\t", 1) for line in raw.strip().split("\n") if "\t" in line]
df = pd.DataFrame(rows, columns=["label", "text"])
df["label"] = df["label"].map({"ham": 0, "spam": 1})
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def clean_text(t: str) -> str:
    t = str(t).lower()
    t = re.sub(r"http\S+|www\.\S+", " ", t)
    t = re.sub(r"\d+", " ", t)
    t = t.translate(str.maketrans("", "", string.punctuation))
    t = re.sub(r"\s+", " ", t).strip()
    return t

def remove_stopwords(t: str) -> str:
    return " ".join([w for w in t.split() if w not in ENGLISH_STOP_WORDS])

df["clean"] = df["text"].apply(clean_text).apply(remove_stopwords)
df[["text","clean","label"]].head()

Unnamed: 0,text,clean,label
0,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,0
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...,1
3,U dun say so early hor... U c already then say...,u dun say early hor u c say,0
4,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives,0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    df["clean"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

In [5]:
def evaluate_pipeline(vec, clf):
    pipe = Pipeline([("vec", vec), ("clf", clf)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    acc = accuracy_score(y_test, preds)
    cm = confusion_matrix(y_test, preds)
    rep = classification_report(y_test, preds, digits=3)
    return pipe, acc, cm, rep

models = {
    "NaiveBayes": MultinomialNB(),
    "LogisticRegression": LogisticRegression(max_iter=2000),
    "SVM(LinearSVC)": LinearSVC()
}

vectorizers = {
    "CountVectorizer": CountVectorizer(),
    "TFIDF": TfidfVectorizer()
}

results = []
store = {}

for vname, vec in vectorizers.items():
    for mname, clf in models.items():
        pipe, acc, cm, rep = evaluate_pipeline(vec, clf)
        results.append({"Vectorizer": vname, "Model": mname, "Accuracy": acc})
        store[(vname, mname)] = {"pipe": pipe, "cm": cm, "report": rep}

results_df = pd.DataFrame(results).sort_values("Accuracy", ascending=False)
results_df

Unnamed: 0,Vectorizer,Model,Accuracy
2,CountVectorizer,SVM(LinearSVC),0.986547
5,TFIDF,SVM(LinearSVC),0.98565
1,CountVectorizer,LogisticRegression,0.984753
0,CountVectorizer,NaiveBayes,0.981166
4,TFIDF,LogisticRegression,0.970404
3,TFIDF,NaiveBayes,0.967713


In [6]:
best_row = results_df.iloc[0]
best_vec = best_row["Vectorizer"]
best_model = best_row["Model"]

print("BEST:", best_vec, "+", best_model)
print("Accuracy:", best_row["Accuracy"])

cm = store[(best_vec, best_model)]["cm"]
print("\nConfusion Matrix:\n", cm)

print("\nClassification Report:\n")
print(store[(best_vec, best_model)]["report"])

BEST: CountVectorizer + SVM(LinearSVC)
Accuracy: 0.9865470852017937

Confusion Matrix:
 [[965   1]
 [ 14 135]]

Classification Report:

              precision    recall  f1-score   support

           0      0.986     0.999     0.992       966
           1      0.993     0.906     0.947       149

    accuracy                          0.987      1115
   macro avg      0.989     0.953     0.970      1115
weighted avg      0.987     0.987     0.986      1115



In [7]:
pipe = Pipeline([
    ("vec", TfidfVectorizer()),
    ("clf", LogisticRegression(max_iter=2000))
])
pipe.fit(X_train, y_train)

vec = pipe.named_steps["vec"]
clf = pipe.named_steps["clf"]

feature_names = np.array(vec.get_feature_names_out())
coefs = clf.coef_[0]

top_spam = feature_names[np.argsort(coefs)[-15:]][::-1]
top_ham  = feature_names[np.argsort(coefs)[:15]]

print("Top SPAM words:", list(top_spam))
print("Top HAM words:", list(top_ham))

Top SPAM words: ['txt', 'claim', 'free', 'stop', 'mobile', 'reply', 'text', 'service', 'prize', 'win', 'won', 'new', 'urgent', 'pobox', 'box']
Top HAM words: ['ltgt', 'im', 'ok', 'ill', 'come', 'home', 'got', 'lor', 'da', 'hey', 'later', 'sorry', 'good', 'going', 'sir']


In [8]:
sample = "Congratulations! You won a free prize. Call now!"
pred = pipe.predict([remove_stopwords(clean_text(sample))])[0]
print("Text:", sample)
print("Prediction:", "SPAM" if pred == 1 else "HAM")

Text: Congratulations! You won a free prize. Call now!
Prediction: SPAM
