In [4]:
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

df = pd.read_csv(
    "./sms+spam+collection/SMSSpamCollection",
    sep="\t",
    header=None,
    names=["label", "text"],
)
df["label_num"] = df["label"].map({"spam": 1, "ham": 0})

# df2 = pd.read_csv("../Machine Learning/")

In [5]:
# quick EDA

# df['label'].count()
print("Shape:", df.shape)
print(df["label"].value_counts())
print("Entries (spam + ham): ", df["label"].count())
# print(df.sample(5).to_dict(orient='records'))

Shape: (5572, 3)
label
ham     4825
spam     747
Name: count, dtype: int64
Entries (spam + ham):  5572


In [6]:
# Train test split

X = df["text"]
y = df["label_num"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)
# train and test data is ready
# Raw SMS → vectorizer (numbers) → model → prediction
tfidf_params = {
    "tfidf__ngram_range": (
        1,
        2,
    ),  # use single words (unigrams) + pairs of words (bigrams)
    "tfidf__min_df": 2,  # ignore very rare words (appear in only 1 doc)
    "tfidf__max_df": 0.9,  # ignore super common words (appear in >90% docs)
    "tfidf__stop_words": "english",  # remove common English stop words (like "the", "is")
}
pipe_nb = Pipeline(
    [
        (
            "tfidf",
            TfidfVectorizer(),
        ),  # Convert raw SMS text into numeric features using TfidfVectorizer.
        ("clf", MultinomialNB()),
    ]
)  # Feed those numeric features into a classifier (


# 3 MNB
pipe_nb.set_params(**tfidf_params)
pipe_nb.fit(X_train, y_train)
y_pred_nb = pipe_nb.predict(X_test)
print("=== MultinomialNB ===")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb, target_names=["ham", "spam"]))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_nb))

=== MultinomialNB ===
Accuracy: 0.9704035874439462
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.78      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115

Confusion matrix:
 [[966   0]
 [ 33 116]]


In [7]:
pipe_lr = Pipeline(
    [("tfidf", TfidfVectorizer()), ("clf", LogisticRegression(max_iter=1000))]
)


# pipe_nb → TF-IDF → Naive Bayes
# pipe_lr → TF-IDF → Logistic Regression

# 4. Fit Logistic Regression (stronger baseline)
pipe_lr.set_params(**tfidf_params)
pipe_lr.fit(X_train, y_train)
y_pred_lr = pipe_lr.predict(X_test)
print("=== LogisticRegression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, target_names=["ham", "spam"]))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_lr))

=== LogisticRegression ===
Accuracy: 0.9704035874439462
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.78      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115

Confusion matrix:
 [[966   0]
 [ 33 116]]


In [None]:

# 5. Example: adjust threshold for LR (if you want more recall on spam)
probs = pipe_lr.predict_proba(X_test)[:,1]   # probability of 'spam'
threshold = 0.4
y_pred_thresh = (probs >= threshold).astype(int)
print(f"=== LogisticRegression (threshold={threshold}) ===")
print(classification_report(y_test, y_pred_thresh, target_names=['ham','spam']))

# 6. Quick cross-validation (F1) for NB
cv_scores = cross_val_score(pipe_nb, X, y, cv=5, scoring='f1', n_jobs=-1)
print("NB CV F1 scores:", cv_scores, "mean:", cv_scores.mean())

# 7. Small hyperparameter grid search example (for NB)
param_grid = {
    'tfidf__ngram_range': [(1,1),(1,2)],
    'tfidf__min_df': [1,2],
    'clf__alpha': [0.1, 0.5, 1.0]  # Laplace smoothing
}
gs = GridSearchCV(pipe_nb, param_grid, cv=4, scoring='f1', n_jobs=-1)
gs.fit(X_train, y_train)
print("GridSearch best params:", gs.best_params_)
print("Best CV f1:", gs.best_score_)

# 8. Save your best model (choose whichever performed better; here LR example)
joblib.dump(pipe_lr, "./sms_spam_classifier.joblib")

print("Saved model to sms_spam_classifier.joblib")

# # 9. Example predict
loaded = joblib.load('./sms_spam_classifier.pkl')
samples = [
    "Congratulations! You've won a $1000 Walmart gift card. Reply WIN to claim.",
    "Hey, are we meeting for dinner tonight?"
]
print("Predictions:", loaded.predict(samples))

=== LogisticRegression (threshold=0.4) ===
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       0.99      0.85      0.91       149

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

NB CV F1 scores: [0.88888889 0.86363636 0.85384615 0.86692015 0.86692015] mean: 0.8680423421107832
GridSearch best params: {'clf__alpha': 0.1, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}
Best CV f1: 0.942746468204223
Saved model to sms_spam_classifier.joblib
Predictions: [1 0]
