In [4]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import joblib

In [5]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [6]:
df = df.loc[:, ['v1', 'v2']].copy()
df.columns = ['label', 'text']
df = df.dropna().reset_index(drop=True)
print(df.shape)
print(df['label'].value_counts())
df.head()

(5572, 2)
label
ham     4825
spam     747
Name: count, dtype: int64


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
def normalize_label(x):
    x = str(x).strip().lower()
    return 'spam' if x == 'spam' else 'ham'

def preprocess(text):
    t = str(text).lower()
    t = re.sub(r'http\S+|www\.\S+', ' ', t)
    t = re.sub(r'\S+@\S+', ' ', t)
    t = re.sub(r'[^a-z0-9\s]', ' ', t)
    t = re.sub(r'\s+', ' ', t).strip()
    return t

df['label'] = df['label'].apply(normalize_label)
df['text_clean'] = df['text'].apply(preprocess)
df.head(4)


Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say


In [8]:
X = df['text_clean'].values
y = (df['label'] == 'spam').astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(X_train.shape[0], X_test.shape[0])


4457 1115


In [9]:
vectorizer = CountVectorizer(min_df=2, ngram_range=(1,1))
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts  = vectorizer.transform(X_test)

clf = MultinomialNB(alpha=1.0)
clf.fit(X_train_counts, y_train)

print("Vocab size:", len(vectorizer.vocabulary_))


Vocab size: 3606


In [10]:
y_pred = clf.predict(X_test_counts)
y_prob = clf.predict_proba(X_test_counts)[:,1]

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred, zero_division=0))
print("Recall:", metrics.recall_score(y_test, y_pred, zero_division=0))
print("F1:", metrics.f1_score(y_test, y_pred, zero_division=0))
print("\nConfusion matrix:\n", metrics.confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", metrics.classification_report(y_test, y_pred, target_names=['ham','spam']))


Accuracy: 0.9838565022421525
Precision: 0.9645390070921985
Recall: 0.912751677852349
F1: 0.9379310344827586

Confusion matrix:
 [[961   5]
 [ 13 136]]

Classification report:
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99       966
        spam       0.96      0.91      0.94       149

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [11]:
import numpy as np
feature_names = np.array(vectorizer.get_feature_names_out())
log_prob = clf.feature_log_prob_
top_spam = feature_names[np.argsort(log_prob[1])[-20:]][::-1]
top_ham  = feature_names[np.argsort(log_prob[0])[-20:]][::-1]
print("Top spam words:", top_spam[:20])
print("Top ham words :", top_ham[:20])


Top spam words: ['to' 'call' 'you' 'your' 'free' 'for' 'the' 'now' 'or' 'is' 'txt' 'ur'
 'have' 'from' 'on' 'and' 'stop' 'text' 'claim' 'mobile']
Top ham words : ['you' 'to' 'the' 'and' 'in' 'me' 'is' 'my' 'it' 'that' 'of' 'for' 'so'
 'have' 'can' 'but' 'your' 'not' 'on' 'are']


In [12]:
examples = [
    "Quick cash opportunity",
    "Hey, are we still on for coffee tomorrow?",
    "Congratulations! You have won a free iPhone. Claim now!"
]
examples_clean = [preprocess(t) for t in examples]
vec = vectorizer.transform(examples_clean)
probs = clf.predict_proba(vec)[:,1]
preds = ['spam' if p>=0.5 else 'ham' for p in probs]

for t, p, pr in zip(examples, preds, probs):
    print(f"{t}\n -> pred={p}, P(spam)={pr:.4f}\n")


Quick cash opportunity
 -> pred=ham, P(spam)=0.3913

Hey, are we still on for coffee tomorrow?
 -> pred=ham, P(spam)=0.0002

Congratulations! You have won a free iPhone. Claim now!
 -> pred=spam, P(spam)=1.0000



In [13]:
import os
os.makedirs('models', exist_ok=True)
joblib.dump(vectorizer, 'models/count_vectorizer.joblib')
joblib.dump(clf, 'models/multinomial_nb.joblib')
print("Saved to models/")


Saved to models/
