In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

The example below is adapted from Marsland's *Machine Learning an Algorithmic Perspective*.  The dataset describes  whether or not a student has a looming deadline, if there is a party going on, and whether or not the student feels lazy.  The activity column is the target, and your aim is to use the Na√Øve Bayes formula below:

$$P(C_i) \prod_{k} P(X_j^k = a_k | C_i)$$

In [2]:
deadline = ['urgent','urgent','near', 'none', 'none', 'none', 'near', 'near', 'near','urgent']
party = ['yes', 'no', 'yes', 'yes', 'no', 'yes', 'no', 'no', 'yes', 'no']
lazy = ['yes', 'yes', 'yes', 'no', 'yes', 'no', 'no', 'yes', 'yes', 'no']
activity = ['party', 'study', 'party', 'party', 'pub', 'party', 'study', 'tv', 'party', 'study']

In [3]:
df = pd.DataFrame({'deadline': deadline, 
                  'party': party,
                  'lazy': lazy,
                  'activity': activity})
df

Unnamed: 0,deadline,party,lazy,activity
0,urgent,yes,yes,party
1,urgent,no,yes,study
2,near,yes,yes,party
3,none,yes,no,party
4,none,no,yes,pub
5,none,yes,no,party
6,near,no,no,study
7,near,no,yes,tv
8,near,yes,yes,party
9,urgent,no,no,study


Here, $C_i$ represents the class in the `activity` columm.  Accordingly, if we want to predict a category of activity given the input: 

```
deadline = near
party = no
lazy = yes
```

This means we need four probabilities:

- $P(party) \times P(near | party) \times P(no party | party) \times P(lazy | party)$
- $P(study) \times P(near | study) \times p(noparty | study) \times P(lazy | study)$
- $P(pub) \times P(near | pub) \times P(noparty | pub) \times P(lazy | pub)$
- $P(tv) \times P(near | tv) \times P(noparty | tv) \times P(lazy |tv)$

Compute these four probabilities and assign them to the list `probs` in the order above (party, study, pub, tv). 

In [4]:
probs = [1/2*2/5*0, 3/10*1/3*1*1/3, 1/10*0, 1/10*1*1*1]
probs

[0.0, 0.03333333333333333, 0.0, 0.1]

the maximum aposteriori solution involves selecting the outcome that is associated with the highest probability. Here its of $tv$.

In [26]:
# Load SMS Spam (direct GitHub raw URL - no Kaggle needed)
url = 'https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv'
email_data = pd.read_csv(url, encoding='latin-1')

In [27]:
#dropping the columnns with NaNs
email_data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis  = 1, inplace = True)

#renaming the remaining columns
email_data = email_data.rename(columns = {"v1": "label", "v2": "text"})
email_data['label'] = email_data['label'].map({'ham': 0, 'spam': 1})

print(email_data['label'].value_counts())
email_data.head()

label
0    4825
1     747
Name: count, dtype: int64


Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [28]:
# 2. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    email_data['text'],
    email_data['label'],
    test_size=0.2,
    random_state=42,
    stratify=email_data['label']
)

In [29]:
# === PIPELINE 1: CountVectorizer + MultinomialNB ===

count_nb_pipe = Pipeline([
    ('cvect', CountVectorizer()),
    ('nb', MultinomialNB())
])

# Baseline fit
count_nb_pipe.fit(X_train, y_train)
print("\nBaseline CountVectorizer + MultinomialNB accuracy:",
      count_nb_pipe.score(X_test, y_test))

# GridSearch on CountVectorizer parameters
count_params = {
    'cvect__max_features': [1000, 2000, 5000],
    'cvect__ngram_range': [(1,1), (1,2)],
    'cvect__stop_words': [None, 'english']
}

count_grid = GridSearchCV(
    count_nb_pipe,
    param_grid=count_params,
    cv=5,
    n_jobs=-1,
    scoring='accuracy'
)
count_grid.fit(X_train, y_train)

print("\nBest CountVectorizer + NB params:", count_grid.best_params_)
print("Best CV accuracy (Count + NB):", count_grid.best_score_)
print("Test accuracy (Count + NB):", count_grid.score(X_test, y_test))


Baseline CountVectorizer + MultinomialNB accuracy: 0.9838565022421525

Best CountVectorizer + NB params: {'cvect__max_features': 5000, 'cvect__ngram_range': (1, 1), 'cvect__stop_words': None}
Best CV accuracy (Count + NB): 0.9867624928910429
Test accuracy (Count + NB): 0.9847533632286996


In [30]:
# === PIPELINE 2: TfidfVectorizer + MultinomialNB ===

tfidf_nb_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

# Baseline fit
tfidf_nb_pipe.fit(X_train, y_train)
print("\nBaseline TfidfVectorizer + MultinomialNB accuracy:",
      tfidf_nb_pipe.score(X_test, y_test))

# GridSearch on TF-IDF parameters
tfidf_params = {
    'tfidf__max_features': [1000, 2000, 5000],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__stop_words': [None, 'english']
}

tfidf_grid = GridSearchCV(
    tfidf_nb_pipe,
    param_grid=tfidf_params,
    cv=5,
    n_jobs=-1,
    scoring='accuracy'
)
tfidf_grid.fit(X_train, y_train)

print("\nBest TF-IDF + NB params:", tfidf_grid.best_params_)
print("Best CV accuracy (TF-IDF + NB):", tfidf_grid.best_score_)
print("Test accuracy (TF-IDF + NB):", tfidf_grid.score(X_test, y_test))


Baseline TfidfVectorizer + MultinomialNB accuracy: 0.9605381165919282

Best TF-IDF + NB params: {'tfidf__max_features': 2000, 'tfidf__ngram_range': (1, 1), 'tfidf__stop_words': 'english'}
Best CV accuracy (TF-IDF + NB): 0.9811530854131751
Test accuracy (TF-IDF + NB): 0.97847533632287


In [31]:
# === Detailed evaluation for the best model (choose one) ===

best_model = tfidf_grid  # or count_grid

y_pred = best_model.predict(X_test)
print("\nClassification report (best model):")
print(classification_report(y_test, y_pred, target_names=['ham', 'spam']))

print("Confusion matrix (rows=true, cols=pred):")
print(confusion_matrix(y_test, y_pred))

# Try a few custom messages
samples = [
    "Congratulations! You've won a free ticket. Call now to claim your prize.",
    "Hey, are we still meeting for lunch today?",
    "URGENT! Your account has been compromised. Reply with your PIN."
]
print("\nSample predictions (0=ham, 1=spam):")
print(list(zip(samples, best_model.predict(samples))))


Classification report (best model):
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       0.98      0.85      0.91       149

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Confusion matrix (rows=true, cols=pred):
[[964   2]
 [ 22 127]]

Sample predictions (0=ham, 1=spam):
[("Congratulations! You've won a free ticket. Call now to claim your prize.", np.int64(1)), ('Hey, are we still meeting for lunch today?', np.int64(0)), ('URGENT! Your account has been compromised. Reply with your PIN.', np.int64(1))]


Exception ignored in: <function ResourceTracker.__del__ at 0x7498d558e020>
Traceback (most recent call last):
  File "/home/vinny/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/vinny/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/vinny/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x71db42b92020>
Traceback (most recent call last):
  File "/home/vinny/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/vinny/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/vinny/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ 