# Bernoulli Naive Bayes for Spam Detection


In [1]:
# imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report

In [2]:
# load dataset with a minimal, robust parser
# rationale: the csv may not be utf-8 and may not be comma-delimited
df = pd.read_csv("spam.csv", sep=None, engine="python", encoding="latin1")

# select label/text columns using common conventions; fall back to first/last
label_col = "label" if "label" in df.columns else ("v1" if "v1" in df.columns else df.columns[0])
text_col  = "text"  if "text"  in df.columns else ("v2" if "v2" in df.columns else df.columns[-1])
df = df[[label_col, text_col]].rename(columns={label_col: "label", text_col: "text"}).dropna()

# normalize labels to {0=ham, 1=spam}
label_map = {"ham": 0, "spam": 1}
df["label"] = df["label"].astype(str).str.lower().map(label_map).astype(int)

# minimal dataset review
print('shape:', df.shape)
print('class counts:\n', df['label'].value_counts().sort_index())
df.head(3)

shape: (5572, 2)
class counts:
 label
0    4825
1     747
Name: count, dtype: int64


Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

In [4]:
# binary presence/absence features as required by Bernoulli NB
# note: CountVectorizer(lowercase=True by default) builds the vocabulary on the training set only
vect = CountVectorizer(binary=True)
Xtr = vect.fit_transform(X_train)
Xte = vect.transform(X_test)

V = Xtr.shape[1]                       # vocabulary size
classes = np.array([0, 1])             # 0=ham, 1=spam
Nc = np.bincount(y_train, minlength=2)  # documents per class
priors = Nc / Nc.sum()                 # P(c)
print('vocab size:', V, '| priors:', priors)

vocab size: 7701 | priors: [0.86582903 0.13417097]


In [5]:
# estimate Bernoulli likelihoods with Laplace smoothing
# P(w|c) = (N_cw + 1) / (N_c + 2), where N_cw counts documents in class c that contain word w
# compute N_cw by summing binary feature presence within each class

# masks for classes
mask0 = (y_train.values == 0)
mask1 = ~mask0

# sum over rows per class (Xtr is binary; sums -> doc counts containing each term)
N0w = Xtr[mask0].sum(axis=0)  # shape (1, V)
N1w = Xtr[mask1].sum(axis=0)  # shape (1, V)

# convert to flat arrays
N0w = np.asarray(N0w).ravel()
N1w = np.asarray(N1w).ravel()

# apply Laplace smoothing denominators: N_c + 2
P_w_c0 = (N0w + 1.0) / (Nc[0] + 2.0)   # shape (V,)
P_w_c1 = (N1w + 1.0) / (Nc[1] + 2.0)

# stack into a (2, V) matrix for vectorized scoring
P_w_c = np.vstack([P_w_c0, P_w_c1])

# to avoid log(0), clamp extremely small/large values
eps = 1e-12
P_w_c = np.clip(P_w_c, eps, 1 - eps)

print('example P(w|ham), P(w|spam) for first 5 terms:\n',
      np.vstack([P_w_c[0,:5], P_w_c[1,:5]]))

example P(w|ham), P(w|spam) for first 5 terms:
 [[0.000259   0.000259   0.000259   0.000259   0.000259  ]
 [0.01833333 0.035      0.00333333 0.00333333 0.00333333]]


In [6]:
# inference: log P(c | x) ∝ log P(c) + sum_w x_w log P(w|c) + sum_w (1-x_w) log (1 - P(w|c))
# efficient computation: 
#   sum_w (1-x_w) log(1-p) = sum_w log(1-p) + sum_w x_w [ -log(1-p) ]
#   => score = log P(c) + sum_w log(1-p) + X * log( p / (1-p) )

log_priors = np.log(priors)                 # shape (2,)
log_one_minus = np.log(1.0 - P_w_c)         # shape (2, V)
log_ratio = np.log(P_w_c) - log_one_minus   # shape (2, V)
base = log_priors + log_one_minus.sum(axis=1)  # per-class constant, shape (2,)

# sparse (n, V) @ (V, 2) -> (n, 2)
scores = Xte @ log_ratio.T                  # (n_test, 2)
scores = scores + base                      # broadcast add per-class base
y_pred = scores.argmax(axis=1)              # argmax over classes


In [7]:
# evaluation: accuracy + precision/recall/F1 and confusion matrix
acc = accuracy_score(y_test, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary", zero_division=0)
cm = confusion_matrix(y_test, y_pred)
print(f"accuracy: {acc:.4f} | precision: {prec:.4f} | recall: {rec:.4f} | f1: {f1:.4f}")
print("confusion matrix:\n", cm)
print("\nclassification report:\n", classification_report(y_test, y_pred, target_names=["ham","spam"], zero_division=0))

accuracy: 0.9758 | precision: 1.0000 | recall: 0.8188 | f1: 0.9004
confusion matrix:
 [[966   0]
 [ 27 122]]

classification report:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.99       966
        spam       1.00      0.82      0.90       149

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.98      0.97      1115

