
# Naive Bayes with **Laplace (add-1) smoothing**

This notebook is the same tiny multinomial Naive Bayes classifier as before,
but now we apply **Laplace correction** to avoid zero probabilities.

What changes with Laplace?
- We add 1 to every word count before normalizing.
- Denominator becomes (total tokens in class + |V|).
- This prevents log(0) during prediction and usually improves robustness on small data.


In [1]:

import numpy as np
from collections import Counter, defaultdict
import re
import math
rng = np.random.default_rng(7)

def simple_tokenize(text):
    return re.findall(r"[a-z0-9]+", text.lower())

# Same tiny dataset
docs = [
    "The team won the football match",
    "A great game of cricket last night",
    "He scored three goals in the league",
    "Tennis players train hard for tournaments",
    "New smartphone release features faster chip",
    "The laptop battery life is excellent",
    "Software update improves security and speed",
    "AI model achieves state of the art results"
]
y = np.array([0,0,0,0, 1,1,1,1])  # 0=Sports, 1=Tech

idx = np.arange(len(docs))
rng.shuffle(idx)
split = int(0.75 * len(docs))
train_idx, test_idx = idx[:split], idx[split:]
X_train = [docs[i] for i in train_idx]
y_train = y[train_idx]
X_test  = [docs[i] for i in test_idx]
y_test  = y[test_idx]

# Build vocabulary on train
vocab = {}
def add_to_vocab(token):
    if token not in vocab:
        vocab[token] = len(vocab)

tokenized_train = []
for txt in X_train:
    toks = simple_tokenize(txt)
    tokenized_train.append(toks)
    for t in toks:
        add_to_vocab(t)

V = len(vocab)
C = len(np.unique(y_train))

# Count words per class
word_counts = np.zeros((C, V), dtype=np.int64)
class_counts = np.zeros(C, dtype=np.int64)
doc_counts = np.zeros(C, dtype=np.int64)

for toks, label in zip(tokenized_train, y_train):
    doc_counts[label] += 1
    for t in toks:
        j = vocab[t]
        word_counts[label, j] += 1
        class_counts[label] += 1

# Priors
priors = doc_counts / doc_counts.sum()

# Laplace smoothing (add-1)
# prob[c, j] = (count + 1) / (total_tokens_in_class + V)
prob = np.zeros_like(word_counts, dtype=np.float64)
for c in range(C):
    denom = class_counts[c] + V  # add |V|
    prob[c, :] = (word_counts[c, :] + 1.0) / max(denom, 1)

def vectorize(text):
    vec = np.zeros(V, dtype=np.int64)
    for t in simple_tokenize(text):
        if t in vocab:
            vec[vocab[t]] += 1
    return vec

def predict(text):
    x = vectorize(text)
    scores = []
    for c in range(C):
        loglik = 0.0
        for j, count in enumerate(x):
            if count > 0:
                p = prob[c, j]
                loglik += count * math.log(max(p, 1e-12))
        score = math.log(max(priors[c], 1e-12)) + loglik
        scores.append(score)
    return int(np.argmax(scores))

# Evaluate
y_pred = np.array([predict(t) for t in X_test])
acc = (y_pred == y_test).mean()
print("Vocabulary size:", V)
print("Class priors:", priors)
print("Test docs:")
for t, yt, yp in zip(X_test, y_test, y_pred):
    print(f" - '{t}' | true={yt} pred={yp}")
print("Accuracy:", round(float(acc), 3))

# Show that zero-probability is gone
zero_issues = []
for t in X_test:
    toks = simple_tokenize(t)
    unseen_by_class = {0: [], 1: []}
    for token in toks:
        if token in vocab:
            j = vocab[token]
            for c in range(C):
                if prob[c, j] == 0.0:
                    unseen_by_class[c].append(token)
    zero_issues.append((t, unseen_by_class))

print("\nTokens causing zero-likelihood by class (should be empty):")
for t, issues in zero_issues:
    print(f"'{t}' ->", issues)


Vocabulary size: 35
Class priors: [0.33333333 0.66666667]
Test docs:
 - 'A great game of cricket last night' | true=0 pred=1
 - 'Tennis players train hard for tournaments' | true=0 pred=1
Accuracy: 0.0

Tokens causing zero-likelihood by class (should be empty):
'A great game of cricket last night' -> {0: [], 1: []}
'Tennis players train hard for tournaments' -> {0: [], 1: []}
