
# Naive Bayes Text Classifier (from scratch, **no Laplace smoothing**)

Beginner-friendly notebook that implements a tiny **multinomial Naive Bayes** classifier for text.
We keep it pure NumPy + Python so you can see how it works under the hood.

What you'll do:
- Create a tiny toy text dataset (two classes)
- Tokenize and build a vocabulary
- Compute class priors and word likelihoods **without** smoothing
- Make predictions and measure accuracy
- See what goes wrong when a word has zero counts (probability zero)


In [1]:

import numpy as np
from collections import Counter, defaultdict
import re
import math
rng = np.random.default_rng(7)

def simple_tokenize(text):
    # lowercase + keep only letters/numbers + split
    return re.findall(r"[a-z0-9]+", text.lower())

# Tiny toy dataset
docs = [
    "The team won the football match",
    "A great game of cricket last night",
    "He scored three goals in the league",
    "Tennis players train hard for tournaments",
    "New smartphone release features faster chip",
    "The laptop battery life is excellent",
    "Software update improves security and speed",
    "AI model achieves state of the art results"
]
# Labels: 0 = Sports, 1 = Tech
y = np.array([0,0,0,0, 1,1,1,1])

# train/test split (simple, fixed)
idx = np.arange(len(docs))
rng.shuffle(idx)
split = int(0.75 * len(docs))
train_idx, test_idx = idx[:split], idx[split:]
X_train = [docs[i] for i in train_idx]
y_train = y[train_idx]
X_test  = [docs[i] for i in test_idx]
y_test  = y[test_idx]

# Build vocabulary from training data
vocab = {}
def add_to_vocab(token):
    if token not in vocab:
        vocab[token] = len(vocab)

tokenized_train = []
for txt in X_train:
    toks = simple_tokenize(txt)
    tokenized_train.append(toks)
    for t in toks:
        add_to_vocab(t)

V = len(vocab)
C = len(np.unique(y_train))

# Count words per class (multinomial NB)
word_counts = np.zeros((C, V), dtype=np.int64)    # counts of token j in class c
class_counts = np.zeros(C, dtype=np.int64)        # total tokens in class c
doc_counts = np.zeros(C, dtype=np.int64)          # number of docs in class c

for toks, label in zip(tokenized_train, y_train):
    doc_counts[label] += 1
    for t in toks:
        j = vocab[t]
        word_counts[label, j] += 1
        class_counts[label] += 1

# Priors P(class)
priors = doc_counts / doc_counts.sum()

# Likelihoods P(word|class) WITHOUT smoothing — beware of zeros!
# prob[c, j] = count(word j in class c) / total tokens in class c
prob = np.zeros_like(word_counts, dtype=np.float64)
for c in range(C):
    denom = max(class_counts[c], 1)  # avoid divide-by-zero if empty class
    prob[c, :] = word_counts[c, :] / denom

def vectorize(text):
    vec = np.zeros(V, dtype=np.int64)
    for t in simple_tokenize(text):
        if t in vocab:  # ignore OOV words (unseen in training)
            vec[vocab[t]] += 1
    return vec

def predict(text):
    x = vectorize(text)
    # log P(class) + sum_j x_j * log P(word_j | class)
    scores = []
    for c in range(C):
        # If any prob is zero where x_j > 0 => log(0) -> -inf
        loglik = 0.0
        for j, count in enumerate(x):
            if count > 0:
                p = prob[c, j]
                if p <= 0.0:
                    loglik = -np.inf
                    break
                loglik += count * math.log(p)
        score = math.log(max(priors[c], 1e-12)) + loglik
        scores.append(score)
    return int(np.argmax(scores))

# Evaluate
y_pred = np.array([predict(t) for t in X_test])
acc = (y_pred == y_test).mean()
print("Vocabulary size:", V)
print("Class priors:", priors)
print("Test docs:")
for t, yt, yp in zip(X_test, y_test, y_pred):
    print(f" - '{t}' | true={yt} pred={yp}")
print("Accuracy:", round(float(acc), 3))

# Show where zero-probability bites us (if any test word never seen in a class)
zero_issues = []
for t in X_test:
    toks = simple_tokenize(t)
    unseen_by_class = {0: [], 1: []}
    for token in toks:
        if token in vocab:
            j = vocab[token]
            for c in range(C):
                if prob[c, j] == 0.0:
                    unseen_by_class[c].append(token)
    zero_issues.append((t, unseen_by_class))

print("\nTokens causing zero-likelihood by class (if any):")
for t, issues in zero_issues:
    print(f"'{t}' ->", issues)


Vocabulary size: 35
Class priors: [0.33333333 0.66666667]
Test docs:
 - 'A great game of cricket last night' | true=0 pred=1
 - 'Tennis players train hard for tournaments' | true=0 pred=1
Accuracy: 0.0

Tokens causing zero-likelihood by class (if any):
'A great game of cricket last night' -> {0: ['of'], 1: []}
'Tennis players train hard for tournaments' -> {0: [], 1: []}
