<a href="https://colab.research.google.com/github/Thuku2/Computer-Vision/blob/main/Naive%20-bayes%20-classifier-from%20-scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# import libraries
import pandas as pd
import numpy as np
import re, math
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv('/content/sample.csv')

display(df.head())

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...,119236.0,
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...,,119239.0
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...,119238.0,
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...,119241.0,119242.0
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243.0,119240.0


In [24]:
# Ensure required columns exist; adapt names if different:
text_col = 'text'
label_col = 'inbound'
assert text_col in df.columns and label_col in df.columns, "Rename your text/label columns to match text_feature/target or change variables above."
# Drop NA rows in these important columns
df = df.dropna(subset=[text_col, label_col]).reset_index(drop=True)

In [25]:

# Preprocessing & Vectorizer (from scratch)
# minimal stopword list
STOPWORDS = set([
    'the', 'is', 'and', 'a', 'an', 'it', 'this', 'that', 'to', 'of',
    'in', 'for', 'on', 'with', 'was', 'were', 'but', 'not'
])

def preprocess(text, remove_stopwords=True):
    text = str(text).lower()
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = [t for t in text.split() if t]
    if remove_stopwords:
        tokens = [t for t in tokens if t not in STOPWORDS]
    return tokens

def build_vocab(docs, min_freq=1):
    ctr = Counter()
    for d in docs:
        ctr.update(preprocess(d))
    vocab = [w for w,c in ctr.items() if c >= min_freq]
    vocab.sort()
    return vocab, ctr

def vectorize_docs(docs, vocab):
    idx = {w:i for i,w in enumerate(vocab)}
    X = np.zeros((len(docs), len(vocab)), dtype=np.int32)
    for i,d in enumerate(docs):
        for w in preprocess(d):
            if w in idx:
                X[i, idx[w]] += 1
    return X

# Build vocab from your dataset
vocab, global_counts = build_vocab(df[text_col].tolist(), min_freq=1)
print(f"Vocabulary size: {len(vocab)}")

X = vectorize_docs(df[text_col].tolist(), vocab)
y = df[label_col].astype(str).values

# Simple from-scratch Multinomial Naive Bayes
class MultinomialNBFromScratch:
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        self.class_to_index = {c:i for i,c in enumerate(self.classes_)}
        n_classes = len(self.classes_)
        n_features = X.shape[1]

        # counts
        self.class_count_docs_ = np.zeros(n_classes, dtype=np.int64)
        self.class_word_counts_ = np.zeros((n_classes, n_features), dtype=np.float64)

        for i,c in enumerate(self.classes_):
            mask = (y == c)
            self.class_count_docs_[i] = mask.sum()
            if mask.sum() > 0:
                self.class_word_counts_[i, :] = X[mask].sum(axis=0)

        # priors
        n_docs = X.shape[0]
        self.class_log_prior_ = np.log((self.class_count_docs_ + 1e-12) / n_docs)

        # smoothed conditional probabilities (per-class per-word)
        self.class_total_words_ = self.class_word_counts_.sum(axis=1)
        denom = (self.class_total_words_[:, None] + self.alpha * n_features)
        self.feature_log_prob_ = np.log((self.class_word_counts_ + self.alpha) / denom)
        return self

    def _joint_log_likelihood(self, X):
        return X.dot(self.feature_log_prob_.T) + self.class_log_prior_[None, :]

    def predict(self, X):
        jll = self._joint_log_likelihood(X)
        idx = np.argmax(jll, axis=1)
        return self.classes_[idx]

    def predict_proba(self, X):
        jll = self._joint_log_likelihood(X)
        max_jll = np.max(jll, axis=1, keepdims=True)
        e = np.exp(jll - max_jll)
        probs = e / e.sum(axis=1, keepdims=True)
        return probs

# Train/test split
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, y, np.arange(X.shape[0]), test_size=0.25, random_state=42, stratify=y
)

model = MultinomialNBFromScratch(alpha=1.0)
model.fit(X_train, y_train)

# Predict & Evaluate
y_pred = model.predict(X_test)
accuracy = (y_pred == y_test).mean()
print(f"Accuracy: {accuracy:.4f}\n")
print("Classification report:")
print(classification_report(y_test, y_pred, zero_division=0))

cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
print("Confusion matrix (rows=true, cols=pred):")
print(pd.DataFrame(cm, index=model.classes_, columns=model.classes_))

# Error analysis
probs = model.predict_proba(X_test)
jll = model._joint_log_likelihood(X_test)

mis_idx = np.where(y_pred != y_test)[0]
print(f"\nNumber of misclassifications: {len(mis_idx)}\n")

def print_mis(idx_list, n=10, sort_by='margin_desc'):
    rows = []
    for j in idx_list:
        row = {
            'orig_index': idx_test[j],
            'text': df.loc[idx_test[j], text_col],
            'true': y_test[j],
            'pred': y_pred[j],
            'probs': probs[j]
        }
        # compute margin
        scores = jll[j]
        top, second = np.partition(scores, -2)[-2:], None
        sorted_scores = np.sort(scores)[::-1]
        margin = sorted_scores[0] - (sorted_scores[1]

                  if len(sorted_scores) >1 else 0.0)
        row['margin'] = margin
        rows.append(row)
    # sort
    if sort_by == 'margin_desc':
        rows = sorted(rows, key=lambda r: -r['margin'])
    elif sort_by == 'margin_asc':
        rows = sorted(rows, key=lambda r: r['margin'])
    for r in rows[:n]:
        print("Index:", r['orig_index'])
        print("Text:", r['text'])
        print("True:", r['true'], "| Pred:", r['pred'], "| Margin:",
              f"{r['margin']:.4f}")
        # show per-class probs with labels
        for cls, p in zip(model.classes_, r['probs']):
            print(f"   P({cls}) = {p:.3f}")
        print('-'*60)

print("Top confident (high margin) misclassifications:")
print_mis(mis_idx, n=5, sort_by='margin_desc')

print("\nTop uncertain misclassifications (low margin):")
print_mis(mis_idx, n=5, sort_by='margin_asc')

# show top words per class (inspect model)
def top_words_per_class(model, vocab, topn=15):
    word_array = np.array(vocab)
    for i,c in enumerate(model.classes_):
        top_idx = np.argsort(model.feature_log_prob_[i])[::-1][:topn]
        print(f"\nTop words for class '{c}':")
        print(word_array[top_idx][:topn])

top_words_per_class(model, vocab, topn=12)

Vocabulary size: 586
Accuracy: 0.9583

Classification report:
              precision    recall  f1-score   support

       False       0.92      1.00      0.96        11
        True       1.00      0.92      0.96        13

    accuracy                           0.96        24
   macro avg       0.96      0.96      0.96        24
weighted avg       0.96      0.96      0.96        24

Confusion matrix (rows=true, cols=pred):
       False  True
False     11     0
True       1    12

Number of misclassifications: 1

Top confident (high margin) misclassifications:
Index: 88
Text: @105860 I wish Amazon had an option of where I can just get it shipped to the ups store so I can avoid a lot of the struggle
True: True | Pred: False | Margin: 0.3961
   P(False) = 0.598
   P(True) = 0.402
------------------------------------------------------------

Top uncertain misclassifications (low margin):
Index: 88
Text: @105860 I wish Amazon had an option of where I can just get it shipped to the ups st

### Naive Bayes Classifier from Scratch: Process and Challenges

**Process:**

1.  **Data Preparation**: Load data, identify text/label columns, and handle missing values.
2.  **Preprocessing**: Clean text (lowercase, remove URLs/non-letters), tokenize, and apply stopword removal.
3.  **Vocabulary Building**: Construct vocabulary with optional frequency filtering.
4.  **Vectorization**: Represent documents as word count vectors (Bag-of-Words).
5.  **Model Implementation**: Implement Multinomial Naive Bayes including:
    *   Calculating class priors.
    *   Calculating smoothed conditional word probabilities per class.
    *   Computing joint log-likelihoods.
    *   Predicting class labels and probabilities.
6.  **Training & Evaluation**: Split data, train the model, and evaluate using accuracy, classification report, and confusion matrix.
7.  **Error Analysis**: Identify and analyze misclassified examples, considering prediction margin.
8.  **Model Inspection**: Analyze top words for each class to understand model behavior.

**Challenges:**

*   **Implementation Accuracy**: Ensuring correct mathematical implementation of Naive Bayes formulas.
*   **Zero Probabilities**: Handling unseen words during training using smoothing (`alpha`).
*   **Numerical Stability**: Using log probabilities to avoid underflow.
*   **Preprocessing Impact**: Selecting appropriate text cleaning and vocabulary building parameters.
*   **Debugging**: Verifying matrix operations and probabilistic calculations.