# CDA305 â€” Assignment 7
# Title: Naive Bayes Classifier
# Student: Rupesh Varma
# Roll no. 2312res547 
# Date: Nov. 11, 2025

In [78]:
from datasets import load_dataset
import random

# load sst2
ds = load_dataset("stanfordnlp/sst2", split="train")

# sample 800 examples 
sampled = ds.shuffle(seed=42).select(range(800))

# convert label 0->negative, 1->positive
def convert_label(x):
    return "positive" if x["label"] == 1 else "negative"

# write data.tsv
with open("data.tsv", "w", encoding="utf-8") as f:
    for item in sampled:
        label = convert_label(item)
        text = item["sentence"].replace("\t", " ") 
        f.write(f"{label}\t{text}\n")

print("Saved data.tsv!")


Saved data.tsv!


# Load Data

In [50]:
import csv

def load_data(path):
    texts, labels = [], []
    with open(path, "r", encoding="utf-8") as f:
        reader = csv.reader(f, delimiter="\t")
        for label, text in reader:
            texts.append(text)
            labels.append(label)
    return texts, labels


# Preprocessing & Vocabulary

In [52]:
import re
from collections import Counter, defaultdict
import numpy as np

def tokenize(text):
    text = text.lower()
    return re.findall(r"[a-z']+", text)

def build_vocab(texts):
    vocab = Counter()
    for t in texts:
        vocab.update(tokenize(t))
    return sorted(vocab.keys())


# Multinomial Naive Bayes (Word Counts)

In [54]:
class MultinomialNB:
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, texts, labels):
        self.labels = list(set(labels))
        self.vocab = build_vocab(texts)
        self.word_index = {w:i for i,w in enumerate(self.vocab)}

        # word count per class
        self.word_counts = {label: np.zeros(len(self.vocab)) for label in self.labels}
        self.class_counts = Counter(labels)

        for text, label in zip(texts, labels):
            for w in tokenize(text):
                if w in self.word_index:
                    idx = self.word_index[w]
                    self.word_counts[label][idx] += 1

        # total words per class
        self.total_words = {c: self.word_counts[c].sum() for c in self.labels}

        return self

    def predict_one(self, text):
        tokens = tokenize(text)
        log_probs = {}

        for c in self.labels:
            # prior
            log_prob = np.log(self.class_counts[c] / sum(self.class_counts.values()))

            for w in tokens:
                if w in self.word_index:
                    idx = self.word_index[w]
                    wc = self.word_counts[c][idx]
                    num = wc + self.alpha
                    den = self.total_words[c] + self.alpha*len(self.vocab)
                    log_prob += np.log(num/den)

            log_probs[c] = log_prob

        return max(log_probs, key=log_probs.get)

    def predict(self, texts):
        return [self.predict_one(t) for t in texts]


# Bernoulli Naive Bayes (Word Presence/Absence)

In [56]:
class BernoulliNB:
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, texts, labels):
        self.labels = list(set(labels))
        self.vocab = build_vocab(texts)
        self.word_index = {w:i for i,w in enumerate(self.vocab)}

        # presence count
        self.present = {c: np.zeros(len(self.vocab)) for c in self.labels}
        self.class_counts = Counter(labels)

        for text, label in zip(texts, labels):
            tokens = set(tokenize(text))
            for w in tokens:
                if w in self.word_index:
                    idx = self.word_index[w]
                    self.present[label][idx] += 1

        return self

    def predict_one(self, text):
        tokens = set(tokenize(text))
        log_probs = {}

        for c in self.labels:
            # prior
            log_prob = np.log(self.class_counts[c] / sum(self.class_counts.values()))

            for w, idx in self.word_index.items():
                present_count = self.present[c][idx]

                prob = (present_count + self.alpha) / \
                       (self.class_counts[c] + 2*self.alpha)

                if w in tokens:
                    log_prob += np.log(prob)
                else:
                    log_prob += np.log(1 - prob)

            log_probs[c] = log_prob

        return max(log_probs, key=log_probs.get)

    def predict(self, texts):
        return [self.predict_one(t) for t in texts]


# Cross-Validation (5-Fold)

In [58]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold

def cross_validate(ModelClass, texts, labels, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accs, f1s = [], []

    for train_idx, test_idx in kf.split(texts):
        X_train = [texts[i] for i in train_idx]
        X_test  = [texts[i] for i in test_idx]
        y_train = [labels[i] for i in train_idx]
        y_test  = [labels[i] for i in test_idx]

        model = ModelClass()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        accs.append(accuracy_score(y_test, preds))
        f1s.append(f1_score(y_test, preds, pos_label="positive"))

    return accs, f1s


# Run Evaluation

In [60]:
texts, labels = load_data("data.tsv")

In [61]:

print(" Multinomial Naive Bayes ")
acc_m, f1_m = cross_validate(MultinomialNB, texts, labels)
print("Accuracy:", acc_m)
print("Average accuracy:", sum(acc_m)/5)
print("F1-score:", f1_m)
print("Average F1:", sum(f1_m)/5)


 Multinomial Naive Bayes 
Accuracy: [0.65625, 0.6625, 0.675, 0.675, 0.68125]
Average accuracy: 0.67
F1-score: [0.7417840375586855, 0.7452830188679245, 0.7346938775510204, 0.7450980392156863, 0.7085714285714285]
Average F1: 0.7350860803529491


In [62]:
print("\n Bernoulli Naive Bayes ")
acc_b, f1_b = cross_validate(BernoulliNB, texts, labels)
print("Accuracy:", acc_b)
print("Average accuracy:", sum(acc_b)/5)
print("F1-score:", f1_b)
print("Average F1:", sum(f1_b)/5)


 Bernoulli Naive Bayes 
Accuracy: [0.60625, 0.63125, 0.58125, 0.68125, 0.60625]
Average accuracy: 0.6212500000000001
F1-score: [0.7319148936170212, 0.7510548523206751, 0.7148936170212766, 0.7866108786610879, 0.7174887892376681]
Average F1: 0.7403926061715458


## Cross-Validation Results

We performed **5-fold cross-validation** for both Multinomial Naive Bayes and Bernoulli Naive Bayes models. For each model, the accuracy and F1-score were calculated across all 5 folds.

### Multinomial Naive Bayes
- Accuracy (5 folds): 0.656, 0.662, 0.675, 0.675, 0.681  
- **Average Accuracy:** 0.67  
- F1-score (5 folds): 0.741, 0.745, 0.734, 0.745, 0.708  
- **Average F1-score:** 0.735  

### Bernoulli Naive Bayes
- Accuracy (5 folds): 0.606, 0.631, 0.581, 0.681, 0.606  
- **Average Accuracy:** 0.621  
- F1-score (5 folds): 0.731, 0.751, 0.714, 0.786, 0.717  
- **Average F1-score:** 0.740  

### Summary
The **Multinomial Naive Bayes** model achieved higher accuracy, while the **Bernoulli Naive Bayes** model produced a similar average F1-score. Overall, Multinomial NB performed slightly better for this text sentiment classification task.
curate result.
