<a href="https://colab.research.google.com/github/Nirmalbhatii/CMCA546_NLP/blob/main/S25MCAG0039_LAB5_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from collections import defaultdict
import math


emails = [
    ("Win money now", "spam"),
    ("Lowest price guaranteed", "spam"),
    ("Cheap meds available", "spam"),
    ("Hello friend how are you", "ham"),
    ("Let’s have lunch tomorrow", "ham"),
    ("Meeting schedule attached", "ham"),
    ("Win a free lottery ticket", "spam"),
    ("See you at the conference", "ham"),
    ("Project deadline reminder", "ham"),
    ("Cheap loans available", "spam")
]


def tokenize(text):
    return text.lower().split()


word_counts = {"spam": defaultdict(int), "ham": defaultdict(int)}
class_counts = {"spam": 0, "ham": 0}
vocab = set()

for text, label in emails:
    class_counts[label] += 1
    for word in tokenize(text):
        word_counts[label][word] += 1
        vocab.add(word)

V = len(vocab)

def calculate_posterior(test_words):
    posteriors = {}
    for label in ["spam", "ham"]:

        log_prob = math.log(class_counts[label] / len(emails))
        total_words = sum(word_counts[label].values())

        for w in test_words:
            count = word_counts[label][w]
            log_prob += math.log((count + 1) / (total_words + V))
        posteriors[label] = log_prob
    return posteriors

test_email = ["cheap", "price", "now"]
posterior = calculate_posterior(test_email)

print("Posterior Probabilities:")
for label in posterior:
    print(label, math.exp(posterior[label]))


Posterior Probabilities:
spam 4.8000000000000015e-05
ham 3.358477132129207e-06


In [2]:
class NaiveBayesClassifier:
    def __init__(self):
        self.word_counts = {"spam": defaultdict(int), "ham": defaultdict(int)}
        self.class_counts = {"spam": 0, "ham": 0}
        self.vocab = set()

    def train(self, data):
        for text, label in data:
            self.class_counts[label] += 1
            for word in tokenize(text):
                self.word_counts[label][word] += 1
                self.vocab.add(word)
        self.V = len(self.vocab)

    def predict(self, text):
        words = tokenize(text)
        scores = {}
        for label in ["spam", "ham"]:
            log_prob = math.log(self.class_counts[label] / sum(self.class_counts.values()))
            total_words = sum(self.word_counts[label].values())
            for w in words:
                count = self.word_counts[label][w]
                log_prob += math.log((count + 1) / (total_words + self.V))
            scores[label] = log_prob
        return max(scores, key=scores.get)


nb = NaiveBayesClassifier()
nb.train(emails)


test_samples = [
    "win a lottery",
    "project meeting tomorrow",
    "cheap price now"
]
for t in test_samples:
    print(t, "->", nb.predict(t))


win a lottery -> spam
project meeting tomorrow -> ham
cheap price now -> spam


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

texts = [t for t, _ in emails]
labels = [l for _, l in emails]


vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)


clf = MultinomialNB()
clf.fit(X, labels)


y_pred = clf.predict(X)
print("Accuracy:", accuracy_score(labels, y_pred))


test = ["cheap price now"]
print("Prediction:", clf.predict(vectorizer.transform(test))[0])


Accuracy: 1.0
Prediction: spam
