<a href="https://colab.research.google.com/github/Sammodi0711/NLP-Sem-1/blob/main/nlp_assignment_05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from collections import defaultdict
import math

emails = [
    ("Win money now", "Spam"),
    ("Lowest price guaranteed", "Spam"),
    ("Cheap meds available", "Spam"),
    ("Hello friend how are you", "Ham"),
    ("Let’s have lunch tomorrow", "Ham"),
    ("Meeting schedule attached", "Ham"),
    ("Win a free lottery ticket", "Spam"),
    ("See you at the conference", "Ham"),
    ("Project deadline reminder", "Ham"),
    ("Cheap loans available", "Spam"),
]
def tokenize(text):
    return text.lower().split()

spam_words = []
ham_words = []
for text, label in emails:
    if label == "Spam":
        spam_words.extend(tokenize(text))
    else:
        ham_words.extend(tokenize(text))

vocab = set(spam_words + ham_words)

spam_count = defaultdict(int)
ham_count = defaultdict(int)
for word in spam_words:
    spam_count[word] += 1
for word in ham_words:
    ham_count[word] += 1

total_spam_words = len(spam_words)
total_ham_words = len(ham_words)

p_spam = sum(1 for _, label in emails if label == "Spam") / len(emails)
p_ham = 1 - p_spam

def likelihood(word, label):
    V = len(vocab)
    if label == "Spam":
        return (spam_count[word] + 1) / (total_spam_words + V)
    else:
        return (ham_count[word] + 1) / (total_ham_words + V)

def classify(text):
    words = tokenize(text)

    log_prob_spam = math.log(p_spam)
    log_prob_ham = math.log(p_ham)

    for w in words:
        log_prob_spam += math.log(likelihood(w, "Spam"))
        log_prob_ham += math.log(likelihood(w, "Ham"))

    prob_spam = math.exp(log_prob_spam)
    prob_ham = math.exp(log_prob_ham)

    total = prob_spam + prob_ham
    return prob_spam/total, prob_ham/total

test_email = "cheap price now"
p_spam, p_ham = classify(test_email)

print(f"Email: '{test_email}'")
print(f"P(Spam | Email) = {p_spam:.3f}")
print(f"P(Ham  | Email) = {p_ham:.3f}")
print("Classification:", "Spam" if p_spam > p_ham else "Ham")

Email: 'cheap price now'
P(Spam | Email) = 0.935
P(Ham  | Email) = 0.065
Classification: Spam


In [None]:
import re
import math
from collections import defaultdict

emails = [
    ("Win money now", "Spam"),
    ("Lowest price guaranteed", "Spam"),
    ("Cheap meds available", "Spam"),
    ("Hello friend how are you", "Ham"),
    ("Let’s have lunch tomorrow", "Ham"),
    ("Meeting schedule attached", "Ham"),
    ("Win a free lottery ticket", "Spam"),
    ("See you at the conference", "Ham"),
    ("Project deadline reminder", "Ham"),
    ("Cheap loans available", "Spam"),
]
stopwords = {"the", "is", "at", "a", "an", "how", "are", "let’s", "have"}

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

documents = [(preprocess(text), label) for text, label in emails]

train_data = documents[:8]
test_data = documents[8:]

vocab = set()
for words, _ in train_data:
    vocab.update(words)
V = len(vocab)

word_count_spam = defaultdict(int)
word_count_ham = defaultdict(int)
spam_docs = ham_docs = 0

for words, label in train_data:
    if label == "Spam":
        spam_docs += 1
        for w in words:
            word_count_spam[w] += 1
    else:
        ham_docs += 1
        for w in words:
            word_count_ham[w] += 1

total_spam_words = sum(word_count_spam.values())
total_ham_words = sum(word_count_ham.values())

p_spam = spam_docs / len(train_data)
p_ham = ham_docs / len(train_data)

def likelihood(word, label):
    if label == "Spam":
        return (word_count_spam[word] + 1) / (total_spam_words + V)
    else:
        return (word_count_ham[word] + 1) / (total_ham_words + V)

def classify(words):
    log_prob_spam = math.log(p_spam)
    log_prob_ham = math.log(p_ham)

    for w in words:
        log_prob_spam += math.log(likelihood(w, "Spam"))
        log_prob_ham += math.log(likelihood(w, "Ham"))

    return "Spam" if log_prob_spam > log_prob_ham else "Ham"

correct = 0
for words, label in test_data:
    prediction = classify(words)
    print(f"Email: {' '.join(words)} | Actual: {label} | Predicted: {prediction}")
    if prediction == label:
        correct += 1

accuracy = correct / len(test_data)
print("\nAccuracy:", accuracy)

Email: project deadline reminder | Actual: Ham | Predicted: Ham
Email: cheap loans available | Actual: Spam | Predicted: Spam

Accuracy: 1.0


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

emails = [
    ("Win money now", "Spam"),
    ("Lowest price guaranteed", "Spam"),
    ("Cheap meds available", "Spam"),
    ("Hello friend how are you", "Ham"),
    ("Let’s have lunch tomorrow", "Ham"),
    ("Meeting schedule attached", "Ham"),
    ("Win a free lottery ticket", "Spam"),
    ("See you at the conference", "Ham"),
    ("Project deadline reminder", "Ham"),
    ("Cheap loans available", "Spam"),
]

texts = [t for t, _ in emails]
labels = [l for _, l in emails]

X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)
vectorizer = CountVectorizer(stop_words="english")
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

model = MultinomialNB()
model.fit(X_train_bow, y_train)

y_pred = model.predict(X_test_bow)
acc = accuracy_score(y_test, y_pred)

print("Predictions:", y_pred)
print("Actual:", y_test)
print("Accuracy:", acc)

Predictions: ['Ham' 'Ham']
Actual: ['Ham', 'Spam']
Accuracy: 0.5
