<a href="https://colab.research.google.com/github/SRINIDHISAGI/NLP1/blob/main/confusion_metrix_and_Bigram_language_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

# Confusion matrix: rows=predicted, columns=actual
conf_matrix = np.array([
    [5, 10, 5],   # Cat predicted
    [15, 20, 10], # Dog predicted
    [0, 15, 10]   # Rabbit predicted
])

classes = ["Cat", "Dog", "Rabbit"]
TP = np.diag(conf_matrix)
FP = conf_matrix.sum(axis=1) - TP
FN = conf_matrix.sum(axis=0) - TP

# Per-class precision and recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)

# Macro-averaged
macro_precision = precision.mean()
macro_recall = recall.mean()

# Micro-averaged
TP_total = TP.sum()
FP_total = FP.sum()
FN_total = FN.sum()
micro_precision = TP_total / (TP_total + FP_total)
micro_recall = TP_total / (TP_total + FN_total)

# Print results
print("Per-class Metrics:")
for i, cls in enumerate(classes):
    print(f"{cls}: Precision = {precision[i]:.3f}, Recall = {recall[i]:.3f}")

print("\nMacro-averaged Metrics:")
print(f"Precision = {macro_precision:.3f}, Recall = {macro_recall:.3f}")

print("\nMicro-averaged Metrics:")
print(f"Precision = {micro_precision:.3f}, Recall = {micro_recall:.3f}")

Per-class Metrics:
Cat: Precision = 0.250, Recall = 0.250
Dog: Precision = 0.444, Recall = 0.444
Rabbit: Precision = 0.400, Recall = 0.400

Macro-averaged Metrics:
Precision = 0.365, Recall = 0.365

Micro-averaged Metrics:
Precision = 0.389, Recall = 0.389


In [2]:
from collections import defaultdict

# Training corpus
corpus = [
    " I love NLP ",
    " I love deep learning ",
    " deep learning is fun "
]

# ----------------------------------------------------------
# 1. Tokenize corpus
# ----------------------------------------------------------
tokenized = [sentence.split() for sentence in corpus]

# ----------------------------------------------------------
# 2. Count unigrams and bigrams
# ----------------------------------------------------------
unigram_counts = defaultdict(int)
bigram_counts = defaultdict(int)

for sent in tokenized:
    for i, word in enumerate(sent):
        unigram_counts[word] += 1
        if i < len(sent) - 1:
            bigram = (sent[i], sent[i+1])
            bigram_counts[bigram] += 1

# ----------------------------------------------------------
# 3. Compute bigram probabilities (MLE)
# P(w_i | w_{i-1}) = count(w_{i-1}, w_i) / count(w_{i-1})
# ----------------------------------------------------------
bigram_probs = {}
for (w1, w2), count in bigram_counts.items():
    bigram_probs[(w1, w2)] = count / unigram_counts[w1]

# ----------------------------------------------------------
# 4. Function to compute sentence probability
# ----------------------------------------------------------
def sentence_probability(sentence, bigram_probs, unigram_counts):
    words = sentence.split()
    prob = 1.0
    for i in range(len(words) - 1):
        w1, w2 = words[i], words[i+1]
        if (w1, w2) in bigram_probs:
            prob *= bigram_probs[(w1, w2)]
        else:
            prob *= 0  # unseen bigram → probability 0 under MLE
    return prob

# ----------------------------------------------------------
# 5. Test sentences
# ----------------------------------------------------------
s1 = " I love NLP "
s2 = " I love deep learning "

p1 = sentence_probability(s1, bigram_probs, unigram_counts)
p2 = sentence_probability(s2, bigram_probs, unigram_counts)

print("P(S1 =", s1, ") =", p1)
print("P(S2 =", s2, ") =", p2)

# ----------------------------------------------------------
# 6. Model preference
# ----------------------------------------------------------
if p1 > p2:
    print("Model prefers Sentence 1:", s1, "because it has higher probability.")
elif p2 > p1:
    print("Model prefers Sentence 2:", s2, "because it has higher probability.")
else:
    print("Model considers both sentences equally probable.")


P(S1 =  I love NLP  ) = 0.5
P(S2 =  I love deep learning  ) = 0.5
Model considers both sentences equally probable.
