In [1]:
from collections import defaultdict

# -----------------------------
# 1. Training Corpus
# -----------------------------
corpus = [
    "<s> I love NLP </s>",
    "<s> I love deep learning </s>",
    "<s> deep learning is fun </s>"
]

# -----------------------------
# 2. Compute Unigram & Bigram Counts
# -----------------------------
unigram_counts = defaultdict(int)
bigram_counts = defaultdict(int)

for sentence in corpus:
    words = sentence.split()
    for i in range(len(words)):
        unigram_counts[words[i]] += 1
        if i < len(words) - 1:
            bigram = (words[i], words[i+1])
            bigram_counts[bigram] += 1

print("Unigram Counts:")
for word, count in unigram_counts.items():
    print(f"{word}: {count}")

print("\nBigram Counts:")
for bigram, count in bigram_counts.items():
    print(f"{bigram}: {count}")

# -----------------------------
# 3. Bigram Probability (MLE)
# -----------------------------
def bigram_prob(w1, w2):
    if unigram_counts[w1] == 0:
        return 0
    return bigram_counts[(w1, w2)] / unigram_counts[w1]

# -----------------------------
# 4. Sentence Probability Function
# -----------------------------
def sentence_probability(sentence):
    words = sentence.split()
    prob = 1.0
    for i in range(len(words) - 1):
        prob *= bigram_prob(words[i], words[i+1])
    return prob

# -----------------------------
# 5. Test Sentences
# -----------------------------
s1 = "<s> I love NLP </s>"
s2 = "<s> I love deep learning </s>"

p1 = sentence_probability(s1)
p2 = sentence_probability(s2)

print("\nSentence Probabilities:")
print(f"P(S1) = {p1:.4f}")
print(f"P(S2) = {p2:.4f}")

# -----------------------------
# 6. Model Preference
# -----------------------------
if p1 > p2:
    print("\nModel prefers S1 because it has higher probability.")
elif p2 > p1:
    print("\nModel prefers S2 because it has higher probability.")
else:
    print("\nBoth sentences have equal probability.")

Unigram Counts:
<s>: 3
I: 2
love: 2
NLP: 1
</s>: 3
deep: 2
learning: 2
is: 1
fun: 1

Bigram Counts:
('<s>', 'I'): 2
('I', 'love'): 2
('love', 'NLP'): 1
('NLP', '</s>'): 1
('love', 'deep'): 1
('deep', 'learning'): 2
('learning', '</s>'): 1
('<s>', 'deep'): 1
('learning', 'is'): 1
('is', 'fun'): 1
('fun', '</s>'): 1

Sentence Probabilities:
P(S1) = 0.3333
P(S2) = 0.1667

Model prefers S1 because it has higher probability.
