In [1]:
from collections import defaultdict

# --------------------------------------------------
# 1. Training corpus
# --------------------------------------------------
corpus = [
    ["<s>", "I", "love", "NLP", "</s>"],
    ["<s>", "I", "love", "deep", "learning", "</s>"],
    ["<s>", "deep", "learning", "is", "fun", "</s>"]
]

# --------------------------------------------------
# 2. Compute unigram and bigram counts
# --------------------------------------------------
unigramCounts = defaultdict(int)
bigramCounts = defaultdict(int)

for sentence in corpus:
    for i in range(len(sentence)):
        unigramCounts[sentence[i]] += 1
        
        if i < len(sentence) - 1:
            bigram = (sentence[i], sentence[i+1])
            bigramCounts[bigram] += 1

print("Unigram Counts:")
for word, count in unigramCounts.items():
    print(word, ":", count)

print("\nBigram Counts:")
for bigram, count in bigramCounts.items():
    print(bigram, ":", count)

# --------------------------------------------------
# 3. Estimate bigram probabilities using MLE
# --------------------------------------------------
bigramProbs = {}

for (w1, w2), count in bigramCounts.items():
    bigramProbs[(w1, w2)] = count / unigramCounts[w1]

print("\nBigram Probabilities (MLE):")
for bigram, prob in bigramProbs.items():
    print(bigram, ":", round(prob,3))

# --------------------------------------------------
# 4. Function to compute sentence probability
# --------------------------------------------------
def sentenceProbability(sentence):
    prob = 1.0
    for i in range(len(sentence)-1):
        bigram = (sentence[i], sentence[i+1])
        
        if bigram in bigramProbs:
            prob *= bigramProbs[bigram]
        else:
            return 0  # unseen bigram
            
    return prob

# --------------------------------------------------
# 5. Test sentences
# --------------------------------------------------
s1 = ["<s>", "I", "love", "NLP", "</s>"]
s2 = ["<s>", "I", "love", "deep", "learning", "</s>"]

p1 = sentenceProbability(s1)
p2 = sentenceProbability(s2)

print("\nSentence Probabilities:")
print("P(S1) =", p1)
print("P(S2) =", p2)

# --------------------------------------------------
# 6. Model preference
# --------------------------------------------------
if p1 > p2:
    print("\nModel prefers S1 because it has higher probability.")
elif p2 > p1:
    print("\nModel prefers S2 because it has higher probability.")
else:
    print("\nBoth sentences have equal probability.")


Unigram Counts:
<s> : 3
I : 2
love : 2
NLP : 1
</s> : 3
deep : 2
learning : 2
is : 1
fun : 1

Bigram Counts:
('<s>', 'I') : 2
('I', 'love') : 2
('love', 'NLP') : 1
('NLP', '</s>') : 1
('love', 'deep') : 1
('deep', 'learning') : 2
('learning', '</s>') : 1
('<s>', 'deep') : 1
('learning', 'is') : 1
('is', 'fun') : 1
('fun', '</s>') : 1

Bigram Probabilities (MLE):
('<s>', 'I') : 0.667
('I', 'love') : 1.0
('love', 'NLP') : 0.5
('NLP', '</s>') : 1.0
('love', 'deep') : 0.5
('deep', 'learning') : 1.0
('learning', '</s>') : 0.5
('<s>', 'deep') : 0.333
('learning', 'is') : 0.5
('is', 'fun') : 1.0
('fun', '</s>') : 1.0

Sentence Probabilities:
P(S1) = 0.3333333333333333
P(S2) = 0.16666666666666666

Model prefers S1 because it has higher probability.
