In [2]:
from collections import Counter
from typing import List, Tuple, Dict

# ----------------------------
# 1) Training corpus (given)
# ----------------------------
TRAINING_SENTENCES = [
    "<s> I love NLP </s>",
    "<s> I love deep learning </s>",
    "<s> deep learning is fun </s>",
]

# ----------------------------
# Helpers
# ----------------------------
def tokenize(sentence: str) -> List[str]:
    # Simple whitespace tokenization (works for this corpus)
    return sentence.strip().split()

def get_bigrams(tokens: List[str]) -> List[Tuple[str, str]]:
    return list(zip(tokens[:-1], tokens[1:]))

# ----------------------------
# 2) Compute unigram & bigram counts
# ----------------------------
def build_counts(corpus: List[str]) -> Tuple[Counter, Counter]:
    unigram_counts = Counter()
    bigram_counts = Counter()

    for sent in corpus:
        tokens = tokenize(sent)
        unigram_counts.update(tokens)
        bigram_counts.update(get_bigrams(tokens))

    return unigram_counts, bigram_counts

# ----------------------------
# 3) MLE bigram probabilities
#    P(w2 | w1) = count(w1,w2) / count(w1)
# ----------------------------
def mle_bigram_prob(w1: str, w2: str, unigram_counts: Counter, bigram_counts: Counter) -> float:
    c_bigram = bigram_counts[(w1, w2)]
    c_unigram = unigram_counts[w1]
    if c_unigram == 0:
        return 0.0
    return c_bigram / c_unigram

# ----------------------------
# 4) Sentence probability under bigram model
#    P(w1..wn) = Î  P(wi | w(i-1))
#    (No smoothing: any unseen bigram => prob = 0)
# ----------------------------
def sentence_probability(sentence: str, unigram_counts: Counter, bigram_counts: Counter) -> float:
    tokens = tokenize(sentence)
    prob = 1.0
    for w1, w2 in get_bigrams(tokens):
        p = mle_bigram_prob(w1, w2, unigram_counts, bigram_counts)
        prob *= p
    return prob

def explain_sentence(sentence: str, unigram_counts: Counter, bigram_counts: Counter) -> None:
    tokens = tokenize(sentence)
    print(f"\nSentence: {sentence}")
    for w1, w2 in get_bigrams(tokens):
        c = bigram_counts[(w1, w2)]
        p = mle_bigram_prob(w1, w2, unigram_counts, bigram_counts)
        print(f"  P({w2} | {w1}) = count({w1},{w2})/count({w1}) = {c}/{unigram_counts[w1]} = {p:.6f}")

# ----------------------------
# Main
# ----------------------------
def main():
    unigram_counts, bigram_counts = build_counts(TRAINING_SENTENCES)

    print("=== Unigram Counts ===")
    for tok, c in unigram_counts.most_common():
        print(f"{tok:>10} : {c}")

    print("\n=== Bigram Counts ===")
    for (w1, w2), c in bigram_counts.most_common():
        print(f"({w1:>4}, {w2:<8}) : {c}")

    # Test sentences (given)
    s1 = "<s> I love NLP </s>"
    s2 = "<s> I love deep learning </s>"

    # Show factor probabilities
    explain_sentence(s1, unigram_counts, bigram_counts)
    explain_sentence(s2, unigram_counts, bigram_counts)

    p1 = sentence_probability(s1, unigram_counts, bigram_counts)
    p2 = sentence_probability(s2, unigram_counts, bigram_counts)

    print("\n=== Sentence Probabilities (Bigram MLE, no smoothing) ===")
    print(f"P(s1) = P({s1}) = {p1:.10f}")
    print(f"P(s2) = P({s2}) = {p2:.10f}")

    # Preference
    if p1 > p2:
        preferred = "s1"
        why = "because its product of bigram MLE probabilities is larger."
    elif p2 > p1:
        preferred = "s2"
        why = "because its product of bigram MLE probabilities is larger."
    else:
        preferred = "tie"
        why = "because both sentences have the same probability under this model."

    print("\n=== Model Preference ===")
    if preferred == "tie":
        print("The model is indifferent (tie).")
    else:
        print(f"The model prefers {preferred}.")
    print(f"Reason: {why}")

    # A concrete "why" for this corpus:
    # In this corpus, the shared prefix bigrams are identical:
    # (<s>, I), (I, love)
    # The difference comes from the tail.
    print("\nExtra explanation:")
    print("Both sentences share the same prefix bigrams: (<s>, I) and (I, love).")
    print("So the preference is determined by the remaining bigrams in each sentence.")

if __name__ == "__main__":
    main()

=== Unigram Counts ===
       <s> : 3
      </s> : 3
         I : 2
      love : 2
      deep : 2
  learning : 2
       NLP : 1
        is : 1
       fun : 1

=== Bigram Counts ===
( <s>, I       ) : 2
(   I, love    ) : 2
(deep, learning) : 2
(love, NLP     ) : 1
( NLP, </s>    ) : 1
(love, deep    ) : 1
(learning, </s>    ) : 1
( <s>, deep    ) : 1
(learning, is      ) : 1
(  is, fun     ) : 1
( fun, </s>    ) : 1

Sentence: <s> I love NLP </s>
  P(I | <s>) = count(<s>,I)/count(<s>) = 2/3 = 0.666667
  P(love | I) = count(I,love)/count(I) = 2/2 = 1.000000
  P(NLP | love) = count(love,NLP)/count(love) = 1/2 = 0.500000
  P(</s> | NLP) = count(NLP,</s>)/count(NLP) = 1/1 = 1.000000

Sentence: <s> I love deep learning </s>
  P(I | <s>) = count(<s>,I)/count(<s>) = 2/3 = 0.666667
  P(love | I) = count(I,love)/count(I) = 2/2 = 1.000000
  P(deep | love) = count(love,deep)/count(love) = 1/2 = 0.500000
  P(learning | deep) = count(deep,learning)/count(deep) = 2/2 = 1.000000
  P(</s> | learning) 