### Task: 0
Build an N-gram language model

In [2]:
import nltk
from collections import defaultdict, Counter

corpus = """
Alice was beginning to get very tired of sitting by her sister on the bank,
and of having nothing to do. Once or twice she had peeped into the book her sister was reading,
but it had no pictures or conversations in it, and what is the use of a book, thought Alice without pictures or conversations?
"""

tokens = nltk.word_tokenize(corpus.lower())

def build_ngram(tokens, n):
    model = defaultdict(Counter)
    for i in range(len(tokens) - n + 1):
        context = tuple(tokens[i:i+n-1])
        next_word = tokens[i+n-1]
        model[context][next_word] += 1
    return model

bigram_model = build_ngram(tokens, 2)
trigram_model = build_ngram(tokens, 3)

print("Bigram model (sample):")
for ctx, cnt in list(bigram_model.items())[:5]:
    print(f"{ctx} -> {dict(cnt)}")

print("\nTrigram model (sample):")
for ctx, cnt in list(trigram_model.items())[:5]:
    print(f"{ctx} -> {dict(cnt)}")

Bigram model (sample):
('alice',) -> {'was': 1, 'without': 1}
('was',) -> {'beginning': 1, 'reading': 1}
('beginning',) -> {'to': 1}
('to',) -> {'get': 1, 'do': 1}
('get',) -> {'very': 1}

Trigram model (sample):
('alice', 'was') -> {'beginning': 1}
('was', 'beginning') -> {'to': 1}
('beginning', 'to') -> {'get': 1}
('to', 'get') -> {'very': 1}
('get', 'very') -> {'tired': 1}


### Task 1:
Compare bi- and tri-gram models

In [4]:
def predict_next(model, context):
    if context in model:
        return model[context].most_common(1)[0][0]
    else:
        return None

context_bigram = ("alice",)
context_trigram = ("alice", "was")

print("\nBigram prediction for 'alice':", predict_next(bigram_model, context_bigram))
print("Trigram prediction for 'alice was':", predict_next(trigram_model, context_trigram))


Bigram prediction for 'alice': was
Trigram prediction for 'alice was': beginning


### Task 2:
Apply interpolation/backoff to your model so that it can better handle unknown words/prompts.

In [6]:
import random
def backoff_predict(context):
    if len(context) == 2 and context in trigram_model:
        return trigram_model[context].most_common(1)[0][0]
    elif len(context) >= 1 and context[-1:] in bigram_model:
        return bigram_model[context[-1:]].most_common(1)[0][0]
    else:
        return random.choice(tokens)

# Example with missing context
context_example = ("tired", "of")
print("\nBackoff prediction for 'tired of':", backoff_predict(context_example))


Backoff prediction for 'tired of': sitting


### Task 3: Generate a sentence

In [8]:
def generate_sentence(prompt, length=10):
    generated = list(prompt)
    for _ in range(length):
        ctx = tuple(generated[-2:]) if len(generated) >= 2 else tuple(generated[-1:])
        next_word = backoff_predict(ctx)
        generated.append(next_word)
    return " ".join(generated)

prompt = ["alice", "was"]
generated_sentence = generate_sentence(prompt, length=10)
print("\nGenerated sentence:")
print(generated_sentence)


Generated sentence:
alice was beginning to get very tired of sitting by her sister
