In [6]:
import os
import re
import string
from collections import Counter, defaultdict
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import ngrams
import nltk
import numpy as np
from itertools import chain
import unicodedata
from dateutil import parser
import random
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Loading and Preprocessing Data

In [9]:
corpus_folder = r"C:\Users\User\Downloads\NLP\BPE_dataset\dataset"
test_folder = r"C:\Users\User\Downloads\NLP\BPE_test"

list_numbering_pattern = r'^\s*\(?\d+[\.\)\-_\*]+\s*'  
punctuation_pattern = r'[^\w\s:.,]' 

def remove_list_numbering(text):
    """ Remove list numbering and bullets while preserving content. """
    cleaned_lines = []
    extracted_numbers = []
    
    for line in text.split("\n"):
        match = re.match(list_numbering_pattern, line)
        if match:
            extracted_numbers.append(match.group()) 
            line = re.sub(list_numbering_pattern, '', line)  
        
        cleaned_lines.append(line.strip())
    
    return extracted_numbers, " ".join(cleaned_lines)

def normalize_text(text):
    """ Normalize accented characters and remove unnecessary punctuation. """
    text = ''.join(c for c in unicodedata.normalize('NFKD', text) if not unicodedata.combining(c))
    text = re.sub(punctuation_pattern, '', text)
    return text

def process_text_files(folder_path):
    """ Process all text files in a given folder and return cleaned corpus. """
    corpus = ""
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()
            except UnicodeDecodeError:
                with open(file_path, 'r', encoding='latin-1', errors='ignore') as f:
                    text = f.read()
            
            extracted_numbers, cleaned_text = remove_list_numbering(text)
            cleaned_text = normalize_text(cleaned_text)
            
            print(f"Extracted list numbers from {filename}: \n{extracted_numbers}")
            print("-" * 50)
            
            corpus += cleaned_text + " "
    
    return corpus

# Process both datasets
corpus = process_text_files(corpus_folder).lower()
corpus = corpus.replace(",", "").replace(".", "")
corpus = re.sub(r"\s+", " ", corpus).strip() 

test = process_text_files(test_folder)
test = test.replace(",", "").replace(".", "")
test = re.sub(r"\s+", " ", test).strip() 

print("Corpus and test dataset processing complete.")


Extracted list numbers from 100_Day11.txt: 
['1. ', '2. ', '3. ', '4. ', '5. ', '6. ', '7. ', '8. ', '9. ', '10. ']
--------------------------------------------------
Extracted list numbers from 104_Day11.txt: 
['1.\t', '2.\t', '3.\t', '4.\t', '5.\t', '6.\t', '7.\t', '8.\t', '9.\t']
--------------------------------------------------
Extracted list numbers from 125_Day1.txt: 
['1. ', '2. ', '3. ', '4. ', '5. ', '6. ']
--------------------------------------------------
Extracted list numbers from 125_Day14.txt: 
['1. ', '2. ', '3. ', '4. ', '5. ', '6. ', '7. ', '8. ', '9. ']
--------------------------------------------------
Extracted list numbers from 135_Day11.txt: 
['1. ', '2. ', '3. ', '4. ', '5. ', '6. ', '7. ', '8. ', '9. ']
--------------------------------------------------
Extracted list numbers from 158_Day3.txt: 
['1. ', '2. ', '3. ', '4. ', '5. ', '6. ', '7. ', '8. ', '9. ', '10. ']
--------------------------------------------------
Extracted list numbers from 158_Day7.txt: 
[

In [5]:
corpus[:1000]

'subha 5 bjhey uthna perha trip thaa jaldi jaldi ready hua aur 0540 ghar saay nikal gaay 0615 bus chal perhee joo kaay first time thaa kaay trip time saay chala nust kaay 3 larkay thy unsaay batain kee phir bluetooth trip coordinator kaay pass thaa tou humm shoor daltay rahay kaay song change krr dain phir murree mein nashtay kaay leya uthay phir usskay baad bluetooth humain mil gaya mein tou soogaya phir hum jaga prr phouch gaay udhar 3 peaks theen 2 perr char gaay aik prr sahi ghalat raastay saay gaay thy full steep thaa mujhey tou laga mein gaya peak prr pohouch gaya phir picks leen phir slow wala group bhee aa gaya thaa unn kaay sath dubara picks leen wapsi prr jeep mein bohot rash thaa phir dinner keya ghar gaay aur soogaau subah 8 bjy utha fresh hua nashta kiya aur university k liye tyaar honay laga 9 bjy apne bike pe university k liye nikal gaya aj friday ha aur aj mere sirf 2 classes huti hain university ponch k classes li pehla lecture nlp ka tha jis me bhut maza aata ha q k h

# Tokenization

In [13]:
tokens = list(word_tokenize(corpus))
len(tokens)

16238

In [15]:
tokens = []
for i in nltk.sent_tokenize(corpus):
    tokens.extend(list(word_tokenize(i)))
len(tokens)

16238

# Training n-gram models

In [18]:
bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))

lenbigram = 0
for gram in bigrams:
    # print(gram)
    lenbigram+=1
lenbigram

16237

In [20]:
token_count = Counter(tokens)
bigram_count = Counter(bigrams)
trigram_count = Counter(trigrams)
# trigram_count['chala', 'gaya', '.']
# trigram_count
bigram_count

Counter({('ke', 'baad'): 63,
         ('aur', 'phir'): 55,
         ('kiya', 'aur'): 49,
         ('ki', 'namaz'): 49,
         ('k', 'baad'): 46,
         ('chala', 'gaya'): 45,
         ('ki', 'aur'): 44,
         ('ke', 'liye'): 44,
         ('or', 'phir'): 42,
         ('khana', 'khaya'): 39,
         ('k', 'liye'): 32,
         ('namaz', 'parhi'): 31,
         ('k', 'sath'): 29,
         ('uske', 'baad'): 28,
         ('nashta', 'kiya'): 27,
         ('raat', 'ko'): 26,
         ('raha', 'tha'): 26,
         ('ke', 'sath'): 26,
         ('thodi', 'der'): 23,
         ('mein', 'ne'): 21,
         ('ki', 'class'): 21,
         ('so', 'gaya'): 20,
         ('ka', 'kaam'): 20,
         ('ke', 'saath'): 20,
         ('tha', 'to'): 20,
         ('ka', 'khana'): 19,
         ('kiya', 'phir'): 18,
         ('ho', 'gaya'): 18,
         ('utha', 'aur'): 18,
         ('ka', 'time'): 17,
         ('university', 'ka'): 17,
         ('ghar', 'walon'): 17,
         ('kay', 'baad'): 17,
         

In [22]:
# for bigram in bigrams:
#     bigram_probs = bigram_count[bigram] / token_count[bigram[0]]
#     # print(bigram, bigram_probs)

# for trigram in trigrams:
#     trigram_probs = trigram_count[trigram] / bigram_count[trigram[:2]]
#     # print(trigram, trigram[:2], trigram_probs)

total_tokens = sum(token_count.values())
unigram_probs = {token: count / total_tokens for token, count in token_count.items()}

bigram_probs = defaultdict()
for bigram in bigrams:
    bigram_probs[bigram] = bigram_count[bigram] / token_count[bigram[0]]

trigram_probs = defaultdict()
for trigram in trigrams:
    if bigram_count[trigram[:2]] > 0:  
        trigram_probs[trigram] = trigram_count[trigram] / bigram_count[trigram[:2]]
bigram_probs

defaultdict(None,
            {('subha', '5'): 0.047619047619047616,
             ('5', 'bjhey'): 0.06666666666666667,
             ('bjhey', 'uthna'): 1.0,
             ('uthna', 'perha'): 0.16666666666666666,
             ('perha', 'trip'): 1.0,
             ('trip', 'thaa'): 0.14285714285714285,
             ('thaa', 'jaldi'): 0.14285714285714285,
             ('jaldi', 'jaldi'): 0.18181818181818182,
             ('jaldi', 'ready'): 0.022727272727272728,
             ('ready', 'hua'): 0.3333333333333333,
             ('hua', 'aur'): 0.0967741935483871,
             ('aur', '0540'): 0.00211864406779661,
             ('0540', 'ghar'): 1.0,
             ('ghar', 'saay'): 0.006578947368421052,
             ('saay', 'nikal'): 0.3333333333333333,
             ('nikal', 'gaay'): 0.037037037037037035,
             ('gaay', '0615'): 0.1,
             ('0615', 'bus'): 1.0,
             ('bus', 'chal'): 0.03125,
             ('chal', 'perhee'): 0.16666666666666666,
             ('perhee', 'joo

# Generating Diary Entries

# Base Version

## Unigram

In [96]:
#weighted random
words, probs = list(unigram_probs.keys()), list(unigram_probs.values())  
prev_word = None  #store last word of prev sent

for _ in range(random.randint(3, 5)):  
    sent_len = random.randint(7, 12)  

    if prev_word and prev_word in words:  
        sentence = [prev_word]  #start with the last word of the previous sentence
        sent_len -= 1  
    else:
        sentence = [random.choices(words, weights=probs)[0]]  #start fresh if no valid transition

    sentence += random.choices(words, weights=probs, k=sent_len)  
    prev_word = sentence[-1]  #store the last word for the next sentence transition
    
    print(" ".join(sentence).capitalize() + ".")


Dair ki doston celebrate thi hue class hue.
Hue walk or phir jo attend paarh hai baje ki dekha liye.
Liye hamien karne gya khaya mein lekin.
Lekin markaz cafe break class mai mehmaan lekin deer.


In [114]:
# max
words, probs = list(unigram_probs.keys()), list(unigram_probs.values())  
prev_word = None 

for _ in range(random.randint(3, 5)):  
    sent_len = random.randint(7, 12)  

    if prev_word and prev_word in words:  
        sentence = [prev_word]  #start with the last word of the previous sentence
        sent_len -= 1  
    else:
        sentence = [max(unigram_probs, key=unigram_probs.get)]  #start fresh with the most freq unigram
        # sentence = [random.choices(words, weights=probs)[0]]  #start fresh if no valid transition
        
    sentence += [max(unigram_probs, key=unigram_probs.get) for _ in range(sent_len)]  # fill sentence with most probable words
    prev_word = sentence[-1] 

    print(" ".join(sentence).capitalize() + ".")


Aur aur aur aur aur aur aur aur aur.
Aur aur aur aur aur aur aur aur aur aur aur aur.
Aur aur aur aur aur aur aur.
Aur aur aur aur aur aur aur aur.


## Bigram 

In [118]:
# random
prev_word = None  
for _ in range(random.randint(3, 5)):  
    sent_len = random.randint(7, 12)  

    if prev_word:  
        #find possible next words using bigrams
        next_words = [b[1] for b in bigram_probs if b[0] == prev_word]
        start_word = random.choice(next_words) if next_words else random.choice(list(unigram_probs.keys()))  
    else:
        start_word = random.choice(list(unigram_probs.keys()))  #first sentence starts randomly

    sentence = [start_word]

    for _ in range(sent_len - 1):
        prev_word = sentence[-1]
        next_words = [b[1] for b in bigram_probs if b[0] == prev_word]

        if next_words:
            sentence.append(random.choice(next_words))  #randomly choose next word from bigrams
        else:
            break  #stop sentence if no valid bigram found

    prev_word = sentence[-1]  #store last word for next sent transition
    print(" ".join(sentence).capitalize() + ".")


3:00 baje pehle ek aur ek option.
Jo fast university 10:10 tak phir hamne asr.
Khatam tha koi kaam nai gaya agli class.
Kaafi acha uthne ki waja sy bohat sy bohat sy kam kar.
Bank ja na nlp ke teacher ne apnay bhai.


In [140]:
#max
prev_word = None  

for _ in range(random.randint(3, 5)):  
    sent_len = random.randint(7, 12)  

    if prev_word:  
        #find possible next words with probabilities
        next_words = [(b[1], bigram_probs[b]) for b in bigram_probs if b[0] == prev_word]
        start_word = max(next_words, key=lambda x: x[1])[0] if next_words else max(unigram_probs, key=unigram_probs.get)  
    else:
        start_word = max([(b[0], bigram_probs[b]) for b in bigram_probs], key=lambda x: x[1])[0]  

    sentence = [start_word]

    for _ in range(sent_len - 1):
        prev_word = sentence[-1]
        next_words = [(b[1], bigram_probs[b]) for b in bigram_probs if b[0] == prev_word]

        if next_words:
            sentence.append(max(next_words, key=lambda x: x[1])[0])  #pick most probable next word
        else:
            break  #stop sentence if no valid bigram found

    prev_word = sentence[-1]  #store last word for next sent transition
    print(" ".join(sentence).capitalize() + ".")


Bjhey uthna para aur phir hum ne khana khaya.
Aur phir hum ne khana khaya aur.
Phir hum ne khana khaya aur phir hum ne khana khaya aur.
Phir hum ne khana khaya aur phir hum ne khana khaya.
Aur phir hum ne khana khaya aur phir hum ne khana.


In [64]:
# backoff and max
num_sentences = random.randint(3, 5)  #random number of sentences
sentences = []  #store generated sentences

for i in range(num_sentences):
    sent_len = random.randint(7, 12)
    start_word = random.choice([b[0] for b in bigram_probs])
    sentence = [start_word]
    
    # Generate sentence using bigrams
    for _ in range(sent_len - 1):
        prev_word = sentence[-1]  #get last word in the sentence
    
        next_words = []
        #find all valid next words
        for b in bigram_probs:
            # print(b, b[0])
            if b[0] == prev_word:
               next_words.append((b[1], bigram_probs[b]) )
    
        if next_words:
            #pick the word with the highest probability
            next_word = max(next_words, key=lambda x: x[1])[0]
            sentence.append(next_word)
        else:
            # Backoff, choose a word based on unigram probabilities
            print("Backoff triggered")
            next_word = random.choices(list(unigram_probs.keys()), weights=unigram_probs.values())[0]

    
    print(" ".join(sentence).capitalize() +".")


Udhar 3 ghantay free hukr university ka kaam kiya aur.
Ki namaz parhi aur phir hum ne khana khaya aur phir hum.
Gaya aur phir hum ne khana khaya aur phir hum ne.
Hai aur phir hum ne khana khaya aur phir hum ne.


In [72]:
# fixed interpolation weight, backoff and max
lambda_interp = 0.7  # Interpolation weight, controls mix of bigram and unigram probabilities

num_sentences = random.randint(3, 5)  # Generate a random number of sentences
sentences = []  # List to store generated sentences

for i in range(num_sentences):
    sent_len = random.randint(7, 12)  # Random sentence length within a given range
    start_word = random.choice([b[0] for b in bigram_probs])  # Pick a random starting word from bigrams
    sentence = [start_word]  # Initialize sentence with the start word
    
    for _ in range(sent_len - 1):
        prev_word = sentence[-1]  # Get the last word in the sentence
        
        next_words = []  # List to store candidate next words with their probabilities
        
        # Find all valid bigram continuations
        for b in bigram_probs:
            if b[0] == prev_word:
                bigram_prob = bigram_probs[b]  
                unigram_prob = unigram_probs.get(b[1], 1e-6)  #prob of the word as a unigram with smoothing for unseen words
                interpolated_prob = lambda_interp * bigram_prob + (1 - lambda_interp) * unigram_prob  #interpolated prob
                
                next_words.append((b[1], interpolated_prob))  #store word and its interpolated prb

        if next_words:
            #select the word with the highest interpolated prob
            next_word = max(next_words, key=lambda x: x[1])[0]
        else:
            #backoff
            print("Backoff triggered")
            next_word = random.choices(list(unigram_probs.keys()), weights=unigram_probs.values())[0]
        
        sentence.append(next_word) 
    
    print(" ".join(sentence).capitalize() + ".") 


Isha ki namaz parhi aur phir hum ne khana.
Average hi nahi tha to mai na nashta kiya aur phir.
Baad mai na nashta kiya aur phir hum ne khana khaya.
Thi to mai na nashta kiya aur phir hum ne khana.


In [151]:
prev_word = None 
words, probs = list(unigram_probs.keys()), list(unigram_probs.values())  

for _ in range(random.randint(3, 5)):  
    sent_len = random.randint(7, 12)  

    if prev_word:  
        #find possible next words using bigram
        next_words = [b[1] for b in bigram_probs if b[0] == prev_word]
        start_word = random.choice(next_words) if next_words else random.choices(words, weights=probs)[0]  
    else:
        start_word = random.choices(words, weights=probs)[0]  #first sentence starts based on unigram probability

    sentence = [start_word]

    for _ in range(sent_len - 1):
        prev_word = sentence[-1]
        next_words = [b[1] for b in bigram_probs if b[0] == prev_word]

        if next_words:
            sentence.append(random.choice(next_words))  #pick randomly from valid bigrams
        else:
            sentence.append(random.choices(words, weights=probs)[0])  #backoff to unigram probability

    prev_word = sentence[-1]  #store last word for next sentence transition
    print(" ".join(sentence).capitalize() + ".")


Chuti hui tou humm shoor daltay rahay thei.
Keh diya tha uth keri sha ki routine likh.
Rha 7 bajay wo bas bed se reh.
Gaya masjid chala ke kaam niptaye ghar pohanchny k mene thori.
Daer kae maine jaldi ready hona tha bus.


In [159]:
# fixed interpolation weight, backoff and random
lambda_bigram = 0.7  #weight for bigram
lambda_unigram = 1 - lambda_bigram  #weight for unigram
prev_word = None  
words, probs = list(unigram_probs.keys()), list(unigram_probs.values())  

for _ in range(random.randint(3, 5)):  
    sent_len = random.randint(7, 12)  

    if prev_word:  
        #find possible next words using bigrams
        next_words = [(b[1], bigram_probs[b]) for b in bigram_probs if b[0] == prev_word]
        if next_words:
            start_word = random.choices([w for w, _ in next_words], weights=[p for _, p in next_words])[0]  #random from bigram
        else:
            start_word = random.choices(words, weights=probs)[0]  #backoff to unigram
    else:
        #first sentence starts with the most frequent first word in bigrams
        # start_word = max([(b[0], bigram_probs[b]) for b in bigram_probs], key=lambda x: x[1])[0]  
        start_word = random.choices(words, weights=probs)[0]  #first sentence starts based on unigram probability

    sentence = [start_word]

    for _ in range(sent_len - 1):
        prev_word = sentence[-1]
        next_words = [(b[1], bigram_probs[b]) for b in bigram_probs if b[0] == prev_word]

        if next_words:
            #interpolate bigram and unigram probabilities
            words_list, probs_list = zip(*next_words)  
            bigram_probs_norm = [p / sum(probs_list) for p in probs_list]  #normalize bigram probs
            unigram_probs_norm = [unigram_probs[w] / sum(probs) for w in words_list]  #normalize unigram probs

            #combine using fixed lambda
            final_probs = [(lambda_bigram * bp) + (lambda_unigram * up) for bp, up in zip(bigram_probs_norm, unigram_probs_norm)]
            sentence.append(random.choices(words_list, weights=final_probs)[0])  #sample from interpolated probs
        else:
            sentence.append(random.choices(words, weights=probs)[0])  #backoff to unigram

    prev_word = sentence[-1]  #store last word for next sentence transition
    print(" ".join(sentence).capitalize() + ".")


Ki kiunke aaj mai khana khaya aur mun dhoye.
Aur uske bd meri mama kay nikalnay tak classes huti.
Hain university k sath wo hua to doston.
Ka workload bhi dhoond liya thodi dair library ja raha.
Tha suba 6 bajay tak masjid mein khaya thory ghr wlon.


## Trigram

In [183]:
# random
prev_bigram = None  

for _ in range(random.randint(3, 5)):  
    sent_len = random.randint(7, 12)  

    if prev_bigram:  
        next_words = [(t[2], trigram_probs[t]) for t in trigram_probs if t[:2] == prev_bigram]
        if next_words:
            words, probs = zip(*next_words)
            start_word = random.choices(words, weights=probs)[0]  
        else:
            start_word = random.choice(list(bigram_probs.keys()))[1]  

        sentence = [prev_bigram[0], prev_bigram[1], start_word]  #ensure valid start
    else:
        start_bigram = random.choice(list(trigram_probs.keys()))[:2]
        sentence = [start_bigram[0], start_bigram[1]]  #start with valid bigram

    for _ in range(sent_len - len(sentence)):
        prev_bigram = tuple(sentence[-2:])
        next_words = [(t[2], trigram_probs[t]) for t in trigram_probs if t[:2] == prev_bigram]

        if next_words:
            words, probs = zip(*next_words)
            sentence.append(random.choices(words, weights=probs)[0])  
        else:
            break  

    prev_bigram = tuple(sentence[-2:])  
    print(" ".join(sentence).capitalize() + ".")


Ki dost jo apnay ghar kashmir gayi hui thi.
Hui thi tou mere ghar walo ne mughe.
Ne mughe wish kea online tohfay bhi bhejay packing mukamal.


In [196]:
# max
prev_bigram = None  

for _ in range(random.randint(3, 5)):  
    sent_len = random.randint(7, 12)  

    if prev_bigram:  
        next_words = [(t[2], trigram_probs[t]) for t in trigram_probs if t[:2] == prev_bigram]
        start_word = max(next_words, key=lambda x: x[1])[0] if next_words else max(bigram_probs, key=bigram_probs.get)[1]  
        sentence = [prev_bigram[0], prev_bigram[1], start_word]  #ensure valid start
    else:
        # start_bigram = max([(t[:2], trigram_probs[t]) for t in trigram_probs], key=lambda x: x[1])[0]
        start_bigram = random.choice(list(trigram_probs.keys()))[:2]        
        sentence = [start_bigram[0], start_bigram[1]]  #start with valid bigram

    for _ in range(sent_len - len(sentence)):
        prev_bigram = tuple(sentence[-2:])
        next_words = [(t[2], trigram_probs[t]) for t in trigram_probs if t[:2] == prev_bigram]

        if next_words:
            next_word = max(next_words, key=lambda x: x[1])[0]
            if next_word == prev_bigram[1]:  #prevent infinite loops
                break  
            sentence.append(next_word)
        else:
            break  

    prev_bigram = tuple(sentence[-2:])  #store last bigram for transition
    print(" ".join(sentence).capitalize() + ".")


Gym gya gym say a ker fresh howa.
Fresh howa namz magrib peri or kuch khaya piya 1.
Piya 1 ganta aram kiya phir asr ki namaz parhi.


In [200]:
# max + backoff
prev_bigram = None  

for _ in range(random.randint(3, 5)):  
    sent_len = random.randint(7, 12)  

    if prev_bigram:  
        next_words = [(t[2], trigram_probs[t]) for t in trigram_probs if t[:2] == prev_bigram]
        start_word = max(next_words, key=lambda x: x[1])[0] if next_words else max(bigram_probs, key=bigram_probs.get)[1]  
        sentence = [prev_bigram[0], prev_bigram[1], start_word]  
    else:
        # start_bigram = max([(t[:2], trigram_probs[t]) for t in trigram_probs], key=lambda x: x[1])[0]
        start_bigram = random.choice(list(trigram_probs.keys()))[:2]        
        sentence = [start_bigram[0], start_bigram[1]]  

    for _ in range(sent_len - len(sentence)):
        prev_bigram = tuple(sentence[-2:])
        next_words = [(t[2], trigram_probs[t]) for t in trigram_probs if t[:2] == prev_bigram]

        if next_words:
            sentence.append(max(next_words, key=lambda x: x[1])[0])  
        else:
            next_bigrams = [(b[1], bigram_probs[b]) for b in bigram_probs if b[0] == prev_bigram[1]]
            if next_bigrams:
                sentence.append(max(next_bigrams, key=lambda x: x[1])[0])  
            else:
                sentence.append(max(unigram_probs, key=unigram_probs.get))  

    prev_bigram = tuple(sentence[-2:])  
    print(" ".join(sentence).capitalize() + ".")


Aaj koi naya recipe try ki jo din.
Jo din ki tayari ki ab ham agla 3 ghantay free tha.
Free tha tou mujhay khud nashta banana pada.
Banana pada chota bhai ko bhi school drop kerna tha tou mujhay.


In [202]:
# fixed lambda interpolation , backoff , max
lambda3, lambda2, lambda1 = 0.7, 0.2, 0.1  

prev_bigram = None  

for _ in range(random.randint(3, 5)):  
    sent_len = random.randint(7, 12)  

    if prev_bigram:  
        next_words = [(t[2], trigram_probs[t] * lambda3) for t in trigram_probs if t[:2] == prev_bigram]
        start_word = max(next_words, key=lambda x: x[1])[0] if next_words else max(bigram_probs, key=bigram_probs.get)[1]  
        sentence = [prev_bigram[0], prev_bigram[1], start_word]  
    else:
        # start_bigram = max([(t[:2], trigram_probs[t]) for t in trigram_probs], key=lambda x: x[1])[0]
        start_bigram = random.choice(list(trigram_probs.keys()))[:2]
        sentence = [start_bigram[0], start_bigram[1]]  

    for _ in range(sent_len - len(sentence)):
        prev_bigram = tuple(sentence[-2:])
        next_words = [(t[2], trigram_probs[t] * lambda3) for t in trigram_probs if t[:2] == prev_bigram]

        if next_words:
            next_word = max(next_words, key=lambda x: x[1])[0]
        else:
            next_bigrams = [(b[1], bigram_probs[b] * lambda2) for b in bigram_probs if b[0] == prev_bigram[1]]
            if next_bigrams:
                next_word = max(next_bigrams, key=lambda x: x[1])[0]
            else:
                next_word = max(unigram_probs, key=lambda k: unigram_probs[k] * lambda1)

        sentence.append(next_word)

    prev_bigram = tuple(sentence[-2:])  
    print(" ".join(sentence).capitalize() + ".")


Kaafi dusty ho gaye meri pehli class.
Pehli class kaafi interesting thi lecture dhyan se suna important.
Suna important points note kiye aur phir so.
Phir so gya phir 10 am per dubara utha or mu hath.
Mu hath dho kar nashta kiya aur phir.


In [192]:
# random + backoff
prev_bigram = None  

for _ in range(random.randint(3, 5)):  
    sent_len = random.randint(7, 12)  

    if prev_bigram:  
        next_words = [(t[2], trigram_probs[t]) for t in trigram_probs if t[:2] == prev_bigram]
        if next_words:
            words, probs = zip(*next_words)
            start_word = random.choices(words, weights=probs)[0]  
        else:
            start_word = random.choice(list(bigram_probs.keys()))[1]  

        sentence = [prev_bigram[0], prev_bigram[1], start_word]  
    else:
        start_bigram = random.choice(list(trigram_probs.keys()))[:2]
        sentence = [start_bigram[0], start_bigram[1]]  

    for _ in range(sent_len - len(sentence)):
        prev_bigram = tuple(sentence[-2:])
        next_words = [(t[2], trigram_probs[t]) for t in trigram_probs if t[:2] == prev_bigram]

        if next_words:
            words, probs = zip(*next_words)
            next_word = random.choices(words, weights=probs)[0]  
        else:
            next_bigrams = [(b[1], bigram_probs[b]) for b in bigram_probs if b[0] == prev_bigram[1]]
            if next_bigrams:
                words, probs = zip(*next_bigrams)
                next_word = random.choices(words, weights=probs)[0]  
            else:
                next_word = random.choices(list(unigram_probs.keys()), weights=unigram_probs.values())[0]  

        sentence.append(next_word)

    prev_bigram = tuple(sentence[-2:])  
    print(" ".join(sentence).capitalize() + ".")


Mobile use kia aur phir nashta karnay neechay chali gayi.
Chali gayi baba lahore se aye hoye thay weekend kay liye tayar.
Liye tayar ho kar ami k sath jummah.
Sath jummah ki namaz ada ki or submit karvai is k baad.
K baad hamne khana khaya aur phir hum ghar ponchy ghar.


In [194]:
# fixed lambda interpolation, backoff, random
lambda3, lambda2, lambda1 = 0.7, 0.2, 0.1  

prev_bigram = None  

for _ in range(random.randint(3, 5)):  
    sent_len = random.randint(7, 12)  

    if prev_bigram:  
        next_words = [(t[2], trigram_probs[t] * lambda3) for t in trigram_probs if t[:2] == prev_bigram]
        if next_words:
            words, probs = zip(*next_words)
            start_word = random.choices(words, weights=probs)[0]  
        else:
            start_word = random.choice(list(bigram_probs.keys()))[1]  

        sentence = [prev_bigram[0], prev_bigram[1], start_word]  
    else:
        start_bigram = random.choice(list(trigram_probs.keys()))[:2]
        sentence = [start_bigram[0], start_bigram[1]]  

    for _ in range(sent_len - len(sentence)):
        prev_bigram = tuple(sentence[-2:])
        next_words = [(t[2], trigram_probs[t] * lambda3) for t in trigram_probs if t[:2] == prev_bigram]

        if next_words:
            words, probs = zip(*next_words)
            next_word = random.choices(words, weights=probs)[0]  
        else:
            next_bigrams = [(b[1], bigram_probs[b] * lambda2) for b in bigram_probs if b[0] == prev_bigram[1]]
            if next_bigrams:
                words, probs = zip(*next_bigrams)
                next_word = random.choices(words, weights=probs)[0]  
            else:
                words, probs = zip(*unigram_probs.items())
                next_word = random.choices(words, weights=[p * lambda1 for p in probs])[0]  

        sentence.append(next_word)

    prev_bigram = tuple(sentence[-2:])  
    print(" ".join(sentence).capitalize() + ".")


Tha isliye aram se uthi aur university chal diya wahan.
Diya wahan ja kr nascon room kay pass rkha uskay.
Rkha uskay bad humnay bohot si batain ki khana khaya.
Khana khaya aik dost ne poster design karne ke baad.
Ke baad kuch deyr phone istemaal kiya aur 8:30 wali.


# Backward Bigram Model

In [211]:
# max
prev_word = None  

for _ in range(random.randint(3, 5)):  
    sent_len = random.randint(7, 12)  

    if prev_word:  
        #find possible previous words based on the second word of the bigram
        prev_words = [(b[0], bigram_probs[b]) for b in bigram_probs if b[1] == prev_word]
        start_word = max(prev_words, key=lambda x: x[1])[0] if prev_words else max(unigram_probs, key=unigram_probs.get)  
    else:
        #first sentence starts with most frequent word
        start_word = max(unigram_probs, key=unigram_probs.get)  

    sentence = [start_word]  

    for _ in range(sent_len - 1):
        prev_word = sentence[-1]
        prev_words = [(b[0], bigram_probs[b]) for b in bigram_probs if b[1] == prev_word]

        if prev_words:
            sentence.append(max(prev_words, key=lambda x: x[1])[0])  #pick most probable previous word
        else:
            break  #stop if no valid bigram is found  

    prev_word = sentence[-1]  #store last word for next sentence transition
    print(" ".join(reversed(sentence)).capitalize() + ".")  #reverse before printing


Jagah dhoond liya 9:30 tk hm sb cousins ikatay huway aur.
Waja s khelne tio me taqrebban 4.
Prhai kee phir usskay baad hamen namaz miss hogyi jiski.
Kee phir thoridere prhai kee phir thoridere prhai kee phir thoridere.
Phir thoridere prhai kee phir thoridere prhai.


In [213]:
#random
prev_word = None  

for _ in range(random.randint(3, 5)):  
    sent_len = random.randint(7, 12)  

    if prev_word:  
        prev_words = [(b[0], bigram_probs[b]) for b in bigram_probs if b[1] == prev_word]
        if prev_words:
            words, probs = zip(*prev_words)
            start_word = random.choices(words, weights=probs)[0]  
        else:
            start_word = random.choices(list(unigram_probs.keys()), weights=unigram_probs.values())[0]  
    else:
        start_word = random.choices(list(unigram_probs.keys()), weights=unigram_probs.values())[0]  

    sentence = [start_word]  

    for _ in range(sent_len - 1):
        prev_word = sentence[-1]
        prev_words = [(b[0], bigram_probs[b]) for b in bigram_probs if b[1] == prev_word]

        if prev_words:
            words, probs = zip(*prev_words)
            sentence.append(random.choices(words, weights=probs)[0])  
        else:
            break  

    prev_word = sentence[-1]  
    print(" ".join(reversed(sentence)).capitalize() + ".")  


Dekha laptop khola proposal submit karvai is dauraan thoda pending assignments start.
Hamra samaan unpack kea online thori daer kae maine juch dair tv.
Se bezti hui uske baad mainy thori sardi bhi.
Gy qk aj mera alaaka meh coffee achay.
Khola proposal submit karvai is dauraan thoda hawa lene pohonch.


# Bidirectional Bigram Model

In [222]:
#max
prev_word = None  

for _ in range(random.randint(3, 5)):  
    sent_len = random.randint(7, 12)  

    #pick the most frequent unigram as the starting word
    start_word = max(unigram_probs, key=unigram_probs.get)  
    sentence_forward = [start_word]  
    sentence_backward = [start_word]  

    #generate forward sentence (left to right)
    for _ in range(sent_len // 2):
        prev_word = sentence_forward[-1]
        next_words = [(b[1], bigram_probs[b]) for b in bigram_probs if b[0] == prev_word]

        if next_words:
            sentence_forward.append(max(next_words, key=lambda x: x[1])[0])  
        else:
            break  

    #generate backward sentence (right to left)
    for _ in range(sent_len // 2):
        next_word = sentence_backward[0]  
        prev_words = [(b[0], bigram_probs[b]) for b in bigram_probs if b[1] == next_word]

        if prev_words:
            sentence_backward.insert(0, max(prev_words, key=lambda x: x[1])[0])  
        else:
            break  

    #merge backward and forward sentences
    full_sentence = sentence_backward[:-1] + sentence_forward  
    print(" ".join(full_sentence).capitalize() + ".")  


Hm sb cousins ikatay huway aur phir hum ne khana khaya.
Cousins ikatay huway aur phir hum ne.
Sb cousins ikatay huway aur phir hum ne khana.
Hm sb cousins ikatay huway aur phir hum ne khana khaya.


In [228]:
# random
#this generates sentences using both left-to-right and right-to-left bigram models with random selection
prev_word = None  

for _ in range(random.randint(3, 5)):  
    sent_len = random.randint(7, 12)  

    #pick a random starting word based on unigram probabilities
    start_word = random.choices(list(unigram_probs.keys()), weights=unigram_probs.values())[0]  
    sentence_forward = [start_word]  
    sentence_backward = [start_word]  

    #generate forward sentence (left to right)
    for _ in range(sent_len // 2):
        prev_word = sentence_forward[-1]
        next_words = [(b[1], bigram_probs[b]) for b in bigram_probs if b[0] == prev_word]

        if next_words:
            words, probs = zip(*next_words)
            sentence_forward.append(random.choices(words, weights=probs)[0])  
        else:
            break  

    #generate backward sentence (right to left)
    for _ in range(sent_len // 2):
        next_word = sentence_backward[0]  
        prev_words = [(b[0], bigram_probs[b]) for b in bigram_probs if b[1] == next_word]

        if prev_words:
            words, probs = zip(*prev_words)
            sentence_backward.insert(0, random.choices(words, weights=probs)[0])  
        else:
            break  

    #merge backward and forward sentences
    full_sentence = sentence_backward[:-1] + sentence_forward  
    print(" ".join(full_sentence).capitalize() + ".")  


Tesri alarm baji or bohat kuch khanay.
Upcoming exams ki chuhti thi to ham ne biryani thi ke.
Pehle agle hafte ka islye dobara neend aa k liye nikala wapis akr.


# Evaluating Model 

## Perplexity

In [24]:
import math
from collections import defaultdict
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

#tokenize test data
test_tokens = list(word_tokenize(test))

#unigram perplexity
def calculate_unigram_perplexity(test_tokens, unigram_probs):
    log_prob = 0
    N = len(test_tokens)

    for word in test_tokens:
        prob = unigram_probs.get(word, 1e-10)  #small value for unseen words
        log_prob += math.log(prob)
    
    return math.exp(-log_prob / N)

#bigram perplexity
def calculate_bigram_perplexity(test_tokens, bigram_probs, unigram_probs):
    log_prob = 0
    N = len(test_tokens)

    for i in range(1, N):
        bigram = (test_tokens[i-1], test_tokens[i])
        prob = bigram_probs.get(bigram, unigram_probs.get(test_tokens[i], 1e-10))  #backoff to unigram
        log_prob += math.log(prob)

    return math.exp(-log_prob / N)

#trigram perplexity
def calculate_trigram_perplexity(test_tokens, trigram_probs, bigram_probs, unigram_probs):
    log_prob = 0
    N = len(test_tokens)

    for i in range(2, N):
        trigram = (test_tokens[i-2], test_tokens[i-1], test_tokens[i])
        bigram = (test_tokens[i-1], test_tokens[i])
        unigram = test_tokens[i]

        if trigram in trigram_probs:
            prob = trigram_probs[trigram]
        elif bigram in bigram_probs:
            prob = bigram_probs[bigram]  #backoff to bigram
        else:
            prob = unigram_probs.get(unigram, 1e-10)  #backoff to unigram
        
        log_prob += math.log(prob)

    return math.exp(-log_prob / N)

#calculate perplexities
unigram_pp = calculate_unigram_perplexity(test_tokens, unigram_probs)
bigram_pp = calculate_bigram_perplexity(test_tokens, bigram_probs, unigram_probs)
trigram_pp = calculate_trigram_perplexity(test_tokens, trigram_probs, bigram_probs, unigram_probs)

print(f"Unigram Perplexity: {unigram_pp}")
print(f"Bigram Perplexity: {bigram_pp}")
print(f"Trigram Perplexity: {trigram_pp}")


Unigram Perplexity: 5917.763161660183
Bigram Perplexity: 1266.3712776217656
Trigram Perplexity: 861.7606020242736
