In [None]:
import pandas as pd

unigram_df = pd.read_csv("unigram.csv")
bigram_df = pd.read_csv("bigram.csv")
trigram_df = pd.read_csv("trigram.csv")
quadrigram_df = pd.read_csv("quadrigram.csv")

print("Unigram sample:")
display(unigram_df.head())


Unigram sample:


Unnamed: 0,Ngram,Count,Raw,Add-One,Add-K,Token-Type
0,.,2039988,0.078923,0.076459,0.077426,4e-06
1,<s>,1812421,0.070119,0.06793,0.068789,4e-06
2,</s>,1812421,0.070119,0.06793,0.068789,4e-06
3,‌,967629,0.037436,0.036267,0.036726,3e-06
4,",",710485,0.027487,0.026629,0.026966,2e-06


In [None]:
import random, re

bbc_df = pd.read_csv("train_telugu_news.csv")

sentences = []
for text in bbc_df['body'].dropna().tolist():
    sents = re.findall(r'[^।.!?]+[।.!?]', text)

    sents = [f"<s> {s.strip()} </s>" for s in sents if len(s.strip()) > 0]
    sentences.extend(sents)

random.seed(42)
sampled_sentences = random.sample(sentences, min(1000, len(sentences)))

print("Total sampled sentences:", len(sampled_sentences))
print("Example sentence:", sampled_sentences[0])

Total sampled sentences: 1000
Example sentence: <s> బాలీవుడ్‌లో మాఫియా అన్న పదం వినిపిస్తే చాలు అందరి కళ్లూ సంజయ్ దత్ వైపే చూస్తాయి. </s>


In [None]:
import numpy as np

def sentence_logprob(sentence, model, smoothing='Add-One'):
    """
    Compute log probability of a sentence using a given n-gram model.
    
    sentence: str
    model: DataFrame with columns [Ngram, Raw, Add-One, Add-K, Token-Type]
    smoothing: which probability column to use
    """
    tokens = sentence.split()
    log_prob = 0.0
    
    n = len(model.iloc[0]['Ngram'].split())
    
    for i in range(len(tokens)):
        if i+1 < n:
            continue
        ngram = ' '.join(tokens[i-n+1:i+1])
        row = model[model['Ngram'] == ngram]
        
        if row.empty:
            log_prob += np.log(1e-12)  
        else:
            p = row.iloc[0][smoothing]
            log_prob += np.log(p if p > 0 else 1e-12)
    
    return log_prob


In [5]:
results = []
for s in sampled_sentences[:10]:
    res = {
        'Sentence': s,
        'Unigram_AddOne': sentence_logprob(s, unigram_df, 'Add-One'),
        'Bigram_AddOne': sentence_logprob(s, bigram_df, 'Add-One'),
        'Trigram_AddOne': sentence_logprob(s, trigram_df, 'Add-One'),
        'Quadrigram_AddOne': sentence_logprob(s, quadrigram_df, 'Add-One')
    }
    results.append(res)

pd.DataFrame(results)


Unnamed: 0,Sentence,Unigram_AddOne,Bigram_AddOne,Trigram_AddOne,Quadrigram_AddOne
0,<s> బాలీవుడ్‌లో మాఫియా అన్న పదం వినిపిస్తే చాల...,-170.549231,-329.74715,-331.572253,-303.941232
1,<s> మరిన్ని వివరాలు తెలియాల్సి ఉంది. </s>,-62.359425,-90.465366,-78.968455,-69.430591
2,<s> లేకుంటే ఆయనపై చెప్పుల దాడికి కూడా వెనుకాడబ...,-114.826491,-244.338134,-248.67919,-221.048169
3,<s> ' అంటూ సాగిపోతున్న ఈ పాట ఫ్యామిలీ ఆడియన్స్...,-122.377517,-223.406164,-248.67919,-221.048169
4,"<s> వాళ్లని సూటిగా గుర్తిస్తారు ఇతర పాత్రలూ, క...",-124.853547,-234.780327,-221.048169,-193.417148
5,<s> కాంగ్రె్‌సతో తన జీవిత కాలంపాటు ములాయం పోరా...,-101.826326,-178.808043,-165.786127,-138.155106
6,<s> మరిన్ని యూనిట్లు ఏర్పాటు చేసేందుకు ఔత్సాహి...,-103.176997,-216.751954,-221.048169,-193.417148
7,<s> స్టాలిన్ కూడా పన్నీరు సెల్వం వైపే మొగ్గుచూ...,-100.712032,-192.313673,-193.417148,-165.786127
8,<s> మస్లిజ్ పార్టీకి ఓటు వేయడం ద్వారా ప్రజలు త...,-154.690902,-288.476803,-331.572253,-303.941232
9,<s> జైలుకు వెళ్ళడం లాలూకు కొత్తకాదు. </s>,-70.79254,-124.488044,-110.524084,-82.893063


In [6]:
final_results = []
for s in sampled_sentences:
    res = {'Sentence': s}
    for model_name, model in [('Unigram', unigram_df),
                              ('Bigram', bigram_df),
                              ('Trigram', trigram_df),
                              ('Quadrigram', quadrigram_df)]:
        for smoothing in ['Add-One','Add-K','Token-Type']:
            key = f"{model_name}_{smoothing}"
            res[key] = sentence_logprob(s, model, smoothing)
    final_results.append(res)

final_df = pd.DataFrame(final_results)
final_df.to_csv("sentence_logprobabilities.csv", index=False)
final_df.head()



Unnamed: 0,Sentence,Unigram_Add-One,Unigram_Add-K,Unigram_Token-Type,Bigram_Add-One,Bigram_Add-K,Bigram_Token-Type,Trigram_Add-One,Trigram_Add-K,Trigram_Token-Type,Quadrigram_Add-One,Quadrigram_Add-K,Quadrigram_Token-Type
0,<s> బాలీవుడ్‌లో మాఫియా అన్న పదం వినిపిస్తే చాల...,-170.549231,-170.440078,-216.530185,-329.74715,-329.737498,-331.206661,-331.572253,-331.572253,-331.572253,-303.941232,-303.941232,-303.941232
1,<s> మరిన్ని వివరాలు తెలియాల్సి ఉంది. </s>,-62.359425,-62.297752,-93.476524,-90.465366,-90.433431,-96.159702,-78.968455,-77.961855,-86.964694,-69.430591,-68.96059,-71.73317
2,<s> లేకుంటే ఆయనపై చెప్పుల దాడికి కూడా వెనుకాడబ...,-114.826491,-114.745691,-161.520175,-244.338134,-244.316927,-248.313159,-248.67919,-248.67919,-248.67919,-221.048169,-221.048169,-221.048169
3,<s> ' అంటూ సాగిపోతున్న ఈ పాట ఫ్యామిలీ ఆడియన్స్...,-122.377517,-122.281976,-175.344752,-223.406164,-223.370056,-234.307167,-248.67919,-248.67919,-248.67919,-221.048169,-221.048169,-221.048169
4,"<s> వాళ్లని సూటిగా గుర్తిస్తారు ఇతర పాత్రలూ, క...",-124.853547,-124.76393,-161.992245,-234.780327,-234.781649,-234.680951,-221.048169,-221.048169,-221.048169,-193.417148,-193.417148,-193.417148
