In [1]:
# %%
# Cell 1: Imports & Counter Loader
import pandas as pd
import numpy as np
import math
from collections import Counter
import os

# Robust CSV counter loader
def load_counter(csv_file):
    if not os.path.exists(csv_file):
        raise FileNotFoundError(f"{csv_file} not found!")
    
    df = pd.read_csv(csv_file)
    
    # Normalize column names
    df.columns = df.columns.str.strip().str.lower()
    
    # Detect columns
    ngram_col = "ngram" if "ngram" in df.columns else "word"
    count_col = "count"
    
    df[ngram_col] = df[ngram_col].astype(str)
    df[count_col] = df[count_col].astype(int)
    
    return Counter(dict(zip(df[ngram_col], df[count_col])))

# Load n-gram counts
unigram_c    = load_counter("../ASSIGNMENT-4/unigram.csv")
bigram_c     = load_counter("../ASSIGNMENT-4/bigram.csv")
trigram_c    = load_counter("../ASSIGNMENT-4/trigram.csv")
quadrigram_c = load_counter("../ASSIGNMENT-4/quadrigram.csv")

print("Unigrams:", len(unigram_c))
print("Bigrams:", len(bigram_c))
print("Trigrams:", len(trigram_c))
print("Quadrigrams:", len(quadrigram_c))


# %%
# Cell 2: Build Vocabulary
vocab = set()
for ng in unigram_c:
    for w in ng.split():
        vocab.add(w)

vocab_size = len(vocab)
print("Vocabulary size:", vocab_size)


# %%
# Cell 3: Good-Turing Smoothing
def good_turing_probs(counter, vocab_size, n):
    N = sum(counter.values())
    freq_of_freq = Counter(counter.values())
    N1 = freq_of_freq.get(1, 0)

    probs = {}
    for ng, c in counter.items():
        Nc = freq_of_freq.get(c, 0)
        Nc1 = freq_of_freq.get(c+1, 0)
        if Nc > 0:
            c_star = (c+1) * Nc1 / Nc
        else:
            c_star = c
        probs[ng] = c_star / N

    # unseen events
    if n == 1:
        unseen_count = vocab_size - len(counter)
    else:
        unseen_count = vocab_size**n - len(counter)

    p_unseen = (N1 / N) / max(1, unseen_count)
    return probs, p_unseen

# Build models
uni_probs, uni_pu   = good_turing_probs(unigram_c, vocab_size, 1)
bi_probs, bi_pu     = good_turing_probs(bigram_c, vocab_size, 2)
tri_probs, tri_pu   = good_turing_probs(trigram_c, vocab_size, 3)
quad_probs, quad_pu = good_turing_probs(quadrigram_c, vocab_size, 4)

print("Good-Turing models built successfully.")

Unigrams: 60951
Bigrams: 231960
Trigrams: 302928
Quadrigrams: 326143
Vocabulary size: 60951
Good-Turing models built successfully.


In [2]:
# Cell 4: Sentence Probability (log-space to avoid underflow)
def sentence_log_prob(sentence, probs, p_unseen, n):
    tokens = ["<s>"]*(n-1) + sentence.strip().split() + ["</s>"]
    log_prob = 0.0
    for i in range(len(tokens)-n+1):
        ng = " ".join(tokens[i:i+n])
        p = probs.get(ng, p_unseen)
        log_prob += math.log(p if p > 0 else 1e-15)
    return log_prob

def sentence_perplexity(sentence, probs, p_unseen, n):
    tokens = ["<s>"]*(n-1) + sentence.strip().split() + ["</s>"]
    log_prob = 0.0
    for i in range(len(tokens)-n+1):
        ng = " ".join(tokens[i:i+n])
        p = probs.get(ng, p_unseen)
        log_prob += math.log(p if p > 0 else 1e-15)
    length = len(tokens)   # include <s> and </s>
    return math.exp(-log_prob / length)


In [4]:
# Cell 5: Load validation & test sets
val = pd.read_csv("validation.csv")
test = pd.read_csv("test.csv")

val_sentences = val["sentence"].tolist()
test_sentences = test["sentence"].tolist()

print("Validation sentences:", len(val_sentences))
print("Test sentences:", len(test_sentences))


Validation sentences: 1000
Test sentences: 1000


In [5]:
# Cell 6: Evaluate on first 10 validation sentences with all models
models = [
    ("Unigram", uni_probs, uni_pu, 1),
    ("Bigram", bi_probs, bi_pu, 2),
    ("Trigram", tri_probs, tri_pu, 3),
    ("Quadgram", quad_probs, quad_pu, 4)
]

for s in val_sentences[:10]:
    print(f"\nSentence: {s}")
    for name, probs, pu, n in models:
        logp = sentence_log_prob(s, probs, pu, n)
        ppl  = sentence_perplexity(s, probs, pu, n)
        print(f"  {name:8s} -> LogProb: {logp:.4f}, Perplexity: {ppl:.4f}")



Sentence: దేశ రాజకీయ వ్యవస్థను ఒప్పించి తెలంగాణ ప్రత్యేక రాష్ట్రంగా ఏర్పడిందన్నారు. ఇప్పుడు తెలంగాణ పునర్నిర్మాణం జరుగుతుందన్నారు. తెలంగాణ పునర్నిర్మాణం కూడా ఓ ఉద్యమమే అన్నారు. బంగారు తెలంగాణ కోసం పాటుపడదామన్నారు. తుమ్మల రాజకీయ దురంధరుడని, ఖమ్మం జిల్లాలో పార్టీని ముందుకు తీసుకెళ్లే ప్రధాన బాధ్యత తీసుకోవాలన్నారు.
  Unigram  -> LogProb: -419.4797, Perplexity: 228122.5615
  Bigram   -> LogProb: -719.5151, Perplexity: 847308059.0045
  Trigram  -> LogProb: -1108.0479, Perplexity: 23291471257134.5820
  Quadgram -> LogProb: -1469.4469, Perplexity: 176974433131128416.0000

Sentence: మీడియాపార్ట్‌ జర్నలిస్ట్ పాల్ జెస్నియెర్ తన కథనంలో ఏం చెప్పారు?
  Unigram  -> LogProb: -84.6525, Perplexity: 12159.0497
  Bigram   -> LogProb: -202.9618, Perplexity: 652406983.4803
  Trigram  -> LogProb: -299.1719, Perplexity: 648188584421.9867
  Quadgram -> LogProb: -397.6604, Perplexity: 246495651245056.7188

Sentence: గురుకులాల్లో ప్రైవేటు భాగస్వామ్యం
  Unigram  -> LogProb: -23.0958, Perplexity: 321.8076
  Bigr

In [6]:
# Cell 7: Task 3 - Good-Turing Frequency Tables

def good_turing_table(counter, top_k=100):
    # Build frequency-of-frequency
    freq_of_freq = Counter(counter.values())
    rows = []
    
    for c in sorted(freq_of_freq.keys())[:top_k]:
        Nc = freq_of_freq[c]
        Nc1 = freq_of_freq.get(c+1, 0)
        if Nc > 0:
            c_star = (c+1) * Nc1 / Nc
        else:
            c_star = c
        rows.append((c, Nc, c_star))
    
    df = pd.DataFrame(rows, columns=["c", "Nc", "c*"])
    return df

# Generate tables
uni_table  = good_turing_table(unigram_c)
bi_table   = good_turing_table(bigram_c)
tri_table  = good_turing_table(trigram_c)
quad_table = good_turing_table(quadrigram_c)

print("Top Good-Turing frequencies for Unigrams:")
display(uni_table.head(20))

print("\nTop Good-Turing frequencies for Bigrams:")
display(bi_table.head(20))

print("\nTop Good-Turing frequencies for Trigrams:")
display(tri_table.head(20))

print("\nTop Good-Turing frequencies for Quadrigrams:")
display(quad_table.head(20))


Top Good-Turing frequencies for Unigrams:


Unnamed: 0,c,Nc,c*
0,1,37959,0.435891
1,2,8273,1.362384
2,3,3757,2.364653
3,4,2221,3.226024
4,5,1433,4.53873
5,6,1084,4.856089
6,7,752,6.191489
7,8,582,7.252577
8,9,469,8.358209
9,10,392,9.456633



Top Good-Turing frequencies for Bigrams:


Unnamed: 0,c,Nc,c*
0,1,204202,0.153123
1,2,15634,0.92318
2,3,4811,1.834962
3,4,2207,2.734481
4,5,1207,4.076222
5,6,820,4.52439
6,7,530,5.781132
7,8,383,6.344648
8,9,270,7.851852
9,10,212,8.924528



Top Good-Turing frequencies for Trigrams:


Unnamed: 0,c,Nc,c*
0,1,290065,0.055794
1,2,8092,0.737395
2,3,1989,1.695324
3,4,843,2.301305
4,5,388,4.329897
5,6,280,4.95
6,7,198,5.373737
7,8,133,6.293233
8,9,93,10.107527
9,10,94,8.893617



Top Good-Turing frequencies for Quadrigrams:


Unnamed: 0,c,Nc,c*
0,1,319695,0.023529
1,2,3761,0.752991
2,3,944,1.919492
3,4,453,2.328918
4,5,211,4.720379
5,6,166,5.73494
6,7,136,4.176471
7,8,71,8.239437
8,9,65,11.692308
9,10,76,7.092105
