In [2]:
import pandas as pd
import numpy as np
import math
from collections import Counter

def load_counter(csv_file):
    df = pd.read_csv(csv_file)
    df["Ngram"] = df["Ngram"].astype(str)   
    df["Count"] = df["Count"].astype(int)
    return Counter(dict(zip(df["Ngram"], df["Count"])))

unigram_c    = load_counter("../lab4/unigram.csv")
bigram_c     = load_counter("../lab4/bigram.csv")
trigram_c    = load_counter("../lab4/trigram.csv")
quadrigram_c = load_counter("../lab4/quadrigram.csv")

print("Unigrams:", len(unigram_c))
print("Bigrams:", len(bigram_c))
print("Trigrams:", len(trigram_c))
print("Quadrigrams:", len(quadrigram_c))


Unigrams: 200000
Bigrams: 200000
Trigrams: 200000
Quadrigrams: 200000


In [3]:
vocab = set()
for ng in unigram_c:
    for w in ng.split():
        vocab.add(w)

vocab_size = len(vocab)
print("Vocabulary size:", vocab_size)


Vocabulary size: 200000


In [4]:
def good_turing_probs(counter, vocab_size, n):
    N = sum(counter.values())
    freq_of_freq = Counter(counter.values())
    N1 = freq_of_freq[1]

    probs = {}
    for ng, c in counter.items():
        Nc = freq_of_freq[c]
        Nc1 = freq_of_freq.get(c+1, 0)
        if Nc > 0:
            c_star = (c+1) * Nc1 / Nc
        else:
            c_star = c
        probs[ng] = c_star / N

    # unseen events
    if n == 1:
        unseen_count = vocab_size - len(counter)
    else:
        unseen_count = vocab_size**n - len(counter)

    p_unseen = (N1 / N) / max(1, unseen_count)
    return probs, p_unseen

uni_probs, uni_pu   = good_turing_probs(unigram_c, vocab_size, 1)
bi_probs, bi_pu     = good_turing_probs(bigram_c, vocab_size, 2)
tri_probs, tri_pu   = good_turing_probs(trigram_c, vocab_size, 3)
quad_probs, quad_pu = good_turing_probs(quadrigram_c, vocab_size, 4)

print("Good-Turing models built successfully.")


Good-Turing models built successfully.


In [None]:
def sentence_log_prob(sentence, probs, p_unseen, n):
    tokens = ["<s>"]*(n-1) + sentence.strip().split() + ["</s>"]
    log_prob = 0.0
    for i in range(len(tokens)-n+1):
        ng = " ".join(tokens[i:i+n])
        p = probs.get(ng, p_unseen)
        log_prob += math.log(p if p > 0 else 1e-15)
    return log_prob

def sentence_perplexity(sentence, probs, p_unseen, n):
    tokens = ["<s>"]*(n-1) + sentence.strip().split() + ["</s>"]
    log_prob = 0.0
    for i in range(len(tokens)-n+1):
        ng = " ".join(tokens[i:i+n])
        p = probs.get(ng, p_unseen)
        log_prob += math.log(p if p > 0 else 1e-15)
    length = len(tokens)   
    return math.exp(-log_prob / length)


In [6]:
val = pd.read_csv("val_sentences.csv")
test = pd.read_csv("test_sentences.csv")

val_sentences = val["sentence"].tolist()
test_sentences = test["sentence"].tolist()

print("Validation sentences:", len(val_sentences))
print("Test sentences:", len(test_sentences))


Validation sentences: 1000
Test sentences: 1000


In [7]:
models = [
    ("Unigram", uni_probs, uni_pu, 1),
    ("Bigram", bi_probs, bi_pu, 2),
    ("Trigram", tri_probs, tri_pu, 3),
    ("Quadgram", quad_probs, quad_pu, 4)
]

for s in val_sentences[:10]:
    print(f"\nSentence: {s}")
    for name, probs, pu, n in models:
        logp = sentence_log_prob(s, probs, pu, n)
        ppl  = sentence_perplexity(s, probs, pu, n)
        print(f"  {name:8s} -> LogProb: {logp:.4f}, Perplexity: {ppl:.4f}")



Sentence: అయితే బర్డ్ ఫ్లూ మనుషులకు వ్యాప్తి చెందే అవకాశం ఉన్నప్పటికీ ఇప్పటివరకు ఈ దేశంలో ఎవరికీ బర్డ్ ఫ్లూ లక్షణాలు కనిపించలేదని , ఈ విషయంలో భయపడాల్సిన అవసరం లేదని కేంద్ర స్పష్టం చేసింది .
  Unigram  -> LogProb: -704.1796, Perplexity: 212185432381.7236
  Bigram   -> LogProb: -684.4864, Perplexity: 41374991970.1936
  Trigram  -> LogProb: -838.1928, Perplexity: 3568618558190.1792
  Quadgram -> LogProb: -932.5470, Perplexity: 31622776601683.1992

Sentence: దీంతో ఫిరోజ్ , ఆమె ప్రియుడ్ని అదుపులోకి తీసుకుని పోలీసులు విచారిస్తున్నారు .
  Unigram  -> LogProb: -289.4139, Perplexity: 266956658704.1897
  Bigram   -> LogProb: -286.0461, Perplexity: 22508827411.8606
  Trigram  -> LogProb: -333.3403, Perplexity: 136769190445.2947
  Quadgram -> LogProb: -356.7269, Perplexity: 116422439110.2750

Sentence: ఇంటికి వెళ్లి ‘ చై ’ కి ఓ పెద్ద హగ్ ‌ ఇవ్వాలని ఆతృతగా ఉంది ’ అని ట్వీట్ ‌ చేశారు నాగ్ .
  Unigram  -> LogProb: -613.2157, Perplexity: 4805334177372.3389
  Bigram   -> LogProb: -450.5850, Perplexit

In [8]:

def good_turing_table(counter, top_k=100):
    freq_of_freq = Counter(counter.values())
    rows = []
    
    for c in sorted(freq_of_freq.keys())[:top_k]:
        Nc = freq_of_freq[c]
        Nc1 = freq_of_freq.get(c+1, 0)
        if Nc > 0:
            c_star = (c+1) * Nc1 / Nc
        else:
            c_star = c
        rows.append((c, Nc, c_star))
    
    df = pd.DataFrame(rows, columns=["c", "Nc", "c*"])
    return df

uni_table  = good_turing_table(unigram_c)
bi_table   = good_turing_table(bigram_c)
tri_table  = good_turing_table(trigram_c)
quad_table = good_turing_table(quadrigram_c)

print("Top Good-Turing frequencies for Unigrams:")
display(uni_table.head(20))

print("\nTop Good-Turing frequencies for Bigrams:")
display(bi_table.head(20))

print("\nTop Good-Turing frequencies for Trigrams:")
display(tri_table.head(20))

print("\nTop Good-Turing frequencies for Quadrigrams:")
display(quad_table.head(20))


Top Good-Turing frequencies for Unigrams:


Unnamed: 0,c,Nc,c*
0,3,17564,6.456844
1,4,28352,3.372954
2,5,19126,4.416397
3,6,14078,5.374059
4,7,10808,6.612139
5,8,8933,7.239897
6,9,7186,8.352352
7,10,6002,9.363379
8,11,5109,10.200822
9,12,4343,11.772738



Top Good-Turing frequencies for Bigrams:


Unnamed: 0,c,Nc,c*
0,10,11565,16.347341
1,11,17187,10.33409
2,12,14801,11.049253
3,13,12580,11.828776
4,14,10629,12.981936
5,15,9199,14.098924
6,16,8106,15.076857
7,17,7189,16.139658
8,18,6446,16.742166
9,19,5680,18.017606



Top Good-Turing frequencies for Trigrams:


Unnamed: 0,c,Nc,c*
0,6,18056,11.351739
1,7,29281,5.877395
2,8,21512,6.927389
3,9,16558,7.912187
4,10,13101,8.874055
5,11,10569,9.869997
6,12,8693,10.985621
7,13,7346,11.777838
8,14,6180,12.951456
9,15,5336,14.50075



Top Good-Turing frequencies for Quadrigrams:


Unnamed: 0,c,Nc,c*
0,4,55323,3.15818
1,5,34944,3.807005
2,6,22172,4.826944
3,7,15289,5.785597
4,8,11057,6.742064
5,9,8283,7.81118
6,10,6470,8.832303
7,11,5195,9.729355
8,12,4212,10.929012
9,13,3541,11.528947


In [12]:


def mle_prob_ngram(ngram_str, n, counts_dict, vocab_size=None):
    """
    Compute MLE probability with add-one smoothing
    """
    if n == 1:
        total = sum(counts_dict[1].values()) + (vocab_size if vocab_size else 0)
        return (counts_dict[1].get(ngram_str, 0) + 1) / total
    prefix = " ".join(ngram_str.split()[:-1])
    prefix_count = counts_dict[n-1].get(prefix, 0)
    
    if prefix_count == 0:
        return 1 / (sum(counts_dict[n-1].values()) + (vocab_size if vocab_size else 0))
    return (counts_dict[n].get(ngram_str, 0) + 1) / (prefix_count + (vocab_size if vocab_size else 0))


def estimate_lambdas_deleted(quad_counts, tri_counts, bi_counts, uni_counts, vocab_size, top_k=5000):
    lambda4 = lambda3 = lambda2 = lambda1 = 0.0
    quad_ngrams = list(quad_counts.keys())[:top_k]  
    for qg in quad_ngrams:
        words = qg.split()
        if len(words) != 4:
            continue
        w1,w2,w3,w4 = words

        p4 = mle_prob_ngram(f"{w1} {w2} {w3} {w4}", 4, {1: uni_counts,2: bi_counts,3: tri_counts,4: quad_counts}, vocab_size)
        p3 = mle_prob_ngram(f"{w2} {w3} {w4}", 3, {1: uni_counts,2: bi_counts,3: tri_counts,4: quad_counts}, vocab_size)
        p2 = mle_prob_ngram(f"{w3} {w4}", 2, {1: uni_counts,2: bi_counts,3: tri_counts,4: quad_counts}, vocab_size)
        p1 = mle_prob_ngram(f"{w4}", 1, {1: uni_counts,2: bi_counts,3: tri_counts,4: quad_counts}, vocab_size)

        probs = [p4, p3, p2, p1]
        best = probs.index(max(probs))
        count = quad_counts[qg]

        if best == 0: lambda4 += count
        elif best == 1: lambda3 += count
        elif best == 2: lambda2 += count
        else: lambda1 += count

    total = lambda1 + lambda2 + lambda3 + lambda4
    return [l/total for l in [lambda4, lambda3, lambda2, lambda1]]

lambdas = estimate_lambdas_deleted(quadrigram_c, trigram_c, bigram_c, unigram_c, vocab_size)
print("Estimated λ weights (quad, tri, bi, uni):", lambdas)


def interpolated_prob_quad(w1, w2, w3, w4, lambdas, vocab_size):
    p4 = mle_prob_ngram(f"{w1} {w2} {w3} {w4}", 4, {1: unigram_c,2: bigram_c,3: trigram_c,4: quadrigram_c}, vocab_size)
    p3 = mle_prob_ngram(f"{w2} {w3} {w4}", 3, {1: unigram_c,2: bigram_c,3: trigram_c,4: quadrigram_c}, vocab_size)
    p2 = mle_prob_ngram(f"{w3} {w4}", 2, {1: unigram_c,2: bigram_c,3: trigram_c,4: quadrigram_c}, vocab_size)
    p1 = mle_prob_ngram(f"{w4}", 1, {1: unigram_c,2: bigram_c,3: trigram_c,4: quadrigram_c}, vocab_size)
    λ4, λ3, λ2, λ1 = lambdas
    return λ4*p4 + λ3*p3 + λ2*p2 + λ1*p1


example_prob = interpolated_prob_quad("the","quick","brown","fox", lambdas, vocab_size)
print("Interpolated probability for 'the quick brown fox':", example_prob)


Estimated λ weights (quad, tri, bi, uni): [0.07179329861723585, 0.10951752805024886, 0.6698238375347991, 0.1488653357977161]
Interpolated probability for 'the quick brown fox': 5.171158097520929e-08
