In [None]:
import csv
import pandas as pd
from collections import Counter, defaultdict

# Load dataset
df = pd.read_parquet("tokenized_gujarati_sentences.parquet")

# Extract sentences
sentences = df["sentence"].tolist()

# --- Step 1: Tokenization ---
# For Gujarati, simple whitespace split works as a baseline
tokenized_sentences = [s.strip().split() for s in sentences]

# --- Step 2: Function to build n-gram models ---
def build_ngram_model(sentences, n):
    ngram_counts = Counter()
    context_counts = Counter()

    for tokens in sentences:
        tokens = ["<s>"] * (n-1) + tokens + ["</s>"]  # padding
        ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

        ngram_counts.update(ngrams)

        if n > 1:
            contexts = [ng[:-1] for ng in ngrams]  # all (n-1)-grams
            context_counts.update(contexts)       # count every occurrence

    # Convert counts to probabilities
    model = defaultdict(dict)
    total_unigrams = sum(ngram_counts.values())

    for ngram, count in ngram_counts.items():
        if n == 1:
            model[()][ngram[0]] = count / total_unigrams
        else:
            context = ngram[:-1]
            word = ngram[-1]
            model[context][word] = count / context_counts[context]  # <-- Correct formula

    return model


# --- Step 3: Build models ---
unigram_model = build_ngram_model(tokenized_sentences, 1)
bigram_model = build_ngram_model(tokenized_sentences, 2)
trigram_model = build_ngram_model(tokenized_sentences, 3)
quadrigram_model = build_ngram_model(tokenized_sentences, 4)

# --- Step 4: Print samples ---

# Function to save n-gram model into CSV
def save_ngram_to_csv(model, ngram_type, filename):
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Ngram_Type", "Context", "Word", "Probability"])

        for context, next_words in model.items():
            for word, prob in next_words.items():
                writer.writerow([ngram_type, " ".join(context), word, prob])


# Save each model separately
save_ngram_to_csv(unigram_model, "Unigram", "unigram_probs.csv")
save_ngram_to_csv(bigram_model, "Bigram", "bigram_probs.csv")
save_ngram_to_csv(trigram_model, "Trigram", "trigram_probs.csv")
save_ngram_to_csv(quadrigram_model, "Quadrigram", "quadrigram_probs.csv")

print("✅ All n-gram probabilities saved into CSV files successfully!")



✅ All n-gram probabilities saved into CSV files successfully!


In [None]:
import pandas as pd
import random, math
from collections import Counter

# -------------------------
# Load dataset
# -------------------------
df = pd.read_parquet("tokenized_gujarati_sentences.parquet")
sentences = df["sentence"].tolist()
tokenized_sentences = [s.strip().split() for s in sentences]

# -------------------------
# Step 1: Train/Val/Test Split
# -------------------------
random.seed(42)
random.shuffle(tokenized_sentences)
val_set  = tokenized_sentences[:1000]
test_set = tokenized_sentences[1000:2000]
train_set = tokenized_sentences[2000:]

# Build vocabulary from training set
vocab = set(["<s>", "</s>"])
for sent in train_set:
    vocab.update(sent)
V = len(vocab)

# -------------------------
# Step 2: Build N-gram Counts
# -------------------------
def build_ngram_counts(sentences, n):
    counts = Counter()
    for tokens in sentences:
        padded = (["<s>"]*(n-1)) + tokens + ["</s>"]
        for i in range(len(padded)-n+1):
            ng = tuple(padded[i:i+n])
            counts[ng] += 1
    return counts

unigram_counts    = build_ngram_counts(train_set, 1)
bigram_counts     = build_ngram_counts(train_set, 2)
trigram_counts    = build_ngram_counts(train_set, 3)
quadrigram_counts = build_ngram_counts(train_set, 4)

# -------------------------
# Step 3: Good-Turing Model
# -------------------------
def good_turing_model(ngram_counts: Counter, vocab_size: int, n: int):
    Nc = Counter(ngram_counts.values())
    N = sum(ngram_counts.values())
    N1 = Nc.get(1, 0)

    if n == 1:
        num_seen_types = len(ngram_counts)
        num_unseen_types = max(vocab_size - num_seen_types, 0)
    else:
        num_seen_types = len(ngram_counts)
        num_unseen_types = max(pow(vocab_size, n) - num_seen_types, 0)

    unseen_prob = (N1 / N) / num_unseen_types if num_unseen_types > 0 and N > 0 else 0.0

    seen_probs = {}
    for ng, c in ngram_counts.items():
        Nc_c = Nc.get(c, 0)
        Nc_c1 = Nc.get(c+1, 0)
        if Nc_c > 0 and Nc_c1 > 0:
            c_star = (c+1) * (Nc_c1 / Nc_c)
        else:
            c_star = c
        seen_probs[ng] = c_star / N if N > 0 else 0.0

    return {
        "seen_probs": seen_probs,
        "unseen_prob": unseen_prob,
        "Nc": dict(Nc),
        "N": N,
        "num_unseen_types": num_unseen_types,
    }

unigram_gt    = good_turing_model(unigram_counts, V, 1)
bigram_gt     = good_turing_model(bigram_counts, V, 2)
trigram_gt    = good_turing_model(trigram_counts, V, 3)
quadrigram_gt = good_turing_model(quadrigram_counts, V, 4)

# -------------------------
# Step 4: Sentence Probability
# -------------------------
def sentence_logprob(tokens, gt_model, n):
    tokens = ["<s>"]*(n-1) + tokens + ["</s>"]
    logp = 0.0
    for i in range(len(tokens)-n+1):
        ng = tuple(tokens[i:i+n])
        if ng in gt_model["seen_probs"]:
            p = gt_model["seen_probs"][ng]
        else:
            p = gt_model["unseen_prob"]
        logp += math.log(p + 1e-12)  # avoid log(0)
    return logp

# Example: evaluate first validation sentence under bigram model
print("\nExample validation sentence log-prob (bigram GT):")
print(sentence_logprob(val_set[0], bigram_gt, n=2))

# -------------------------
# Step 5: Frequency Tables
# -------------------------
def build_frequency_table(ngram_counts: Counter, vocab_size: int, n: int, top_k: int = 100):
    Nc = Counter(ngram_counts.values())
    num_seen_types = len(ngram_counts)
    N = sum(ngram_counts.values())
    N1 = Nc.get(1, 0)

    if n == 1:
        num_unseen_types = max(vocab_size - num_seen_types, 0)
    else:
        num_unseen_types = max(pow(vocab_size, n) - num_seen_types, 0)

    rows = []

    # c = 0 row
    c_star_0 = (N1 / num_unseen_types) if num_unseen_types > 0 else 0.0
    p_star_0 = (c_star_0 / N) if N > 0 else 0.0
    rows.append({"C (MLE)": 0, "Nc": num_unseen_types, "C*": c_star_0, "P* (c*/N)": p_star_0})

    max_c = max(Nc.keys()) if len(Nc) > 0 else 0
    for c in range(1, max_c+1):
        Nc_c = Nc.get(c, 0)
        Nc_c1 = Nc.get(c+1, 0)
        if Nc_c > 0 and Nc_c1 > 0:
            c_star = (c+1) * (Nc_c1 / Nc_c)
        else:
            c_star = float(c)
        p_star = (c_star / N) if N > 0 else 0.0
        rows.append({"C (MLE)": c, "Nc": Nc_c, "C*": c_star, "P* (c*/N)": p_star})

    df = pd.DataFrame(rows).head(top_k)
    csv_filename = f"frequency_table_n{n}_top{top_k}.csv"
    df.to_csv(csv_filename, index=False, encoding="utf-8")
    print(f"\nSaved frequency table for n={n} → {csv_filename}")
    return df

# Save frequency tables
df_uni  = build_frequency_table(unigram_counts, V, n=1, top_k=100)
df_bi   = build_frequency_table(bigram_counts, V, n=2, top_k=100)
df_tri  = build_frequency_table(trigram_counts, V, n=3, top_k=100)
df_quad = build_frequency_table(quadrigram_counts, V, n=4, top_k=100)

# -------------------------
# Step 6: Deleted Interpolation (Quadrigrams)
# -------------------------
# Placeholder: requires grid search/EM to tune λ’s using val_set
# Idea:
#   For each quadrigram, interpolate with trigram, bigram, unigram probs.
#   Optimize λ1..λ4 so they sum to 1 and maximize val likelihood.



Example validation sentence log-prob (bigram GT):
-177.4398762869915

Saved frequency table for n=1 → frequency_table_n1_top100.csv

Saved frequency table for n=2 → frequency_table_n2_top100.csv

Saved frequency table for n=3 → frequency_table_n3_top100.csv

Saved frequency table for n=4 → frequency_table_n4_top100.csv


In [None]:
import math

def mle_prob(ngram, counts, lower_counts):
    """Compute MLE probability for an n-gram."""
    if len(ngram) == 1:
        return counts.get(ngram, 0) / sum(counts.values())
    else:
        context = ngram[:-1]
        return counts.get(ngram, 0) / lower_counts.get(context, 1)  # avoid /0

def build_context_counts(counts):
    """For bigram/trigram/quadrigram: build context counts."""
    context_counts = Counter()
    for ng, c in counts.items():
        context = ng[:-1]
        context_counts[context] += c
    return context_counts

# Precompute context counts
bigram_contexts    = build_context_counts(bigram_counts)
trigram_contexts   = build_context_counts(trigram_counts)
quadrigram_contexts= build_context_counts(quadrigram_counts)

def sentence_logprob_interpolated(tokens, lambdas):
    """Sentence log-prob under interpolated quadrigram model."""
    tokens = ["<s>"]*3 + tokens + ["</s>"]
    logp = 0.0
    for i in range(3, len(tokens)):
        uni   = (tokens[i],)
        bi    = (tokens[i-1], tokens[i])
        tri   = (tokens[i-2], tokens[i-1], tokens[i])
        quad  = (tokens[i-3], tokens[i-2], tokens[i-1], tokens[i])

        p1 = mle_prob(uni, unigram_counts, {})
        p2 = mle_prob(bi, bigram_counts, bigram_contexts)
        p3 = mle_prob(tri, trigram_counts, trigram_contexts)
        p4 = mle_prob(quad, quadrigram_counts, quadrigram_contexts)

        p = lambdas[0]*p1 + lambdas[1]*p2 + lambdas[2]*p3 + lambdas[3]*p4
        logp += math.log(p + 1e-12)  # avoid log(0)
    return logp

def grid_search_lambdas_fast(val_set, step=0.2, max_sent=200):
    """
    Fast grid search for λ's (default step=0.2, 200 val sentences).
    """
    best_score = -1e18
    best_lambdas = None
    grid = [round(i*step,2) for i in range(int(1/step)+1)]
    for l1 in grid:
        for l2 in grid:
            for l3 in grid:
                l4 = 1 - (l1+l2+l3)
                if l4 < 0:  # invalid combo
                    continue
                lambdas = (l1, l2, l3, l4)
                total_logp = 0.0
                for sent in val_set[:max_sent]:
                    total_logp += sentence_logprob_interpolated(sent, lambdas)
                if total_logp > best_score:
                    best_score = total_logp
                    best_lambdas = lambdas
    return best_lambdas, best_score

# -------------------------
# Example: run fast search
# -------------------------
best_lambdas, best_score = grid_search_lambdas_fast(val_set, step=0.2, max_sent=200)
print("Best λ's (λ1, λ2, λ3, λ4):", best_lambdas)
print("Validation log-likelihood:", best_score)


Best λ's (λ1, λ2, λ3, λ4): (0.6, 0.4, 0.0, 0.0)
Validation log-likelihood: -30816.31385418249
