In [16]:

from collections import Counter
import io, csv
import pandas as pd

In [10]:
path = "telugu_tokenized_sentences.txt"

In [None]:
def build_counts(path):
    unigram_c = Counter()
    bigram_c = Counter()
    trigram_c = Counter()
    quadrigram_c = Counter()

    with io.open(path, "r", encoding="utf-8", buffering=1024*1024) as f:
        for line in f:
            words = line.strip().split()
            if not words:
                continue

            sent = ["<s>"] + words + ["</s>"]

            unigram_c.update(sent)
            bigram_c.update(zip(sent[:-1], sent[1:]))
            trigram_c.update(zip(sent[:-2], sent[1:-1], sent[2:]))
            quadrigram_c.update(zip(sent[:-3], sent[1:-2], sent[2:-1], sent[3:]))

    print("Finished building counts")
    return unigram_c, bigram_c, trigram_c, quadrigram_c

In [None]:
unigram_c, bigram_c, trigram_c, quadrigram_c = build_counts(path)

print("Unique unigrams:", len(unigram_c))
print("Unique bigrams:", len(bigram_c))
print("Unique trigrams:", len(trigram_c))
print("Unique quadrigrams:", len(quadrigram_c))

N = sum(unigram_c.values())
V_uni = len(unigram_c)
V_bi  = len(bigram_c)
V_tri = len(trigram_c)

print("Total tokens (N):", N)


Finished building counts
Unique unigrams: 832991
Unique bigrams: 7658547
Unique trigrams: 14233784
Unique quadrigrams: 16723829
Total tokens (N): 25847778


In [None]:
unigram_df = pd.DataFrame({
    'unigram': list(unigram_c.keys()),
    'count': list(unigram_c.values())
})
unigram_df['probability'] = unigram_df['count'] / N


bigram_df = pd.DataFrame({
    'bigram': list(bigram_c.keys()),
    'count': list(bigram_c.values())
})
bigram_df['prefix'] = bigram_df['bigram'].apply(lambda x: x[0])
bigram_df['probability'] = bigram_df.apply(
    lambda row: row['count'] / unigram_c[row['prefix']],
    axis=1
)


trigram_df = pd.DataFrame({
    'trigram': list(trigram_c.keys()),
    'count': list(trigram_c.values())
})
trigram_df['prefix'] = trigram_df['trigram'].apply(lambda x: (x[0], x[1]))
trigram_df['probability'] = trigram_df.apply(
    lambda row: row['count'] / bigram_c[row['prefix']],
    axis=1
)


quadrigram_df = pd.DataFrame({
    'quadrigram': list(quadrigram_c.keys()),
    'count': list(quadrigram_c.values())
})
quadrigram_df['prefix'] = quadrigram_df['quadrigram'].apply(lambda x: (x[0], x[1], x[2]))
quadrigram_df['probability'] = quadrigram_df.apply(
    lambda row: row['count'] / trigram_c[row['prefix']],
    axis=1
)

print("\n✅ Language models built successfully!")



✅ Language models built successfully!


In [19]:
unigram_df.to_csv("unigram_model.csv", index=False)
bigram_df.to_csv("bigram_model.csv", index=False)
trigram_df.to_csv("trigram_model.csv", index=False)
quadrigram_df.to_csv("quadrigram_model.csv", index=False)
print("Models saved as CSV files.")

Models saved as CSV files.


In [None]:

def save_ngram_probs(counter, base_counter, filename_prefix, top_n, k=0.6):
    """
    Save n-gram counts and probabilities into CSV.
    counter        : Counter of n-grams
    base_counter   : Denominator counts (None for unigrams)
    filename_prefix: "unigram", "bigram", etc.
    top_n          : number of top n-grams to save
    k              : value for Add-K smoothing
    """

    V = len(base_counter) if base_counter else len(counter)
    csv_file = f"{filename_prefix}.csv"

    with open(csv_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        
        writer.writerow(["Ngram", "Count", "Raw", "Add-One", "Add-K", "Token-Type"])

        for ngram, count in counter.most_common(top_n):
            key = " ".join(ngram) if isinstance(ngram, tuple) else ngram

            if base_counter:
                denom = base_counter.get(ngram[:-1], 0)
            else:  
                denom = N

            
            raw  = count / denom if denom > 0 else count / N
            add1 = (count + 1) / (denom + V) if denom > 0 else (count + 1) / (N + V)
            addk = (count + k) / (denom + k*V) if denom > 0 else (count + k) / (N + k*V)
            tokT = (count + V) / (denom + V*V) if denom > 0 else (count + V) / (N + V*V)

            
            writer.writerow([key, count, raw, add1, addk, tokT])

    print(f"Saved top {top_n} {filename_prefix}s → {csv_file}")


In [None]:
save_ngram_probs(unigram_c, None, "unigram", top_n=200000)
save_ngram_probs(bigram_c, unigram_c, "bigram", top_n=200000)
save_ngram_probs(trigram_c, bigram_c, "trigram", top_n=200000)
save_ngram_probs(quadrigram_c, trigram_c, "quadrigram", top_n=200000)


Saved top 200000 unigrams → unigram.csv
Saved top 200000 bigrams → bigram.csv
Saved top 200000 trigrams → trigram.csv
Saved top 200000 quadrigrams → quadrigram.csv
