In [13]:
# %%
import re
import itertools
from collections import Counter, defaultdict
from nltk.util import ngrams
import csv
import random
import os

# =========================
# 1. Load Telugu dataset
# =========================
dataset_path = "../ASSIGNMENT-1/telugu_dataset.txt" 
NLP_LAB= "./" 
with open(dataset_path, "r", encoding="utf-8") as f:
    raw_sentences = [line.strip() for line in f if line.strip()]
raw_sentences = raw_sentences[:10000]
print(f"Total sentences loaded: {len(raw_sentences)}")
print("Sample sentence:", raw_sentences[0])

# =========================
# 2. Sentence & word tokenizer
# =========================
def telugu_sentence_tokenizer(text):
    return re.split(r'(?<=[।!?॥.])\s+', text)

def telugu_word_tokenizer(text):
    url = r'https?://\S+'
    email = r'\b[\w\.-]+@[\w\.-]+\.\w+\b'
    date = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'
    decimal = r'\b\d+\.\d+\b'
    number = r'\b\d+\b'
    telugu = r'[\u0C00-\u0C7F]+'
    english = r'[a-zA-Z]+'
    punctuation = r'[.,!?;:"(){}\[\]<>|/@#$%^&*_+=~`\'“”‘’₹…-]'
    pattern = f'{url}|{email}|{date}|{decimal}|{number}|{telugu}|{english}|{punctuation}'
    return re.findall(pattern, text)

# =========================
# 3. Split dataset
# =========================
random.seed(42)
random.shuffle(raw_sentences)
val_set = raw_sentences[:1000]
test_set = raw_sentences[1000:2000]
train_set = raw_sentences[2000:]

print(f"Train: {len(train_set)}, Validation: {len(val_set)}, Test: {len(test_set)}")

# =========================
# 4. Count n-grams
# =========================
def count_ngrams(sentences, n):
    counter = Counter()
    for sent in sentences:
        tokens = telugu_word_tokenizer(sent)
        if not tokens:
            continue
        padded = ['<s>']*(n-1) + tokens + ['</s>']*(n-1)
        counter.update(ngrams(padded, n))
    return counter

print("Counting n-grams...")
unigram_counts = count_ngrams(train_set, 1)
bigram_counts = count_ngrams(train_set, 2)
trigram_counts = count_ngrams(train_set, 3)
quadrigram_counts = count_ngrams(train_set, 4)

print(f"Unique unigrams: {len(unigram_counts):,}")
print(f"Unique bigrams: {len(bigram_counts):,}")
print(f"Unique trigrams: {len(trigram_counts):,}")
print(f"Unique quadrigrams: {len(quadrigram_counts):,}")

# =========================
# 5. Good-Turing smoothing
# =========================
def good_turing_probs(ngram_counter, vocab_size, n):
    count_of_counts = Counter(ngram_counter.values())
    N1 = count_of_counts.get(1, 0)
    N = sum(ngram_counter.values())

    if n == 1:
        unseen_count = vocab_size - len(ngram_counter)
    else:
        unseen_count = vocab_size ** n - len(ngram_counter)

    P_unseen = (N1 / N) / max(unseen_count, 1)
    probs = {ng: count / N for ng, count in ngram_counter.items()}
    return probs, P_unseen

vocab = set(itertools.chain.from_iterable([ng[0] for ng in unigram_counts]))
V = len(vocab)

unigram_probs, unigram_P_unseen = good_turing_probs(unigram_counts, V, 1)
bigram_probs, bigram_P_unseen = good_turing_probs(bigram_counts, V, 2)
trigram_probs, trigram_P_unseen = good_turing_probs(trigram_counts, V, 3)
quadrigram_probs, quadrigram_P_unseen = good_turing_probs(quadrigram_counts, V, 4)

# =========================
# 6. Save n-grams to CSV
# =========================
def save_ngram_csv(filename, ngram_counts, ngram_probs, P_unseen):
    with open(filename, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['ngram', 'count', 'MLE_prob', 'GoodTuring_prob', 'P_unseen'])
        total_count = sum(ngram_counts.values())
        for ng in ngram_counts:
            writer.writerow([
                ' '.join(ng),
                ngram_counts[ng],
                ngram_counts[ng]/total_count,
                ngram_probs[ng],
                P_unseen
            ])
    print(f"Saved {filename}")

save_ngram_csv("unigram.csv", unigram_counts, unigram_probs, unigram_P_unseen)
save_ngram_csv("bigram.csv", bigram_counts, bigram_probs, bigram_P_unseen)
save_ngram_csv("trigram.csv", trigram_counts, trigram_probs, trigram_P_unseen)
save_ngram_csv("quadrigram.csv", quadrigram_counts, quadrigram_probs, quadrigram_P_unseen)

# =========================
# 7. Sentence probability using smoothed n-grams
# =========================
def sentence_prob(sentence, n, ngram_probs, P_unseen):
    tokens = telugu_word_tokenizer(sentence)
    padded = ['<s>']*(n-1) + tokens + ['</s>']*(n-1)
    ngrams_list = list(ngrams(padded, n))
    prob = 1.0
    for ng in ngrams_list:
        prob *= ngram_probs.get(ng, P_unseen)
    return prob

# Example usage
example_sentence = "ఈ రోజు వాతావరణం బాగుంది."
prob_bigram = sentence_prob(example_sentence, 2, bigram_probs, bigram_P_unseen)
print(f"Sentence probability (Bigram Good-Turing): {prob_bigram:.8f}")

Total sentences loaded: 10000
Sample sentence: అమెరికా అధ్యక్షుడు డొనాల్డ్ ట్రంప్ కు రాష్ట్రపతి భవన్ వద్ద ఘనస్వాగతం లభించింది. ఆయనకు రాష్ట్రపతి రామ్ నాథ్ కోవింద్ దంపతులు, ప్రధాని మోదీ సాదరంగా ఆహ్వానం పలకడంతో పాటు సైనికులు గౌరవ వందనాన్ని అందించారు.
Train: 8000, Validation: 1000, Test: 1000
Counting n-grams...
Unique unigrams: 60,951
Unique bigrams: 231,960
Unique trigrams: 302,928
Unique quadrigrams: 326,143
Saved unigram.csv
Saved bigram.csv
Saved trigram.csv
Saved quadrigram.csv
Sentence probability (Bigram Good-Turing): 0.00000000
