In [None]:
from collections import defaultdict
import random
import csv

sentences = []
tokens_all = []

with open("tokenized_sentences.txt", "r", encoding="utf-8") as f:
    for line in f:
        line_tokens = line.strip().split()
        if line_tokens:
            sentences.append(line_tokens)
            tokens_all.extend(line_tokens)

random.seed(42)
sentences_1000 = random.sample(sentences, min(1000, len(sentences)))

vocab = set(tokens_all)
V = len(vocab)
print(f"Loaded {len(tokens_all)} tokens, {len(sentences_1000)} sentences, vocab size = {V}")

def build_ngram(tokens, n):
    ngram_counts = defaultdict(int)
    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i+n])
        ngram_counts[ngram] += 1
    return ngram_counts

def h_counts(tokens, n):
    ngram_counts = build_ngram(tokens, n)
    if n > 1:
        hcounts = build_ngram(tokens, n-1)
    else:
        hcounts = {(): len(tokens)}
    return ngram_counts, hcounts

def smoothingfunc(ngram_counts, hcounts, vocabsize, smoothing="add1", k=0.5):
    probs = {}
    if smoothing == "tokentype":
        uniquefolls = defaultdict(set)
        for ngram in ngram_counts:
            h, w = ngram[:-1], ngram[-1]
            uniquefolls[h].add(w)

    for ngram, count in ngram_counts.items():
        h = ngram[:-1]
        hcount = hcounts.get(h, 0)
        if smoothing == "add1":
            probs[ngram] = (count + 1) / (hcount + vocabsize)
        elif smoothing == "addk":
            probs[ngram] = (count + k) / (hcount + k * vocabsize)
        elif smoothing == "tokentype":
            v_h = len(uniquefolls[h]) if h in uniquefolls else 1
            probs[ngram] = (count + v_h) / (hcount + v_h * vocabsize)
        else:
            probs[ngram] = count / hcount if hcount > 0 else 0.0
    return probs

def sentence_prob(sentence_tokens, n, probs, vocabsize, smoothing="add1", k=0.5):
    prob = 1.0
    sentence_tokens = ["<s>"]*(n-1) + sentence_tokens + ["</s>"]
    for i in range(n-1, len(sentence_tokens)):
        ngram = tuple(sentence_tokens[i-(n-1):i+1])
        prob *= probs.get(ngram, 1 / (vocabsize if smoothing=="add1" else k*vocabsize))
    return prob

ngram_probs = {}
for n in [1,2,3,4]:
    ngram_counts, hcounts = h_counts(tokens_all, n)
    
    add1_probs = smoothingfunc(ngram_counts, hcounts, V, "add1")
    addk_probs = smoothingfunc(ngram_counts, hcounts, V, "addk", k=0.5)
    tokentype_probs = smoothingfunc(ngram_counts, hcounts, V, "tokentype")
    
    ngram_probs[n] = {
        "add1": add1_probs,
        "addk": addk_probs,
        "tokentype": tokentype_probs
    }

output_file = "sentence_probabilities.csv"
header = ["sentence_id"] + [f"{n}-gram_{s}" for n in [1,2,3,4] for s in ["add1","addk","tokentype"]]

with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)
    
    for idx, sent_tokens in enumerate(sentences_1000):
        row = [idx+1]
        for n in [1,2,3,4]:
            for smoothing in ["add1","addk","tokentype"]:
                prob = sentence_prob(sent_tokens, n, ngram_probs[n][smoothing], V, smoothing=smoothing)
                row.append(prob)
        writer.writerow(row)

print(f"Saved probabilities of 1000 sentences to {output_file}")


Loaded 59936 tokens, 1000 sentences, vocab size = 10231
Saved probabilities of 1000 sentences to sentence_probabilities.csv


: 