In [None]:
import math
from collections import Counter
from pathlib import Path
import random

# =====================================================
# PATH
# =====================================================

BASE_DIR = Path(__file__).resolve().parents[1]
CORPUS_PATH = BASE_DIR / "Corpora" / "News" / "corpus.txt"

# =====================================================
# READ CORPUS
# =====================================================

def load_tokens(path):
    tokens = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            words = line.strip().split()
            tokens.extend(words)
    return tokens

# =====================================================
# SPLIT TRAIN / TEST
# =====================================================

def split_data(tokens, train_ratio=0.8):
    split_index = int(len(tokens) * train_ratio)
    return tokens[:split_index], tokens[split_index:]

# =====================================================
# BUILD MODELS (MLE)
# =====================================================

def build_unigram(tokens):
    counts = Counter(tokens)
    total = len(tokens)
    return counts, total

def build_bigram(tokens):
    return Counter(zip(tokens[:-1], tokens[1:]))

def build_trigram(tokens):
    return Counter(zip(tokens[:-2], tokens[1:-1], tokens[2:]))

# =====================================================
# PERPLEXITY (NO SMOOTHING)
# =====================================================

def perplexity_unigram(test_tokens, counts, total):
    log_prob = 0
    T = len(test_tokens)

    for w in test_tokens:
        if counts[w] == 0:
            return float("inf")
        p = counts[w] / total
        log_prob += math.log(p)

    return math.exp(-log_prob / T)

def perplexity_bigram(test_tokens, bigram_counts, unigram_counts):
    log_prob = 0
    T = len(test_tokens) - 1

    for w1, w2 in zip(test_tokens[:-1], test_tokens[1:]):
        if bigram_counts[(w1, w2)] == 0:
            return float("inf")
        p = bigram_counts[(w1, w2)] / unigram_counts[w1]
        log_prob += math.log(p)

    return math.exp(-log_prob / T)

def perplexity_trigram(test_tokens, trigram_counts, bigram_counts):
    log_prob = 0
    T = len(test_tokens) - 2

    for w1, w2, w3 in zip(test_tokens[:-2], test_tokens[1:-1], test_tokens[2:]):
        if trigram_counts[(w1, w2, w3)] == 0:
            return float("inf")
        p = trigram_counts[(w1, w2, w3)] / bigram_counts[(w1, w2)]
        log_prob += math.log(p)

    return math.exp(-log_prob / T)

# =====================================================
# MAIN
# =====================================================

def main():
    print("Loading corpus...")
    tokens = load_tokens(CORPUS_PATH)

    print("Total tokens:", len(tokens))

    train_tokens, test_tokens = split_data(tokens)

    print("Train tokens:", len(train_tokens))
    print("Test tokens:", len(test_tokens))

    # Build models
    unigram_counts, total = build_unigram(train_tokens)
    bigram_counts = build_bigram(train_tokens)
    trigram_counts = build_trigram(train_tokens)

    # Compute perplexities
    ppl_uni = perplexity_unigram(test_tokens, unigram_counts, total)
    ppl_bi = perplexity_bigram(test_tokens, bigram_counts, unigram_counts)
    ppl_tri = perplexity_trigram(test_tokens, trigram_counts, bigram_counts)

    print("\n=== Task 1 Results (No Smoothing) ===")
    print("Unigram Perplexity:", ppl_uni)
    print("Bigram Perplexity:", ppl_bi)
    print("Trigram Perplexity:", ppl_tri)


if __name__ == "__main__":
    main()