# tc_model

In [2]:
import math
import random
import pandas as pd
from collections import defaultdict
import random
from collections import Counter

In [3]:
# Trigram Language Model
class TrigramLanguageModel:
    def __init__(self):
        self.trigrams = defaultdict(lambda: defaultdict(int))
        self.bigrams = defaultdict(int)

    def train(self, sentences):
        for sentence in sentences:
            tokens = ['<s>', '<s>'] + sentence + ['</s>']
            for i in range(len(tokens) - 2):
                bigram = (tokens[i], tokens[i + 1])
                next_word = tokens[i + 2]
                self.trigrams[bigram][next_word] += 1
                self.bigrams[bigram] += 1

    def predict_next(self, context):
        bigram = tuple(context)
        if bigram in self.trigrams:
            next_words = self.trigrams[bigram]
            return max(next_words, key=next_words.get)
        return "<unk>"

    def generate_sentence(self):
        result = ['<s>', '<s>']
        while True:
            next_word = self.predict_next(result[-2:])
            if next_word == '</s>' or len(result) > 20:
                break
            result.append(next_word)
        return result[2:]

In [4]:
# Load CoNLL2003 data from local file
def load_conll2003_file(filepath):
    sentences = []
    with open(filepath, 'r', encoding='utf-8') as file:
        words = []
        for line in file:
            line = line.strip()
            if not line:
                if words:
                    sentences.append(words)
                    words = []
            else:
                parts = line.split()
                if len(parts) >= 1:
                    words.append(parts[0])
        if words:
            sentences.append(words)
    return sentences

In [5]:
# Evaluate model by checking next word predictions
def evaluate_detailed(model, test_sentences, k=3):
    vocab = set(word for trigram in model.trigrams.values() for word in trigram)
    
    total = 0
    correct = 0
    top_k_correct = 0
    log_probs = []

    for sentence in test_sentences:
        tokens = ['<s>', '<s>'] + sentence + ['</s>']
        for i in range(2, len(tokens)):
            context = (tokens[i - 2], tokens[i - 1])
            actual_word = tokens[i]

            # Get counts for smoothing
            trigram_counts = model.trigrams.get(context, {})
            total_count = model.bigrams.get(context, 0) + len(vocab)

            # Smoothed probability for actual word
            actual_count = trigram_counts.get(actual_word, 0)
            prob = (actual_count + 1) / total_count
            log_probs.append(math.log(prob))

            # Accuracy
            prediction = model.predict_next(context)
            if prediction == actual_word:
                correct += 1

            # Top-k Accuracy
            smoothed_probs = {
                word: (trigram_counts.get(word, 0) + 1) / total_count
                for word in vocab
            }
            top_k_preds = sorted(smoothed_probs, key=smoothed_probs.get, reverse=True)[:k]
            if actual_word in top_k_preds:
                top_k_correct += 1

            total += 1

    accuracy = correct / total if total > 0 else 0
    top_k_accuracy = top_k_correct / total if total > 0 else 0
    perplexity = math.exp(-sum(log_probs) / total) if total > 0 else float('inf')

    return accuracy, top_k_accuracy, perplexity


In [10]:
def main():
    # Load training data
    sentences = load_conll2003_file("conll2003/eng.train")

    # Split 80/20
    split_idx = int(0.8 * len(sentences))
    train_sentences = sentences[:split_idx]
    test_sentences = sentences[split_idx:]

    # Train and evaluate
    model = TrigramLanguageModel()
    model.train(train_sentences)

    accuracy, top_k_acc, ppl = evaluate_detailed(model, test_sentences, k=3)
    print(f"\nEvaluation Metrics:")
    print(f"Accuracy         : {accuracy:.2%}")
    print(f"Top-3 Accuracy   : {top_k_acc:.2%}")
    print(f"Perplexity       : {ppl:.2f}")

    print("\nGenerated Sentence:", ' '.join(model.generate_sentence()))

    # Predict next words from a sample sentence
    test_sentence = ["He", "was", "born", "in", "London", "on", "May"]
    print("\nFriend's Test Sentence:", test_sentence)
    print("Predicted Next Words:")
    tokens = ['<s>', '<s>'] + test_sentence
    for i in range(2, len(tokens)):
        context = [tokens[i - 2], tokens[i - 1]]
        prediction = model.predict_next(context)
        print(f"{context} -> {prediction}")

In [11]:
if __name__ == "__main__":
    main()


Evaluation Metrics:
Accuracy         : 12.66%
Top-3 Accuracy   : 16.79%
Perplexity       : 12401.77

Generated Sentence: -DOCSTART-

Friend's Test Sentence: ['He', 'was', 'born', 'in', 'London', 'on', 'May']
Predicted Next Words:
['<s>', '<s>'] -> -DOCSTART-
['<s>', 'He'] -> said
['He', 'was'] -> the
['was', 'born'] -> Agnes
['born', 'in'] -> Buenos
['in', 'London'] -> .
['London', 'on'] -> Wednesday
