# tc_ner_model

## Imports

In [33]:
import math
import random
import pandas as pd
from collections import defaultdict
import random
from collections import defaultdict

In [34]:
# Trigram Language Model
class TrigramLanguageModel:
    def __init__(self):
        self.trigrams = defaultdict(lambda: defaultdict(int))
        self.bigrams = defaultdict(int)

    def train(self, sentences):
        for sentence in sentences:
            tokens = ['<s>', '<s>'] + sentence + ['</s>']
            for i in range(len(tokens) - 2):
                bigram = (tokens[i], tokens[i + 1])
                next_word = tokens[i + 2]
                self.trigrams[bigram][next_word] += 1
                self.bigrams[bigram] += 1

    def predict_next(self, context):
        bigram = tuple(context)
        if bigram in self.trigrams:
            next_words = self.trigrams[bigram]
            return max(next_words, key=next_words.get)
        return "<unk>"

    def generate_sentence(self):
        result = ['<s>', '<s>']
        while True:
            next_word = self.predict_next(result[-2:])
            if next_word == '</s>' or len(result) > 20:
                break
            result.append(next_word)
        return result[2:]

## Data Loading and Preprocessing
This function loads and parses a CSV-formatted NER dataset.

In [35]:
# Load CoNLL2003 data from local file
def load_conll2003_file(filepath):
    sentences = []
    with open(filepath, 'r', encoding='utf-8') as file:
        words = []
        for line in file:
            line = line.strip()
            if not line:
                if words:
                    sentences.append(words)
                    words = []
            else:
                parts = line.split()
                if len(parts) >= 1:
                    words.append(parts[0])
        if words:
            sentences.append(words)
    return sentences

## Evaluation
This function computes the model's tagging accuracy.

In [36]:
# Evaluate model by checking next word predictions
def evaluate(model, test_sentences):
    total = 0
    correct = 0
    for sentence in test_sentences:
        tokens = ['<s>', '<s>'] + sentence + ['</s>']
        for i in range(2, len(tokens)):
            context = [tokens[i - 2], tokens[i - 1]]
            prediction = model.predict_next(context)
            if prediction == tokens[i]:
                correct += 1
            total += 1
    return correct / total if total > 0 else 0

## Main Execution
Train the HMM, evaluate its performance, and test a sample sentence.

In [37]:
def main():
    # Load training data
    sentences = load_conll2003_file("conll2003/eng.train")

    # Split 80/20
    split_idx = int(0.8 * len(sentences))
    train_sentences = sentences[:split_idx]
    test_sentences = sentences[split_idx:]

    # Train and evaluate
    model = TrigramLanguageModel()
    model.train(train_sentences)

    accuracy = evaluate(model, test_sentences)
    print(f"Model Accuracy: {accuracy:.2%}")

    print("\nGenerated Sentence:", ' '.join(model.generate_sentence()))

    # Predict next words from a sample sentence
    test_sentence = ["Manila", "is", "the", "capital", "of", "the", "Philippines"]
    print("\nFriend's Test Sentence:", test_sentence)
    print("Predicted Next Words:")
    tokens = ['<s>', '<s>'] + test_sentence
    for i in range(2, len(tokens)):
        context = [tokens[i - 2], tokens[i - 1]]
        prediction = model.predict_next(context)
        print(f"{context} -> {prediction}")

In [38]:
if __name__ == "__main__":
    main()

Model Accuracy: 12.66%

Generated Sentence: -DOCSTART-

Friend's Test Sentence: ['Manila', 'is', 'the', 'capital', 'of', 'the', 'Philippines']
Predicted Next Words:
['<s>', '<s>'] -> -DOCSTART-
['<s>', 'Manila'] -> international
['Manila', 'is'] -> <unk>
['is', 'the'] -> first
['the', 'capital'] -> .
['capital', 'of'] -> Etruria
['of', 'the'] -> season
