# tc_ner_model

## Imports

In [21]:
import math
import random
import pandas as pd
from collections import defaultdict


## Hidden Markov Model Class
This class encapsulates the entire functionality of the HMM.

In [22]:
class TrigramLanguageModel:
    def __init__(self, smoothing=1e-5):
        self.trigram_counts = defaultdict(lambda: defaultdict(int))
        self.bigram_counts = defaultdict(int)
        self.vocab = set()
        self.smoothing = smoothing

    def train(self, sentences):
        for sentence in sentences:
            tokens = ['<s>', '<s>'] + sentence + ['</s>']
            self.vocab.update(tokens)
            for i in range(2, len(tokens)):
                bigram = (tokens[i - 2], tokens[i - 1])
                trigram = tokens[i]
                self.bigram_counts[bigram] += 1
                self.trigram_counts[bigram][trigram] += 1

    def predict_next(self, prev_two):
        candidates = self.trigram_counts.get(tuple(prev_two), {})
        if not candidates:
            return random.choice(list(self.vocab))
        total = self.bigram_counts[tuple(prev_two)] + self.smoothing * len(self.vocab)
        probs = {word: (count + self.smoothing) / total for word, count in candidates.items()}
        return max(probs, key=probs.get)

    def generate_sentence(self, max_len=15):
        sentence = ['<s>', '<s>']
        for _ in range(max_len):
            next_word = self.predict_next(sentence[-2:])
            if next_word == '</s>':
                break
            sentence.append(next_word)
        return sentence[2:]


## Data Loading and Preprocessing
This function loads and parses a CSV-formatted NER dataset.

In [28]:

def load_data(filepath):
    df = pd.read_csv("ner.csv",encoding='ISO-8859-1')
    sentences = df['Sentence'].apply(str.split).tolist()
    return sentences


## Evaluation
This function computes the model's tagging accuracy.

In [24]:

def evaluate(model, test_sentences):
    correct = 0
    total = 0
    for sentence in test_sentences:
        tokens = ['<s>', '<s>'] + sentence + ['</s>']
        for i in range(2, len(tokens)):
            prediction = model.predict_next([tokens[i - 2], tokens[i - 1]])
            if prediction == tokens[i]:
                correct += 1
            total += 1
    return correct / total



## Main Execution
Train the HMM, evaluate its performance, and test a sample sentence.

In [None]:
def main():
    sentences = load_data("ner.csv")
    split_idx = int(0.8 * len(sentences))
    train_sentences = sentences[:split_idx]
    test_sentences = sentences[split_idx:]

    model = TrigramLanguageModel()
    model.train(train_sentences)

    accuracy = evaluate(model, test_sentences)
    print(f"Model Accuracy: {accuracy:.2%}")

    print("\nGenerated Sentence:", ' '.join(model.generate_sentence()))

    # Friend's test sentence
    test_sentence = ["Manila", "is", "the", "capital", "of", "the", "Philippines"]
    print("\nFriend's Test Sentence:", test_sentence)
    print("Predicted Next Words:")

    # Use your model to predict next word after each bigram
    tokens = ['<s>', '<s>'] + test_sentence
    for i in range(2, len(tokens)):
        context = [tokens[i - 2], tokens[i - 1]]
        prediction = model.predict_next(context)
        print(f"{context} -> {prediction}")

In [29]:
main()

KeyError: 'Sentence'