In [None]:
# Hidden Markov Model
## Imports
import math
from collections import defaultdict

In [None]:
# Hidden Markov Model for NER
class HMM_NER:
    def __init__(self):
        self.transition_probs = defaultdict(lambda: defaultdict(int))  # tag -> next_tag
        self.emission_probs = defaultdict(lambda: defaultdict(int))    # tag -> word
        self.tag_counts = defaultdict(int)
        self.unique_tags = set()

    def train(self, sentences, tags):
        for words, ner_tags in zip(sentences, tags):
            prev_tag = "<s>"
            self.tag_counts[prev_tag] += 1
            for word, tag in zip(words, ner_tags):
                self.transition_probs[prev_tag][tag] += 1
                self.emission_probs[tag][word] += 1
                self.tag_counts[tag] += 1
                self.unique_tags.add(tag)
                prev_tag = tag
            self.transition_probs[prev_tag]["</s>"] += 1

    def predict(self, sentence):
        V = [{}]
        path = {}

        for tag in self.unique_tags:
            V[0][tag] = math.log(self.transition_probs["<s>"].get(tag, 1) / self.tag_counts["<s>"]) + \
                        math.log(self.emission_probs[tag].get(sentence[0], 1) / self.tag_counts[tag])
            path[tag] = [tag]

        for t in range(1, len(sentence)):
            V.append({})
            new_path = {}

            for curr_tag in self.unique_tags:
                (prob, best_tag) = max(
                    (V[t - 1][prev_tag] +
                     math.log(self.transition_probs[prev_tag].get(curr_tag, 1) / self.tag_counts[prev_tag]) +
                     math.log(self.emission_probs[curr_tag].get(sentence[t], 1) / self.tag_counts[curr_tag]),
                     prev_tag)
                    for prev_tag in self.unique_tags
                )
                V[t][curr_tag] = prob
                new_path[curr_tag] = path[best_tag] + [curr_tag]

            path = new_path

        n = len(sentence) - 1
        (prob, final_tag) = max((V[n][tag], tag) for tag in self.unique_tags)
        return path[final_tag]

In [None]:
## Data Loading and Preprocessing
def load_conll2003_file(filepath):
    sentences = []
    tags = []
    with open(filepath, 'r', encoding='utf-8') as file:
        words, ner_tags = [], []
        for line in file:
            line = line.strip()
            if not line:
                if words and ner_tags:
                    sentences.append(words)
                    tags.append(ner_tags)
                    words, ner_tags = [], []
            else:
                parts = line.split()
                if len(parts) >= 4:
                    words.append(parts[0])
                    ner_tags.append(parts[-1])
        if words and ner_tags:
            sentences.append(words)
            tags.append(ner_tags)
    return sentences, tags

In [None]:
## Evaluation
def evaluate(model, test_sentences, test_tags):
    total = 0
    correct = 0
    for sentence, true_tags in zip(test_sentences, test_tags):
        predicted_tags = model.predict(sentence)
        for pred, true in zip(predicted_tags, true_tags):
            if pred == true:
                correct += 1
            total += 1
    return correct / total if total > 0 else 0

In [None]:
## Main Execution
def main():
    # Load training data
    sentences, tags = load_conll2003_file("conll2003/eng.train")

    # Split 80/20
    split_idx = int(0.8 * len(sentences))
    train_sentences = sentences[:split_idx]
    train_tags = tags[:split_idx]
    test_sentences = sentences[split_idx:]
    test_tags = tags[split_idx:]

    # Train and evaluate
    model = HMM_NER()
    model.train(train_sentences, train_tags)

    accuracy = evaluate(model, test_sentences, test_tags)
    print(f"Model Accuracy: {accuracy:.2%}")

    # Predict on sample sentence
    test_sentence = ["Manila", "is", "the", "capital", "of", "the", "Philippines"]
    print("\nFriend's Test Sentence:", test_sentence)
    print("Predicted NER Tags:")
    print(model.predict(test_sentence))

In [None]:
if __name__ == "__main__":
    main()