`Deliverable URL`:
https://colab.research.google.com/drive/1b9Sitk8VgLx4KkiFB68wkUVSvNWAJcS7?usp=sharing

In [None]:
!wget https://raw.githubusercontent.com/debajyotimaz/nlp_assignment/refs/heads/main/Viterbi_assignment/train_data.txt
!wget https://raw.githubusercontent.com/debajyotimaz/nlp_assignment/refs/heads/main/Viterbi_assignment/test_data.txt
!wget https://raw.githubusercontent.com/debajyotimaz/nlp_assignment/refs/heads/main/Viterbi_assignment/noisy_test_data.txt

SyntaxError: invalid syntax (1465806638.py, line 1)

In [2]:
def load_data(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            sentence = []
            for token in line.strip().split():
                word, tag = token.rsplit('/', 1)  # Split word and tag
                sentence.append((word, tag))
            data.append(sentence)
    return data

# Load train and test data from files
train_data_file = '/content/train_data.txt'  # Path to your training data file
test_data_file = '/content/test_data.txt'    # Path to your test data file
noisy_test_data_file = '/content/noisy_test_data.txt'  # Path to your noisy test data file

train_data = load_data(train_data_file)
test_data = load_data(test_data_file)
noisy_test_data = load_data(noisy_test_data_file)

# Print a sample from the training data
print(train_data[0])

[('He', 'PRON'), ('let', 'VERB'), ('her', 'PRON'), ('tell', 'VERB'), ('him', 'PRON'), ('all', 'PRT'), ('about', 'ADP'), ('the', 'DET'), ('church', 'NOUN'), ('.', '.')]


In [3]:
import numpy as np
from collections import defaultdict

class HMMViterbiPOS:
    def __init__(self, train_file, test_file, noisy_test_file):
        self.training_sentences = self.load_data(train_file)
        self.test_sentences = self.load_data(test_file)
        self.noisy_test_sentences = self.load_data(noisy_test_file)
        self.states = set()
        self.word_set = set()
        self.transition_probs = defaultdict(lambda: defaultdict(float))
        self.emission_probs = defaultdict(lambda: defaultdict(float))
        self.start_probs = defaultdict(float)
        self.train_hmm()

    def load_data(self, file_path):
        data = []
        try:
            with open(file_path, 'r') as file:
                for line in file:
                    sentence = [(token.rsplit('/', 1)[0], token.rsplit('/', 1)[1])
                                for token in line.strip().split()]
                    data.append(sentence)
        except FileNotFoundError:
            raise Exception(f"File {file_path} not found")
        return data

    def train_hmm(self):
        transition_counts = defaultdict(lambda: defaultdict(int))
        emission_counts = defaultdict(lambda: defaultdict(int))
        start_counts = defaultdict(int)
        state_counts = defaultdict(int)

        for sentence in self.training_sentences:
            previous_tag = None
            for word, tag in sentence:
                self.states.add(tag)
                self.word_set.add(word)
                state_counts[tag] += 1
                emission_counts[tag][word] += 1

                if previous_tag is None:
                    start_counts[tag] += 1
                else:
                    transition_counts[previous_tag][tag] += 1
                previous_tag = tag

        total_sentences = len(self.training_sentences)
        self.start_probs = {tag: (start_counts[tag] + 1) / (total_sentences + len(self.states)) for tag in self.states}

        for prev_tag in self.states:
            total_transitions = sum(transition_counts[prev_tag].values()) + len(self.states)
            for tag in self.states:
                self.transition_probs[prev_tag][tag] = (transition_counts[prev_tag][tag] + 1) / total_transitions

        for tag in self.states:
            total_emissions = sum(emission_counts[tag].values()) + len(self.word_set)
            for word in self.word_set:
                self.emission_probs[tag][word] = (emission_counts[tag][word] + 1) / total_emissions

    def viterbi_algorithm(self, sentence):
        viterbi_matrix = [{}]
        backpointer = [{}]

        for state in self.states:
            viterbi_matrix[0][state] = np.log(self.start_probs[state]) + np.log(self.emission_probs[state].get(sentence[0], 1e-6))
            backpointer[0][state] = None

        for t in range(1, len(sentence)):
            viterbi_matrix.append({})
            backpointer.append({})

            for state in self.states:
                max_prob, best_prev_state = max(
                    (viterbi_matrix[t - 1][prev_state] + np.log(self.transition_probs[prev_state].get(state, 1e-6)) +
                     np.log(self.emission_probs[state].get(sentence[t], 1e-6)), prev_state)
                    for prev_state in self.states
                )

                viterbi_matrix[t][state] = max_prob
                backpointer[t][state] = best_prev_state

        best_final_state = max(self.states, key=lambda state: viterbi_matrix[-1][state])
        best_path = []

        for t in reversed(range(len(sentence))):
            best_path.insert(0, best_final_state)
            best_final_state = backpointer[t][best_final_state]

        return list(zip(sentence, best_path))


###Evaluation
An example of evaluation:

In [4]:
sentence = [('Get', 'VERB'), ('copper', 'NOUN'), ('or', 'CONJ'), ('earthenware', 'NOUN'), ('mugs', 'NOUN'), ('that', 'PRON'), ('keep', 'VERB'), ('beer', 'NOUN'), ('chilled', 'VERB'), ('or', 'CONJ'), ('soup', 'NOUN'), ('hot', 'ADJ'), ('.', '.')]
predicted_tags = ['DET', 'NOUN', 'CONJ', 'NOUN', 'ADP', 'PRON', 'VERB', 'NOUN', '.', 'CONJ', 'NOUN', 'ADJ', '.']
true_tags = ('VERB', 'NOUN', 'CONJ', 'NOUN', 'NOUN', 'PRON', 'VERB', 'NOUN', 'VERB', 'CONJ', 'NOUN', 'ADJ', '.')
correct = 0
total = 0
correct += sum(p == t for p, t in zip(predicted_tags, true_tags))
total += len(true_tags)
accuracy = correct / total

print(f"Baseline Viterbi Accuracy: {accuracy * 100:.2f}%")
# print(f"Viterbi with Noise Handling Accuracy: {accuracy * 100:.2f}%")  # similarly calculate for Noise Handling


Baseline Viterbi Accuracy: 76.92%


In [None]:
# Function to evaluate accuracy
def evaluate_accuracy(hmm_model, test_data):
    correct, total = 0, 0
    predictions = []

    for sentence in test_data:
        words = [word for word, _ in sentence]
        predicted_tags = hmm_model.viterbi_algorithm(words)
        predictions.append(predicted_tags)

        for (pred_word, pred_tag), (true_word, true_tag) in zip(predicted_tags, sentence):
            if pred_tag == true_tag:
                correct += 1
            total += 1

    return correct / total

# Load the trained HMM model
hmm_model = HMMViterbiPOS("/content/train_data.txt", "/content/test_data.txt", "/content/noisy_test_data.txt")

# Evaluate accuracy on test and noisy test data
test_accuracy = evaluate_accuracy(hmm_model, hmm_model.test_sentences)
noisy_test_accuracy = evaluate_accuracy(hmm_model, hmm_model.noisy_test_sentences)

test_accuracy, noisy_test_accuracy


Viterbi Algorithm Accuracy: 83.45%
