In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import ast

In [2]:
train_path = r"C:\Users\mitta\OneDrive - iiit-b\Documents\NLP\Assignment1\dataset\TRAIN.csv"
test_path = r"C:\Users\mitta\OneDrive - iiit-b\Documents\NLP\Assignment1\dataset\TEST.csv"

train_data = pd.read_csv(train_path, header=None, names=['sentence'])
test_data = pd.read_csv(test_path, header=None, names=['sentence'])

In [3]:
train_data['sentence'] = train_data['sentence'].apply(ast.literal_eval)
test_data['sentence'] = test_data['sentence'].apply(ast.literal_eval)

words = set(word for sentence in train_data['sentence'] for word, _ in sentence)
tags = set(tag for sentence in train_data['sentence'] for _, tag in sentence)

transition_counts = defaultdict(lambda: defaultdict(int))
emission_counts = defaultdict(lambda: defaultdict(int))
initial_counts = defaultdict(int)

In [4]:
def train_hmm(train_data):
    for sentence in train_data['sentence']:
        prev_tag = None
        for word, tag in sentence:
            emission_counts[tag][word] += 1
            if prev_tag is not None:
                transition_counts[prev_tag][tag] += 1
            else:
                initial_counts[tag] += 1
            prev_tag = tag


In [5]:
def normalize_counts(counts):
    probabilities = {}
    for key, sub_counts in counts.items():
        total = sum(sub_counts.values())
        probabilities[key] = {k: v / total for k, v in sub_counts.items()}
    return probabilities

In [6]:
train_hmm(train_data)
transition_probs = normalize_counts(transition_counts)
emission_probs = normalize_counts(emission_counts)
initial_probs = {k: v / sum(initial_counts.values()) for k, v in initial_counts.items()}


In [7]:
def viterbi(words, transition_probs, emission_probs, initial_probs, tags):
    V = [{}]
    backpointer = [{}]
    
    for tag in tags:
        V[0][tag] = initial_probs.get(tag, 0) * emission_probs[tag].get(words[0], 0)
        backpointer[0][tag] = None
    
    for t in range(1, len(words)):
        V.append({})
        backpointer.append({})
        for tag in tags:
            max_prob, best_prev_tag = max(
                (V[t-1][prev_tag] * transition_probs[prev_tag].get(tag, 0) * emission_probs[tag].get(words[t], 0), prev_tag)
                for prev_tag in tags
            )
            V[t][tag] = max_prob
            backpointer[t][tag] = best_prev_tag
    
    best_last_tag = max(V[-1], key=V[-1].get)
    best_path = [best_last_tag]
    for t in range(len(words) - 1, 0, -1):
        best_path.insert(0, backpointer[t][best_path[0]])
    
    return best_path

In [8]:
def evaluate(test_data):
    correct, total = 0, 0
    for sentence in test_data['sentence']:
        words = [word for word,_ in sentence]
        actual_tags = [tag for _, tag in sentence]
        predicted_tags = viterbi(words, transition_probs, emission_probs, initial_probs, tags)
        correct += sum(p == a for p, a in zip(predicted_tags, actual_tags))
        total += len(actual_tags)
    accuracy = correct / total if total > 0 else 0
    print("Model Accuracy: "+str(accuracy*100)+"%")

evaluate(test_data)


Model Accuracy: 82.45245454028328%


In [9]:
sentence=['He','drew','a','deep','breath']
predicted_tags = viterbi(sentence, transition_probs, emission_probs, initial_probs)
predicted_tags

TypeError: viterbi() missing 1 required positional argument: 'tags'