In [6]:
from decimal import Decimal as dec


class HindiTagger:
    train_file_name = 'train_set_hmm.csv'
    test_file_name = 'test_set_hmm.csv'

    p_word_tag = {}
    p_word = {}
    p_tag = {}
    dict_transition = {}
    cnt = 0

    def __init__(self):
        self.train()

    def process_input_file(self, file_name, train_data=False):
        vakya_list = []
        with open(file_name, encoding='utf8') as inputFile:
            s_list = []
            prev = "start"
            for idx, line in enumerate(inputFile):
                if idx == 0:
                    continue
                if line.strip() == "~":
                    vakya_list.append(s_list)
                    s_list = []
                    prev = "start"
                else:
                    l = line.split('~')
                    s_list.append([l[0].strip(), l[1].strip()])
                    if train_data:
                        self.dict_transition[f"{l[1].strip()}_{prev}"] = self.dict_transition.get(
                            f"{l[1].strip()}_{prev}", 0) + 1
                    prev = l[1].strip()
        return vakya_list

    def train(self):

        vakya_list = self.process_input_file(self.train_file_name, train_data=True)
        for word_list in vakya_list:
            for word, tag in word_list:
                self.p_word[word] = self.p_word.get(word, 0) + 1
                self.p_tag[tag] = self.p_tag.get(tag, 0) + 1
                self.p_word_tag[f"{word}_{tag}"] = self.p_word_tag.get(f"{word}_{tag}", 0) + 1

        for tag in self.p_tag:
            self.cnt += dec(self.dict_transition.get(f"{tag}_start", 0))

    def transition_prob(self, tag, prev_tag):
        if prev_tag == "start":
            return (dec(self.dict_transition.get(f"{tag}_{prev_tag}", 0))) / self.cnt
        else:
            return (dec(self.dict_transition.get(f"{tag}_{prev_tag}", 0))) / dec(self.p_tag.get(prev_tag, 0))

    def emission_prob(self, word, tag):
        return (dec(self.p_word_tag.get(f"{word}_{tag}", 0)) + dec(1)) / (
                dec(self.p_tag.get(tag, 0)) + dec(len(self.p_tag)))

    def VITERBI(self, vakya):
        viterbi_table = []
        prev_column = []

        for value in range(len(vakya)):
            temp_table = {}
            temp_prev = {}
            for key in self.p_tag:
                temp_table[key] = 0
                temp_prev[key] = None
            viterbi_table.append(temp_table)
            prev_column.append(temp_prev)

        for t in self.p_tag:
            viterbi_table[0][t] = self.transition_prob(t, "start") * self.emission_prob(vakya[0],
                                                                                        t)
            prev_column[0][t] = None

        for idx in range(len(vakya)):
            for t in self.p_tag:
                for t_dash in self.p_tag:
                    prev_prob = viterbi_table[idx - 1][t_dash] * self.transition_prob(t, t_dash) * self.emission_prob(
                        vakya[idx], t)
                    if prev_prob > viterbi_table[idx][t]:
                        viterbi_table[idx][t] = prev_prob
                        prev_column[idx][t] = t_dash

        max_prob = max(viterbi_table[idx], key=viterbi_table[idx].get)

        seq = []
        itr = max_prob
        while itr is not None:
            seq.append(itr)
            itr = prev_column[idx][itr]
            idx -= 1
        seq = seq[::-1]
        return seq

    def hmm_bi_gram(self):
        inner_p_tag = {}
        index = 0
        for key in self.p_tag.keys():
            inner_p_tag[key] = index
            index += 1

        cm = []

        for tag_dict_index in range(len(inner_p_tag.keys())):
            tnt = []
            for j in range(index):
                tnt.append(0)
            cm.append(tnt)

        word_count = 0

        for vakya in self.process_input_file(self.test_file_name):
            word_count += len(vakya)
            words = [shabd[0] for shabd in vakya]
            predicted_tags = self.VITERBI(words)
            for i in range(len(vakya)):
                tag_predicted = predicted_tags[i]
                tag = vakya[i][1]
                cm[inner_p_tag[tag]][inner_p_tag[tag_predicted]] += 1

        pred_actual = 0
        for i in range(len(inner_p_tag)):
            pred_actual += cm[i][i]

        print(pred_actual, word_count)
        print(cm)

    def predict(self):
        self.hmm_bi_gram()


In [7]:
tagger = HindiTagger()
tagger.predict()

81030 98896
[[4633, 15, 5, 142, 35, 325, 59, 7, 3, 236, 73, 107, 63, 37, 13, 41, 19, 50, 72, 26, 481, 211], [3, 16269, 1, 65, 20, 9, 151, 2, 1, 19, 5, 5, 10, 4, 7, 3, 21, 2, 14, 1, 91, 2], [73, 11, 4590, 75, 12, 34, 0, 1, 8, 30, 98, 12, 8, 3, 6, 16, 14, 8, 53, 3, 173, 11], [34, 29, 3, 6967, 170, 48, 14, 0, 3, 27, 12, 26, 62, 5, 25, 24, 44, 21, 56, 45, 220, 22], [6, 3, 1, 196, 3413, 4, 0, 0, 2, 11, 2, 5, 6, 1, 0, 4, 1, 1, 4, 2, 26, 8], [763, 74, 53, 385, 152, 16995, 225, 19, 16, 472, 466, 170, 475, 78, 76, 152, 304, 216, 459, 221, 1613, 397], [31, 47, 10, 96, 32, 207, 6388, 8, 10, 224, 184, 213, 138, 25, 32, 51, 34, 18, 284, 97, 728, 27], [0, 8, 0, 1, 2, 0, 0, 2190, 0, 14, 10, 4, 3, 4, 8, 0, 1, 1, 8, 1, 64, 2], [35, 3, 0, 5, 2, 1, 0, 0, 9511, 25, 17, 1, 5, 0, 2, 1, 0, 0, 1, 0, 23, 5], [131, 64, 25, 163, 65, 384, 92, 13, 22, 3417, 200, 214, 156, 44, 16, 68, 21, 14, 205, 24, 1433, 88], [3, 0, 2, 0, 0, 4, 4, 0, 0, 5, 76, 1, 0, 1, 1, 0, 3, 2, 0, 2, 11, 1], [0, 0, 0, 0, 0, 1, 0, 0, 0, 4, 3, 