## Assignment 3
### Foundations of Machine Learning (CS564)

### *HMM model for PoS tagging on the Brown dataset*

<table style=\"font-size:15px\">
    <thead>
        <td><b>Name of Student</b></td>
        <td><b>Roll No.</b></td>
        <td><b>Date</b></td>
    </thead>
    <tr>
        <td>M. Maheeth Reddy</td>
        <td>1801CS31</td>
        <td>09-Nov-2021</td>
    </tr>
</table>

# import libraries

In [1]:
from os import stat
import numpy as np
from collections import Counter
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import KFold, train_test_split

# function to get all the states for hmm

In [2]:
def get_states(state_seq):
    states = dict(Counter(state_seq))
    return states

# function to create the initial probabilities

In [3]:
def create_initial_prob(states):
    return {state: states[state]/sum(states.values()) for state in states.keys()}

# function to create the state transition probabilities

In [4]:
def create_transition_prob(data_seq, states):
    transit_prob = {}

    for row in states.keys():
        transit_prob[row] = {}
        for col in states.keys():
            transit_prob[row][col] = 0

    for sample_seq in data_seq:
        for i in range(len(sample_seq)-1):
            transit_prob[sample_seq[i]][sample_seq[i+1]] += 1

    for row in states.keys():
        for col in states.keys():
            transit_prob[row][col] = (transit_prob[row][col]+1)/(states[row]+len(states))

    return transit_prob

# function to create the emission probabilities

In [5]:
def create_emission_prob(data_obs, data_seq, states, corpus):
    emiss_prob = {}

    for state in states.keys():
        emiss_prob[state] = {}
        for obs in corpus.keys():
            emiss_prob[state][obs] = 0

    for t in range(len(data_seq)):
        for w in range(len(data_seq[t])):
            emiss_prob[data_seq[t][w]][data_obs[t][w]] += 1

    for state in states.keys():
        for obs in corpus.keys():
            emiss_prob[state][obs] = (emiss_prob[state][obs]+1)/(states[state]+len(corpus))

    return emiss_prob

# function to implement Viterbi algorithm

In [6]:
def viterbi_algorithm(initial_prob, transit_prob, emiss_prob, states, obs_seq):
    i = 0
    tree = [{}]
    state_keys = list(states.keys())

    for state in state_keys:
        tree[0][state] = {'p':initial_prob[state]*emiss_prob[state][obs_seq[0]] if emiss_prob[state].get(obs_seq[0]) != None else 0, 'prev':-1}

    for i in range(1, len(obs_seq)):
        tree.append({})

        for curr_state in state_keys:
            max_prob = 0
            prev = state_keys[0]

            for prev_state in state_keys:
                prob = tree[i-1][prev_state]['p']*transit_prob[prev_state][curr_state]*emiss_prob[curr_state][obs_seq[i]] if emiss_prob[curr_state].get(obs_seq[i]) != None else 0
                
                if prob >= max_prob:
                    max_prob = prob
                    prev = prev_state

            tree[i][curr_state] = {'p': max_prob, 'prev': prev}

    max_prob_state = state_keys[0]
    for state in state_keys[1:]:
        if tree[i][state]['p'] > tree[i][max_prob_state]['p']:
            max_prob_state = state

    pred_state_seq = [max_prob_state]
    while(i > 0):
        pred_state_seq.append(tree[i][max_prob_state]['prev'])
        max_prob_state = tree[i][max_prob_state]['prev']
        i -= 1

    pred_state_seq.reverse()
    return pred_state_seq

# function calculate the forward probabilities

In [7]:
def get_forward_prob(initial_prob, transit_prob, emiss_prob, states, obs_seq):
    state_keys = list(states.keys())
    alpha = [{}]
    for state in state_keys:
        alpha[0][state] = initial_prob[state]*emiss_prob[state][obs_seq[0]] if emiss_prob[state].get(obs_seq[0]) != None else 0

    for t in range(1, len(obs_seq)):
        alpha.append({})
        for curr_state in state_keys:
            sum_prob = 0
            for prev_state in state_keys:
                sum_prob += (alpha[t-1][prev_state]*transit_prob[prev_state][curr_state])

            alpha[t][curr_state] = emiss_prob[curr_state][obs_seq[t]]*sum_prob if emiss_prob[curr_state].get(obs_seq[t]) != None else 0

    return alpha

# function calculate the backward probabilities

In [8]:
def get_backward_prob(initial_prob, transit_prob, emiss_prob, states, obs_seq):
    # T = len(obs_seq)
    state_keys = list(states.keys())
    beta = [{} for i in range(len(obs_seq))]

    for state in state_keys:
        beta[len(obs_seq)-1][state] = 1

    for t in range(len(obs_seq)-2, -1, -1):
        for curr_state in state_keys:
            sum_prob = 0
            for next_state in state_keys:
                if emiss_prob[next_state].get(obs_seq[t+1]) != None:
                    sum_prob += (beta[t+1][next_state]*transit_prob[curr_state][next_state]*emiss_prob[next_state][obs_seq[t+1]])

            beta[t][curr_state] = sum_prob

    return beta

In [9]:
def get_temp_variables(initial_prob, transit_prob, emiss_prob, states, alpha, beta, obs_seq):
    state_keys = list(states.keys())
    y = [{} for i in range(len(obs_seq))]
    epi = [{} for i in range(len(obs_seq))]

    for t in range(0, len(obs_seq)):
        for state in state_keys:
            sum_y = 0
            for all_s in state_keys:
                sum_y += (alpha[t][all_s]*beta[t][all_s])

            y[t][state] = (alpha[t][state]*beta[t][state])/sum_y if sum_y > 0 else 0

    for t in range(0, len(obs_seq)-1):
        for i in state_keys:
            epi[t][i] = {}

            for j in state_keys:
                sum_epi = 0
                for k in state_keys:
                    for w in state_keys:
                        if emiss_prob[w].get(obs_seq[t+1]) != None:
                            sum_epi += (alpha[t][k]*transit_prob[k][w]*beta[t+1][w]*emiss_prob[w][obs_seq[t+1]])

                if (emiss_prob[j].get(obs_seq[t+1]) != None) and sum_epi > 0:
                    epi[t][i][j] = (alpha[t][i]*transit_prob[i][j]*beta[t+1][j]*emiss_prob[j][obs_seq[t+1]])/sum_epi 
                else:
                    epi[t][i][j] = 0

    return y, epi

# function to train the model

In [10]:
def train(initial_prob, transit_prob, emiss_prob, states, x_train, y_train, epochs):
    samples = x_train.shape[0]
    state_keys = list(states.keys())

    for epoch in range(0,epochs):
        print('Epoch ', (epoch+1), end='\r')
        alpha,beta = [],[]
        y,epi = [],[]
        
        for r in range(0, samples):
            alpha.append(get_forward_prob(x_train[r]))
            beta.append(get_backward_prob(x_train[r]))
            temp1, temp2 = get_temp_variables(alpha[r], beta[r], x_train[r])
            y.append(temp1)
            epi.append(temp2)

        for state in state_keys:
            initial_prob[state] = 0
            for r in range(0, samples):
                initial_prob[state] += (y[r][0][state]/samples)

        for i in state_keys:
            for j in state_keys:
                num,den = 0,0
                for r in range(0, samples):
                    for t in range(0, len(epi[r])-1):
                        num += epi[r][t][i][j]
                        den += y[r][t][i]

                transit_prob[i][j] = num/den if den > 0 else 0

        for i in state_keys:
            for r in range(0, samples):
                for k in x_train[r]:
                    num,den = 0,0
                    for t in range(0, len(y[r])):
                        if x_train[t] == k:
                            num += y[r][t][i]
                        den += y[r][t][i]

                emiss_prob[i][j] = num/den if den > 0 else 0

    return initial_prob, transit_prob, emiss_prob

In [11]:
def get_obs_seq(sentences):
    corpus = {}
    state_seq = []

    data_obs,data_seq = [],[]

    for sent in sentences:
        words = sent.split(' ')
        sent_tag_list,sent_word_list = [],[]
        for word in words:
            word_split = word.rsplit('/', 1)
            word, tag = word_split[0], word_split[1]
            sent_tag_list.append(tag)
            sent_word_list.append(word)

            if corpus.get(word) == None:
                corpus[word] = 1
            else:
                corpus[word] += 1
            state_seq.append(tag)

        data_obs.append(sent_word_list)
        data_seq.append(sent_tag_list)

    return data_obs, data_seq, corpus, state_seq

# function to get the sentences from Brown Dataset

In [12]:
def get_all_sentences(filename):
    sentences = []
    with open(filename, 'r') as infile:
        for line in infile:
            sentences.append(line.rstrip())
    print('Number of sentences in the brown dataset are', len(sentences))
    return sentences

# function for all predictions

In [13]:
def get_and_write_preds(data_obs, outfile, init, trans, emi, states):
    outputfile = open(outfile, 'w')
    all_predictions = []
    for words in data_obs:
        pred_seq = viterbi_algorithm(init, trans, emi, states, words)
        all_predictions.append(pred_seq)
        for i in range(len(pred_seq)):
            outputfile.write(words[i] + '\t' + pred_seq[i] + '\n')
        outputfile.write('\n')

    outputfile.close()
    return all_predictions

In [14]:
def flatten_state_seq(data_seq):
    vec = []
    for seq in data_seq:
        vec.extend(seq)

    return vec

# driver function

In [15]:
def driver(filename, outfile):
    sentences = get_all_sentences(filename)
    train_sentences, test_sentences = train_test_split(sentences, test_size=0.2, random_state=117)
    data_obs, data_seq, corpus, state_seq = get_obs_seq(train_sentences)
    test_data_obs, test_data_seq, test_corpus, test_state_seq = get_obs_seq(test_sentences)
    len_corpus = len(corpus)
    states = dict(Counter(state_seq))
    state_keys = list(states.keys())

    kf = KFold(n_splits=5, shuffle=False)

    np_obs = np.array(data_obs)
    np_seq = np.array(data_seq)

    for train_index, test_index in kf.split(np_obs):
        x_train, y_train = np_obs[train_index], np_seq[train_index]
        x_test, y_test = np_obs[test_index], np_seq[test_index]

        init = create_initial_prob(states)
        trans = create_transition_prob(y_train, states)
        emi = create_emission_prob(x_train, y_train, states, corpus)

        pred_seq = []
        for t in range(0, len(x_test)):
            pred_seq.extend(viterbi_algorithm(init, trans, emi, states, x_test[t]))

        print(precision_recall_fscore_support(flatten_state_seq(y_test), pred_seq, labels=state_keys),end='\n\n')

    init = create_initial_prob(states)
    trans = create_transition_prob(y_train, states)
    emi = create_emission_prob(x_train, y_train, states, corpus)

    test_pred_seq = get_and_write_preds(test_data_obs, outfile, init, trans, emi, states)
    with open('precision_recall_fscore.txt', 'w') as infile:
        print('precision, recall and fscore respectively\n',file=infile)
        print(precision_recall_fscore_support(flatten_state_seq(test_data_seq),flatten_state_seq(test_pred_seq), labels=state_keys), file=infile)
    
    return

# Results

In [16]:
driver("Brown_train.txt", "test_pred.txt")

Number of sentences in the brown dataset are 27491


  np_obs = np.array(data_obs)
  np_seq = np.array(data_seq)


(array([0.86756542, 0.93788922, 0.98386531, 0.95824176, 0.89314463,
       0.84454303, 0.97751346, 0.91765221, 0.86735898, 0.91824752,
       0.99097938, 0.152     ]), array([0.98637548, 0.89421875, 0.98490169, 0.88779666, 0.96612764,
       0.8203758 , 0.99698929, 0.96100917, 0.83465819, 0.82911909,
       0.70875576, 0.44705882]), array([0.92316347, 0.91553351, 0.98438323, 0.92167511, 0.92820371,
       0.83228402, 0.98715532, 0.93883038, 0.85069444, 0.8714102 ,
       0.8264374 , 0.22686567]), array([ 9982, 19200,  2848, 14242, 10451,  5801, 11293,  4360,  4403,
        2452,  1085,    85]))

(array([0.87623805, 0.93923751, 0.98381422, 0.95954147, 0.89173296,
       0.84214223, 0.97681357, 0.91853809, 0.86710189, 0.91331546,
       0.98985507, 0.04481132]), array([0.98034303, 0.89432185, 0.98485382, 0.88611361, 0.96717314,
       0.82151422, 0.99634814, 0.96462428, 0.83460503, 0.82253521,
       0.6878147 , 0.17431193]), array([0.92537178, 0.91622954, 0.98433374, 0.92136691, 0.92792