In [122]:
import numpy as np
import nltk
from hmmlearn import hmm
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time
from collections import defaultdict, Counter
'''
Reading the input file and storing the 
values of the 3 columns of each row in
a tuple: (<Word>, <POS_TAG>, <CHUNK_TAG>)
'''
f = open("train.txt", "r")
sentence_corpus = []
sentence = []

for line in f:
    line = line.strip()
    if line == "":
        sentence_corpus.append(sentence)
        sentence = []
    else:
        word, pos_tag, _ = line.split(" ")
        #ignoring the chunk tag for this task
        sentence.append((word, pos_tag))
f.close()

# Add the last sentence (if any)
if sentence:
    sentence_corpus.append(sentence)

In [90]:
'''
Splitting the data into train and test
to improve accuracy. Splitting is done
in order to maintain the ordering of the
sentences.
'''
# train_size = int(len(sentence_corpus) * 0.8)
# train_data = sentence_corpus[:train_size]
# test_data = sentence_corpus[train_size:]

In [91]:
print(sentence_corpus[:2])

[[('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ('pound', 'NN'), ('is', 'VBZ'), ('widely', 'RB'), ('expected', 'VBN'), ('to', 'TO'), ('take', 'VB'), ('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN'), ('if', 'IN'), ('trade', 'NN'), ('figures', 'NNS'), ('for', 'IN'), ('September', 'NNP'), (',', ','), ('due', 'JJ'), ('for', 'IN'), ('release', 'NN'), ('tomorrow', 'NN'), (',', ','), ('fail', 'VB'), ('to', 'TO'), ('show', 'VB'), ('a', 'DT'), ('substantial', 'JJ'), ('improvement', 'NN'), ('from', 'IN'), ('July', 'NNP'), ('and', 'CC'), ('August', 'NNP'), ("'s", 'POS'), ('near-record', 'JJ'), ('deficits', 'NNS'), ('.', '.')], [('Chancellor', 'NNP'), ('of', 'IN'), ('the', 'DT'), ('Exchequer', 'NNP'), ('Nigel', 'NNP'), ('Lawson', 'NNP'), ("'s", 'POS'), ('restated', 'VBN'), ('commitment', 'NN'), ('to', 'TO'), ('a', 'DT'), ('firm', 'NN'), ('monetary', 'JJ'), ('policy', 'NN'), ('has', 'VBZ'), ('helped', 'VBN'), ('to', 'TO'), ('prevent', 'VB'), ('a', 'DT'), ('freefall', 'NN'), ('in', 'IN'), ('s

In [92]:
for sent in sentence_corpus[:2]:
  for tuple in sent:
    print(tuple)

('Confidence', 'NN')
('in', 'IN')
('the', 'DT')
('pound', 'NN')
('is', 'VBZ')
('widely', 'RB')
('expected', 'VBN')
('to', 'TO')
('take', 'VB')
('another', 'DT')
('sharp', 'JJ')
('dive', 'NN')
('if', 'IN')
('trade', 'NN')
('figures', 'NNS')
('for', 'IN')
('September', 'NNP')
(',', ',')
('due', 'JJ')
('for', 'IN')
('release', 'NN')
('tomorrow', 'NN')
(',', ',')
('fail', 'VB')
('to', 'TO')
('show', 'VB')
('a', 'DT')
('substantial', 'JJ')
('improvement', 'NN')
('from', 'IN')
('July', 'NNP')
('and', 'CC')
('August', 'NNP')
("'s", 'POS')
('near-record', 'JJ')
('deficits', 'NNS')
('.', '.')
('Chancellor', 'NNP')
('of', 'IN')
('the', 'DT')
('Exchequer', 'NNP')
('Nigel', 'NNP')
('Lawson', 'NNP')
("'s", 'POS')
('restated', 'VBN')
('commitment', 'NN')
('to', 'TO')
('a', 'DT')
('firm', 'NN')
('monetary', 'JJ')
('policy', 'NN')
('has', 'VBZ')
('helped', 'VBN')
('to', 'TO')
('prevent', 'VB')
('a', 'DT')
('freefall', 'NN')
('in', 'IN')
('sterling', 'NN')
('over', 'IN')
('the', 'DT')
('past', 'JJ')
('

In [97]:
#Splitting the corpus data into train_data and test_data (validadtion) (80/20 split)
train_set,test_set =train_test_split(sentence_corpus,train_size=0.80,test_size=0.20,random_state = 101)

# List of all the tags in the train and the test set (it may not be unique)
train_tag_corpus = [ t for sentence in train_set for t in sentence ]
test_tag_corpus = [ t for sentence in test_set for t in sentence ]
print(len(train_tag_corpus))
print(len(test_tag_corpus))

170288
41439


In [99]:
print(train_tag_corpus[:20])

[('Besides', 'IN'), ('sacking', 'VBG'), ('other', 'JJ'), ('senior', 'JJ'), ('Politburo', 'NNP'), ('officials', 'NNS'), ('who', 'WP'), ('allied', 'VBD'), ('themselves', 'PRP'), ('with', 'IN'), ('Mr.', 'NNP'), ('Honecker', 'NNP'), (',', ','), ('Mr.', 'NNP'), ('Krenz', 'NNP'), ('could', 'MD'), ('loosen', 'VB'), ('controls', 'NNS'), ('on', 'IN'), ('the', 'DT')]


In [105]:
# Finding number of unique tags and words (Vocabulary)
train_tag_set = {tag for word, tag in train_tag_corpus}
vocab = {word for word, tag in train_tag_corpus}

In [110]:
#Methods to compute transition and emission

'''
prev_tag -> current_tag 
Pr(current_tag | prev_tag) = (# of prev_tag -> current_tag)/(# of prev_tag)
'''
def computeTransition(prev_tag, current_tag):
    tags = [tag for _, tag in train_tag_corpus]
    
    #Count of prev_tag
    cnt_prev_tag = len([tag for tag in tags if tag == prev_tag])
    cnt_prev_curr_tag = 0
    
    for i in range(1, len(tags)):
        if tags[i-1] == prev_tag and tags[i] == current_tag:
            cnt_prev_curr_tag += 1
    
    return cnt_prev_curr_tag / cnt_prev_tag

In [118]:
#The crux of HMM is the emission and transition probabilities

#Transition
transition = np.zeros((len(train_tag_set), len(train_tag_set)), dtype='float32')
train_tag_list = list(train_tag_set)
for i in range(len(train_tag_list)):
    for j in range(len(train_tag_list)):
        transition[i,j] = computeTransition(train_tag_list[i], train_tag_list[j])

In [120]:
# compute Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)#total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
#now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(w_given_tag_list)
 
     
    return (count_w_given_tag, count_tag)

In [124]:
tags_df = pd.DataFrame(transition, columns = list(train_tag_list), index=list(train_tag_list))
display(tags_df)

Unnamed: 0,NN,NNPS,VBZ,#,RBS,RB,PRP$,PRP,IN,RP,...,SYM,.,NNS,PDT,VBP,CC,TO,JJ,POS,WP$
NN,0.115926,0.000207,0.038587,4.1e-05,0.000165,0.01731,0.000165,0.005082,0.247304,8.3e-05,...,4.1e-05,0.104896,0.085891,4.1e-05,0.003553,0.039703,0.041727,0.009543,0.022681,0.000248
NNPS,0.073964,0.005917,0.029586,0.0,0.0,0.002959,0.0,0.0,0.112426,0.0,...,0.0,0.106509,0.014793,0.0,0.06213,0.085799,0.026627,0.014793,0.032544,0.0
VBZ,0.039591,0.0,0.001347,0.0,0.001616,0.133854,0.008618,0.023431,0.096687,0.001616,...,0.0,0.029087,0.016967,0.000269,0.001616,0.003232,0.042015,0.077027,0.0,0.0
#,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RBS,0.039216,0.0,0.0,0.0,0.0,0.078431,0.0,0.0,0.20915,0.0,...,0.0,0.013072,0.150327,0.0,0.013072,0.0,0.0,0.411765,0.0,0.0
RB,0.008116,0.000189,0.039826,0.000378,0.000378,0.059456,0.002454,0.010004,0.134579,0.0,...,0.0,0.061533,0.004908,0.000566,0.030389,0.006606,0.025104,0.102492,0.000566,0.0
PRP$,0.448366,0.000654,0.0,0.0,0.005882,0.00719,0.0,0.000654,0.0,0.0,...,0.0,0.001307,0.194118,0.0,0.000654,0.0,0.0,0.218954,0.0,0.0
PRP,0.001633,0.0,0.217903,0.0,0.000327,0.05227,0.0,0.003267,0.04149,0.0,...,0.0,0.032342,0.001307,0.0,0.192421,0.006534,0.012088,0.010454,0.0,0.0
IN,0.111081,0.00196,0.001416,0.000762,0.001797,0.015137,0.037354,0.030438,0.029349,0.0,...,0.0,0.000926,0.061966,0.001416,0.000436,0.00196,0.003485,0.083311,0.0,0.0
RP,0.059701,0.0,0.0,0.0,0.0,0.0,0.044776,0.0,0.328358,0.0,...,0.0,0.0,0.074627,0.0,0.0,0.0,0.014925,0.104478,0.0,0.0


In [129]:
def Viterbi_memoization(words, train_bag=train_tag_corpus):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    # memoization dictionary
    memo = {}
    
    for key, word in enumerate(words):
        # initialise list of probability column for a given observation
        p = []
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]

            # compute emission and state probabilities
            # check memoization dictionary first
            if (key, tag) in memo:
                emission_p = memo[(key, tag)]
            else:
                emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
                memo[(key, tag)] = emission_p

            state_probability = emission_p * transition_p
            p.append(state_probability)

        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)]
        state.append(state_max)
        
    return list(zip(words, state))

In [134]:
# Let's test our Viterbi algorithm on a few sample sentences of test dataset
random.seed(1234)      #define a random seed to get same sentences when run multiple times
 
# choose random 10 numbers
rndom = [random.randint(1,len(test_set)) for x in range(10)]
 
# list of 10 sents on which we test the model
test_run = [test_set[i] for i in rndom]
 
# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]
 
# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]

In [135]:
#Here We will only test 10 sentences to check the accuracy
#as testing the whole training set takes huge amount of time
start = time.time()
tagged_seq = Viterbi_memoization(test_tagged_words)
end = time.time()
difference = end-start
 
print("Time taken in seconds: ", difference)
 
# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
 
accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

Time taken in seconds:  194.15836691856384
Viterbi Algorithm Accuracy:  95.87628865979381


In [137]:
#(takes alot of time to run s0 we wont run it here)
# tagging the test sentences()
test_tagged_words = [tup for sent in test_set for tup in sent]
test_untagged_words = [tup[0] for sent in test_set for tup in sent]
test_untagged_words
 
start = time.time()
tagged_seq = Viterbi_memoization(test_untagged_words)
end = time.time()
difference = end-start
 
print("Time taken in seconds: ", difference)
 
# accuracy
check = [i for i, j in zip(test_tagged_words, test_untagged_words) if i == j] 
 
accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

KeyboardInterrupt: 