In [1]:
import nltk
import random
import numpy as np
import pandas as pd
import pprint, time
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

'''
Reading the input file and storing the 
values of the 3 columns of each row in
a tuple: (<Word>, <POS_TAG>, <CHUNK_TAG>)
'''
f = open("train.txt", "r")
sentence_corpus = []
sentence = []

for line in f:
    line = line.strip()
    if line == "":
        sentence_corpus.append(sentence)
        sentence = []
    else:
        word, pos_tag, _ = line.split(" ")
        #ignoring the chunk tag for this task
        sentence.append((word, pos_tag))
f.close()

# Add the last sentence (if any)
if sentence:
    sentence_corpus.append(sentence)

In [2]:
print(sentence_corpus[:2])

[[('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ('pound', 'NN'), ('is', 'VBZ'), ('widely', 'RB'), ('expected', 'VBN'), ('to', 'TO'), ('take', 'VB'), ('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN'), ('if', 'IN'), ('trade', 'NN'), ('figures', 'NNS'), ('for', 'IN'), ('September', 'NNP'), (',', ','), ('due', 'JJ'), ('for', 'IN'), ('release', 'NN'), ('tomorrow', 'NN'), (',', ','), ('fail', 'VB'), ('to', 'TO'), ('show', 'VB'), ('a', 'DT'), ('substantial', 'JJ'), ('improvement', 'NN'), ('from', 'IN'), ('July', 'NNP'), ('and', 'CC'), ('August', 'NNP'), ("'s", 'POS'), ('near-record', 'JJ'), ('deficits', 'NNS'), ('.', '.')], [('Chancellor', 'NNP'), ('of', 'IN'), ('the', 'DT'), ('Exchequer', 'NNP'), ('Nigel', 'NNP'), ('Lawson', 'NNP'), ("'s", 'POS'), ('restated', 'VBN'), ('commitment', 'NN'), ('to', 'TO'), ('a', 'DT'), ('firm', 'NN'), ('monetary', 'JJ'), ('policy', 'NN'), ('has', 'VBZ'), ('helped', 'VBN'), ('to', 'TO'), ('prevent', 'VB'), ('a', 'DT'), ('freefall', 'NN'), ('in', 'IN'), ('s

In [3]:
'''
First implementation: Vertebi Algorithm from scratch
Note: Time consuming: Test data running for more than
      3 Hours.
'''

'\nFirst implementation: Vertebi Algorithm from scratch\nNote: Time consuming: Test data running for more than\n      3 Hours.\n'

In [4]:
#Splitting the corpus data into train_data and test_data (validadtion) (80/20 split)
train_set,test_set =train_test_split(sentence_corpus,train_size=0.80,test_size=0.20,random_state = 101)

# List of all the tags in the train and the test set (it may not be unique)
train_tag_corpus = [ t for sentence in train_set for t in sentence ]
test_tag_corpus = [ t for sentence in test_set for t in sentence ]
print(len(train_tag_corpus))
print(len(test_tag_corpus))

170288
41439


In [5]:
print(train_tag_corpus[:20])

[('Besides', 'IN'), ('sacking', 'VBG'), ('other', 'JJ'), ('senior', 'JJ'), ('Politburo', 'NNP'), ('officials', 'NNS'), ('who', 'WP'), ('allied', 'VBD'), ('themselves', 'PRP'), ('with', 'IN'), ('Mr.', 'NNP'), ('Honecker', 'NNP'), (',', ','), ('Mr.', 'NNP'), ('Krenz', 'NNP'), ('could', 'MD'), ('loosen', 'VB'), ('controls', 'NNS'), ('on', 'IN'), ('the', 'DT')]


In [6]:
# Finding number of unique tags and words (Vocabulary)
train_tag_set = {tag for word, tag in train_tag_corpus}
vocab = {word for word, tag in train_tag_corpus}

In [7]:
#Methods to compute transition and emission

'''
prev_tag -> current_tag 
Pr(current_tag | prev_tag) = (# of prev_tag -> current_tag)/(# of prev_tag)
'''
def computeTransition(prev_tag, current_tag):
    tags = [tag for _, tag in train_tag_corpus]
    
    #Count of prev_tag
    cnt_prev_tag = len([tag for tag in tags if tag == prev_tag])
    cnt_prev_curr_tag = 0
    
    for i in range(1, len(tags)):
        if tags[i-1] == prev_tag and tags[i] == current_tag:
            cnt_prev_curr_tag += 1
    
    return cnt_prev_curr_tag / cnt_prev_tag

In [119]:
#The crux of HMM is the emission and transition probabilities

#Transition
transition = np.zeros((len(train_tag_set), len(train_tag_set)), dtype='float32')
train_tag_list = list(train_tag_set)
for i in range(len(train_tag_list)):
    for j in range(len(train_tag_list)):
        transition[i,j] = computeTransition(train_tag_list[i], train_tag_list[j])

In [125]:
# compute Emission Probability
def computeEmission(word, tag):
    train_bag = train_tag_corpus
    tag_list = [tg for tg in train_bag if tg[1]==tag]
    cnt_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
 
    return count_w_given_tag / cnt_tag

In [118]:
# emission = {}
# vocab_list = list(vocab)
# for word in vocab_list:
#     for tag in train_tag_list:
#         if (word, tag) in emission.keys():
#             continue
#         else:
#             emission[(word, tag)] = computeEmission(word, tag)

In [126]:
tags_df = pd.DataFrame(transition, columns = list(train_tag_list), index=list(train_tag_list))

In [55]:
import concurrent.futures
import threading

def viterbi_memoization_threaded(words):
    train_bag = train_tag_corpus
    tags = list(set([pair[1] for pair in train_bag]))
    
    # initialize memoization dictionary
    memo = {}
    lock = threading.Lock()
    
    # initialize probability matrix
    T = len(words)
    prob_matrix = np.zeros((T, len(tags)))
    
    # fill in first column of probability matrix
    for i, tag in enumerate(tags):
        if (words[0], tag) in memo:
            emission_p = memo[(words[0], tag)]
        else:
            emission_p = computeEmission(words[0], tag)[0] / word_given_tag(words[0], tag)[1]
            memo[(words[0], tag)] = emission_p
        prob_matrix[0][i] = tags_df.loc['.', tag] * emission_p
        
    # define worker function for multithreading
    def worker(i, j, tag, tags):
        max_prob = 0
        for k, prev_tag in enumerate(tags):
            transition_p = tags_df.loc[prev_tag, tag]
            prob = prob_matrix[i-1][k] * transition_p
            if prob > max_prob:
                max_prob = prob
                lock.acquire()
                if (words[i], tag) in memo:
                    emission_p = memo[(words[i], tag)]
                else:
                    emission_p = computeEmission(words[i], tag)[0] / word_given_tag(words[i], tag)[1]
                    memo[(words[i], tag)] = emission_p
                prob_matrix[i][j] = max_prob * emission_p
                lock.release()
    
    # fill in remaining columns of probability matrix using multithreading
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for i in range(1, T):
            for j, tag in enumerate(tags):
                futures.append(executor.submit(worker, i, j, tag, tags))
        concurrent.futures.wait(futures)
    print("Aman")
    
    print(prob_matrix)
    # backtrack to find optimal sequence of tags
    state = []
    max_prob = max(prob_matrix[-1])
    prev_tag = None
    for i in range(T-1, -1, -1):
        for j, tag in enumerate(tags):
            if prob_matrix[i][j] == max_prob:
                if prev_tag:
                    state.insert(0, prev_tag)
                max_prob /= memo[(words[i], tag)]
                max_prob /= tags_df.loc[prev_tag, tag]
                prev_tag = tag
                break
    
    state.insert(0, prev_tag)
    return list(zip(words, state))


In [53]:
import threading

def viterbi_parallel(words, train_bag=train_tag_corpus):
    state = []
    T = list(set([pair[1] for pair in train_bag]))

    # create a lock
    lock = threading.Lock()

    with concurrent.futures.ThreadPoolExecutor() as executor:
        # create list of tasks to be executed concurrently
        tasks = []
        for key, word in enumerate(words):
            task = executor.submit(process_word, key, word, state, T, lock)
            tasks.append(task)

        # wait for tasks to complete and collect results
        for task in concurrent.futures.as_completed(tasks):
            result = task.result()
            state = result[:]

    return list(zip(words, state))

def process_word(key, word, state, T, lock):
    # acquire the lock
    lock.acquire()

    #initialise list of probability column for a given observation
    p = []
    for tag in T:
        if key == 0:
            transition_p = tags_df.loc['.', tag]
        else:
            transition_p = tags_df.loc[state[-1], tag]

        # compute emission and state probabilities
        emission_p = computeEmission(word, tag)[0]/word_given_tag(word, tag)[1]
        state_probability = emission_p * transition_p
        p.append(state_probability)

    pmax = max(p)
    # getting state for which probability is maximum
    state_max = T[p.index(pmax)]

    # append the state_max to the state list
    state.append(state_max)

    # release the lock
    lock.release()

    return state

In [127]:
def Viterbi(words, train_bag = train_tag_corpus):
    pos = []
    dp = {}
    tags = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        p = []
        pmax = -float("inf")
        p_index = 0
        for i, tag in enumerate(tags):
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[pos[-1], tag]
            
            if (tag, word) in dp.keys():
                emission_p = dp[(tag, word)]
            else:
                emission_p = computeEmission(words[key], tag)
                dp[(tag, word)] = emission_p
                
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            if state_probability > pmax:
                pmax = state_probability
                p_index = i
             
        #pmax = max(p)
        # getting state for which probability is maximum
        pos_max = tags[p_index] 
        pos.append(pos_max)
    return list(zip(words, pos))

In [272]:
rndom = [random.randint(1,len(test_set)) for x in range(10)]
test_run = [test_set[i] for i in rndom]
test_run_base = [tup for sent in test_run for tup in sent]
test_tagged_words = [tup[0] for sent in test_run for tup in sent]

In [273]:
# test_tagged_words = [tup for sent in test_set for tup in sent]
# test_untagged_words = [tup[0] for sent in test_set for tup in sent]
# #test_untagged_words

In [274]:
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
diff = end-start
print("Time = ", diff)
# print(tagged_seq[:10])
# #accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
 
accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

 #Accuracy of random 10 sentences on the split test data set is 94% using Viterbi.

Time =  109.052237033844
Viterbi Algorithm Accuracy:  88.47736625514403


In [124]:
'''
Second Implementation: Using NLTK's hmm
Takes less time and easier to impement
'''

"\nSecond Implementation: Using NLTK's hmm\nTakes less time and easier to impement\n"

In [182]:
#Creating HMM object
HmmModel = nltk.HiddenMarkovModelTagger.train(train_set)

true_pos_tags = [tag for sentences in test_run for word, tag in sentences]

predicted_pos_tags=[]
for sentences in test_run:
    predicted_pos_tags += [tag for _, tag in HmmModel.tag([word for word, _ in sentences])]

In [172]:
#Accuracy
print (classification_report(true_pos_tags, predicted_pos_tags))
#Accuracy of random 10 sentences on the split test data set is 95% using nltk's hmm

              precision    recall  f1-score   support

           $       0.67      1.00      0.80         2
          ''       0.00      0.00      0.00         0
           ,       1.00      1.00      1.00        12
           .       0.91      1.00      0.95        10
          CC       1.00      1.00      1.00         9
          CD       0.90      1.00      0.95         9
          DT       0.95      1.00      0.98        21
          EX       1.00      1.00      1.00         1
          IN       1.00      1.00      1.00        24
          JJ       0.94      0.89      0.91        18
         JJR       1.00      1.00      1.00         1
          MD       0.75      1.00      0.86         3
          NN       0.97      0.92      0.95        39
         NNP       0.93      1.00      0.97        14
         NNS       1.00      0.83      0.91        18
         POS       1.00      1.00      1.00         1
         PRP       1.00      1.00      1.00         9
        PRP$       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [251]:
true_pos_tags = [tag for sentences in test_set for word, tag in sentences]

predicted_pos_tags=[]
for sentences in test_set:
    predicted_pos_tags += [tag for _, tag in HmmModel.tag([word for word, _ in sentences])]

In [252]:
#Accuracy
print (classification_report(true_pos_tags, predicted_pos_tags))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           #       1.00      1.00      1.00         6
           $       0.81      1.00      0.90       311
          ''       0.77      1.00      0.87       297
           (       0.97      0.97      0.97        39
           )       0.64      0.92      0.76        39
           ,       0.98      1.00      0.99      2127
           .       0.95      1.00      0.97      1767
           :       0.99      0.99      0.99       203
          CC       0.97      1.00      0.99      1054
          CD       0.95      0.91      0.93      1599
          DT       0.95      0.99      0.97      3576
          EX       1.00      0.89      0.94        46
          FW       1.00      0.10      0.18        10
          IN       0.97      0.99      0.98      4399
          JJ       0.90      0.86      0.88      2549
         JJR       0.86      0.93      0.89       169
         JJS       0.98      0.87      0.92        75
          MD       0.92    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#Accuracy on the split test data set is 93%

In [219]:
patterns = [
    (r'.*ing$', 'VBG'),               # gerund
    (r'.*ed$', 'VBD'),                # past tense 
    (r'.*es$', 'VBZ'),                # verb    
    (r'.*end$', 'VB'),
    (r'^[A-Z].*$', 'NNP'),            # possessive nouns
    (r'\b\w+s\b', 'NNS'),             # plural nouns
    (r'\b\w+NN\b', 'NN'),             # singuular noun
    (r'\*T?\*?-[0-9]+$', 'X'),        # X
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')                     # nouns
]
 
# rule based tagger
rule_based_tagger = nltk.RegexpTagger(patterns)

In [220]:
print(rule_based_tagger.tag(['cat']))

[('cat', 'NN')]


In [253]:
i = 0
cntt = 0
for sentence in test_set:
    for word, _ in sentence:
        if predicted_pos_tags[i] == tmp:
            cntt += 1
            predicted_pos_tags[i] = rule_based_tagger.tag([word])[0][1]
        i += 1
print(cntt)

378


In [254]:
print(len(predicted_pos_tags))

41439


In [235]:
print (classification_report(true_pos_tags, predicted_pos_tags))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           #       1.00      1.00      1.00         6
           $       0.81      1.00      0.90       311
          ''       0.77      1.00      0.87       297
           (       0.97      0.97      0.97        39
           )       0.64      0.92      0.76        39
           ,       0.98      1.00      0.99      2127
           .       0.95      1.00      0.97      1767
           :       0.99      0.99      0.99       203
          CC       0.97      1.00      0.99      1054
          CD       0.95      0.91      0.93      1599
          DT       0.95      0.99      0.97      3576
          EX       1.00      0.89      0.94        46
          FW       1.00      0.10      0.18        10
          IN       0.97      0.99      0.98      4399
          JJ       0.90      0.86      0.88      2549
         JJR       0.86      0.93      0.89       169
         JJS       0.98      0.87      0.92        75
          MD       0.92    

  _warn_prf(average, modifier, msg_start, len(result))


In [236]:
print(true_pos_tags[:50], predicted_pos_tags[:50],tmp)

['NNP', 'NNP', 'NNP', 'NNP', 'NNP', ',', 'VBG', 'DT', 'NN', ',', 'VBD', 'PRP', 'MD', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NN', '.', 'CC', 'WRB', 'JJ', 'MD', 'NNS', 'VB', '.', '``', 'PRP', 'VBZ', 'RB', 'DT', 'JJ', ':', 'PRP', 'VBZ', 'TO', 'VB', 'DT', 'NN', 'IN', 'DT', 'NN', '.', "''", 'DT', 'NN', 'VBZ', 'JJ', 'NNP'] ['NNP', 'NNP', 'NNP', 'NNP', 'NNP', ',', 'VBG', 'DT', 'NN', ',', 'VBD', 'PRP', 'MD', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NN', '.', 'CC', 'WRB', 'RB', 'MD', 'VB', 'NN', '.', 'NN', 'PRP', 'VBZ', 'RB', 'DT', 'NN', ':', 'PRP', 'VBZ', 'TO', 'VB', 'DT', 'NN', 'IN', 'DT', 'NN', '.', "''", 'DT', 'NN', 'VBZ', 'JJ', 'NNP'] ``


In [262]:
f_out = open("test_data.txt", "r")
test_corpus = []

for line in f_out:
    line = line.strip()
    if line == "":
        continue
    else:
        test_corpus.append(line)
f_out.close()

In [263]:
test_corpus[:5]

['Rockwell', 'International', 'Corp.', "'s", 'Tulsa']

In [264]:
predicted_pos_tags=[]
for word in test_corpus:
    predicted_pos_tags += [tag for _, tag in HmmModel.tag([word])]

In [265]:
cnt = 0
for i in predicted_pos_tags:
    if i == predicted_pos_tags[0]:
        cnt += 1
print(cnt, len(predicted_pos_tags))

5435 47377


In [269]:
predicted_pos_tags[:20]

['``',
 'NNP',
 'NNP',
 'VBZ',
 '``',
 'NN',
 'VBD',
 'PRP',
 'VBN',
 'DT',
 'JJ',
 'NN',
 '``',
 'PRP$',
 'NN',
 'IN',
 'NNP',
 'NNP',
 'TO',
 'VB']

In [270]:
predicted_pos_tags[0] == '``'

True

In [271]:
cnttt = 0
for i in predicted_pos_tags:
    if i == tmp:
        cnttt += 1
print(cnttt)

5435


In [260]:
i = 0
for word in test_corpus:
    if predicted_pos_tags[i] == tmp:
        predicted_pos_tags[i] = rule_based_tagger.tag([word])[0][1]
    i += 1
    

In [261]:
predicted_pos_tags[:20]

['NNP',
 'NNP',
 'NNP',
 'VBZ',
 'NNP',
 'NN',
 'VBD',
 'PRP',
 'VBN',
 'DT',
 'JJ',
 'NN',
 'VBG',
 'PRP$',
 'NN',
 'IN',
 'NNP',
 'NNP',
 'TO',
 'VB']

In [180]:
f_read = open("test_data_tagged.txt", "r")
data = []
for line in f_read:
    line = line.strip()
    data.append(line)
f_read.close()

In [181]:
f_write = open("test_data_tagged.txt", "w")
j = 0
for line in data:
    if line == '':
        f_write.writelines(line + '\n')
        continue
    else:
        f_write.writelines(line + " " + predicted_pos_tags[j] + '\n')
    j += 1
f_write.close()

In [185]:
cnt = 0
for i in predicted_pos_tags:
    if i == predicted_pos_tags[0]:
        cnt += 1
print(cnt, len(predicted_pos_tags))

9 252


In [224]:
tmp

'``'

In [223]:
tmp = predicted_pos_tags[0]

In [None]:
'``'