## POS tagging using modified Viterbi

### Data Preparation

In [1]:
#Importing libraries
import nltk

In [3]:
# nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/shakeeb/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [4]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [6]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(nltk_data, test_size=0.05)

In [27]:
train_set[0]

[('Paul', 'NOUN'),
 ('Sandifer', 'NOUN'),
 (',', '.'),
 ('director', 'NOUN'),
 ('of', 'ADP'),
 ('testing', 'NOUN'),
 ('for', 'ADP'),
 ('the', 'DET'),
 ('South', 'NOUN'),
 ('Carolina', 'NOUN'),
 ('department', 'NOUN'),
 ('of', 'ADP'),
 ('education', 'NOUN'),
 (',', '.'),
 ('says', 'VERB'),
 ('0', 'X'),
 ('Mr.', 'NOUN'),
 ('Cannell', 'NOUN'),
 ("'s", 'PRT'),
 ('allegations', 'NOUN'),
 ('of', 'ADP'),
 ('cheating', 'NOUN'),
 ('``', '.'),
 ('are', 'VERB'),
 ('purely', 'ADV'),
 ('without', 'ADP'),
 ('foundation', 'NOUN'),
 (',', '.'),
 ("''", '.'),
 ('and', 'CONJ'),
 ('based', 'VERB'),
 ('on', 'ADP'),
 ('unfair', 'ADJ'),
 ('inferences', 'NOUN'),
 ('.', '.')]

In [16]:
# list of tagged words
tagged_words = [tup for sent in train_set for tup in sent]
len(tagged_words)

95780

In [20]:
# list of tokens
tokens = [word for word, tags in tagged_words]
v = len(set(tokens))
v

12084

In [33]:
# list of tags
tags = set([tag for word, tag in tagged_words])
t = len(tags)
t

12

### Build the vanilla Viterbi based POS tagger

In [40]:
import numpy as np
import pandas as pd

In [25]:
# computing P(w/t) and storing in T x V matrix
w_given_t = np.zeros((t, v))

In [28]:
# compute word given tag: Emission Probability
def word_given_tag(iword, itag, train_bag = tagged_words):
    tag_list = [(word, tag) for word, tag in train_bag if tag == itag]
    count_tag = len(tag_list)
    w_given_tag_list = [word for word, tag in tag_list if word == iword]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

In [29]:
word_given_tag('Carolina', 'NOUN')

(13, 27502)

In [30]:
# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability
def t2_given_t1(t2, t1, train_bag = tagged_words):
    tags = [tag for word, tag in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [31]:
t2_given_t1('NOUN', 'ADP')

(3029, 9378)

In [34]:
tags_matrix = np.zeros((t, t), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [46]:
tags_matrix.shape

(12, 12)

In [42]:
# convert the matrix to a df for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
tags_df.head()

Unnamed: 0,ADP,NOUN,VERB,ADV,PRT,NUM,ADJ,PRON,.,DET,CONJ,X
ADP,0.016848,0.32299,0.008424,0.014075,0.00128,0.06302,0.105993,0.069098,0.040094,0.323523,0.000853,0.033803
NOUN,0.176169,0.265908,0.146389,0.017162,0.043924,0.009599,0.011854,0.004872,0.239401,0.013235,0.04247,0.029016
VERB,0.090592,0.110259,0.169725,0.081843,0.031436,0.022687,0.064576,0.035695,0.034533,0.134417,0.005188,0.219048
ADV,0.12037,0.030754,0.343915,0.080357,0.013889,0.032738,0.127646,0.01455,0.136243,0.069444,0.006614,0.023479
PRT,0.021207,0.246656,0.400653,0.01044,0.001958,0.05677,0.085155,0.018597,0.042088,0.101142,0.002284,0.013051


In [51]:
# Viterbi Heuristic
def Viterbi(words):
    state = []
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in tags:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = list(tags)[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [53]:
Viterbi(['Carolina'])

[('Carolina', 'NOUN')]

In [58]:
test_tagged_words = [tup for sent in test_set for tup in sent]
len(test_tagged_words)

4896

In [59]:
import time
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start
print(f'Time taken to tag test seq: {difference}secs')

806.738972902298

In [62]:
test_run_base = [tag for word, tag in test_tagged_words]
# accuracy
correctly_tagged = [predicted_tag for predicted_tag, actual_tag in zip(tagged_seq, test_run_base) 
                                if predicted_tag == actual_tag] 
accuracy = len(correctly_tagged)/len(tagged_seq)
print(f'Accuracy: {accuracy}')

Accuracy: 0.0


### Solve the problem of unknown words

#### Evaluating tagging accuracy

### Compare the tagging accuracies of the modifications with the vanilla Viterbi algorithm

### List down cases which were incorrectly tagged by original POS tagger and got corrected by your modifications