## POS tagging using modified Viterbi

### Data Preparation

In [109]:
#Importing libraries
import nltk

In [110]:
# nltk.download('treebank')
# nltk.download('universal_tagset')

In [111]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [112]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(nltk_data, test_size=0.05, random_state=1)

In [113]:
train_set[0]

[('He', 'PRON'),
 ('has', 'VERB'),
 ('promised', 'VERB'),
 ('stiffer', 'ADJ'),
 ('fines', 'NOUN'),
 (',', '.'),
 ('though', 'ADP'),
 ('the', 'DET'),
 ('size', 'NOUN'),
 ('of', 'ADP'),
 ('penalties', 'NOUN'),
 ('sought', 'VERB'),
 ('*', 'X'),
 ('by', 'ADP'),
 ('OSHA', 'NOUN'),
 ('have', 'VERB'),
 ('been', 'VERB'),
 ('rising', 'VERB'),
 ('in', 'ADP'),
 ('recent', 'ADJ'),
 ('years', 'NOUN'),
 ('even', 'ADV'),
 ('before', 'ADP'),
 ('he', 'PRON'),
 ('took', 'VERB'),
 ('office', 'NOUN'),
 ('this', 'DET'),
 ('year', 'NOUN'),
 ('.', '.')]

In [114]:
# list of tagged words
train_tagged_words = [tup for sent in train_set for tup in sent]
len(train_tagged_words)

95668

In [140]:
# list of tokens
train_words = [word for word, tags in train_tagged_words]
v = len(set(train_words))
v

12097

In [116]:
# list of tags
train_tags = set([tag for word, tag in train_tagged_words])
t = len(tags)
t

12

### Build the vanilla Viterbi based POS tagger

In [117]:
import numpy as np
import pandas as pd

In [118]:
# computing P(w/t) and storing in T x V matrix
w_given_t = np.zeros((t, v))

In [119]:
# compute word given tag: Emission Probability
def word_given_tag(iword, itag, train_bag = train_tagged_words):
    tag_list = [(word, tag) for word, tag in train_bag if tag == itag]
    count_tag = len(tag_list)
    w_given_tag_list = [word for word, tag in tag_list if word == iword]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

In [120]:
word_given_tag('Carolina', 'NOUN')

(12, 27474)

In [121]:
# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [tag for word, tag in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [122]:
t2_given_t1('NOUN', 'ADP')

(3032, 9339)

In [123]:
tags_matrix = np.zeros((t, t), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [124]:
tags_matrix.shape

(12, 12)

In [125]:
# convert the matrix to a df for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(train_tags), index=list(train_tags))
tags_df.head()

Unnamed: 0,ADP,ADV,VERB,PRT,X,NUM,CONJ,NOUN,PRON,DET,.,ADJ
ADP,0.017025,0.013813,0.008352,0.001285,0.033837,0.061141,0.000857,0.32466,0.069172,0.323803,0.040261,0.105793
ADV,0.11655,0.078921,0.346653,0.013986,0.022977,0.031635,0.006327,0.031635,0.015318,0.068931,0.138528,0.128538
VERB,0.090874,0.081468,0.168688,0.03125,0.216807,0.022621,0.005442,0.111085,0.03607,0.135728,0.03537,0.064599
PRT,0.019544,0.010098,0.400326,0.001954,0.014007,0.056026,0.00228,0.246254,0.018241,0.101629,0.043648,0.085993
X,0.143538,0.025433,0.203942,0.185344,0.074869,0.002702,0.010491,0.062947,0.055158,0.055317,0.163726,0.016532


In [127]:
# Viterbi Heuristic
def Viterbi(words, tags_df = tags_df):
    state = []
    tags_list = tags_df.index
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in tags_list:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = tags_list[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [128]:
Viterbi(['Carolina'])

[('Carolina', 'NOUN')]

In [129]:
test_tagged_words = [tup for sent in test_set for tup in sent]
len(test_tagged_words)

5008

In [130]:
test_tagged_words[0:2]

[('While', 'ADP'), ('the', 'DET')]

In [131]:
test_words = [word for word, tag in test_tagged_words]
test_words[0:2]

['While', 'the']

In [132]:
import os
def whatsapp(msg):
    os.system(f'/home/shakeeb/Documents/twilio-whatsapp/run.sh "{msg}"')

In [133]:
import time
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi(test_words)
end = time.time()
difference = end-start

In [134]:
msg = f'Training Complete. Time taken to tag test seq: {difference}secs'
print(msg)
whatsapp(msg)

Training Complete. Time taken to tag test seq: 1354.3533408641815secs


In [135]:
tagged_seq[0:10]

[('While', 'ADP'),
 ('the', 'DET'),
 ('new', 'ADJ'),
 ('proposal', 'NOUN'),
 ('might', 'VERB'),
 ('appeal', 'NOUN'),
 ('to', 'PRT'),
 ('the', 'DET'),
 ('dirtiest', 'ADJ'),
 ('utilities', 'NOUN')]

In [220]:
def accuracy(tagged_seq, test_tagged_words):
    predicted_tags = [tag for word, tag in tagged_seq]
    actual_tags = [tag for word, tag in test_tagged_words]
    # accuracy
    correctly_tagged = [predicted_tag for predicted_tag, actual_tag in zip(predicted_tags, actual_tags) 
                                    if predicted_tag == actual_tag] 
    accuracy = len(correctly_tagged)/len(actual_tags)
    return round(100*accuracy, 2)

In [204]:
accuracy(tagged_seq, test_tagged_words)

0.9071485623003195

In [170]:
def get_incorrect_tagged(tagged_seq, test_tagged_words):
    actual_tags = [tag for word, tag in test_tagged_words]
    return [(word, predicted_tag, actual_tag) for (word, predicted_tag), actual_tag in zip(tagged_seq, actual_tags) 
                                if predicted_tag != actual_tag]

In [171]:
incorrectly_tagged = get_incorrect_tagged(tagged_seq, test_tagged_words)
incorrectly_tagged[0:10]

[('appeal', 'NOUN', 'VERB'),
 ('cleanup', 'ADP', 'NOUN'),
 ('burn', 'ADP', 'VERB'),
 ('cleaner-burning', 'ADP', 'ADJ'),
 ('fuels', 'ADP', 'NOUN'),
 ('elaborate', 'VERB', 'ADJ'),
 ('*-94', 'ADP', 'X'),
 ('that', 'ADP', 'DET'),
 ('*T*-117', 'ADP', 'X'),
 ('uptick', 'NOUN', 'VERB')]

### Solve the problem of unknown words

In [233]:
def get_unknown_tagged_incorrectly(tagged_seq, test_tagged_words, train_words):
    print('Tags not in train set\nword - tag - actual_tag')
    return [(word, tag, actual_tag) 
            for (word, tag), (_, actual_tag) in zip(tagged_seq, test_tagged_words) 
            if word not in train_words and tag != actual_tag]

In [234]:
# lets figure out the unknown words
unknown_tagged_words = get_unknown_tagged_incorrectly(tagged_seq, test_tagged_words, train_words)
unknown_tagged_words[0:10]

Tags not in train set
word - tag - actual_tag


[('cleanup', 'ADP', 'NOUN'),
 ('burn', 'ADP', 'VERB'),
 ('cleaner-burning', 'ADP', 'ADJ'),
 ('fuels', 'ADP', 'NOUN'),
 ('*-94', 'ADP', 'X'),
 ('*T*-117', 'ADP', 'X'),
 ('Jennison', 'ADP', 'NOUN'),
 ('bell-ringing', 'ADP', 'ADJ'),
 ('Ancient', 'ADP', 'NOUN'),
 ('Youths', 'ADP', 'NOUN')]

In [177]:
len(unknown_tagged_words)

322

In [143]:
# see if the unknown words have same tags
tags = [tag for word, tag in unknown_tagged_words]
list(set(tags))

['ADP']

In [244]:
# build a rule based tagger
patterns = [
    (r'\*+', 'X'),                   # if there are * in the words, mark it X
    (r'.*', 'NN')                    # nouns
]
rule_based_tagger = nltk.RegexpTagger(patterns)

In [265]:
# merge viterbi and rule based tagger
def extended_viterbi(viterbi_tagged_seq, new_tagger, test_tagged_words = test_tagged_words):
    new_tagged = new_tagger.tag(test_words)
    
    merged_tag_seq = []
    i = 0
    for (word, viterbi_tag), (_, rule_tag) in zip(tagged_seq, new_tagged):
        if word in unknown_words:
            merged_tag_seq.append((word, rule_tag))
        else:
            merged_tag_seq.append((word, viterbi_tag))
            
    print(f'Accuracy of new tagger:              {accuracy(new_tagged, test_tagged_words)}')
    print(f'Accuracy of original viterbi tagger: {accuracy(viterbi_tagged_seq, test_tagged_words)}')    
    print(f'Accuracy of extended viterbi tagger: {accuracy(merged_tag_seq, test_tagged_words)}')

    return merged_tag_seq

In [266]:
merged_tag_seq = extended_viterbi(tagged_seq, rule_based_tagger, test_tagged_words)

Accuracy of new tagger:              38.1
Accuracy of original viterbi tagger: 90.71
Accuracy of extended viterbi tagger: 95.01


In [267]:
# lets identify any other mismatches
unknown_tagged_words = get_unknown_tagged_incorrectly(merged_tag_seq, test_tagged_words, train_words)
unknown_tagged_words[0:10]

Tags not in train set
word - tag - actual_tag


[('burn', 'NOUN', 'VERB'),
 ('male-only', 'NOUN', 'ADJ'),
 ('sole', 'NOUN', 'ADJ'),
 ('complaining', 'ADJ', 'VERB'),
 ('opposite', 'NOUN', 'ADJ'),
 ('loathsome', 'NOUN', 'ADJ'),
 ('propelling', 'ADJ', 'VERB'),
 ('serves', 'NOUN', 'VERB'),
 ('sitting', 'ADJ', 'VERB'),
 ('apologize', 'NOUN', 'VERB')]

In [268]:
# build a rule based tagger
patterns = [
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'), # cardinal numbers
    (r'\*+', 'X'),                   # if there are * in the words, mark it X
    (r'.*', 'NN')                    # nouns
]
rule_based_tagger = nltk.RegexpTagger(patterns)
merged_tag_seq = extended_viterbi(tagged_seq, rule_based_tagger, test_tagged_words)

Accuracy of new tagger:              7.73
Accuracy of original viterbi tagger: 90.71
Accuracy of extended viterbi tagger: 91.71


In [269]:
# lets identify any other mismatches
unknown_tagged_words = get_unknown_tagged_incorrectly(merged_tag_seq, test_tagged_words, train_words)
unknown_tagged_words[0:10]

Tags not in train set
word - tag - actual_tag


[('cleanup', 'NN', 'NOUN'),
 ('burn', 'NN', 'VERB'),
 ('cleaner-burning', 'NN', 'ADJ'),
 ('fuels', 'NN', 'NOUN'),
 ('Jennison', 'NN', 'NOUN'),
 ('bell-ringing', 'NN', 'ADJ'),
 ('Ancient', 'NN', 'NOUN'),
 ('Youths', 'NN', 'NOUN'),
 ('male-only', 'NN', 'ADJ'),
 ('galling', 'NN', 'ADJ')]

In [270]:
# build a rule based tagger
patterns = [
    (r'.*ing$', 'ADJ'),              # gerund
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'), # cardinal numbers
    (r'\*+', 'X'),                   # if there are * in the words, mark it X
    (r'.*', 'NOUN')                    # nouns
]
rule_based_tagger = nltk.RegexpTagger(patterns)
merged_tag_seq = extended_viterbi(tagged_seq, rule_based_tagger, test_tagged_words)

Accuracy of new tagger:              34.94
Accuracy of original viterbi tagger: 90.71
Accuracy of extended viterbi tagger: 94.57


In [271]:
# lets identify any other mismatches
unknown_tagged_words = get_unknown_tagged_incorrectly(merged_tag_seq, test_tagged_words, train_words)
unknown_tagged_words[0:10]

Tags not in train set
word - tag - actual_tag


[('burn', 'NOUN', 'VERB'),
 ('male-only', 'NOUN', 'ADJ'),
 ('sole', 'NOUN', 'ADJ'),
 ('complaining', 'ADJ', 'VERB'),
 ('nullified', 'NOUN', 'VERB'),
 ('opposite', 'NOUN', 'ADJ'),
 ('loathsome', 'NOUN', 'ADJ'),
 ('cautious', 'NOUN', 'ADJ'),
 ('propelling', 'ADJ', 'VERB'),
 ('serves', 'NOUN', 'VERB')]

In [272]:
# build a rule based tagger
patterns = [
    (r'(.*ing|.*ous)$', 'ADJ'),              # gerund
    (r'.*ed$', 'VERB'),               # past tense
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'), # cardinal numbers
    (r'\*+', 'X'),                   # if there are * in the words, mark it X
    (r'.*', 'NOUN')                    # nouns
]
rule_based_tagger = nltk.RegexpTagger(patterns)
merged_tag_seq = extended_viterbi(tagged_seq, rule_based_tagger, test_tagged_words)

Accuracy of new tagger:              38.1
Accuracy of original viterbi tagger: 90.71
Accuracy of extended viterbi tagger: 95.01


In [273]:
# lets identify any other mismatches
unknown_tagged_words = get_unknown_tagged_incorrectly(merged_tag_seq, test_tagged_words, train_words)
unknown_tagged_words[0:10]

Tags not in train set
word - tag - actual_tag


[('burn', 'NOUN', 'VERB'),
 ('male-only', 'NOUN', 'ADJ'),
 ('sole', 'NOUN', 'ADJ'),
 ('complaining', 'ADJ', 'VERB'),
 ('opposite', 'NOUN', 'ADJ'),
 ('loathsome', 'NOUN', 'ADJ'),
 ('propelling', 'ADJ', 'VERB'),
 ('serves', 'NOUN', 'VERB'),
 ('sitting', 'ADJ', 'VERB'),
 ('apologize', 'NOUN', 'VERB')]

In [274]:
# now apply unigram and bigram tagger
# lexicon backed up by the rule-based tagger
lexicon_tagger = nltk.UnigramTagger(train_set, backoff=rule_based_tagger)

#lexicon based bigram tagger
bigram_tagger = nltk.BigramTagger(train_set, backoff=lexicon_tagger)

#### Evaluation

In [275]:
merged_tag_seq = extended_viterbi(tagged_seq, bigram_tagger, test_tagged_words)

Accuracy of new tagger:              95.21
Accuracy of original viterbi tagger: 90.71
Accuracy of extended viterbi tagger: 95.01


### List down cases which were incorrectly tagged by original POS tagger and got corrected by your modifications

In [284]:
# incorrectly tagged words from tagged_seq which were corrected in merged_tag_seq
corrected_tags = \
[(word, viterbi_tag, viterbi_extend_tag, actual_tag)
 for (word, actual_tag), (_, viterbi_tag), (_, viterbi_extend_tag) in 
 zip(test_tagged_words, tagged_seq, merged_tag_seq)
 if (viterbi_tag != actual_tag) and (viterbi_extend_tag == actual_tag)]

In [333]:
print('Tags corrected with extended viterbi tagger')
print(f'--------------------------------------------------------------------------')
print(f'{"Word":<15s} | {"vanilla viterbi tag"} | extended viterbi tag | acutal tag')
print(f'{"-"*16}|{"-"*21}|{"-"*22}|{"-"*12}')
for word, viterbi_tag, viterbi_ex_tag, actual_tag in corrected_tags[0:10]:
    print(f'{word:<15s} | {viterbi_tag:19s} | {viterbi_ex_tag:20s} | {actual_tag}')

Tags corrected with extended viterbi tagger
--------------------------------------------------------------------------
Word            | vanilla viterbi tag | extended viterbi tag | acutal tag
----------------|---------------------|----------------------|------------
cleanup         | ADP                 | NOUN                 | NOUN
cleaner-burning | ADP                 | ADJ                  | ADJ
fuels           | ADP                 | NOUN                 | NOUN
*-94            | ADP                 | X                    | X
*T*-117         | ADP                 | X                    | X
Jennison        | ADP                 | NOUN                 | NOUN
bell-ringing    | ADP                 | ADJ                  | ADJ
Ancient         | ADP                 | NOUN                 | NOUN
Youths          | ADP                 | NOUN                 | NOUN
1637            | ADP                 | NUM                  | NUM
