In [1]:
# Importing libraries
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split

nltk.download('treebank')
nltk.download('universal_tagset')
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
print(nltk_data[:5]) #some sentences with tags

[nltk_data] Downloading package treebank to C:\Users\Sakshi
[nltk_data]     Verma\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to C:\Users\Sakshi
[nltk_data]     Verma\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')], [('Rudolph', 'NOUN'), ('Agnew', 'NOUN'), (',', '.'), ('55', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), ('and', 'CONJ'), ('former', 'ADJ'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Consolidated', 'NOUN'), ('Gold', 'NOUN'), ('Fields', 'NOUN'), ('PLC', 'NOUN'), (',', '.'), ('was', 'VERB'), ('named', 'VERB'), ('*-1', 'X'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('of', 'ADP'), ('this', 'DET'), ('British', 'ADJ'), ('industrial', 'ADJ'), ('

In [2]:
# split data into training and validation set in the ratio 80:20
train_set,test_set =train_test_split(nltk_data,train_size=0.80,test_size=0.20,random_state = 101)

In [3]:
# create list of train and test tagged words
train_tagged_words = [ tag_word for sentence in train_set for tag_word in sentence ]
test_tagged_words = [ tag_word for sentence in test_set for tag_word in sentence ]
print(len(train_tagged_words))
print(len(test_tagged_words))

80310
20366


In [4]:
# check some of the train and test tagged words.
print(train_tagged_words[:5])
print(test_tagged_words[:5])

[('Drink', 'NOUN'), ('Carrier', 'NOUN'), ('Competes', 'VERB'), ('With', 'ADP'), ('Cartons', 'NOUN')]
[('The', 'DET'), ('company', 'NOUN'), ('said', 'VERB'), ('0', 'X'), ('it', 'PRON')]


In [5]:
#unique tags are present in training data
tags = {tag for word,tag in train_tagged_words}
print(len(tags))
print(tags)
 
#total words in vocabulary
vocab = {word for word,tag in train_tagged_words}

12
{'ADJ', 'VERB', 'CONJ', 'NUM', 'NOUN', 'X', 'DET', 'ADP', 'PRT', 'ADV', 'PRON', '.'}


In [6]:
# Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)#total number of times the given tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
# the total number of times the given word occurred with the given tag.
    count_w_given_tag = len(w_given_tag_list)
    return (count_w_given_tag, count_tag)

In [7]:
# Transition Probability
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [8]:
# Matrix(i, j) gives P(jth tag after the ith tag)
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]
print(tags_matrix)

[[6.33009672e-02 1.14563107e-02 1.68932043e-02 2.17475723e-02
  6.96893215e-01 2.09708735e-02 5.24271838e-03 8.05825219e-02
  1.14563107e-02 5.24271838e-03 1.94174761e-04 6.60194159e-02]
 [6.63904250e-02 1.67955801e-01 5.43278083e-03 2.28360966e-02
  1.10589318e-01 2.15930015e-01 1.33609578e-01 9.23572779e-02
  3.06629837e-02 8.38858187e-02 3.55432779e-02 3.48066315e-02]
 [1.13611415e-01 1.50384188e-01 5.48847427e-04 4.06147093e-02
  3.49066973e-01 9.33040585e-03 1.23490669e-01 5.59824370e-02
  4.39077942e-03 5.70801310e-02 6.03732169e-02 3.51262353e-02]
 [3.53445187e-02 2.07068902e-02 1.42806144e-02 1.84219927e-01
  3.51660132e-01 2.02427700e-01 3.57015361e-03 3.74866128e-02
  2.60621198e-02 3.57015361e-03 1.42806140e-03 1.19243130e-01]
 [1.25838192e-02 1.49133503e-01 4.24540639e-02 9.14395228e-03
  2.62344331e-01 2.88252197e-02 1.31063312e-02 1.76826611e-01
  4.39345129e-02 1.68945398e-02 4.65906132e-03 2.40094051e-01]
 [1.76821072e-02 2.06419379e-01 1.03786280e-02 3.07514891e-03
  6

In [9]:
# df
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
display(tags_df)

Unnamed: 0,ADJ,VERB,CONJ,NUM,NOUN,X,DET,ADP,PRT,ADV,PRON,.
ADJ,0.063301,0.011456,0.016893,0.021748,0.696893,0.020971,0.005243,0.080583,0.011456,0.005243,0.000194,0.066019
VERB,0.06639,0.167956,0.005433,0.022836,0.110589,0.21593,0.13361,0.092357,0.030663,0.083886,0.035543,0.034807
CONJ,0.113611,0.150384,0.000549,0.040615,0.349067,0.00933,0.123491,0.055982,0.004391,0.05708,0.060373,0.035126
NUM,0.035345,0.020707,0.014281,0.18422,0.35166,0.202428,0.00357,0.037487,0.026062,0.00357,0.001428,0.119243
NOUN,0.012584,0.149134,0.042454,0.009144,0.262344,0.028825,0.013106,0.176827,0.043935,0.016895,0.004659,0.240094
X,0.017682,0.206419,0.010379,0.003075,0.061695,0.075726,0.05689,0.142226,0.185086,0.025754,0.0542,0.160869
DET,0.206411,0.040247,0.000431,0.022855,0.635906,0.045134,0.006037,0.009918,0.000287,0.012074,0.003306,0.017393
ADP,0.107062,0.008479,0.001012,0.063275,0.323589,0.034548,0.320931,0.016958,0.001266,0.014553,0.069603,0.038724
PRT,0.082975,0.401174,0.002348,0.056751,0.250489,0.012133,0.10137,0.019569,0.001174,0.009393,0.017613,0.04501
ADV,0.130721,0.339022,0.006982,0.029868,0.032196,0.022886,0.071373,0.119472,0.01474,0.081458,0.012025,0.139255


In [10]:
# Viterbi algorithm
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    for key, word in enumerate(words):
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
            # emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [11]:
# Selecting some random sentences
random.seed(500) 
# choose random 50 numbers
rndom = [random.randint(1,len(test_set)) for x in range(50)]
# list of 50 sents on which we test the model
test_run = [test_set[i] for i in rndom]
 
# list of tagged words
test_run_base = [ tag_word for sentence in test_run for tag_word in sentence ]
 
# list of untagged words
test_tagged_words = [ tag_word[0] for sentence in test_run for tag_word in sentence ]

In [None]:
#Testing of some random sentences
tagged_seq = Viterbi(test_tagged_words)
# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

In [None]:
test="The fans watch the race"
pred_tags= Viterbi(test.split())
print(pred_tags)