In [2]:
# Downloading Corpus from NLTK

import nltk
from nltk.corpus import treebank

# Download the required dataset
nltk.download('treebank')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [3]:
from sklearn.model_selection import train_test_split

# Load the Treebank corpus for training
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

tagged_words = [ tup for sent in nltk_data for tup in sent ]


In [4]:
#Use set datatype to check how many unique tags are present in training data
tags = {tag for word,tag in tagged_words}

# check total words in vocabulary
vocab = {word for word,tag in tagged_words}

In [5]:
import numpy as np

# compute Emission Probability
def word_given_tag(word, tag, train_bag = tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)#total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
#now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(w_given_tag_list)
     
    return (count_w_given_tag, count_tag)

# compute Transition Probability
def t2_given_t1(t2, t1, train_bag = tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

# creating t x t transition matrix of tags, t= no of tags
# Matrix(i, j) represents P(jth tag after the ith tag)
 
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]
 

In [6]:
import pandas as pd

tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
display(tags_df)

Unnamed: 0,.,VERB,X,CONJ,PRON,NUM,ADJ,DET,ADV,NOUN,PRT,ADP
.,0.093385,0.088775,0.026889,0.058301,0.065898,0.080922,0.043875,0.17388,0.052582,0.221852,0.00239,0.091165
VERB,0.035019,0.168977,0.217782,0.005382,0.035535,0.022855,0.065173,0.134326,0.081834,0.110366,0.031407,0.091345
X,0.163617,0.204748,0.074853,0.010283,0.055497,0.002722,0.016936,0.054589,0.025707,0.061999,0.184636,0.144413
CONJ,0.03532,0.156733,0.008389,0.000442,0.05872,0.041501,0.117439,0.119205,0.054746,0.349669,0.004857,0.05298
PRON,0.040555,0.485568,0.092802,0.005115,0.007673,0.007307,0.073073,0.009499,0.033979,0.209353,0.012422,0.022653
NUM,0.116751,0.018331,0.210378,0.013536,0.00141,0.184997,0.033277,0.003384,0.00282,0.353074,0.027073,0.034969
ADJ,0.064874,0.012037,0.020947,0.016883,0.000625,0.020791,0.066437,0.004846,0.00469,0.69939,0.010786,0.077693
DET,0.01765,0.039656,0.045616,0.000458,0.003668,0.02212,0.204928,0.005501,0.012607,0.638281,0.000229,0.009284
ADV,0.136235,0.344686,0.023021,0.006938,0.015137,0.031536,0.129612,0.068748,0.07947,0.031851,0.014191,0.118575
NOUN,0.239963,0.146881,0.029064,0.042575,0.004711,0.009457,0.012298,0.013164,0.017009,0.264246,0.043891,0.176742


In [7]:
def Viterbi(words, train_bag = tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
             
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [8]:
test_sent="My name is Sudeeksha. I like Coffee."
pred_tags_rule=Viterbi(test_sent.split())
print(pred_tags_rule)

[('My', 'PRON'), ('name', 'NOUN'), ('is', 'VERB'), ('Sudeeksha.', '.'), ('I', 'PRON'), ('like', 'VERB'), ('Coffee.', '.')]
