# POS Tagging Using Hidden Markov Model
- **Hiddden Markov Model is a chain of invisible state. Each state emits observable outputs**

In [1]:
import numpy as np
from collections import Counter, defaultdict
import pomegranate
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution
from pprint import pprint

In [2]:
# Training the data
sent1 = "venkat plays in parks"
sent1_pos = "NOUN VERB MODIFIER NOUN"
sent2 = "Robert parks car at nights"
sent2_pos = "NOUN VERB NOUN MODIFIER NOUN"

In [3]:
s1 = sent1.split()  # splitting the words from a sentence
s2 = sent2.split()
s1_pos = sent1_pos.split()
s2_pos = sent2_pos.split()

In [4]:
# sentence 1 and pos
print(s1)
print(s1_pos)
print("----------------------------------------------------------------**")

# Sentence 2 and pos
print(s2)
print(s2_pos)

['venkat', 'plays', 'in', 'parks']
['NOUN', 'VERB', 'MODIFIER', 'NOUN']
----------------------------------------------------------------**
['Robert', 'parks', 'car', 'at', 'nights']
['NOUN', 'VERB', 'NOUN', 'MODIFIER', 'NOUN']


In [6]:
word_list = s1+s2
print(word_list)
print("--------------------------------------------------------------------------------**")

pos_list = s1_pos + s2_pos
print(pos_list)

['venkat', 'plays', 'in', 'parks', 'Robert', 'parks', 'car', 'at', 'nights']
--------------------------------------------------------------------------------**
['NOUN', 'VERB', 'MODIFIER', 'NOUN', 'NOUN', 'VERB', 'NOUN', 'MODIFIER', 'NOUN']


## Hidden Markov Model

In [7]:
# Build a Hidden Markov Model
hmm_model = HiddenMarkovModel(name = 'POS_Tagger_HMM')

### Add the Hidden States with their Emmision probabilities(observation states) to the model

### Hidden states with their emmision counts
- **Keys will be Hidden states(here, pos_list are keys)**
- **value of each key will be emmision from the hidden state(a dictionary of word frequency for that POS)**

In [8]:
print(pos_list)
print(word_list)

['NOUN', 'VERB', 'MODIFIER', 'NOUN', 'NOUN', 'VERB', 'NOUN', 'MODIFIER', 'NOUN']
['venkat', 'plays', 'in', 'parks', 'Robert', 'parks', 'car', 'at', 'nights']


In [9]:
POS_wordsfreq = defaultdict(lambda: defaultdict(int))
for POS, word in zip(pos_list, word_list):
    POS_wordsfreq[POS][word] += 1
pprint(POS_wordsfreq)

defaultdict(<function <lambda> at 0x000001EDAF68C310>,
            {'MODIFIER': defaultdict(<class 'int'>, {'in': 1, 'at': 1}),
             'NOUN': defaultdict(<class 'int'>,
                                 {'Robert': 1,
                                  'car': 1,
                                  'nights': 1,
                                  'parks': 1,
                                  'venkat': 1}),
             'VERB': defaultdict(<class 'int'>, {'plays': 1, 'parks': 1})})


In [10]:
POS_wordsfreq.items()

dict_items([('NOUN', defaultdict(<class 'int'>, {'venkat': 1, 'parks': 1, 'Robert': 1, 'car': 1, 'nights': 1})), ('VERB', defaultdict(<class 'int'>, {'plays': 1, 'parks': 1})), ('MODIFIER', defaultdict(<class 'int'>, {'in': 1, 'at': 1}))])

In [11]:
POS_wordsfreq.values()

dict_values([defaultdict(<class 'int'>, {'venkat': 1, 'parks': 1, 'Robert': 1, 'car': 1, 'nights': 1}), defaultdict(<class 'int'>, {'plays': 1, 'parks': 1}), defaultdict(<class 'int'>, {'in': 1, 'at': 1})])

## Calculate emmision probabilities and add each POS 'State' object to the model

In [13]:
to_states = []
for POS, WordsFreq in POS_wordsfreq.items():     # creating(POs, WordsFreq) two variables in POS_wordsfreq.items 
    total = float(sum(WordsFreq.values()))    # count of how many nouns, verbs, modifiers (pos tags names)
    print("-----------------------------------------------")
    print(POS, 'total:',total)
    emmision_prob = {word: count/total for word, count in WordsFreq.items()}     # count of nouns, verbs,modifiers with names
    print(emmision_prob)                                                           # with probability
    
    POS_state = State(DiscreteDistribution(emmision_prob), name = POS)
    print(POS_state)
    to_states.append(POS_state)        # adding Discrete distribution 

-----------------------------------------------
NOUN total: 5.0
{'venkat': 0.2, 'parks': 0.2, 'Robert': 0.2, 'car': 0.2, 'nights': 0.2}
{
    "class" : "State",
    "distribution" : {
        "class" : "Distribution",
        "dtype" : "str",
        "name" : "DiscreteDistribution",
        "parameters" : [
            {
                "venkat" : 0.2,
                "parks" : 0.2,
                "Robert" : 0.2,
                "car" : 0.2,
                "nights" : 0.2
            }
        ],
        "frozen" : false
    },
    "name" : "NOUN",
    "weight" : 1.0
}
-----------------------------------------------
VERB total: 2.0
{'plays': 0.5, 'parks': 0.5}
{
    "class" : "State",
    "distribution" : {
        "class" : "Distribution",
        "dtype" : "str",
        "name" : "DiscreteDistribution",
        "parameters" : [
            {
                "plays" : 0.5,
                "parks" : 0.5
            }
        ],
        "frozen" : false
    },
    "name" : "VERB",
    

In [14]:
input("any key")

any keyshivani


'shivani'

In [16]:
print(type(to_states[0]))
print(len(to_states))
print([state.name for state in to_states])
print(to_states[2])     # Here, I set index 2 to see Modifier u can check 0,1 indexes to see noun and verb

<class 'pomegranate.base.State'>
3
['NOUN', 'VERB', 'MODIFIER']
{
    "class" : "State",
    "distribution" : {
        "class" : "Distribution",
        "dtype" : "str",
        "name" : "DiscreteDistribution",
        "parameters" : [
            {
                "in" : 0.5,
                "at" : 0.5
            }
        ],
        "frozen" : false
    },
    "name" : "MODIFIER",
    "weight" : 1.0
}


## Add Start and End Tag Transition probabilities

##### Counts for transition from start tag and transition to end tag 

In [17]:
pos_list

['NOUN',
 'VERB',
 'MODIFIER',
 'NOUN',
 'NOUN',
 'VERB',
 'NOUN',
 'MODIFIER',
 'NOUN']

In [18]:
from_front = s1_pos[0],s1_pos[1],s1_pos[2],s1_pos[3]
from_reverse = s1_pos[-1],s1_pos[-2],s1_pos[-3],s1_pos[-4]
print(from_front)
print(from_reverse)

('NOUN', 'VERB', 'MODIFIER', 'NOUN')
('NOUN', 'MODIFIER', 'VERB', 'NOUN')


In [20]:
print(s1_pos)
print(s2_pos)

print("------------------------------------------------------------------")
start_pos_list = [s1_pos[0],s2_pos[0]]
print("start Pos list:",start_pos_list)

print("-------------------------------------------------------------------")

end_pos_list = [s1_pos[-1],s2_pos[-1]]
print("End Pos list:",end_pos_list)

print("-------------------------------------------------------------------")

start_pos_count = Counter(start_pos_list)
print("Start POS Count:", start_pos_count)

print("--------------------------------------------------------------------")

end_pos_count = Counter(end_pos_list)
print("End Pos Count:",end_pos_count)

print("--------------------------------------------------------------------")

pos_count_unigrams = Counter(pos_list)
print("POS Unigram counts:",pos_count_unigrams)

['NOUN', 'VERB', 'MODIFIER', 'NOUN']
['NOUN', 'VERB', 'NOUN', 'MODIFIER', 'NOUN']
------------------------------------------------------------------
start Pos list: ['NOUN', 'NOUN']
-------------------------------------------------------------------
End Pos list: ['NOUN', 'NOUN']
-------------------------------------------------------------------
Start POS Count: Counter({'NOUN': 2})
--------------------------------------------------------------------
End Pos Count: Counter({'NOUN': 2})
--------------------------------------------------------------------
POS Unigram counts: Counter({'NOUN': 5, 'VERB': 2, 'MODIFIER': 2})


### Start and End Probability for each POS Tag
- **Finding the probability from starting to Noun, verb, modifier and LIke wise finding the end probability from ending to Noun, verb and modifier**

In [26]:
start_prob, end_prob = {},{}        
for ps in pos_count_unigrams:
    start_prob[ps] = start_pos_count[ps]/pos_count_unigrams[ps]
    end_prob[ps] = end_pos_count[ps]/pos_count_unigrams[ps]
    
print(start_prob)
print(end_prob)

{'NOUN': 0.4, 'VERB': 0.0, 'MODIFIER': 0.0}
{'NOUN': 0.4, 'VERB': 0.0, 'MODIFIER': 0.0}


## Add Start & End probabilities to the model

In [27]:
for Pos_states in to_states:
    hmm_model.add_transition(hmm_model.start, Pos_states, start_prob[Pos_states.name])
    hmm_model.add_transition(hmm_model.end, Pos_states, end_prob[Pos_states.name])

### Add tranition probabilities between POS states

### Hidden State transition counts- Using POS_List Bigrams

In [28]:
pos_list

['NOUN',
 'VERB',
 'MODIFIER',
 'NOUN',
 'NOUN',
 'VERB',
 'NOUN',
 'MODIFIER',
 'NOUN']

In [33]:
bigram = [(pos_list[i],pos_list[i+1]) for i in range(0, len(pos_list)-1,1)]
pprint(bigram)

print("---------------------------------------------------------")

pos_count_bag = Counter(bigram)     # how many  times bigrams are repeating
pprint(pos_count_bag)

[('NOUN', 'VERB'),
 ('VERB', 'MODIFIER'),
 ('MODIFIER', 'NOUN'),
 ('NOUN', 'NOUN'),
 ('NOUN', 'VERB'),
 ('VERB', 'NOUN'),
 ('NOUN', 'MODIFIER'),
 ('MODIFIER', 'NOUN')]
---------------------------------------------------------
Counter({('NOUN', 'VERB'): 2,
         ('MODIFIER', 'NOUN'): 2,
         ('VERB', 'MODIFIER'): 1,
         ('NOUN', 'NOUN'): 1,
         ('VERB', 'NOUN'): 1,
         ('NOUN', 'MODIFIER'): 1})


## Tansition Probabilities for each POS tag pair

In [34]:
pos_count_bag.keys()

dict_keys([('NOUN', 'VERB'), ('VERB', 'MODIFIER'), ('MODIFIER', 'NOUN'), ('NOUN', 'NOUN'), ('VERB', 'NOUN'), ('NOUN', 'MODIFIER')])

In [40]:
pos_count_unigrams.keys()

dict_keys(['NOUN', 'VERB', 'MODIFIER'])

In [46]:
# Getting the transition probability 
transition_probability_pos_word = {}
for key in pos_count_bag.keys():
    transition_probability_pos_word[key] = pos_count_bag.get(key)/pos_count_unigrams[key[0]]
    
transition_probability_pos_word

{('NOUN', 'VERB'): 0.4,
 ('VERB', 'MODIFIER'): 0.5,
 ('MODIFIER', 'NOUN'): 1.0,
 ('NOUN', 'NOUN'): 0.2,
 ('VERB', 'NOUN'): 0.5,
 ('NOUN', 'MODIFIER'): 0.2}

#### If a certain pair of POS don't occur in training set, make them ZEREOS. Since our training set is very less, this issue occurs 

In [47]:
# If a certain pair of POS don't occur in traning set, make them ZEROES. Since our training set is very less, this issue occurs
transition_probability_pos_word[('VERB','VERB')] = 0
transition_probability_pos_word[('MODIFIER','VERB')] = 0
transition_probability_pos_word[('MODIFIER','MODIFIER')] = 0

In [48]:
transition_probability_pos_word

{('NOUN', 'VERB'): 0.4,
 ('VERB', 'MODIFIER'): 0.5,
 ('MODIFIER', 'NOUN'): 1.0,
 ('NOUN', 'NOUN'): 0.2,
 ('VERB', 'NOUN'): 0.5,
 ('NOUN', 'MODIFIER'): 0.2,
 ('VERB', 'VERB'): 0,
 ('MODIFIER', 'VERB'): 0,
 ('MODIFIER', 'MODIFIER'): 0}

In [49]:
# Add transition probabilities to all POS
for Pos_states in to_states:
    for next_pos_state in to_states:
        hmm_model.add_transition(Pos_states, next_pos_state,transition_probability_pos_word[(Pos_states.name,next_pos_state.name)])

In [50]:
hmm_model.bake()

### Decode POS for a new sentence

In [51]:
# Decode POS for a new sentence
def pos_decoding(sentence,model):
    decode,state_path = model.viterbi(sentence)
    return [state[1].name for state in state_path[1:-1]]

In [52]:
print(s1)
print(s2)

['venkat', 'plays', 'in', 'parks']
['Robert', 'parks', 'car', 'at', 'nights']


In [55]:
s3 = ('venkat','parks','car','in','parks')
print(s3)
pos_tags = pos_decoding(s3, hmm_model)
print(str(pos_tags))

('venkat', 'parks', 'car', 'in', 'parks')
['NOUN', 'VERB', 'NOUN', 'MODIFIER']


In [56]:
s3 = ['venkat','plays','at','nights','in','parks']
print(s3)

pos_tags = pos_decoding(s3, hmm_model)
print(str(pos_tags))

['venkat', 'plays', 'at', 'nights', 'in', 'parks']
['NOUN', 'VERB', 'MODIFIER', 'NOUN', 'MODIFIER']


In [58]:
s4 = ['venkat', 'plays', 'in', 'parks']
print(s4)
pos_tags1 = pos_decoding(s4, hmm_model)
print(str(pos_tags1))           # showing one time pos tags after decoding 

['venkat', 'plays', 'in', 'parks']
['NOUN', 'VERB', 'MODIFIER']


In [60]:
s5 = ['Robert', 'parks', 'car', 'at', 'nights']
print(s5)
pos_tags2 = pos_decoding(s5, hmm_model)
print(str(pos_tags2))

['Robert', 'parks', 'car', 'at', 'nights']
['NOUN', 'VERB', 'NOUN', 'MODIFIER']


In [57]:
# original 
sent1 = "venkat plays in parks"
sent1_pos = "NOUN VERB MODIFIER NOUN"
sent2 = "Robert parks car at nights"
sent2_pos = "NOUN VERB NOUN MODIFIER NOUN"