# Hidden Markov: POS Tagging
Sam Keyser, Carter Shavitz, John Paul Bunn

CS 2400 - Introduction to AI

## Experiment
### Set Up

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.util import ngrams
from nltk.corpus import brown, treebank, conll2000

# Download the requisite datasets
nltk.download('treebank')
nltk.download('brown')
nltk.download('conll2000')
nltk.download('universal_tagset')

# Load datasets
treebank_corpus = treebank.tagged_sents(tagset='universal')

In [None]:
print(treebank_corpus)

# Get a test X, y out of the corpus
X, y = zip(*treebank_corpus[0])
X = list(X)
y = list(y)
print('Sentence:', X)
print('Tags:', y)

## Probability Counting
Now that we've got a set of test sentences and tags, we need to start constructing the transition and emission probabilities. This count should be a function *N*, which is the length of the *N*-gram which we use to keep track of previous states up to the current one.

### Playing around with Splitting Sentences into *N*-grams

In [None]:
N = 3 # Default N-gram length
start_tag ='!@#$%^&*()_+START+_)(*&^%$#@!' # some string of characters very unlikely to occur in the wild which we can use as start tag

Example of splitting using ngram from nltk

In [None]:
print(X)
print(*ngrams(X, N)) # Split up our X

Putting everything together to actuallt do some probability counting and an implementation of the viterbi algorithm for determining best walk through tag space, to label each observation/word.

Hard coded to use N = 3 for simplicity.

In [None]:
start_tag ='START' # some string of characters very unlikely to occur in the wild which we can use as start tag
end_tag = 'END'

tag_words = []
for sentence in brown.tagged_sents():
    tag_words.append((start_tag, start_tag))
    for word, tag in sentence:
        tag_words.extend([(tag[:2], word)])
    tag_words.append((end_tag, end_tag))

cfd_tag_words = nltk.ConditionalFreqDist(tag_words)
cpd_tag_words = nltk.ConditionalProbDist(cfd_tag_words, nltk.MLEProbDist)

#print("The probability of an adjective (JJ) being 'smart' is", cpd_tag_words["JJ"].prob("smart"))
#print("The probability of a verb (VB) being 'try' is", cpd_tag_words["VB"].prob("try"))

tags = [tag for tag, word in tag_words]
cfd_tags = nltk.ConditionalFreqDist(nltk.ngrams(tags, 2))
cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist)

sample = "Doctor Powers is a very neat chap".split(' ')

distinct_tags = set(tags)

#print('The probability of DT occuring after NN is : ', cpd_tags["NN"].prob("DT"))
#print('The probability of VB occuring after NN is : ', cpd_tags["NN"].prob("VB"))


###
# putting things together:
# what is the probability of the tag sequence "PP VB NN" for the word sequence "I love food"?
# It is
# P(START) * P(PP|START) * P(I | PP) *
#            P(VB | PP) * P(love | VB) *
#            P(TO | VB) * P(food | NN) *
#            P(END | VB)
#
# We leave aside P(START) for now.
prob_tagsequence = cpd_tags[start_tag].prob("PP") * cpd_tag_words["PP"].prob("I") * \
                   cpd_tags["PP"].prob("VB") * cpd_tag_words["VB"].prob("love") * \
                   cpd_tags["VB"].prob("NN") * cpd_tag_words["PP"].prob("food") * \
                   cpd_tags["NN"].prob(end_tag)

#print("The probability of sentence 'I love food' having the tag sequence 'START PP VB PP END' is : ", prob_tagsequence)

viterbi_tag = {}
viterbi_backpointer = {}

for tag in distinct_tags:
    if tag is start_tag:
        continue
    viterbi_tag[tag] = cpd_tags[start_tag].prob(tag) * cpd_tag_words[tag].prob(sample[0])
   #print('viterbi_tag:', tag, viterbi_tag[tag])
    viterbi_backpointer[tag] = start_tag

viterbi_main = [viterbi_tag]
backpointer_main = [viterbi_backpointer]

curr_best = max(viterbi_tag.keys(), key=lambda tag: viterbi_tag[tag])

#print("Word", "'" + sample[0] + "'", "current best two-tag sequence:", viterbi_backpointer[curr_best], curr_best)

#print(distinct_tags)
for index in range(1, len(sample)):
    curr_viterbi = {}
    curr_backpointer = {}
    prev_viterbi = viterbi_main[-1]

    for tag in distinct_tags:
        if tag is not start_tag:
            prev_best = max(prev_viterbi.keys(), key=lambda prevtag: prev_viterbi[prevtag] * cpd_tags[prevtag].prob(tag) * cpd_tag_words[tag].prob(sample[index]))
            #print('prev_best:', prev_best)

            curr_viterbi[tag] = prev_viterbi[prev_best] * cpd_tags[prev_best].prob(tag) * cpd_tag_words[tag].prob(sample[index])
            #print('curr_viterbi:', tag, curr_viterbi[tag])

            curr_backpointer[tag] = prev_best

    curr_best = max(curr_viterbi.keys(), key=lambda tag: curr_viterbi[tag])
    #print("Word", "'" + sample[index] + "'", "current best two-tag sequence:", curr_backpointer[curr_best], curr_best)

    viterbi_main.append(curr_viterbi)
    backpointer_main.append(curr_backpointer)

prev_viterbi = viterbi_main[-1]
prev_best = max(prev_viterbi.keys(), key=lambda prev_tag: prev_viterbi[prev_tag] * cpd_tags[prev_tag].prob(end_tag))
prob_tag_seq = prev_viterbi[prev_best] * cpd_tags[prev_best].prob(end_tag)

best_tag_seq = [end_tag, prev_best]
backpointer_main.reverse()

curr_best = prev_best
for backpointer in backpointer_main:
    best_tag_seq.append(backpointer[curr_best])
    curr_best = backpointer[curr_best]

best_tag_seq.reverse()
print(" ".join(sample))
print('tags:', best_tag_seq[1:-1])