# Hidden Markov: POS Tagging
Sam Keyser, Carter Shavitz, John Paul Bunn

CS 2400 - Introduction to AI

## Experiment
### Set Up

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.util import ngrams
from nltk.corpus import brown, treebank, conll2000

# Download the requisite datasets
nltk.download('treebank')
nltk.download('brown')
nltk.download('conll2000')
nltk.download('universal_tagset')

# Load datasets
treebank_corpus = treebank.tagged_sents(tagset='universal')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\keysers\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\keysers\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\keysers\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\keysers\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [2]:
print(treebank_corpus)

# Get a test X, y out of the corpus
X, y = zip(*treebank_corpus[0])
X = list(X)
y = list(y)
print('Sentence:', X)
print('Tags:', y)

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')], ...]
Sentence: ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
Tags: ['NOUN', 'NOUN', '.', 'NUM', 'NOUN', 'ADJ', '.', 'VERB', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'NOUN', 'NUM', '.']


## Probability Counting
Now that we've got a set of test sentences and tags, we need to start constructing the transition and emission probabilities. This count should be a function *N*, which is the length of the *N*-gram which we use to keep track of previous states up to the current one.

### Playing around with Splitting Sentences into *N*-grams

In [3]:
N = 3 # Default N-gram length
start_tag ='!@#$%^&*()_+START+_)(*&^%$#@!' # some string of characters very unlikely to occur in the wild which we can use as start tag

Example of splitting using ngram from nltk

In [4]:
print(X)
print(*ngrams(X, N)) # Split up our X

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
('Pierre', 'Vinken', ',') ('Vinken', ',', '61') (',', '61', 'years') ('61', 'years', 'old') ('years', 'old', ',') ('old', ',', 'will') (',', 'will', 'join') ('will', 'join', 'the') ('join', 'the', 'board') ('the', 'board', 'as') ('board', 'as', 'a') ('as', 'a', 'nonexecutive') ('a', 'nonexecutive', 'director') ('nonexecutive', 'director', 'Nov.') ('director', 'Nov.', '29') ('Nov.', '29', '.')


Counting probability based on the article [here](https://www.freecodecamp.org/news/a-deep-dive-into-part-of-speech-tagging-using-viterbi-algorithm-17c8de32e8bc).


We run over our dataset in X to fill out conditional probabilities for transition and emission, using naive laplace smoothing.

The following section assumes we're using an *N* = 3 for simplicity.
The actual implementation should be generic as to *N*.

In [9]:
X = treebank_corpus
tmp = [[s for x, s in sentence] for sentence in X]
tags = set([tag for tags in tmp for tag in tags])
print('tags:', tags)

# declare dictionaries
C = {}
q = {}
e = {}

lamda = 1 # laplace smoothing hyperparameter
V = len(tags) # laplace smoothing hyperparameter

# Convenience methods
def increment_dict_val(dict, val):
    dict[val] = dict.get(val, 0) + 1

def safe_get(dict, val):
    return dict.get(val, 0)

tags: {'ADJ', 'VERB', 'NUM', 'DET', 'CONJ', 'X', 'ADP', 'PRON', 'PRT', 'NOUN', 'ADV', '.'}


In [10]:
%%time
# Conditional prob counting
for sentence in X:
    # x = observation = word
    # s = state = tag
    words = [x for x, s in sentence]
    tags = [s for x, s in sentence]

    tags.insert(0, start_tag)
    tags.insert(0, start_tag)

    for trigram in ngrams(sentence, 3):
        increment_dict_val(C, trigram)
        increment_dict_val(C, trigram[:-1])

    for x, s in sentence:
        increment_dict_val(C, (x, s))

    for trigram in ngrams(sentence, 3):
        q[trigram[-1]] = (safe_get(q, trigram) + lamda) / (safe_get(q, trigram[:-1]) + lamda*V)

    visited = set()
    for x, s in sentence:
        if s not in visited:
            e[s] = (safe_get(e, (x,s)) + lamda) / (safe_get(e, (s)) + lamda*V)
            visited.add(s)

Wall time: 1.91 s


In [18]:
from viterbi import viterbi
from viterbi import viterbi1

sentence = X[0]

C = {}
q = {}
e = {}
words = [x for x, s in sentence]
tags = [s for x, s in sentence]

tags.insert(0, start_tag)
tags.insert(0, start_tag)

for trigram in ngrams(sentence, 3):
    increment_dict_val(C, trigram)
    increment_dict_val(C, trigram[:-1])

for x, s in sentence:
    increment_dict_val(C, (x, s))

for trigram in ngrams(sentence, 3):
    q[trigram[-1]] = (safe_get(q, trigram) + lamda) / (safe_get(q, trigram[:-1]) + lamda*V)

visited = set()
for x, s in sentence:
    if s not in visited:
        e[s] = (safe_get(e, (x,s)) + lamda) / (safe_get(e, (s)) + lamda*V)
        visited.add(s)


start_p = {k: 1.0/len(tags) for k in tags} # TODO: this would be a trainable parameter

start_p[start_tag] = 1.0
q[start_tag] = 1.0
e[start_tag] = 0.0

'Pierre'

In [None]:
data = {'Word':words, 
        'Tag':tags}
df = pd.DataFrame(data)
viterbi1 = viterbi1(words, treebank_corpus, df, ) 
