In [1]:
# extract words and their POS tags from the WSJ corpus
with open('./corpus/WSJ_02-21.pos') as file:
    data = file.readlines()

words = ['<s>']
pos_tags = ['BEGIN_SENT']

for line in data:
    if len(line) > 1:
        word, pos = line.split()
        words.append(word)
        pos_tags.append(pos)
    else:
        words.extend(['</s>', '<s>'])
        pos_tags.extend(['END_SENT', 'BEGIN_SENT'])

print(words[:10])
print(pos_tags[:10])
print(len(words))
print(len(pos_tags))

['<s>', 'In', 'an', 'Oct.', '19', 'review', 'of', '``', 'The', 'Misanthrope']
['BEGIN_SENT', 'IN', 'DT', 'NNP', 'CD', 'NN', 'IN', '``', 'DT', 'NN']
1029693
1029693


In [2]:
# a table of frequencies of words that occur with that POS tag
from collections import defaultdict

likelihoods = defaultdict(lambda: defaultdict(int))

for word, pos in zip(words, pos_tags):
    likelihoods[pos][word] += 1

print(likelihoods['DT'])
print(likelihoods['BEGIN_SENT'])
print(likelihoods['END_SENT'])

defaultdict(<class 'int'>, {'an': 3142, 'The': 6795, 'the': 41098, 'a': 19264, 'A': 817, 'some': 1274, 'this': 1897, 'any': 721, 'those': 505, 'Both': 99, 'Some': 314, 'no': 606, 'An': 141, 'Either': 3, 'This': 398, 'these': 417, 'another': 351, 'that': 1168, 'That': 412, 'each': 356, 'every': 171, 'all': 842, 'No': 82, 'both': 336, 'These': 139, 'Another': 72, 'Those': 66, 'Each': 57, 'Any': 18, 'All': 82, 'THE': 35, 'either': 50, 'many': 14, 'Every': 20, 'neither': 18, 'NO': 2, 'half': 31, 'Many': 2, 'Neither': 13, 'nary': 1, 'AN': 6, 'them': 1, 'la': 2, 'Half': 1, 'THOSE': 1, 'del': 1, 'BOTH': 1})
defaultdict(<class 'int'>, {'<s>': 39833})
defaultdict(<class 'int'>, {'</s>': 39832})


In [3]:
# a table of frequencies of following states
transitions = defaultdict(lambda: defaultdict(int))

for i in range(len(pos_tags) - 1):
    transitions[pos_tags[i]][pos_tags[i + 1]] += 1

print(transitions['BEGIN_SENT'])
print(transitions['END_SENT'])
print(transitions['DT'])

defaultdict(<class 'int'>, {'IN': 5050, 'NNP': 8036, 'DT': 8648, 'NNS': 1669, 'PRP': 2428, 'JJ': 1671, '``': 3003, 'RB': 2263, 'CC': 2269, 'WP': 123, 'VBG': 482, 'VBN': 230, 'CD': 440, 'NN': 1598, 'PRP$': 320, 'WDT': 31, 'VBD': 33, '(': 144, 'VBZ': 56, 'FW': 7, 'NNPS': 79, 'WRB': 249, 'JJR': 60, 'JJS': 100, 'VB': 115, 'LS': 28, 'TO': 138, 'PDT': 29, 'RBR': 84, '$': 28, 'MD': 27, 'RBS': 22, 'EX': 168, 'VBP': 14, ':': 102, 'UH': 23, 'SYM': 46, "''": 16, ')': 1, 'WP$': 1, '#': 1})
defaultdict(<class 'int'>, {'BEGIN_SENT': 39832})
defaultdict(<class 'int'>, {'NNP': 9044, 'NN': 38873, 'JJ': 17850, 'NNPS': 416, 'VBN': 689, 'NNS': 5988, 'CD': 1876, 'WP': 66, '(': 43, 'RBR': 141, 'VBG': 628, 'JJR': 465, 'RB': 844, 'WRB': 1, 'IN': 791, 'RBS': 228, 'DT': 129, 'VBZ': 657, '``': 454, 'MD': 181, 'JJS': 762, 'TO': 25, 'VBP': 145, ',': 191, 'PRP$': 59, '.': 131, 'VBD': 180, '$': 757, 'FW': 21, 'END_SENT': 7, 'CC': 60, 'VB': 21, 'PRP': 40, 'WDT': 16, "''": 3, '#': 13, ':': 33, 'POS': 3, 'RP': 6, ')': 

In [4]:
# converting to probabilities by dividing by the total
for pos in likelihoods:
    total = sum(likelihoods[pos].values())
    for word in likelihoods[pos]:
        likelihoods[pos][word] /= total

for pos in transitions:
    total = sum(transitions[pos].values())
    for next_pos in transitions[pos]:
        transitions[pos][next_pos] /= total

print(likelihoods['DT'])
print(transitions['BEGIN_SENT'])

defaultdict(<class 'int'>, {'an': 0.03839104616211725, 'The': 0.08302583025830258, 'the': 0.5021627037462427, 'a': 0.2353803670487036, 'A': 0.009982649495369126, 'some': 0.015566579506854672, 'this': 0.023178807947019868, 'any': 0.008809657633000172, 'those': 0.006170425942670023, 'Both': 0.0012096478580679846, 'Some': 0.003836660883165123, 'no': 0.007404511131204028, 'An': 0.0017228317978544023, 'Either': 3.665599569902984e-05, 'This': 0.004863028762737959, 'these': 0.005095183402165147, 'another': 0.004288751496786491, 'that': 0.014271400992155616, 'That': 0.0050340900760000975, 'each': 0.0043498448229515405, 'every': 0.002089391754844701, 'all': 0.010288116126194374, 'No': 0.0010019305491068156, 'both': 0.0041054715182913416, 'These': 0.0016983944673883825, 'Another': 0.0008797438967767161, 'Those': 0.0008064319053786564, 'Each': 0.0006964639182815669, 'Any': 0.00021993597419417904, 'All': 0.0010019305491068156, 'THE': 0.00042765328315534813, 'either': 0.0006109332616504973, 'many':

In [27]:
vocabularies = set(words)
unique_pos_tags = list(transitions.keys())
len(vocabularies), len(unique_pos_tags)

(44391, 47)

In [33]:
# implement a vertebi HMM POS tagger
sample_sentence = "<s> life . </s>".split()

import numpy as np 

# initialize the viterbi matrix
viterbi = np.zeros((len(unique_pos_tags), len(sample_sentence)))

for j, token in enumerate(sample_sentence):
    for i, pos in enumerate(unique_pos_tags):
        if j == 0 and pos == 'BEGIN_SENT':
            viterbi[i, j] = 1
        elif j == 1:
            viterbi[i, j] = transitions['BEGIN_SENT'][pos] * likelihoods[pos][token]
        else:
            viterbi[i, j] = max(viterbi[k, j - 1] * transitions[unique_pos_tags[k]][pos] * likelihoods[pos][token] for k in range(len(unique_pos_tags)))
        
# backtrace to find the best path
best_path = [np.argmax(viterbi[:, -1])]
for j in range(len(sample_sentence) - 1, 0, -1):
    best_path.append(np.argmax([viterbi[i, j - 1] * transitions[unique_pos_tags[i]][unique_pos_tags[best_path[-1]]] for i in range(len(unique_pos_tags))]))
best_path = best_path[::-1]


# print the best path
for i, token in enumerate(sample_sentence):
    print(f'{token}/{pos_tags[best_path[i]]}', end=' ')
print()

<s>/BEGIN_SENT life/NN ./NNS </s>/VBP 
