In [1]:
import nltk
nltk.download('treebank')


[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [2]:
from nltk.corpus import brown
import re
import numpy as np

In [3]:
removeNonWords = re.compile(r'\W')

In [16]:
states = {}
observables = {}

In [17]:
countStates = 1
countObservables = 0

for taggedSentence in brown.tagged_sents(tagset='universal'):
    for token in taggedSentence:
        word = token[0]
        if removeNonWords.match(word):
            continue
        if states.get(token[1]) == None:
            states[token[1]] = countStates
            countStates += 1
        if observables.get(word) == None:
            observables[word] = countObservables
            countObservables += 1
    
print("countObservables: ", countObservables)
print("countStates: ", countStates)
#print("states: ",states)
#print("observables: ",observables)
    

countObservables:  55640
countStates:  11


In [None]:
countObservables:  56032
countStates:  456

In [18]:
removeNonWords.match("there's,")

In [19]:
transitionMatrix = np.zeros((countStates, countStates))
emissionMatrix = np.zeros((countStates, countObservables))

In [20]:
for taggedSentence in brown.tagged_sents(tagset='universal'):
    previousState = 0
    for token in taggedSentence:
        word = token[0]
        if removeNonWords.match(word):
            continue
        wordIndex = observables[word]
        stateIndex = states[token[1]]
        
        transitionMatrix[previousState][stateIndex] += 1
        emissionMatrix[stateIndex][wordIndex] += 1
        
        previousState = stateIndex
        

In [21]:
for tag in states.keys():
    print(tag, end="\t")

DET	NOUN	ADJ	VERB	ADP	ADV	CONJ	PRT	PRON	NUM	X	

In [22]:
for i in range(countStates):
    for j in range(countStates):
        print(emissionMatrix[i][j], end=",\t")
    print()

7258.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	3529.0,	0.0,	0.0,	0.0,	
0.0,	17.0,	85.0,	0.0,	4.0,	0.0,	60.0,	0.0,	43.0,	0.0,	4.0,	
0.0,	0.0,	0.0,	15.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	
0.0,	0.0,	0.0,	0.0,	0.0,	1943.0,	0.0,	0.0,	0.0,	0.0,	0.0,	
0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	36078.0,	0.0,	
0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	
0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	11.0,	0.0,	0.0,	0.0,	
0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	
0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	
0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	0.0,	
0.0,	0.0,	0.0,	3.0,	0.0,	0.0,	0.0,	2.0,	0.0,	2.0,	0.0,	


In [14]:
states.keys()

dict_keys(['DET', 'NOUN', 'ADJ', 'VERB', 'ADP', 'ADV', 'CONJ', 'PRT', 'PRON', 'NUM', 'X'])

In [1]:
def viterbi(states, observables, stationaryTrans, transitionProbability, emissionProbability, inputSent):
    
    N = len(states.keys())
    M = len(inputSent)
    
    t1 = np.zeros((N,M+1))
    t2 = np.zeros((N,M+1))
    
    y = lambda word:observables.get(word) in inputSent
    
    for i in range(N):
        t1[i][0] = stationaryTrans[i] * emissionProbability[i][y[0]]

    x = -1
    for j in range(1, M+1):
        for i in range(N):
            _max = -1000
            argmax = -1
            for k in range(N):
                abc = t1[k][j-1] * transitionProbability[k][i] * emissionProbability[i][y[j]]
                if _max < abc:
                    _max = abc
                    argmax = k
            t1[i][j] = _max 
            t2[i][j] = argmax
            x = argmax
            
    
    res = np.zeros((M,+1))
    for i in range(M+1, 0, -1):
        res[i] = t2[i][x]
        x = res[i]
        
    return res
        