In [1]:
# to download dataset from Kaggle
import opendatasets as od
import numpy as np
import re
import pandas as pd

# data preprocessing
from sklearn.model_selection import train_test_split

# for evaluating sequence model metrics
from seqeval.metrics import f1_score, classification_report

# simple example of Viterbi Algorithm

import numpy as np

obs = {
    "normal": 0, 
    "cold": 1, 
    "dizzy": 2
}

states = {
    "Healthy": 0,
    "Fever": 1
}

start_p = [0.6, 0.4]

trans_p = np.array([[0.7, 0.3], [0.4, 0.6]])

emit_p = np.array([[0.5, 0.4, 0.1], [0.1, 0.3, 0.6]])

inputObs = ['normal', 'normal', 'dizzy']

res = viterbi(states, obs, start_p, trans_p, emit_p, inputObs)

print(res)
##### [0, 0, 1,]


In [3]:
od.download('https://www.kaggle.com/datasets/yingxuhe/pos-tagging?select=sents.answer')

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: vasanth12
Your Kaggle Key: ········
Downloading pos-tagging.zip to .\pos-tagging


100%|██████████████████████████████████████████████████████████████████████████████████████████| 2.50M/2.50M [02:11<00:00, 19.9kB/s]







In [6]:
# import POS tagging dataset
data = open('pos-tagging/sents.train', encoding='utf-8')

In [2]:
def findStationaryDistrb(transitionProb, stationaryDist):
    
    for i in range(100):
        stationaryDist = np.dot(transitionProb, stationaryDist)
    return np.transpose(stationaryDist)
    

In [3]:
def findEmissionAndTransitionProb(
    sentences, tags, states, observables, tagCount, wordCount, transmissionDist, transmissionProb, emissionDist, emissionProb
):
    # Distribution matrices   
    for i in range(len(sentences)): 
        _ptag = 0
        for j in range(len(sentences[i])):
            _token = sentences[i][j]
            _tag = tags[i][j]
            
            emissionDist[_tag][_token] += 1
            transmissionDist[_ptag][_tag] += 1
            
            _ptag = _tag
            
    # Emission Probability martices
    for i in range(tagCount):
        _keys = list(states.keys())
        _idx = list(filter(lambda x:states[x]["index"] == i, _keys))[0]
        _count = states[_idx]["count"] 
        
        for j in range(tagCount):
            transmissionProb[j][i] = transmissionDist[j][i] / _count
            
    # Transmission Probability martices
    for i in range(wordCount):
        _keys = list(observables.keys())
        _idx = list(filter(lambda x:observables[x]["index"] == i, _keys))[0]
        _count = observables[_idx]["count"]
        
        for j in range(tagCount):
            emissionProb[j][i] = emissionDist[j][i] / _count 
            
            

In [4]:
def featureExtraction(filename, states, observables):    
    _sentences = []
    _tags = []
    
    tagCount, wordCount = 0, 0
    with open(filename) as data:
        sentences = list(data)
        
        for sentence in sentences:
            sentence = sentence.lower()
            s = []
            t = []
            
            for token in sentence.split(' '):
                wAt = re.sub(r'\n', '', token)
                wAt = wAt.split('/')
                word = wAt[0]
                if observables.get(word) == None:
                    observables[word] = {
                        "index": wordCount,
                        "count": 1
                    }
                    wordCount += 1
                else:
                    observables[word]["count"] += 1
                s.append(observables[word]["index"])
                
                tag = wAt[-1]
                if states.get(tag) == None:
                    states[tag] = {
                        "index": tagCount,
                        "count": 1
                    }
                    tagCount += 1
                else:
                    states[tag]["count"] += 1
                t.append(states[tag]["index"])
            _sentences.append(s)
            _tags.append(t)
                    
    return _sentences, _tags                

In [5]:
def featureExtractionForOOV(filename, states, observables):
    _sentences = []
    _tags = []
    
    tagCount, wordCount = 0, 0
    with open(filename) as data:
        sentences = list(data)
        
        for sentence in sentences:
            sentence = sentence.lower()
            s = []
            t = []
            
            for token in sentence.split(' '):
                wAt = re.sub(r'\n', '', token)
                wAt = wAt.split('/')
                word = wAt[0]
                if observables.get(word) == None:
                    word = "oov"
                observables[word]["count"] += 1
                s.append(observables[word]["index"])
                
                tag = wAt[-1]
                if states.get(tag) == None:
                    states[tag] = {
                        "index": tagCount,
                        "count": 1
                    }
                    tagCount += 1
                else:
                    states[tag]["count"] += 1
                t.append(states[tag]["index"])
            _sentences.append(s)
            _tags.append(t)
                    
    return _sentences, _tags                

In [8]:
# Declaration of variables
states = {}
observables = {}

# Extract features from data
sentences, tags = featureExtraction('./pos-tagging/sents.train', states, observables)
tagCount = len(states.keys())
wordCount = len(observables.keys())

# To handle Out Of Vocabulary
observables['oov'] = {'index': wordCount, 'count': 0}
_oovSentences, _oovTags = featureExtractionForOOV('./pos-tagging/sents.answer', states, observables)
tagCount = len(states.keys())
wordCount = len(observables.keys())

xTrain, xTest, yTrain, yTest = train_test_split(_oovSentences, _oovTags, test_size=0.4)

for i in range(len(xTrain)):
    sentences.append(xTrain[i])
    tags.append(yTrain[i])

# Probabilities Distribution
transmissionDist = np.zeros((tagCount, tagCount))
transmissionProb = np.zeros((tagCount, tagCount))
emissionDist = np.zeros((tagCount, wordCount))
emissionProb = np.zeros((tagCount, wordCount))

findEmissionAndTransitionProb(
    sentences, tags, states, observables, tagCount, wordCount, transmissionDist, transmissionProb, emissionDist, emissionProb
)
stationaryDist = transmissionProb

# Stationary Probability distribution
stationaryDist = findStationaryDistrb(transmissionProb, stationaryDist)

# Prediction
res = viterbi(states, observables, stationaryDist[0], transmissionProb, emissionProb, 'Rolls-Royce Motor Cars Inc. said it expects its U.S. sales to remain steady at about 1,200 cars in 1990 .')
print(res)

[2.0, 2.0, 10.0, 2.0, 15.0, 21.0, 19.0, 22.0, 2.0, 10.0, 17.0, 23.0, 24.0, 0.0, 0.0, 3.0, 10.0, 0.0, 3.0, 18.0]


In [7]:
def viterbi(states, observables, stationaryTrans, transitionProbability, emissionProbability, inputSent):
    
    N = len(states.keys())
    M = len(inputSent)
    
    t1 = np.zeros((N,M+1))
    t2 = np.zeros((N,M+1))
    
    #y = lambda word:observables.get(word) in inputSent
    y = []
    for obs in inputSent.split(' '):
        obs = obs.lower()
        if observables.get(obs) == None:
            obs = 'oov'
        y.append(observables[obs]["index"])
    
    for i in range(N):
        t1[i][0] = stationaryTrans[i] * emissionProbability[i][y[0]]

    x = -1
    for j in range(1, M+1):
        for i in range(N):
            _max = -1000
            argmax = -1
            for k in range(N):
                try:
                    abc = t1[k][j-1] * transitionProbability[k][i] * emissionProbability[i][y[j]]
                    if _max < abc:
                        _max = abc
                        argmax = k
                except:
                    abc = t1[k][j-1]
                    if _max < abc:
                        _max = abc
                        argmax = k
            t1[i][j] = _max 
            t2[i][j] = argmax
            x = argmax
            
    _inputLength = len(y)
    res = np.zeros(_inputLength)
    for i in range(_inputLength, 0, -1):
        res[i-1] = t2[x][i]
        x = int(res[i-1])
        
    return res.tolist()
        

In [9]:
idxToTag = {}
for _tag in states.keys():
    _idx = states[_tag]['index']
    idxToTag[_idx] = _tag 

In [10]:
idxToWord = {}
for _word in observables.keys():
    _idx = observables[_word]["index"]
    idxToWord[_idx] = _word

In [11]:
def convertIndextoTAG(res):
    tagRes = ''
    for idx in res:
        tagRes += idxToTag[idx] + ' '
    return tagRes

In [12]:
res = viterbi(states, observables, stationaryDist[0], transmissionProb, emissionProb, 'Vasanth is a great man')
print(res)
print(convertIndextoTAG(res))

[4.0, 19.0, 1.0, 24.0, 4.0]
nn vbz dt jj nn 


In [13]:
def modelMetricsEvaluation(yTest, pred):
    _f1Score = f1_score(yTest, pred)
    print("F1 Score of Viterbi algorithm: ", _f1Score)
    print("Classification Report: ", classification_report(yTest, pred))
    

In [15]:
_predY = []
for i in range(len(xTest)):
    _testStr = ' '.join(idxToWord[_idx] for _idx in xTest[i])
    res = viterbi(states, observables, stationaryDist[0], transmissionProb, emissionProb, _testStr)
    
    pred = [idxToTag[int(_idx)] for _idx in res]
    _predY.append(pred)
    
_trueY = []
for i in range(len(yTest)):
    _true = [idxToTag[int(_idx)] for _idx in yTest[i]]
    _trueY.append(_true)
    
modelMetricsEvaluation(_trueY, _predY)

Prediction is completed
Converstion is completed




F1 Score of Viterbi algorithm:  0.9167631545091985


  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:                precision    recall  f1-score   support

           '       1.00      1.00      1.00       126
           _       1.00      1.00      1.00      1257
           `       1.00      1.00      1.00       128
           b       0.89      0.88      0.88       938
          bd       0.95      0.94      0.95       575
          bg       0.98      0.81      0.89       297
          bn       0.91      0.79      0.85       417
          bp       0.96      0.81      0.88       245
          br       0.65      0.52      0.58        25
          bs       1.00      0.50      0.67         4
          bz       0.98      0.95      0.96       412
           c       0.99      0.99      0.99       429
           d       0.99      0.94      0.96       867
          dt       0.97      0.75      0.85       100
           h       1.00      0.50      0.67         2
           j       0.78      0.88      0.83      1040
          jr       0.93      0.75      0.83        91
   