In [1]:
import numpy as np
import copy
import pandas as pd

In [418]:
# process training data
df = pd.read_excel('data/GOV-ZA.50000ParallelleEnWoorde.xh.pos.full.xls')
df = df[df.POS != 'PUNCT'].fillna('SENTENCE_BOUNDARY')
data = [{'Token':'', 'POS':'SENTENCE_BOUNDARY'}]
df = pd.concat([pd.DataFrame(data), df], ignore_index=True).reset_index(drop = True)
df.head()

Unnamed: 0,Token,POS
0,,SENTENCE_BOUNDARY
1,Injongo,N09
2,ye-website,V
3,yaseMzantsi,LOC
4,Afrika,N05


In [443]:
len(df)*0.2

7771.8

In [459]:
# split into training and validation sets
train = df.iloc[7778:].reset_index()
validation = df.iloc[:7779].reset_index()

# get tags and vocabulary
states = train.POS.unique()
vocab = train.Token.unique()[1:]

In [593]:
# train HMM

transitions = {}
emissions = {}
discount = 0.1
trans_discount = 0.5
discount_totals = {}
trans_discount_totals = {}
POS_counts = {}
for i in range(1, len(train)):
    key = train.POS[i]
    POS_counts[key] = POS_counts.setdefault(key, 0) + 1      
    key = (train.POS[i], train.POS[i-1])
    transitions[key] = transitions.setdefault(key, 0) + 1
    if train.POS[i] != 'SENTENCE_BOUNDARY':
        key = (train.Token[i], train.POS[i])
        emissions[key] = emissions.setdefault(key, 0) + 1

# Normalize and smooth transition probabilities
for key in list(transitions.keys()):
    transitions[key] = (transitions[key]-trans_discount) / POS_counts[key[1]]
    trans_discount_totals[key[1]] = trans_discount_totals.setdefault(key[1], 0) + trans_discount
    
for key in list(trans_discount_totals.keys()):
    trans_discount_totals[key] = trans_discount_totals[key] / POS_counts[key]
    
unseen_counts = {}
unseen_keys = []
for s1 in states:
    for s2 in states:
        key = (s1, s2)
        if key not in transitions:
            transitions[key] = trans_discount_totals[s2]
            unseen_counts[s2] = unseen_counts.setdefault(s2, 0) + 1
            unseen_keys += [key]

for key in unseen_keys:
    transitions[key] = transitions[key] / unseen_counts[key[1]]

# Normalize emission probabilities
for key in list(emissions.keys()):
    emissions[key] = (emissions[key] - discount) / POS_counts[key[1]]
    discount_totals[key[1]] = discount_totals.setdefault(key[1], 0) + discount
    
for key in list(discount_totals.keys()):
    discount_totals[key] = discount_totals[key] / POS_counts[key]

In [599]:
# check transition probablities
pos_sum = 0
for key in transitions:  
    if key[1] == 'N05':
        pos_sum += transitions[key]
np.round(pos_sum, 12)

1.0

In [594]:
# absolute discount smoothing
def smooth(df, data):
    smoothed = copy.deepcopy(df)
    data_vocab = data.Token.unique()[1:]
    full_vocab = np.unique(np.concatenate((vocab, data_vocab)))
    unseen_counts = {}
    unseen_keys = []
    for s in states[1:]:
        for w in full_vocab:
            key = (w, s)
            if key not in smoothed:
                smoothed[key] = discount_totals[s]
                unseen_counts[s] = unseen_counts.setdefault(s, 0) + 1
                unseen_keys += [key]
                
    for key in unseen_keys:
        smoothed[key] = smoothed[key] / unseen_counts[key[1]]
    return smoothed

In [535]:
# process test data
df = pd.read_excel('data/GOV-ZA.Toetsteks.5000ParallelleEnWoorde.xh.pos.full.xls')
df = df[df.POS != 'PUNCT'].fillna('SENTENCE_BOUNDARY').reset_index(drop = True)
test = df

In [603]:
# smooth emission probablities over validation / test set

#target = validation
target = test

smoothed = smooth(emissions, target)

# check emission probabilities
pos_sum = 0
for key in smoothed:  
    if key[1] == 'N05':
        pos_sum += smoothed[key]
np.round(pos_sum, 12)

1.0

In [604]:
# create transition matrix
A = np.zeros((len(states), len(states)))
for i in range(len(states)):
    for j in range(len(states)):
        key = (states[i], states[j])
        if key in transitions:
            A[i,j] = transitions[key]
        else:
            A[i,j] = 0

In [605]:
# viterbi tagging for one sentence
def viterbi(data):   
    # initialization
    gammas = [np.zeros(len(states)-1)]
    psis = [[]]
    token = data[0]
    for s in range(1, len(states)):    
        gammas[0][s-1] = np.log(smoothed[(token, states[s])]) + np.log(A[s,0])
        
    # recursive step
    for i in range(1, len(data)):
        token = data[i]
        gammas += [np.zeros(len(states)-1)]
        psis += [np.zeros(len(states)-1, dtype = int)]
        for s in range(1, len(states)): 
            gammas[i][s-1] = np.log(smoothed[(token, states[s])]) + np.max(np.log(A[s,1:]) + gammas[i-1])
            psis[i][s-1] = np.argmax(np.log(A[s,1:]) + gammas[i-1])
            
    # termination
    z = np.argmax(np.log(A[0,1:]) + gammas[len(data)-1])
    
    # backtracking
    sequence = [z+1]
    for i in range(1, len(data)):
        sequence += [psis[len(data)-i][sequence[i-1]]+1]
    
    sequence.reverse()
    return sequence

In [606]:
# use HMM to tag data
row = 0
labels = []
predictions = []

while row < len(target):
    
    # split into sentences
    sentence = []
    while target.POS[row] != 'SENTENCE_BOUNDARY':
        sentence += [target.Token[row]]
        labels += [target.POS[row]]
        row += 1
    if sentence != []:
        # tag sentence
        predictions += viterbi(sentence)
    row += 1
    
# compute accuracy
(np.asarray([states[i] for i in predictions]) == np.asarray(labels)).mean()

0.7910487288135594