# Test and results.

In [1]:
from markovify.HMM import HMM
from markovify.ViterbiDecoder import ViterbiDecoder

from nltk.corpus import brown
from math import ceil, floor

import pandas as pd
import numpy as np

In [None]:
## Just a progress bar to know how a loop is progressing
## Code not related to the project.
## All credit to https://github.com/alexanderkuk/log-progress
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

# Accuracy for news text.

In [8]:
corpus_tagged = brown.tagged_sents(categories='news', tagset='universal')
corpus_raw = brown.sents(categories='news')
size = len(corpus_tagged)

corpus_train = corpus_tagged[:floor(size*9/10)]
corpus_test_tagged = corpus_tagged[floor(size*9/10):]
corpus_test = corpus_raw[floor(size*9/10):]

print(len(corpus_train))
print(len(corpus_test))

4160
463


In [6]:
model = HMM()
model.train(corpus_train)
#print(model._a)
#print(model.b)

decoder = ViterbiDecoder(model)

      <s>       DET      NOUN       ADJ      VERB       ADP         .  \
<s>   0.0  0.097158  0.040371  0.027597  0.012128  0.034436  0.035664   
DET   0.0  0.004882  0.237429  0.404081  0.042024  0.007321  0.009316   
NOUN  0.0  0.033493  0.265166  0.080615  0.290537  0.532719  0.648349   
ADJ   0.0  0.002441  0.152106  0.062887  0.007493  0.038594  0.034441   
VERB  0.0  0.228200  0.059268  0.106540  0.204480  0.203543  0.077162   
ADP   0.0  0.478859  0.120898  0.138317  0.032368  0.017082  0.008563   
.     0.0  0.077336  0.054901  0.047332  0.071765  0.061009  0.082432   
<e>   0.0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
ADV   0.0  0.023045  0.005905  0.059207  0.061954  0.042390  0.035382   
CONJ  0.0  0.034860  0.030421  0.042315  0.032986  0.011750  0.003576   
PRT   0.0  0.015428  0.002863  0.005519  0.100734  0.018348  0.007904   
PRON  0.0  0.002929  0.000608  0.003345  0.136346  0.009671  0.013833   
NUM   0.0  0.001367  0.029705  0.022245  0.007030  

In [13]:
flat_list = [item for sublist in corpus_test_tagged for item in sublist]

results = pd.DataFrame.from_records(flat_list)
results.columns = ["Word", "Tag"]
results.head(10)

Unnamed: 0,Word,Tag
0,But,CONJ
1,in,ADP
2,all,PRT
3,its,DET
4,175,NUM
5,years,NOUN
6,",",.
7,not,ADV
8,a,DET
9,single,ADJ


In [11]:
tagged = []
for sent in log_progress(corpus_test, every=1):
    tagged.append(decoder.viterbi(sent))

VBox(children=(HTML(value=''), IntProgress(value=0, max=463)))

In [15]:
flat_list = [item for sublist in tagged for item in sublist]

results['Predicted'] = flat_list
results.head(10)

Unnamed: 0,Word,Tag,Predicted
0,But,CONJ,CONJ
1,in,ADP,ADP
2,all,PRT,PRT
3,its,DET,DET
4,175,NUM,NOUN
5,years,NOUN,NOUN
6,",",.,.
7,not,ADV,ADV
8,a,DET,DET
9,single,ADJ,ADJ


In [23]:
correct = 0
for index, row in results.iterrows():
    if row[1] == row[2]:
        correct += 1
brown_news_accuracy = correct / results.shape[0]
print(brown_news_accuracy)

# Accuracy for news text training the model using an adventures book.

In [24]:
corpus_tagged = brown.tagged_sents(categories='adventure', tagset='universal')
corpus_test_tagged = brown.tagged_sents(categories='news', tagset='universal')
corpus_test_raw = brown.sents(categories='news')

size = len(corpus_tagged)

corpus_train = corpus_tagged[:floor(size*9/10)]
corpus_test_tagged = corpus_test_tagged[:floor(size*1/10)]
corpus_test = corpus_test_raw[:floor(size*1/10)]

print(len(corpus_train))
print(len(corpus_test))

4173
463


In [25]:
model = HMM()
model.train(corpus_train)

decoder = ViterbiDecoder(model)

tagged = []
for sent in log_progress(corpus_test, every=1):
    tagged.append(decoder.viterbi(sent))

VBox(children=(HTML(value=''), IntProgress(value=0, max=463)))

In [26]:
flat_list = [item for sublist in corpus_test_tagged for item in sublist]

results = pd.DataFrame.from_records(flat_list)
results.columns = ["Word", "Tag"]

flat_list = [item for sublist in tagged for item in sublist]

results['Predicted'] = flat_list
results.head(10)

Unnamed: 0,Word,Tag,Predicted
0,The,DET,DET
1,Fulton,NOUN,ADJ
2,County,NOUN,NOUN
3,Grand,ADJ,ADP
4,Jury,NOUN,NOUN
5,said,VERB,VERB
6,Friday,NOUN,ADP
7,an,DET,DET
8,investigation,NOUN,NOUN
9,of,ADP,ADP


In [27]:
correct = 0
for index, row in results.iterrows():
    if row[1] == row[2]:
        correct += 1
brown_news_adventures_accuracy = correct / results.shape[0]
print(brown_news_adventures_accuracy)

0.8097132284921369
