# Test and results.

In [1]:
from markovify.markovify import markovify
from markovify.utils.CrossValidation import cross_val_score

from nltk.corpus import brown

import pandas as pd
import numpy as np

from markovify.utils.ProgressBar import log_progress

# Accuracy for news text.

In [2]:
corpus = brown.tagged_sents(categories='news', tagset='universal')

model = markovify(smoothing='laplace', alpha=1)
brown_news_accuracy = cross_val_score(model, sentences=corpus, verbose=False)
print(brown_news_accuracy)
print('Best accuracy: {}.'.format(np.max(brown_news_accuracy)))

[0.9037871033776868, 0.8948039721825747, 0.9044380503967216]
Best accuracy: 0.9044380503967216.


# Comparison of different alphas.

In [3]:
corpus = brown.tagged_sents(categories='news', tagset='universal')

model07 = markovify(smoothing='laplace', alpha=0.7)
brown_news_accuracy_0_7 = cross_val_score(model07, sentences=corpus, verbose=False)
print('Best accuracy for alpha=0.7: {}.'.format(np.max(brown_news_accuracy_0_7)))

model03 = markovify(smoothing='laplace', alpha=0.3)
brown_news_accuracy_0_3 = cross_val_score(model03, sentences=corpus, verbose=False)
print('Best accuracy for alpha=0.3: {}.'.format(np.max(brown_news_accuracy_0_3)))

Best accuracy for alpha=0.7: 0.9044089865434359.
Best accuracy for alpha=0.3: 0.904539707387561.


# --- OUTDATED ---

# Accuracy for news text training the model using an adventures book.

In [None]:
corpus_tagged = brown.tagged_sents(categories='adventure', tagset='universal')
corpus_test_tagged = brown.tagged_sents(categories='news', tagset='universal')
corpus_test_raw = brown.sents(categories='news')

size = len(corpus_tagged)

corpus_train = corpus_tagged[:floor(size*9/10)]
corpus_test_tagged = corpus_test_tagged[:floor(size*1/10)]
corpus_test = corpus_test_raw[:floor(size*1/10)]

print(len(corpus_train))
print(len(corpus_test))

In [None]:
model = markovify(smoothing='laplace', alpha=1)
model = model.fit(corpus_train)

tagged = []
for sent in log_progress(corpus_test, every=1):
    tagged.append(model.predict(sent))

In [None]:
flat_list = [item for sublist in corpus_test_tagged for item in sublist]

results = pd.DataFrame.from_records(flat_list)
results.columns = ["Word", "Tag"]

flat_list = [item for sublist in tagged for item in sublist]

results['Predicted'] = flat_list
results.head(10)

In [None]:
total_words = results.shape[0]
correct = 0
for index, row in results.iterrows():
    if row[1] == row[2]:
        correct += 1
brown_news_adventures_accuracy = correct / total_words
print(brown_news_adventures_accuracy)

## Same experiment using $\alpha = 0.7$.

In [None]:
model = HMM(smoothing='laplace', alpha=0.7)
model = model.fit(corpus_train)

tagged = []
for sent in log_progress(corpus_test, every=1):
    tagged.append(model.predict(sent))

flat_list = [item for sublist in corpus_test_tagged for item in sublist]

results = pd.DataFrame.from_records(flat_list)
results.columns = ["Word", "Tag"]

flat_list = [item for sublist in tagged for item in sublist]

results['Predicted'] = flat_list
results.head(10)

total_words = results.shape[0]
correct = 0
for index, row in results.iterrows():
    if row[1] == row[2]:
        correct += 1
brown_news_adventures_accuracy_07 = correct / total_words
print(brown_news_adventures_accuracy_07)