In [0]:
import matplotlib.pyplot as plt
import numpy as np
from itertools import chain
from collections import Counter, defaultdict
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution
import nltk
from sklearn.model_selection import train_test_split

In [2]:
!pip install pomegranate

Collecting pomegranate
[?25l  Downloading https://files.pythonhosted.org/packages/85/31/398cb5b1c338017cc7b158049c636122fc18c8e096d55247d7b240208e53/pomegranate-0.12.0-cp36-cp36m-manylinux1_x86_64.whl (5.6MB)
[K     |████████████████████████████████| 5.6MB 75kB/s 
Installing collected packages: pomegranate
Successfully installed pomegranate-0.12.0


In [10]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

Downloading Brown Dataset

In [0]:
from nltk.corpus import brown

In [0]:
sents = list(brown.tagged_sents(tagset="universal"))
test = list(brown.sents())

In [13]:
test

[['The',
  'Fulton',
  'County',
  'Grand',
  'Jury',
  'said',
  'Friday',
  'an',
  'investigation',
  'of',
  "Atlanta's",
  'recent',
  'primary',
  'election',
  'produced',
  '``',
  'no',
  'evidence',
  "''",
  'that',
  'any',
  'irregularities',
  'took',
  'place',
  '.'],
 ['The',
  'jury',
  'further',
  'said',
  'in',
  'term-end',
  'presentments',
  'that',
  'the',
  'City',
  'Executive',
  'Committee',
  ',',
  'which',
  'had',
  'over-all',
  'charge',
  'of',
  'the',
  'election',
  ',',
  '``',
  'deserves',
  'the',
  'praise',
  'and',
  'thanks',
  'of',
  'the',
  'City',
  'of',
  'Atlanta',
  "''",
  'for',
  'the',
  'manner',
  'in',
  'which',
  'the',
  'election',
  'was',
  'conducted',
  '.'],
 ['The',
  'September-October',
  'term',
  'jury',
  'had',
  'been',
  'charged',
  'by',
  'Fulton',
  'Superior',
  'Court',
  'Judge',
  'Durwood',
  'Pye',
  'to',
  'investigate',
  'reports',
  'of',
  'possible',
  '``',
  'irregularities',
  "''",
 

In [14]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [0]:
tags_t = []
for x in sents:
    z = []
    z = [y[1] for y in x]
    tags_t.append(z)


In [0]:
X_train, X_test, y_train, y_test = train_test_split(test,tags_t , test_size = 0.2,random_state = 40)

In [0]:
tags = [tag for word, tag in brown.tagged_words(tagset="universal")]
words = [word for word, tag in brown.tagged_words(tagset="universal")]

In [0]:
def pair_counts(sequences_A, sequences_B):
    dcount = defaultdict(dict) 
    d = dict(Counter(list(zip(sequences_A, sequences_B))))
    for key,value in d.items():
        dcount[key[0]][key[1]] = value
    return dcount
emission_counts = pair_counts(tags, words)

In [0]:
import itertools
def pairwise(iterable):
    t, t_1 = itertools.tee(iterable)
    next(t_1, 'end')
    return zip(t, t_1)

def bigram_counts(sequences):
    return(dict(Counter(pairwise(sequences))))

tag_bigrams = bigram_counts(tags)

In [0]:
words_bigrams = bigram_counts(words)

In [0]:
def starting_counts(sequences):
    return(dict(Counter([i[0] for i in sequences])))
tag_starts = starting_counts([key for key,value in tag_bigrams.items()])

def ending_counts(sequences):
    return(dict(Counter([i[-1] for i in sequences])))
tag_ends = ending_counts([key for key,value in tag_bigrams.items()])

In [0]:
def unigrams(sequences):
    return (dict(Counter(sequences)))
tag_unigrams = unigrams(tags)

In [0]:
uni_tags = [key for key,value in tag_unigrams.items()]

In [0]:
basic_model = HiddenMarkovModel(name="base-hmm-tagger")
states = {}

for pos_tag in uni_tags:
    emission_probabilities = dict()
    for word, occurance in emission_counts[pos_tag].items(): 
        emission_probabilities[word] = occurance / tag_unigrams[pos_tag] 
    #print(emission_probabilities)
    tag_distribution = DiscreteDistribution(emission_probabilities) 
    state = State(tag_distribution, name=pos_tag)
    states[pos_tag] = state
    basic_model.add_state(state)

In [0]:
for pos_tag in uni_tags:
    
    state = states[pos_tag]
    start_probability = tag_starts[pos_tag] / sum(tag_starts.values())
    basic_model.add_transition(basic_model.start, state, start_probability)
    end_probability = tag_ends[pos_tag] / sum(tag_ends.values())
    basic_model.add_transition(state, basic_model.end, end_probability)



In [0]:
for tag_1 in uni_tags:
    
    state_1 = states[tag_1]
    sum_of_probabilities = 0
    
    for tag_2 in uni_tags:
        state_2 = states[tag_2]
        bigram = (tag_1, tag_2)
        try:
            transition_probability = tag_bigrams[bigram] / tag_unigrams[tag_1]
        except:
            transition_probability = 0
        sum_of_probabilities += transition_probability
        basic_model.add_transition(state_1, state_2, transition_probability)    
    


In [0]:
basic_model.bake()

In [28]:
print("Number of nodes or states: ", basic_model.node_count())
print("Number of edges: ", basic_model.edge_count())

Number of nodes or states:  14
Number of edges:  168


In [0]:
def simplify_decoding(X, model):
    _, state_path = model.viterbi(X)
    return [state[1].name for state in state_path[1:-1]]  
def accuracy(X, Y, model):
    correct = total_predictions = 0
    for observations, actual_tags in zip(X, Y):

        try:
            most_likely_tags = simplify_decoding(observations, model)
            correct += sum(p == t for p, t in zip(most_likely_tags, actual_tags))
        except:
            pass
        total_predictions += len(observations)
    return correct / total_predictions

In [30]:
hmm_training_acc = accuracy(X_train, y_train, basic_model)
print("training accuracy basic hmm model: {:.2f}%".format(100 * hmm_training_acc))

hmm_testing_acc = accuracy(X_test, y_test, basic_model)
print("testing accuracy basic hmm model: {:.2f}%".format(100 * hmm_testing_acc))

training accuracy basic hmm model: 97.50%
testing accuracy basic hmm model: 97.56%


In [0]:
from nltk.tag import pos_tag, map_tag
def accuracy_nltk(X,Y):
    correct = total_predictions = 0
    for observations, actual_tags in zip(X, Y):
        try:
            most_likely_tags = nltk.pos_tag(observations)
            most = [x[1] for x in most_likely_tags]
            simplifiedTags = [map_tag('en-ptb', 'universal', tag) for tag in most]
            correct += sum(p == t for p, t in zip(simplifiedTags, actual_tags))
        except:
            pass
        total_predictions += len(observations)
    return correct / total_predictions

In [0]:
nltk_accuracy_score = accuracy_nltk(X_train,y_train)
print("training accuracy nltk model: {:.2f}%".format(100 * nltk_accuracy_score))
nltk_accuracy_score = accuracy_nltk(X_test,y_test)
print("testing accuracy nltk model: {:.2f}%".format(100 * nltk_accuracy_score))

training accuracy nltk model: 91.81%
testing accuracy nltk model: 91.96%
