In [39]:
import matplotlib.pyplot as plt
import numpy as np
from IPython.core.display import HTML
from itertools import chain
from collections import Counter, defaultdict, namedtuple, OrderedDict
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution
import os
from io import BytesIO
from itertools import chain
import random

In [40]:
def read_data(filename):
    """Read tagged sentence data"""
    with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
        sentence_lines = [l.split("\n") for l in f.read().split("\n\n")]
    return OrderedDict(((s[0], Sentence(*zip(*[l.strip().split("\t")
                        for l in s[1:]]))) for s in sentence_lines if s[0]))

def read_tags(filename):
    """Read a list of word tag classes"""
    with open(filename, 'r') as f:
        tags = f.read().split("\n")
    return frozenset(tags)

Sentence = namedtuple("Sentence", "words tags")

In [41]:
tag = read_tags("english2.txt")
sentence1 = read_data("hindi2.txt")
sentence1

OrderedDict([('b100-5507',
              Sentence(words=('इसके   ', 'अतिरिक्त', 'गुग्गुल', 'कुंड', 'भीम', 'गुफा ', 'तथा', 'भीमशिला', 'भी', 'दर्शनीय', 'है', 'इसके   ', 'अतिरिक्त', 'गुग्गुल', 'कुंड', 'भीम', 'गुफा ', 'तथा', 'भीमशिला', 'भी', 'दर्शनीय', 'है'), tags=('Apart', 'from this', 'Guggul', 'Kund', 'Bhima', 'Cave', 'and', 'Bhimshila', 'also worth', 'visiting', 'is', 'Apart', 'from this', 'Guggul', 'Kund', 'Bhima', 'Cave', 'and', 'Bhimshila', 'also worth', 'visiting', 'is'))),
             ('b100-935',
              Sentence(words=('इसके   ', 'अतिरिक्त', 'गुग्गुल', 'कुंड', 'भीम', 'गुफा ', 'तथा', 'भीमशिला', 'भी', 'दर्शनीय', 'है', 'आधा', 'किमी ', 'की ', 'दूरी ', 'पर ', 'भैरवनाथ ', 'मंदिर ', 'जहाँ ', 'केवल ', 'केदारनाथ ', 'के ', 'पट ', 'खुलने ', 'और ', 'बंद ', 'होने ', 'के ', 'दिन ', 'ही ', 'पूजन ', 'किया ', 'जाता ', 'है'), tags=('Apart', 'from this', 'Guggul', 'Kund', 'Bhima', 'Cave', 'and', 'Bhimshila', 'also worth', 'visiting', 'is', 'half', 'km', 'a', 'distance', 'at', 'Bhairavnath',

In [42]:
class Dataset(namedtuple("_Dataset", "sentences keys vocab X tagset Y training_set testing_set N stream")):
    def __new__(cls, tagfile, datafile, train_test_split=0.8, seed=112890):
        tagset = read_tags(tagfile)
        sentences = read_data(datafile)
        keys = tuple(sentences.keys())
        wordset = frozenset(chain(*[s.words for s in sentences.values()]))
        word_sequences = tuple([sentences[k].words for k in keys])
        tag_sequences = tuple([sentences[k].tags for k in keys])
        N = sum(1 for _ in chain(*(s.words for s in sentences.values())))
        
        # split data into train/test sets
        _keys = list(keys)
        if seed is not None: random.seed(seed)
        random.shuffle(_keys)
        split = int(train_test_split * len(_keys))
        training_data = Subset(sentences, _keys[:split])
        testing_data = Subset(sentences, _keys[split:])
        stream = tuple(zip(chain(*word_sequences), chain(*tag_sequences)))
        return super().__new__(cls, dict(sentences), keys, wordset, word_sequences, tagset,
                               tag_sequences, training_data, testing_data, N, stream.__iter__)

    def __len__(self):
        return len(self.sentences)

    def __iter__(self):
        return iter(self.sentences.items())
    
    
class Subset(namedtuple("BaseSet", "sentences keys vocab X tagset Y N stream")):
    def __new__(cls, sentences, keys):
        word_sequences = tuple([sentences[k].words for k in keys])
        tag_sequences = tuple([sentences[k].tags for k in keys])
        wordset = frozenset(chain(*word_sequences))
        tagset = frozenset(chain(*tag_sequences))
        N = sum(1 for _ in chain(*(sentences[k].words for k in keys)))
        stream = tuple(zip(chain(*word_sequences), chain(*tag_sequences)))
        return super().__new__(cls, {k: sentences[k] for k in keys}, keys, wordset, word_sequences,
                               tagset, tag_sequences, N, stream.__iter__)

    def __len__(self):
        return len(self.sentences)

    def __iter__(self):
        return iter(self.sentences.items())

In [43]:
data = Dataset("english2.txt", "hindi2.txt", train_test_split=0.8)

print("There are {} sentences in the corpus.".format(len(data)))
print("There are {} sentences in the training set.".format(len(data.training_set)))
print("There are {} sentences in the testing set.".format(len(data.testing_set)))

assert len(data) == len(data.training_set) + len(data.testing_set), \
       "The number of sentences in the training set + testing set should sum to the number of sentences in the corpus"

There are 18 sentences in the corpus.
There are 14 sentences in the training set.
There are 4 sentences in the testing set.


In [44]:
for i in range(2):    
    print("Sentence {}:".format(i + 1), data.X[i])
    print()
    print("Labels {}:".format(i + 1), data.Y[i])
    print()

Sentence 1: ('इसके   ', 'अतिरिक्त', 'गुग्गुल', 'कुंड', 'भीम', 'गुफा ', 'तथा', 'भीमशिला', 'भी', 'दर्शनीय', 'है', 'इसके   ', 'अतिरिक्त', 'गुग्गुल', 'कुंड', 'भीम', 'गुफा ', 'तथा', 'भीमशिला', 'भी', 'दर्शनीय', 'है')

Labels 1: ('Apart', 'from this', 'Guggul', 'Kund', 'Bhima', 'Cave', 'and', 'Bhimshila', 'also worth', 'visiting', 'is', 'Apart', 'from this', 'Guggul', 'Kund', 'Bhima', 'Cave', 'and', 'Bhimshila', 'also worth', 'visiting', 'is')

Sentence 2: ('इसके   ', 'अतिरिक्त', 'गुग्गुल', 'कुंड', 'भीम', 'गुफा ', 'तथा', 'भीमशिला', 'भी', 'दर्शनीय', 'है', 'आधा', 'किमी ', 'की ', 'दूरी ', 'पर ', 'भैरवनाथ ', 'मंदिर ', 'जहाँ ', 'केवल ', 'केदारनाथ ', 'के ', 'पट ', 'खुलने ', 'और ', 'बंद ', 'होने ', 'के ', 'दिन ', 'ही ', 'पूजन ', 'किया ', 'जाता ', 'है')

Labels 2: ('Apart', 'from this', 'Guggul', 'Kund', 'Bhima', 'Cave', 'and', 'Bhimshila', 'also worth', 'visiting', 'is', 'half', 'km', 'a', 'distance', 'at', 'Bhairavnath', 'temple', 'where', 'only', 'Kedarnath', 'is', 'door', 'opening', 'and', 'closi

In [45]:
key = 'b100-5507'
print("Sentence: {}".format(key))
print("words:\n\t{!s}".format(data.sentences[key].words))
print("tags:\n\t{!s}".format(data.sentences[key].tags))

Sentence: b100-87000


KeyError: 'b100-87000'

In [46]:
words = [word for i, (word, tag) in enumerate(data.training_set.stream())]
tags = [tag for i, (word, tag) in enumerate(data.training_set.stream())]
words[0:4], tags[0:4]

(['सभी ', 'पर्व ', 'ओर ', 'त्योहार '], ['All', 'festivals', 'and', 'festival'])

In [174]:
def pair_counts(tags, words):
    d = defaultdict(lambda: defaultdict(int))
    for tag, word in zip(tags, words):
        d[tag][word] += 1
    return d
        
word_counts = pair_counts(words, tags)

In [175]:
mfc_table = dict((word, max(tags.keys(), key=lambda key: tags[key])) for word, tags in word_counts.items())

In [176]:
i = 0
for key, value in mfc_table.items():
    print(key, value)
    i += 1
    if i > 3: break

यो this
तो is
ईसामसीह Jesus Christ
के 's


In [177]:
FakeState = namedtuple('FakeState', 'name')

class MFCTagger:
    missing = FakeState(name = '<MISSING>')
    
    def __init__(self, table):
        self.table = defaultdict(lambda: MFCTagger.missing)
        self.table.update({word: FakeState(name=tag) for word, tag in table.items()})
        
    def viterbi(self, seq):
        """This method simplifies predictions by matching the Pomegranate viterbi() interface"""
        return 0., list(enumerate(["<start>"] + [self.table[w] for w in seq] + ["<end>"]))

In [178]:
# Using Most frequent POS tag
mfc_model = MFCTagger(mfc_table)

In [179]:
def replace_unknown(sequence):
    return [w if w in data.training_set.vocab else 'nan' for w in sequence]

def simplify_decoding(X, model):    
    _, state_path = model.viterbi(replace_unknown(X))
    return [state[1].name for state in state_path[1:-1]]

In [180]:
for key in data.testing_set.keys[:2]:
    print("Sentence Key: {}\n".format(key))
    print("Sentence: {}\n".format(data.sentences[key].words))
    print("Predicted labels:\n-----------------")
    print(simplify_decoding(data.sentences[key].words, mfc_model))
    print()
    print("Actual labels:\n--------------")
    print(data.sentences[key].tags)
    print("\n")

Sentence Key: b100-76546

Sentence: ('क्रिसमस', 'का', 'पर्व', 'या', 'त्योहार-पर्व', 'है', 'जिसे', 'न', 'केवल', 'ईसाई', 'धर्म', 'के', 'अनुयायी', 'समर्थक', 'ही', 'मनाते', 'है', 'अपितु', 'इसे', 'टी', 'विशव', 'के', 'प्रायः', 'सभी', 'धर्मो', 'ओर', 'सम्प्रदाय', 'के', 'लोग', 'मनाते', 'है')

Predicted labels:
-----------------
['Christmas', "'s", 'festival', 'or', 'festival', 'is', 'that', 'is not', 'only', 'christian', 'religion', "'s", 'followers', 'supporters', 'only', 'celebrated', 'is', 'but', 'this', 'all', 'world', "'s", 'before', 'all', 'religions', 'and', 'communities', "'s", 'people', 'celebrated', 'is']

Actual labels:
--------------
('Christmas', "'s", 'festival', 'or', 'festival', 'is', 'that', 'is not', 'only', 'christian', 'religion', 'is', 'followers', 'supporters', 'only', 'celebrated', 'by', 'but', 'this', 'all', 'world', 'in', 'before', 'all', 'religions', 'and', 'communities', "'s", 'people', 'celebrate', 'it')


Sentence Key: b100-76560

Sentence: ('यह ', 'ध्यान ', 'देने '

In [181]:
def accuracy(X, Y, model):
    
    correct = total_predictions = 0
    for observations, actual_tags in zip(X, Y):
        
        # The model.viterbi call in simplify_decoding will return None if the HMM
        # raises an error (for example, if a test sentence contains a word that
        # is out of vocabulary for the training set). Any exception counts the
        # full sentence as an error (which makes this a conservative estimate).
        try:
            most_likely_tags = simplify_decoding(observations, model)
            correct += sum(p == t for p, t in zip(most_likely_tags, actual_tags))
        except:
            pass
        total_predictions += len(observations)
    return correct / total_predictions

In [182]:
mfc_training_acc = accuracy(data.training_set.X, data.training_set.Y, mfc_model)
print("training accuracy mfc_model: {:.2f}%".format(100 * mfc_training_acc))

mfc_testing_acc = accuracy(data.testing_set.X, data.testing_set.Y, mfc_model)
print("testing accuracy mfc_model: {:.2f}%".format(100 * mfc_testing_acc))

training accuracy mfc_model: 91.03%
testing accuracy mfc_model: 82.87%


In [183]:
def unigram_counts(sequences):
    return Counter(sequences)

tags = [tag for i, (word, tag) in enumerate(data.training_set.stream())]
tag_unigrams = unigram_counts(tags)
tag_unigrams

Counter({'this': 19,
         'is': 38,
         'Jesus Christ': 5,
         "'s": 32,
         'subject': 5,
         'in': 34,
         'different': 5,
         'opinion': 5,
         'that': 13,
         'his': 5,
         'birth': 5,
         '25': 5,
         'december': 5,
         'night': 5,
         'at': 5,
         '12': 5,
         "o'clock": 5,
         'bethlum': 5,
         'city': 5,
         'one': 6,
         'cowshed': 5,
         'happened': 10,
         'angels': 5,
         'message': 6,
         'through': 5,
         'people': 11,
         'has': 6,
         'great man': 5,
         'face': 5,
         'accepted': 5,
         'as': 9,
         'a': 5,
         'Apart': 2,
         'from this': 2,
         'Guggul': 2,
         'Kund': 2,
         'Bhima': 2,
         'Cave': 2,
         'and': 11,
         'Bhimshila': 2,
         'also worth': 2,
         'visiting': 2,
         'Christmas': 6,
         'festival': 15,
         'or': 11,
         'is not': 6,
 

In [184]:
def bigram_counts(sequences):
    return Counter(sequences)

tags = [tag for i, (word, tag) in enumerate(data.stream())]
o = [(tags[i],tags[i+1]) for i in range(0,len(tags)-2,2)]
tag_bigrams = bigram_counts(o)
tag_bigrams 

Counter({('this', 'is'): 3,
         ('Jesus Christ', "'s"): 3,
         ('subject', 'in'): 3,
         ('different', 'opinion'): 3,
         ('is', 'that'): 7,
         ('his', 'birth'): 3,
         ('25', 'december'): 3,
         ("'s", 'night'): 3,
         ('at', '12'): 3,
         ("o'clock", 'bethlum'): 3,
         ('city', 'in'): 3,
         ('one', 'cowshed'): 3,
         ('in', 'happened'): 3,
         ('happened', 'angels'): 3,
         ("'s", 'message'): 3,
         ('through', 'people'): 3,
         ('has', 'this'): 3,
         ('great man', "'s"): 3,
         ('face', 'in'): 3,
         ('accepted', 'as'): 3,
         ('a', 'In this'): 2,
         ('way', 'without'): 3,
         ('discrimination', 'in'): 3,
         ('whole', 'world'): 4,
         ('in', 'celebrated'): 3,
         ('is', 'is'): 3,
         ('this', 'without'): 3,
         ('any', 'Christmas'): 2,
         ("'s", 'festival'): 4,
         ('or', 'festival'): 4,
         ('is not', 'only'): 4,
         ('chri

In [185]:
def starting_counts(sequences):
    return Counter(sequences)

tags = [tag for i, (word, tag) in enumerate(data.stream())]
starts_tag = [i[0] for i in data.Y]
tag_starts = starting_counts(starts_tag)
tag_starts

Counter({'this': 6,
         'In this': 4,
         'Christmas': 7,
         'Apart': 2,
         'half': 1,
         'Bhairav': 2,
         'All': 1,
         'it': 3,
         'Iraq': 1,
         'terrorism': 1})

In [186]:
def ending_counts(sequences):    
    return Counter(sequences)

end_tag = [i[len(i)-1] for i in data.Y]
tag_ends = ending_counts(end_tag)
tag_ends

Counter({'a': 6,
         'any': 4,
         'it': 10,
         'is': 4,
         'done': 1,
         'does': 1,
         'said': 1,
         'has': 1})

In [187]:
end_tag = [i[len(i)-2] for i in data.Y]
tag_ends = ending_counts(end_tag)
tag_ends

Counter({'as': 6,
         'without': 4,
         'celebrate': 7,
         'visiting': 2,
         'is': 1,
         'important': 2,
         'get': 1,
         'have': 3,
         'them': 1,
         'kept': 1})

In [197]:
hmm_model = HiddenMarkovModel(name="base-hmm-tagger")

tags = [tag for i, (word, tag) in enumerate(data.stream())]
words = [word for i, (word, tag) in enumerate(data.stream())]

tags_count = unigram_counts(tags)
tag_words_count = pair_counts(tags, words)

starting_tag_list = [i[0] for i in data.Y]
#ending_tag_list = [i[-1] if len(i)==1 else i[-2] for i in data.Y]
#ending_tag_list = [i[-1] for i in data.Y]
ending_tag_list = [i[len(i)-1] for i in data.Y]

starting_tag_count = starting_counts(starting_tag_list) #the number of times a tag occured at the start
ending_tag_count = ending_counts(ending_tag_list)       #the number of times a tag occured at the end

tag_words_count

defaultdict(<function __main__.pair_counts.<locals>.<lambda>()>,
            {'this': defaultdict(int, {'यो': 6, 'इन्हें': 6, 'इसे': 11}),
             'is': defaultdict(int,
                         {'तो': 6,
                          'है': 21,
                          'जाता': 4,
                          'के': 7,
                          'के ': 4,
                          'जाता ': 1,
                          'का ': 2,
                          'की ': 3,
                          'हैं': 3}),
             'Jesus Christ': defaultdict(int, {'ईसामसीह': 6}),
             "'s": defaultdict(int, {'के': 25, 'की': 6, 'का': 7}),
             'subject': defaultdict(int, {'विषय': 6}),
             'in': defaultdict(int,
                         {'मे': 6,
                          'की': 6,
                          'में': 16,
                          'के': 11,
                          'में ': 1,
                          'जिसमें': 1}),
             'different': defaultdict(int, {'अनेक': 6}),

In [198]:
to_pass_states = []
for tag, words_dict in tag_words_count.items():
    total = float(sum(words_dict.values()))
    distribution = {word: count/total for word, count in words_dict.items()}
    tag_emissions = DiscreteDistribution(distribution)
    tag_state = State(tag_emissions, name=tag)
    to_pass_states.append(tag_state)

In [199]:
distribution

{'रखा': 1.0}

In [200]:
to_pass_states

[{
     "class" : "State",
     "distribution" : {
         "class" : "Distribution",
         "dtype" : "str",
         "name" : "DiscreteDistribution",
         "parameters" : [
             {
                 "\u092f\u094b" : 0.2608695652173913,
                 "\u0907\u0928\u094d\u0939\u0947\u0902" : 0.2608695652173913,
                 "\u0907\u0938\u0947" : 0.4782608695652174
             }
         ],
         "frozen" : false
     },
     "name" : "this",
     "weight" : 1.0
 }, {
     "class" : "State",
     "distribution" : {
         "class" : "Distribution",
         "dtype" : "str",
         "name" : "DiscreteDistribution",
         "parameters" : [
             {
                 "\u0924\u094b" : 0.11764705882352941,
                 "\u0939\u0948" : 0.4117647058823529,
                 "\u091c\u093e\u0924\u093e" : 0.0784313725490196,
                 "\u0915\u0947" : 0.13725490196078433,
                 "\u0915\u0947 " : 0.0784313725490196,
                 "\u091c\u09

In [201]:
start_prob={}

for tag in tags:
    start_prob[tag] = starting_tag_count[tag] / tags_count[tag]

for tag_state in to_pass_states :
    hmm_model.add_transition(hmm_model.start, tag_state, start_prob[tag_state.name]) 

In [202]:
end_prob={}

for tag in tags:
    end_prob[tag] = ending_tag_count[tag]/tags_count[tag]
    
for tag_state in to_pass_states :
    hmm_model.add_transition(tag_state, hmm_model.end, end_prob[tag_state.name])

In [203]:
transition_prob_pair={}

for key in tag_bigrams.keys():
    transition_prob_pair[key] = tag_bigrams.get(key)/tags_count[key[0]]
    
for tag_state in to_pass_states:
    for next_tag_state in to_pass_states:
        if((tag_state.name,next_tag_state.name) in transition_prob_pair):
            hmm_model.add_transition(tag_state, next_tag_state, transition_prob_pair[(tag_state.name, next_tag_state.name)])

In [204]:
hmm_model.bake()

In [205]:
hmm_training_acc = accuracy(data.training_set.X, data.training_set.Y, hmm_model)
print("training accuracy basic hmm model: {:.2f}%".format(100 * hmm_training_acc))

hmm_testing_acc = accuracy(data.testing_set.X, data.testing_set.Y, hmm_model)
print("testing accuracy basic hmm model: {:.2f}%".format(100 * hmm_testing_acc))

training accuracy basic hmm model: 72.76%
testing accuracy basic hmm model: 46.96%
