In [124]:
import random
import re
import math
from collections import defaultdict, deque, Counter
import os
import sys
import numpy as np
import pickle
from random import shuffle
script_path=os.path.dirname(os.path.abspath('__file__'))
START_SYMBOL = '*'
STOP_SYMBOL = 'STOP'
RARE_SYMBOL = '_RARE_'
RARE_WORD_MAX_FREQ = 3
LOG_PROB_OF_ZERO = -1000

In [125]:
def calc_ngrams(sent_tags, n):
    ngrams = list(zip(*[sent_tags[i:] for i in range(n)]))
    return ngrams


def deleted_interpolation(unigram_c, bigram_c, trigram_c):
    lambda1 = 0
    lambda2 = 0
    lambda3 = 0
    for a, b, c in trigram_c.keys():
        v = trigram_c[(a, b, c)]
        if v > 0:
            c1 = (v - 1) / (bigram_c[(a, b)])
            c2 = (bigram_c[(a, b)] - 1) / (unigram_c[(a, )] - 1)
            c3 = (unigram_c[(a, )] - 1) / (sum(unigram_c.values()) - 1)
            clist = [c1, c2, c3]
            m = np.argmax(clist)
            if m == 0:
                lambda1 += v
            if m == 1:
                lambda2 += v
            if m == 2:
                lambda3 += v
    weights = [lambda1, lambda2, lambda3]
    weights = [a / sum(weights) for a in weights]
    return weights


def calc_ngram_counts(corpus_tags):
    unigram_c = defaultdict(int)
    bigram_c = defaultdict(int)
    trigram_c = defaultdict(int)

    for sent_tags in corpus_tags:
        unigram_tags = calc_ngrams(sent_tags, 1)
        bigram_tags = calc_ngrams(sent_tags, 2)
        trigram_tags = calc_ngrams(sent_tags, 3)

        for unigram in unigram_tags:
            unigram_c[unigram] += 1

        for bigram in bigram_tags:
            bigram_c[bigram] += 1

        for trigram in trigram_tags:
            trigram_c[trigram] += 1

    return unigram_c, bigram_c, trigram_c

def calc_ngram_probs(corpus_tags, unigram_c, bigram_c, trigram_c):
    unigram_total = sum(unigram_c.values())
    unigram_p = {(a, ):math.log(unigram_c[(a, )], 2) - math.log(unigram_total, 2) for a, in unigram_c}

    unigram_c[START_SYMBOL] = len(corpus_tags)
    bigram_p = {(a, b):math.log(bigram_c[(a, b)], 2) - math.log(unigram_c[(a, )], 2) for a, b in bigram_c}

    bigram_c[(START_SYMBOL, START_SYMBOL)] = len(corpus_tags)
    trigram_p = {(a, b,c): math.log(trigram_c[(a, b, c)], 2) - math.log(bigram_c[(a, b)], 2) for a, b, c in trigram_c}
    return unigram_p, bigram_p, trigram_p


def calc_known(corpus_words):
    known_words = set()
    word_c = defaultdict(int)

    for sent_words in corpus_words:
        for word in sent_words:
            word_c[word] += 1

    for word, count in word_c.items():
        if count > RARE_WORD_MAX_FREQ:
            known_words.add(word)
    return known_words


def replace_rare(corpus_words, known_words):
    for i, sent_words in enumerate(corpus_words):
        for j, word in enumerate(sent_words):
            if word not in known_words:
                corpus_words[i][j] = RARE_SYMBOL
    return corpus_words

def calc_emission(corpus_words_rare, corpus_tags):
    '''bigram_corpus_words_rare = []
    for sent_words in corpus_words_rare:
        bigram_words = calc_ngrams(sent_words[1:], 2)
        bigram_corpus_words_rare.append(bigram_words)
    '''
    e_values_c = defaultdict(int)
    #e_values_bc = defaultdict(int)
    tag_c = defaultdict(int)
    #tag_bc = defaultdict(int)
    for sent_words, sent_tags in zip(corpus_words_rare, corpus_tags):
        for word, tag in zip(sent_words, sent_tags):
            e_values_c[(word, tag)] += 1
            tag_c[tag] += 1
    '''for sent_words, sent_tags in zip(bigram_corpus_words_rare, corpus_tags):
        for word, tag in zip(sent_words, sent_tags):
            e_values_bc[(word[0],word[1],tag)] += 1
            tag_bc[tag] += 1'''
    
    #e_values_bi = {(w1,w2, tag): math.log(e_values_c[(w1,w2, tag)], 2) - math.log(tag_bc[tag], 2) for w1,w2, tag in e_values_bc}
    e_values = {(word, tag): math.log(e_values_c[(word, tag)], 2) - math.log(tag_c[tag], 2) for word, tag in e_values_c}
    taglist = set(tag_c)
    #taglist_bi = set(tag_bc)
    return e_values,taglist



def viterbi(corpus_test_words, taglist, known_words, q_values, e_values):
    tagged = []
    best_scores = []
    # pi[(k, u, v)]: max probability of a tag sequence ending in tags u, v at position k
    # bp[(k, u, v)]: backpointers to recover the argmax of pi[(k, u, v)]
    pi = defaultdict(float)
    #pi={}
    bp = {}
    # Initialization
    pi[(0, START_SYMBOL, START_SYMBOL)] = 1

    # Define tagsets S(k)
    def S(k):
        if k in (-1, 0):
            return {START_SYMBOL}
        else:
            return taglist

    # The Viterbi algorithm
    for sent_words_actual in corpus_test_words:
        sent_words = [word if word in known_words else RARE_SYMBOL for word in sent_words_actual[2:-2]]
        n = len(sent_words)
        for k in range(1, n + 1):
            for u in S(k - 1):
                for v in S(k):
                    max_score = float('-Inf')
                    max_tag = None
                    for w in S(k - 2):
                        if e_values.get((sent_words[k - 1], v), 0) != 0:
                            score = pi.get((k - 1, w, u),LOG_PROB_OF_ZERO) + q_values.get((w, u, v),LOG_PROB_OF_ZERO) + e_values.get((sent_words[k - 1], v))
                            if score > max_score:
                                max_score = score
                                max_tag = w
                    pi[(k, u, v)] = max_score
                    bp[(k, u, v)] = max_tag

        max_score = float('-Inf')
        u_max, v_max = None, None
        for u in S(n - 1):
            for v in S(n):
                score = pi.get((n, u, v), LOG_PROB_OF_ZERO) + q_values.get((u, v, STOP_SYMBOL), LOG_PROB_OF_ZERO)
                if score > max_score:
                    max_score = score
                    u_max = u
                    v_max = v
        best_scores.append(max_score)
        tags = deque()
        tags.append(v_max)
        tags.append(u_max)
        for i, k in enumerate(range(n - 2, 0, -1)):
            tags.append(bp[(k + 2, tags[i + 1], tags[i])])
        tags.reverse()

        tagged_sentence = deque()
        for j in range(0, n):
            tagged_sentence.append(sent_words_actual[2:-2][j] + '____' + tags[j])
        tagged.append([' '.join(tagged_sentence)])

    return best_scores, tagged


In [126]:
data_path=script_path+"/Data"
train_data=data_path+"/train.txt"
test_data=data_path+"/dev.txt"
test_errors=data_path+"/dev_results.txt"
correct_data = data_path+"/dev_correction_results.txt"
tr=open(train_data).read().split("\n\n")
te=open(test_data).read().split("\n\n")
te_err = open(test_errors).read().split("\n\n")
te_correction=open(correct_data).read().split("\n\n")

In [127]:
def extarct_data(data):
    words=[]
    errors=[]
    correct=[]
    trial=[]
    span_track=[]
    for sentence in data:
        temp1=[]
        temp2=[]
        temp3=[]
        temp4=[]
        tokens = sentence.split("\n")
        for token in tokens:
            l = len(token.split())
            if l!=0:
                if l!=1:
                    temp1.append(token.split()[0])
                    temp2.extend([token.split()[1:l-1]])
                    temp3.append(token.split()[l-1])
                else:
                    temp1.append(token.split()[0])
                    temp2.append([token.split()[0]])
                    temp3.append("no-error")
        cnt=0
        index=-1
        fl=1
        for i in range(len(temp2)-1):
            if len(temp2[i])==len(temp2[i+1]) and temp2[i]==temp2[i+1]:
                cnt+=1
                if fl==1:
                    index=i
                    fl=0
            else:
                temp4.append((index,cnt))
                cnt=0
                index=-1
                fl=1
        words.append(temp1)
        errors.append(temp3)
        correct.append(temp2)
        span_track.append(temp4)
    return words,errors,correct,span_track


In [128]:
tr_words,tr_errors,tr_correct,tr_span_track = extarct_data(tr)
te_words , te_errors , te_correct , te_span_track = extarct_data(te_correction)
try:
    assert len(tr_words)==len(tr_errors)==len(tr_correct)==len(tr_span_track)
    assert len(te_words)==len(te_errors)==len(te_correct)==len(te_span_track)
    print(len(tr_words))
    print(len(tr_errors))
    print(len(tr_correct))
    print(len(tr_span_track))
    print(len(te_words))
    print(len(te_errors))
    print(len(te_correct))
    print(len(te_span_track))
except AssertionError:
    print("Lengths are not equal")
    print(len(tr_words))
    print(len(tr_errors))
    print(len(tr_correct))
    print(len(tr_span_track))
    print(len(te_words))
    print(len(te_errors))
    print(len(te_correct))
    print(len(te_span_track))

17470
17470
17470
17470
2184
2184
2184
2184


In [129]:
print(tr_words[0:5])
print(tr_errors[0:5])
print(tr_correct[0:5])
print(tr_span_track[0:5])

[['so', 'vhtr', 'is', 'environment', 'friendly', 'and', 'the', 'most', 'optimized', 'solution', 'towards', 'green', 'house', 'effect', '.'], ['retrieved', 'on', '5/09/2009.http', ':', '//www.purdue.edu/uns/x/2007a/070314agrawalbiomass.html', ')'], ['for', 'instance', ',', 'an', 'insurance', 'company', 'will', 'refuse', 'to', 'provide', 'any', 'insurance', 'coverage', 'for', 'an', 'individual', 'who', 'may', 'have', 'had', 'been', 'tested', 'positive', 'of', 'a', 'genetic', 'disorder', 'or', 'has', 'a', 'high', 'probability', 'of', 'acquiring', 'a', 'genetic', 'disorder', ',', 'since', 'the', 'insurance', 'company', 'finds', 'it', 'a', 'loss', 'to', 'make', 'a', 'payout', 'to', 'an', "'already", 'or', 'potentially-sick', "'", 'individual', '.'], ['in', 'brief', ',', 'with', 'the', 'advancement', 'of', 'medical', 'technologies', ',', 'the', 'robotic', 'surgery', 'systems', 'are', 'definitely', 'benefit', 'both', 'the', 'patients', 'and', 'the', 'surgeons', '.'], ['shantakumar', '1999', '

In [130]:
def append_symbols(data_words,data_errors):
    for x in range(len(data_words)):
        temp1=[]
        temp1.append(START_SYMBOL)
        temp1.append(START_SYMBOL)
        temp1.extend(data_words[x])
        temp1.append(STOP_SYMBOL)
        data_words[x]=temp1  
        temp2=[]
        temp2.append(START_SYMBOL)
        temp2.append(START_SYMBOL)
        temp2.extend(data_errors[x])
        temp2.append(STOP_SYMBOL)
        data_errors[x]=temp2
    return data_words,data_errors

tr_words,tr_errors = append_symbols(tr_words,tr_errors)
te_words,te_errors = append_symbols(te_words,te_errors)
print(tr_words[0:5])
print(tr_errors[0:5])        

[['*', '*', 'so', 'vhtr', 'is', 'environment', 'friendly', 'and', 'the', 'most', 'optimized', 'solution', 'towards', 'green', 'house', 'effect', '.', 'STOP'], ['*', '*', 'retrieved', 'on', '5/09/2009.http', ':', '//www.purdue.edu/uns/x/2007a/070314agrawalbiomass.html', ')', 'STOP'], ['*', '*', 'for', 'instance', ',', 'an', 'insurance', 'company', 'will', 'refuse', 'to', 'provide', 'any', 'insurance', 'coverage', 'for', 'an', 'individual', 'who', 'may', 'have', 'had', 'been', 'tested', 'positive', 'of', 'a', 'genetic', 'disorder', 'or', 'has', 'a', 'high', 'probability', 'of', 'acquiring', 'a', 'genetic', 'disorder', ',', 'since', 'the', 'insurance', 'company', 'finds', 'it', 'a', 'loss', 'to', 'make', 'a', 'payout', 'to', 'an', "'already", 'or', 'potentially-sick', "'", 'individual', '.', 'STOP'], ['*', '*', 'in', 'brief', ',', 'with', 'the', 'advancement', 'of', 'medical', 'technologies', ',', 'the', 'robotic', 'surgery', 'systems', 'are', 'definitely', 'benefit', 'both', 'the', 'pati

In [131]:
error_set = set([error for sent in tr_errors for error in sent if error!=START_SYMBOL and error!=STOP_SYMBOL])
print(len(error_set))
print(error_set)
error_map = dict([(error, error) for error in error_set])

29
{'Pform', 'WOinc', 'Wtone', 'Smod', 'Sfrag', 'Cit', 'Prep', 'Um', 'Vm', 'Rloc-', 'Srun', 'Pref', 'Npos', 'Ssub', 'Nn', 'Wa', 'Mec', 'WOadv', 'SVA', 'Vt', 'Vform', 'Wci', 'Wform', 'Spar', 'no-error', 'Others', 'ArtOrDet', 'V0', 'Trans'}


In [132]:
split_len = 90
l_train = int(9*len(tr_words)/10)
cp_tr_words,cp_tr_errors,cp_tr_correct,cp_tr_span_track = tr_words,tr_errors,tr_correct,tr_span_track
tr_words,tr_errors,tr_correct,tr_span_track = cp_tr_words[:l_train],cp_tr_errors[:l_train],cp_tr_correct[:l_train],cp_tr_span_track[:l_train]
dev_words , dev_errors , dev_correct , dev_span_track = cp_tr_words[l_train:],cp_tr_errors[l_train:],cp_tr_correct[l_train:],cp_tr_span_track[l_train:]
print(tr_words[0:5])
print(tr_errors[0:5])
print(tr_correct[0:5])
print(tr_span_track[0:5])

[['*', '*', 'so', 'vhtr', 'is', 'environment', 'friendly', 'and', 'the', 'most', 'optimized', 'solution', 'towards', 'green', 'house', 'effect', '.', 'STOP'], ['*', '*', 'retrieved', 'on', '5/09/2009.http', ':', '//www.purdue.edu/uns/x/2007a/070314agrawalbiomass.html', ')', 'STOP'], ['*', '*', 'for', 'instance', ',', 'an', 'insurance', 'company', 'will', 'refuse', 'to', 'provide', 'any', 'insurance', 'coverage', 'for', 'an', 'individual', 'who', 'may', 'have', 'had', 'been', 'tested', 'positive', 'of', 'a', 'genetic', 'disorder', 'or', 'has', 'a', 'high', 'probability', 'of', 'acquiring', 'a', 'genetic', 'disorder', ',', 'since', 'the', 'insurance', 'company', 'finds', 'it', 'a', 'loss', 'to', 'make', 'a', 'payout', 'to', 'an', "'already", 'or', 'potentially-sick', "'", 'individual', '.', 'STOP'], ['*', '*', 'in', 'brief', ',', 'with', 'the', 'advancement', 'of', 'medical', 'technologies', ',', 'the', 'robotic', 'surgery', 'systems', 'are', 'definitely', 'benefit', 'both', 'the', 'pati

In [133]:
try:
    assert len(tr_words)==len(tr_errors)==len(tr_correct)==len(tr_span_track)
    assert len(dev_words)==len(dev_errors)==len(dev_correct)==len(dev_span_track)
    print(len(tr_words))
    print(len(tr_errors))
    print(len(tr_correct))
    print(len(tr_span_track))
    print(len(dev_words))
    print(len(dev_errors))
    print(len(dev_correct))
    print(len(dev_span_track))
except AssertionError:
    print("Lengths are not equal")
    print(len(tr_words))
    print(len(tr_errors))
    print(len(tr_correct))
    print(len(tr_span_track))
    print(len(dev_words))
    print(len(dev_errors))
    print(len(dev_correct))
    print(len(dev_span_track))

15723
15723
15723
15723
1747
1747
1747
1747


In [134]:
print("Number of Training sentences: ", len(tr_words))
print("Number of Testing sentences: ", len(te_words))
print("Number of Development sentences: ", len(dev_words))

Number of Training sentences:  15723
Number of Testing sentences:  2184
Number of Development sentences:  1747


In [135]:
'''unigram_c, bigram_c, trigram_c = calc_ngram_counts(tr_errors)
unigram_p, bigram_p, trigram_p = calc_ngram_probs(tr_errors, unigram_c,bigram_c, trigram_c)
weights = deleted_interpolation(
    calc_ngram_counts(dev_errors)[0],
    calc_ngram_counts(dev_errors)[1],
    calc_ngram_counts(dev_errors)[2])
print(weights)'''

'unigram_c, bigram_c, trigram_c = calc_ngram_counts(tr_errors)\nunigram_p, bigram_p, trigram_p = calc_ngram_probs(tr_errors, unigram_c,bigram_c, trigram_c)\nweights = deleted_interpolation(\n    calc_ngram_counts(dev_errors)[0],\n    calc_ngram_counts(dev_errors)[1],\n    calc_ngram_counts(dev_errors)[2])\nprint(weights)'

In [136]:
'''trigram_d = {}
for a, b, c in trigram_c:
    t1 = weights[0] * 2**trigram_p[(a, b, c)]
    t2 = weights[1] * 2**bigram_p[(b, c)]
    t3 = weights[2] * 2**unigram_p[(c, )]
    try:
        trigram_d[(a, b, c)] = math.log(t1 + t2 + t3, 2)
    except ValueError:
        print(t1, t2, t3, a, b, c)
q_values = trigram_d'''
#pickle.dump(q_values, open('saved_models/q_values.pkl', 'wb'))
q_values = pickle.load(open('saved_models/q_values.pkl','rb'))
temp = sorted(q_values.items(), key=lambda kv: -kv[1])
for trigram, q_val in temp:
    output = ' '.join(
        ['TRIGRAM', trigram[0], trigram[1], trigram[2],
         str(q_val)])
    print(output + "\n")

TRIGRAM Wtone ArtOrDet no-error -0.026803697065232935

TRIGRAM WOinc ArtOrDet no-error -0.026803697065232935

TRIGRAM Ssub ArtOrDet no-error -0.026803697065232935

TRIGRAM Nn ArtOrDet no-error -0.026803697065232935

TRIGRAM Pref ArtOrDet no-error -0.026803697065232935

TRIGRAM Trans ArtOrDet no-error -0.026803697065232935

TRIGRAM Pform ArtOrDet no-error -0.026803697065232935

TRIGRAM SVA ArtOrDet no-error -0.026803697065232935

TRIGRAM Srun ArtOrDet no-error -0.026803697065232935

TRIGRAM Vm ArtOrDet no-error -0.026803697065232935

TRIGRAM Mec Prep no-error -0.036394161286407906

TRIGRAM Um Prep no-error -0.036394161286407906

TRIGRAM Srun Prep no-error -0.036394161286407906

TRIGRAM WOinc Prep no-error -0.036394161286407906

TRIGRAM Prep Prep no-error -0.036394161286407906

TRIGRAM Spar Prep no-error -0.036394161286407906

TRIGRAM WOadv Prep no-error -0.036394161286407906

TRIGRAM Wform SVA no-error -0.03808277803919105

TRIGRAM Sfrag SVA no-error -0.03808277803919105

TRIGRAM * SVA 

TRIGRAM * Pform no-error -1.1581819690349766

TRIGRAM Prep Pform Pform -1.1635679362319826

TRIGRAM Wci Pform Pform -1.1635679362319826

TRIGRAM Rloc- Pform Pform -1.1635679362319826

TRIGRAM Wform Um no-error -1.1640777699538414

TRIGRAM Trans Vm Vm -1.164567048824153

TRIGRAM Others Vm Vm -1.164567048824153

TRIGRAM * Vm Vm -1.164567048824153

TRIGRAM * Npos Npos -1.1648535976460057

TRIGRAM Trans Trans no-error -1.165961473331625

TRIGRAM Npos Npos no-error -1.1727085562008521

TRIGRAM Wci Rloc- Rloc- -1.1737122576656953

TRIGRAM no-error Wtone no-error -1.1771609810885677

TRIGRAM * Wci no-error -1.1874017635925134

TRIGRAM Smod Mec Mec -1.1901811652691092

TRIGRAM Pref Mec Mec -1.1901811652691092

TRIGRAM Spar Pref Pref -1.1914381626167148

TRIGRAM Vform Rloc- no-error -1.1922623054629007

TRIGRAM Pform Pform Pform -1.1978551980094412

TRIGRAM Pform Vt Vt -1.1984613199953835

TRIGRAM Srun Vt Vt -1.1984613199953835

TRIGRAM Prep Rloc- Rloc- -1.201804827847737

TRIGRAM Um Vform Vfor


TRIGRAM no-error Others Rloc- -6.761599177725004

TRIGRAM Prep ArtOrDet Vform -6.769683829767072

TRIGRAM no-error WOinc STOP -6.778188437589336

TRIGRAM no-error Wtone Wci -6.794080186588064

TRIGRAM Others no-error Rloc- -6.794085940525285

TRIGRAM * Rloc- ArtOrDet -6.7976882439470385

TRIGRAM no-error V0 Wci -6.801914277069843

TRIGRAM Srun no-error Nn -6.806581923440196

TRIGRAM Wci Wci STOP -6.808384691053663

TRIGRAM no-error Trans STOP -6.809028980599387

TRIGRAM Nn Nn Vt -6.815540952643379

TRIGRAM Vt no-error Rloc- -6.8166525641156035

TRIGRAM no-error Wci Prep -6.817586519098607

TRIGRAM no-error Srun Mec -6.8367478163516875

TRIGRAM Mec no-error Rloc- -6.837053685673077

TRIGRAM Pform no-error SVA -6.843046556313788

TRIGRAM no-error Trans Rloc- -6.846711641437814

TRIGRAM Trans no-error Wci -6.852496453877897

TRIGRAM Wci no-error Nn -6.853100239483483

TRIGRAM no-error Wform Prep -6.8546188678524675

TRIGRAM no-error Vt Rloc- -6.855867158302573

TRIGRAM Vm Vm Prep -6.8586

In [137]:
known_words = calc_known(tr_words)
tr_words_rare = replace_rare(tr_words, known_words)
for sentence in tr_words_rare:
    output = ' '.join(sentence[2:-1])
    print(output + "\n")

so vhtr is environment friendly and the most _RARE_ solution towards green house effect .

retrieved on _RARE_ : _RARE_ )

for instance , an insurance company will refuse to provide any insurance coverage for an individual who may have had been tested positive of a genetic disorder or has a high probability of _RARE_ a genetic disorder , since the insurance company finds it a loss to make a payout to an _RARE_ or _RARE_ ' individual .

in brief , with the advancement of medical technologies , the robotic surgery systems are definitely benefit both the patients and the surgeons .

_RARE_ 1999 , _RARE_ 1996 and 1998 , lee 1999 and _RARE_ , have pointed out the lack of _RARE_ coverage and the lack of _RARE_ of funds for retirement security in this cpf scheme , especially for the low-income elderly ( as cited in david r. _RARE_ and _RARE_ _RARE_ .

through these - engineers can predict the impact of a possible _RARE_ in current and _RARE_ situation ; study about the leak before break ( _RA

if potential criminals - aware of this fact , there is a high possibility that crimes can be greatly _RARE_ .

in the long run the price of the energy generated by coal and oil would _RARE_ as the tax for carbon emission - increase _RARE_ .

aging population is a global phenomenon that affects everyone .

throughout history , we , mankind had always _RARE_ to improve the quality of our lives .

that 's the best solution in comparison with other inventions from the scientists .

`` it is an affordable severe disability insurance scheme which provides basic financial protection to those who need long term care , especially in old age .

however , in the consideration that electric cars could not travel far distance , and it was not convenient to get - battery charged , electric cars finally lost out in the market , due to the restriction of _RARE_ .

so should we abandon these people when they are not as strong and _RARE_ as they were before ?

for instance , education is the most import


in view of the collapse of nicoll highway back in april 2004 , hdb has put up very strict requirements on the _RARE_ process and the materials used , like grade 30 concrete should be used in _RARE_ .

in conclusion , the government has shown much concern to the elderly by investing a lot of money in their cause .

some might think is the individual - choice that whether to tell the result out .

hospital stays are also generally shorter . ''

singapore - being a small country and yet having one of the _RARE_ economies in asia - has always been the target its neighboring countries .

when someone _RARE_ a product that is tagged with rfid , the radio wave can still be detected as long as the chip is still active and in - range of detection .

furthermore , helium is used as the coolant in the vhtr system .

however , we should not just blindly put these money into subsidies , - instead , we can use it to research for better policies or systems that will maximize the utility of the budge

nowadays , many _RARE_ foreign immigrants are working in service sectors , and even _RARE_ many customers in their mother _RARE_ .

secondly , mark l. _RARE_ of _RARE_ university argues that _RARE_ high number of young people in a society along with high rates of unemployment creates strong _RARE_ against current political conditions and little _RARE_ in society .

however , the idea was not applied to mobile phones because of the lack of - computing power at that time .

all in all , aging is part of the natural life cycle of an _RARE_ .

beside the effect on financial problem , outdated technology systems used in malaysia also will cause the research process either _RARE_ for a long period or _RARE_ there .

although there are radio stations to report on traffic congestion , however some _RARE_ may not _RARE_ into radio stations or may not know of other alternative ways to _RARE_ their paths .

have present studies , development and policies helped to relieve the aging process ?

in 


scheme ; which offer grants up to $ _RARE_ to companies who are interested to re- employ older workers ( ntuc _RARE_ , 2006 ) .

even though , it is a journey of _RARE_ for chinese innovations in the field of engineering design , which really suffers quite a lot at the moment , the future for chinese engineering innovations is still bright with the government _RARE_ to achieve the goal for a healthy environment of engineering supervision .

in spite of the controversies , the `` very-high-temperature reactors `` which is independent of reprocessing nuclear waste and generates hydrogen as by-product , is the most feasible one among the six .

according to the information from wikipedia `` the _RARE_ project - cost 2 billion dollar , this includes the cost of the two bombs used as well as the test bomb .

in this way , the elderly _RARE_ appropriate care at the same time lowers the healthcare cost as well as _RARE_ up beds in the acute hospitals for more critical cases which need to be 

in summary , convention usually has enough capital and technology , large scale and good environment .

however , in vhtr , as mentioned earlier , it produces wastes and researchers are trying to find solution to it .

however , base on research , - usage of the electronic health record system can help to reduce _RARE_ and error in medical prescription - resulting in a cost reduction - as much as usd $ 50 billion per year .

it can be placed not only under the _RARE_ , but also under the _RARE_ ; - it can even be portable .

it can be produced and never limited .

engineers who are _RARE_ with these `` no future '' products form - a group to work out the products , and this kind of groups are called `` skunk works '' .

moreover , this research also showed that exercises reduced _RARE_ _RARE_ rate by 25 % in _RARE_ women .

despite the difference in both technologies , there is actually one similarity between them .

_RARE_ policy has been implemented since - early 1990s - yet the effe


_RARE_ ( 2008 ) _RARE_ that the _RARE_ of laws such as the genetic information _RARE_ act 2008 in the united states would prohibit insurance companies to reject policies due to genetic information and _RARE_ companies to use genetic information for hiring decisions for individuals .

if designers have designed an _RARE_ to show the process of cooling circuit , then operators may know that the valve had not been closed , and this disaster may be prevented .

( _RARE_ _RARE_ ' giving sight to blind , 2009 ) `` they can not necessarily read a book but they can read a sign '' , said by the development company .

with time passing by , more and more new technology have been discovered or created , these technologies can be classified in two main patterns , one is the technologies which are developed by research , another type is the technologies which are resulted from serendipitous discoveries .

the elderly need not worry too much about their health but what they have to do as a personal

eventually , this technology gives a great advantageous for society .

_RARE_ of this new , compulsory scheme , named - lifelong income for the elderly - ( life ) will receive lifelong monthly _RARE_ from their cpf accounts .

till then let the innovations keep pouring in .

secondly they may not have sufficient money or large piece of time for their _RARE_ .

it helps in coping with global warming by not producing _RARE_ gases unlike the conventional power plants .

population aging , as stated in the introduction of the report world population aging : _RARE_ ( 2002 ) - is the process by which older individuals become a _RARE_ larger share of the total population .

elders can now communicate with each others , their children and relatives regardless of _RARE_ distance .

we can see the increasing speed is very fast .

they claim that nuclear power is _RARE_ .

as we know that the aged in singapore might not be as tech savvy as the youngsters .

the last and perhaps the most significa

In [138]:
#e_values,errorlist= calc_emission(tr_words_rare, tr_errors)
#pickle.dump(e_values, open('saved_models/e_values.pkl', 'wb'))
#pickle.dump(errorlist, open('saved_models/errorlist.pkl', 'wb'))
e_values = pickle.load(open('saved_models/e_values.pkl','rb'))
errorlist = pickle.load(open('saved_models/errorlist.pkl','rb'))
temp = sorted(e_values.items(), key=lambda kv: -kv[1])
for item, e_val in temp:
    output = ' '.join([item[0], item[1], str(e_val)])
    print(output + "\n")

* * 0.0

STOP STOP 0.0

- V0 -0.6148520726400823

- ArtOrDet -0.9981539408346283

- Npos -1.284614936419307

- Others -1.3526248921102795

- Mec -1.812098683145079

the ArtOrDet -1.9426588282429247

_RARE_ Wa -2.2854022188622487

, Srun -2.35112573545865

- Ssub -2.5243361417696546

the Pref -2.5296800652199662

- Srun -2.535550306596078

of Prep -2.6075544913649935

- Spar -2.6880559936852597

in Prep -2.8190585965587056

- Trans -2.821710215034675

_RARE_ Cit -2.8314003960242555

will Vm -2.9103215010087906

it Pform -3.028196891830653

_RARE_ Wform -3.0418201756946264

it Pref -3.045803689613124

is SVA -3.1329646441487764

on Prep -3.1925169920861505

- Pref -3.1926450779423963

to Prep -3.2510691744412634

, Mec -3.3929095405957694

, Trans -3.476881718037233

are SVA -3.5056235411570684

's Npos -3.517275693209582

for Prep -3.5434241539452778

can Vm -3.547751421624083

- Prep -3.5539165277629206

- Vm -3.6302135818160552

_RARE_ Mec -3.6509776935362135

- Wa -3.7004397181410926


includes SVA -7.7535510546006545

more Wci -7.763027890752402

can Um -7.770188346547823

elderly Um -7.770188346547823

from Rloc- -7.777255315191924

's Rloc- -7.777255315191924

working Wform -7.78135971352466

that Wform -7.78135971352466

in Vt -7.787358454146103

government Mec -7.791458917086043

an WOadv -7.8008998999203065

people WOadv -7.8008998999203065

japan WOadv -7.8008998999203065

most WOadv -7.8008998999203065

policies WOadv -7.8008998999203065

population WOadv -7.8008998999203065

important WOadv -7.8008998999203065

still WOadv -7.8008998999203065

limited WOadv -7.8008998999203065

children WOadv -7.8008998999203065

's WOadv -7.8008998999203065

they WOadv -7.8008998999203065

system WOadv -7.8008998999203065

countries WOadv -7.8008998999203065

spending Um -7.814582465906277

from Wci -7.817475674774778

invest Vform -7.827819024617321

make Vform -7.827819024617321

will Vform -7.827819024617321

reducing Vform -7.827819024617321

provide Vform -7.827819024

field Nn -9.111657346263133

home Nn -9.111657346263133

car Nn -9.111657346263133

laptop Nn -9.111657346263133

physician Nn -9.111657346263133

helps Nn -9.111657346263133

numbers Nn -9.111657346263133

has Nn -9.111657346263133

case Nn -9.111657346263133

technologies Nn -9.111657346263133

demand Nn -9.111657346263133

being Nn -9.111657346263133

experiences Nn -9.111657346263133

all Um -9.114142747765184

social Um -9.114142747765184

has Um -9.114142747765184

work Um -9.114142747765184

nuclear Um -9.114142747765184

some no-error -9.129782234627978

aid Vform -9.149747119504683

long Vform -9.149747119504683

avoid Vform -9.149747119504683

generates Vform -9.149747119504683

needed Vform -9.149747119504683

carry Vform -9.149747119504683

focusing Vform -9.149747119504683

proved Vform -9.149747119504683

development Vform -9.149747119504683

bring Vform -9.149747119504683

providing Vform -9.149747119504683

researched Vform -9.149747119504683

gives Vform -9.14974711950


there Others -9.476746203939467

information Others -9.476746203939467

first Others -9.476746203939467

but Others -9.476746203939467

made Others -9.476746203939467

therefore no-error -9.481983996228115

most no-error -9.487995234261977

future Um -9.492654371018915

take Um -9.492654371018915

current Um -9.492654371018915

limited Um -9.492654371018915

much Um -9.492654371018915

energy Um -9.492654371018915

help Um -9.492654371018915

world Um -9.492654371018915

its Um -9.492654371018915

still Um -9.492654371018915

policies Um -9.492654371018915

this Cit -9.50382573799575

world Cit -9.50382573799575

provident Cit -9.50382573799575

; Cit -9.50382573799575

supports Cit -9.50382573799575

friedland Cit -9.50382573799575

n. Cit -9.50382573799575

case Cit -9.50382573799575

information Cit -9.50382573799575

2 Cit -9.50382573799575

ministry Cit -9.50382573799575

from Cit -9.50382573799575

medical Cit -9.50382573799575

summer Cit -9.50382573799575

somehow Cit -9.50382

concerns Srun -10.164906926675688

dollar Srun -10.164906926675688

citizens Srun -10.164906926675688

death Srun -10.164906926675688

even Srun -10.164906926675688

means Srun -10.164906926675688

he Srun -10.164906926675688

keeps Srun -10.164906926675688

eradicate Srun -10.164906926675688

prevented Srun -10.164906926675688

immediate Srun -10.164906926675688

company Srun -10.164906926675688

`` Srun -10.164906926675688

disease Srun -10.164906926675688

mainly Srun -10.164906926675688

continuing Srun -10.164906926675688

bank Srun -10.164906926675688

usa Srun -10.164906926675688

leading Srun -10.164906926675688

nowadays Srun -10.164906926675688

advance Srun -10.164906926675688

money Srun -10.164906926675688

development Srun -10.164906926675688

group Srun -10.164906926675688

employer Srun -10.164906926675688

plane Srun -10.164906926675688

again Srun -10.164906926675688

testing Srun -10.164906926675688

director Srun -10.164906926675688

do Srun -10.164906926675688

giv


factor Others -10.476746203939467

nature Others -10.476746203939467

phones Others -10.476746203939467

asia Others -10.476746203939467

suit Others -10.476746203939467

looked Others -10.476746203939467

obligation Others -10.476746203939467

occurring Others -10.476746203939467

: Others -10.476746203939467

costs Others -10.476746203939467

over Others -10.476746203939467

required Others -10.476746203939467

neighbourhood Others -10.476746203939467

moral Others -10.476746203939467

hence Others -10.476746203939467

2004 Others -10.476746203939467

about Others -10.476746203939467

additional Others -10.476746203939467

ensure Others -10.476746203939467

closely Others -10.476746203939467

resource Others -10.476746203939467

consideration Others -10.476746203939467

low Others -10.476746203939467

adult Others -10.476746203939467

effects Others -10.476746203939467

scwr Others -10.476746203939467

hope Others -10.476746203939467

survey Others -10.476746203939467

patients Othe

every Wci -10.932952892194715

rises Wci -10.932952892194715

aid Wci -10.932952892194715

makes Wci -10.932952892194715

got Wci -10.932952892194715

according Wci -10.932952892194715

inevitable Wci -10.932952892194715

example Wci -10.932952892194715

days Wci -10.932952892194715

different Wci -10.932952892194715

under Wci -10.932952892194715

compared no-error -10.948933584149428

burden no-error -10.957248852361431

further no-error -10.957248852361431

improve no-error -10.957248852361431

psychology no-error -10.957248852361431

every no-error -10.965612325296059

how no-error -10.974024565112257

before no-error -10.982486143860454

& no-error -10.982486143860454

had no-error -10.990997643715971

policy no-error -10.990997643715971

same no-error -10.990997643715971

2004 no-error -11.008172787524103

product no-error -11.008172787524103

citizens no-error -11.008172787524103

2007 no-error -11.008172787524103

privacy no-error -11.016837648651231

companies no-error -11.016


helium WOinc -11.31741261376487

diabetes WOinc -11.31741261376487

generally WOinc -11.31741261376487

give WOinc -11.31741261376487

general WOinc -11.31741261376487

planet WOinc -11.31741261376487

observed WOinc -11.31741261376487

although WOinc -11.31741261376487

prevent WOinc -11.31741261376487

medisave WOinc -11.31741261376487

improve WOinc -11.31741261376487

must WOinc -11.31741261376487

safer WOinc -11.31741261376487

excessive WOinc -11.31741261376487

economic WOinc -11.31741261376487

discourage WOinc -11.31741261376487

team WOinc -11.31741261376487

fighting WOinc -11.31741261376487

within WOinc -11.31741261376487

protect WOinc -11.31741261376487

effect WOinc -11.31741261376487

factories WOinc -11.31741261376487

breach WOinc -11.31741261376487

believe WOinc -11.31741261376487

great WOinc -11.31741261376487

pay WOinc -11.31741261376487

again WOinc -11.31741261376487

newspaper WOinc -11.31741261376487

service WOinc -11.31741261376487

indicate WOinc -11.3

decades Rloc- -11.777255315191924

save Rloc- -11.777255315191924

mirror Rloc- -11.777255315191924

truth Rloc- -11.777255315191924

ideas Rloc- -11.777255315191924

deeper Rloc- -11.777255315191924

water Rloc- -11.777255315191924

implemented Rloc- -11.777255315191924

approach Rloc- -11.777255315191924

opinion Rloc- -11.777255315191924

knowledge Rloc- -11.777255315191924

added Rloc- -11.777255315191924

mother Rloc- -11.777255315191924

limit Rloc- -11.777255315191924

respective Rloc- -11.777255315191924

projects Rloc- -11.777255315191924

capable Rloc- -11.777255315191924

day Rloc- -11.777255315191924

develop Rloc- -11.777255315191924

channel Rloc- -11.777255315191924

another Rloc- -11.777255315191924

benefit Rloc- -11.777255315191924

sectors Rloc- -11.777255315191924

easing Rloc- -11.777255315191924

2006 Rloc- -11.777255315191924

improving Rloc- -11.777255315191924

nothing Rloc- -11.777255315191924

conducted Rloc- -11.777255315191924

tag Rloc- -11.777255315191924


reactors Wci -12.51791539291587

gradually Wci -12.51791539291587

roof Wci -12.51791539291587

identity Wci -12.51791539291587

increment Wci -12.51791539291587

administration Wci -12.51791539291587

green Wci -12.51791539291587

building Wci -12.51791539291587

structured Wci -12.51791539291587

dying Wci -12.51791539291587

heard Wci -12.51791539291587

levels Wci -12.51791539291587

contribution Wci -12.51791539291587

lifestyle Wci -12.51791539291587

layer Wci -12.51791539291587

signals Wci -12.51791539291587

upgrades Wci -12.51791539291587

lies Wci -12.51791539291587

designs Wci -12.51791539291587

livings Wci -12.51791539291587

enables Wci -12.51791539291587

followed Wci -12.51791539291587

carrying Wci -12.51791539291587

welfare Wci -12.51791539291587

innovate Wci -12.51791539291587

hold Wci -12.51791539291587

environmentally Wci -12.51791539291587

leave Wci -12.51791539291587

repercussion Wci -12.51791539291587

confidentiality Wci -12.51791539291587

consists W

culture Rloc- -12.777255315191924

telephone Rloc- -12.777255315191924

born Rloc- -12.777255315191924

hire Rloc- -12.777255315191924

emotional Rloc- -12.777255315191924

safest Rloc- -12.777255315191924

virtual Rloc- -12.777255315191924

minds Rloc- -12.777255315191924

criteria Rloc- -12.777255315191924

invasion Rloc- -12.777255315191924

prevented Rloc- -12.777255315191924

professionals Rloc- -12.777255315191924

hold Rloc- -12.777255315191924

implications Rloc- -12.777255315191924

goes Rloc- -12.777255315191924

personal Rloc- -12.777255315191924

invention Rloc- -12.777255315191924

facilitate Rloc- -12.777255315191924

weapon Rloc- -12.777255315191924

mankind Rloc- -12.777255315191924

counterparts Rloc- -12.777255315191924

company Rloc- -12.777255315191924

controlled Rloc- -12.777255315191924

progress Rloc- -12.777255315191924

nicoll Rloc- -12.777255315191924

outcome Rloc- -12.777255315191924

replacement Rloc- -12.777255315191924

adult Rloc- -12.777255315191924

e


hiroshima Um -12.814582465906277

alone Um -12.814582465906277

retain Um -12.814582465906277

load Um -12.814582465906277

quite Um -12.814582465906277

independent Um -12.814582465906277

opportunity Um -12.814582465906277

abilities Um -12.814582465906277

instilled Um -12.814582465906277

constitutes Um -12.814582465906277

refrigerators Um -12.814582465906277

oldsters Um -12.814582465906277

misuse Um -12.814582465906277

gives Um -12.814582465906277

except Um -12.814582465906277

appeal Um -12.814582465906277

request Um -12.814582465906277

aim Um -12.814582465906277

advanced Um -12.814582465906277

five Um -12.814582465906277

stimulus Um -12.814582465906277

introduce Um -12.814582465906277

complete Um -12.814582465906277

pace Um -12.814582465906277

acquire Um -12.814582465906277

conducive Um -12.814582465906277

sleep Um -12.814582465906277

willingness Um -12.814582465906277

reference Um -12.814582465906277

practice Um -12.814582465906277

measures Um -12.814582465

regard no-error -15.221952078555844

press no-error -15.221952078555844

facebook no-error -15.221952078555844

injured no-error -15.221952078555844

segment no-error -15.221952078555844

accidentally no-error -15.221952078555844

singaporean no-error -15.221952078555844

relying no-error -15.221952078555844

tv no-error -15.221952078555844

laura no-error -15.221952078555844

switch no-error -15.221952078555844

wage no-error -15.221952078555844

upgraded no-error -15.221952078555844

worsen no-error -15.221952078555844

negligence no-error -15.221952078555844

unemployed no-error -15.221952078555844

whereabouts no-error -15.221952078555844

worst no-error -15.221952078555844

thoughts no-error -15.221952078555844

aviation no-error -15.221952078555844

streets no-error -15.221952078555844

divided no-error -15.221952078555844

agencies no-error -15.221952078555844

spectrum no-error -15.221952078555844

categories no-error -15.221952078555844

diffusion no-error -15.221952078555844



user-friendly no-error -16.069948985110795

morale no-error -16.069948985110795

hypertension no-error -16.069948985110795

drinkable no-error -16.069948985110795

projected no-error -16.069948985110795

excavation no-error -16.069948985110795

humanity no-error -16.069948985110795

tedious no-error -16.069948985110795

ford no-error -16.069948985110795

e-bike no-error -16.069948985110795

geothermal no-error -16.069948985110795

accomplish no-error -16.069948985110795

billions no-error -16.069948985110795

campaign no-error -16.069948985110795

prototyping no-error -16.069948985110795

rbmk no-error -16.069948985110795

graduates no-error -16.069948985110795

style no-error -16.069948985110795

owing no-error -16.069948985110795

german no-error -16.069948985110795

stressed no-error -16.069948985110795

satellites no-error -16.069948985110795

altered no-error -16.069948985110795

book no-error -16.069948985110795

guilty no-error -16.069948985110795

infection no-error -16.069948

In [139]:
#best_score, viterbi_tagged = viterbi(te_words, errorlist, known_words,q_values, e_values)

In [140]:
#pickle.dump(best_score, open('saved_models/best_score.pkl', 'wb'))
#pickle.dump(viterbi_tagged, open('saved_models/viterbi_tagged.pkl', 'wb'))
best_score = pickle.load(open('saved_models/best_score.pkl','rb'))
viterbi_tagged = pickle.load(open('saved_models/viterbi_tagged.pkl','rb'))

In [141]:
test_error_seq = []
for i in range(len(te_words)):
    temp = []
    for j in range(2, len(te_words[i]) - 2):
        temp.append(te_words[i][j] + "____" + te_errors[i][j])
    test_error_seq.append(temp)

In [142]:
word_level_acc = 0
sentence_level_acc = 0
w = 0
for best_score, pred_error_seq, actual_error_seq in zip(best_score, viterbi_tagged,test_error_seq):
    print("Optimum path probability: ", best_score)
    print("Predicted tag seq.: ", pred_error_seq)
    print("Actual tag seq.: ", actual_error_seq)
    print("\n")
    if len([i for i, j in zip(pred_error_seq[0].split(), actual_error_seq) if i == j]) == len(actual_error_seq):
        sentence_level_acc += 1
    word_level_acc += len(list(set(pred_error_seq[0].split()) & set(actual_error_seq)))
    w += len(actual_error_seq)

Optimum path probability:  -172.73839917017014
Predicted tag seq.:  ['the____no-error life____no-error expectancy____no-error of____no-error individuals____no-error has____no-error increased____no-error due____no-error to____no-error better____no-error accessibility____no-error to____no-error healthcare____no-error services____no-error and____no-error facilities____no-error than____no-error before____no-error']
Actual tag seq.:  ['the____no-error', 'life____no-error', 'expectancy____Nn', 'of____no-error', 'individuals____no-error', 'has____SVA', 'increased____no-error', 'due____no-error', 'to____no-error', 'better____no-error', 'accessibility____no-error', 'to____no-error', 'healthcare____no-error', 'services____no-error', 'and____no-error', 'facilities____no-error', 'than____Rloc-', 'before____Rloc-']


Optimum path probability:  -267.23393378471377
Predicted tag seq.:  ['in____no-error addition____no-error ,____no-error a____no-error better____no-error health____no-error care____no-e

Optimum path probability:  -191.74021716876857
Predicted tag seq.:  ['despite____no-error this____no-error ,____no-error the____no-error crux____no-error of____no-error the____no-error issue____no-error here____no-error lies____no-error in____no-error how____no-error much____no-error should____no-error this____no-error spending____no-error on____no-error the____no-error aged____no-error be____no-error capped____no-error at____no-error']
Actual tag seq.:  ['despite____Um', 'this____Um', ',____no-error', 'the____no-error', 'crux____no-error', 'of____no-error', 'the____no-error', 'issue____no-error', 'here____no-error', 'lies____no-error', 'in____no-error', 'how____no-error', 'much____no-error', 'should____WOinc', 'this____WOinc', 'spending____WOinc', 'on____WOinc', 'the____WOinc', 'aged____WOinc', 'be____no-error', 'capped____no-error', 'at____no-error']


Optimum path probability:  -148.36528521316094
Predicted tag seq.:  ['this____no-error is____no-error because____no-error different__

Predicted tag seq.:  ['for____no-error example____no-error ,____no-error a____no-error small____no-error device____no-error called____no-error hearing____no-error aid____no-error is____no-error a____no-error solution____no-error for____no-error the____no-error hearing____no-error loss____no-error problem____no-error of____no-error the____no-error elderly____no-error']
Actual tag seq.:  ['for____no-error', 'example____no-error', ',____no-error', 'a____Rloc-', 'small____Rloc-', 'device____Nn', 'called____Wci', 'hearing____no-error', 'aid____Nn', 'is____SVA', 'a____no-error', 'solution____no-error', 'for____no-error', 'the____no-error', 'hearing____WOinc', 'loss____WOinc', 'problem____WOinc', 'of____WOinc', 'the____WOinc', 'elderly____WOinc']


Optimum path probability:  -96.46067417622449
Predicted tag seq.:  ["he____no-error does____no-error n't____no-error have____no-error the____no-error right____no-error -____ArtOrDet to____no-error disclose____no-error it____no-error"]
Actual tag se

Optimum path probability:  -248.42883981783424
Predicted tag seq.:  ['although____no-error there____no-error is____no-error no____no-error nuclear____no-error incidence____Wci happen____no-error in____no-error recent____no-error years____no-error ,____no-error it____no-error is____no-error too____no-error biased____no-error to____no-error say____no-error that____no-error the____no-error engineering____no-error psychology____no-error can____no-error solved____no-error the____no-error problems____no-error by____no-error itself____no-error']
Actual tag seq.:  ['although____no-error', 'there____no-error', 'is____no-error', 'no____ArtOrDet', 'nuclear____no-error', 'incidence____Nn', 'happen____Vt', 'in____no-error', 'recent____no-error', 'years____no-error', ',____no-error', 'it____no-error', 'is____no-error', 'too____no-error', 'biased____no-error', 'to____no-error', 'say____no-error', 'that____no-error', 'the____no-error', 'engineering____no-error', 'psychology____no-error', 'can____no-er

Predicted tag seq.:  ['but____no-error ,____no-error the____no-error aging____no-error problem____no-error may____no-error be____no-error exaggerated____no-error ,____no-error due____no-error to____no-error a____no-error variety____no-error of____no-error reasons____no-error']
Actual tag seq.:  ['but____Wtone', ',____Wtone', 'the____Wtone', 'aging____Wtone', 'problem____Wtone', 'may____Wtone', 'be____Wtone', 'exaggerated____no-error', ',____no-error', 'due____no-error', 'to____no-error', 'a____no-error', 'variety____no-error', 'of____no-error', 'reasons____no-error']


Optimum path probability:  -180.6075147454964
Predicted tag seq.:  ['it____no-error is____no-error now____no-error widespread____no-error and____no-error ranging____no-error from____no-error testing____no-error the____no-error congenital____no-error defect____no-error of____no-error a____no-error foetus____no-error to____no-error finding____no-error the____no-error existence____no-error of____no-error the____no-error onc

Optimum path probability:  -153.3998129722469
Predicted tag seq.:  ['instead____no-error ,____no-error the____no-error government____no-error should____no-error allow____no-error its____no-error public____no-error spending____no-error on____no-error the____no-error aged____no-error to____no-error hang____no-error around____no-error a____no-error certain____no-error percentage____no-error']
Actual tag seq.:  ['instead____no-error', ',____no-error', 'the____no-error', 'government____no-error', 'should____no-error', 'allow____no-error', 'its____no-error', 'public____no-error', 'spending____no-error', 'on____no-error', 'the____no-error', 'aged____no-error', 'to____no-error', 'hang____Wtone', 'around____Wtone', 'a____Wci', 'certain____no-error', 'percentage____no-error']


Optimum path probability:  -266.7769690404275
Predicted tag seq.:  ['the____no-error findings____no-error were____no-error used____no-error for____no-error prevention____no-error to____no-error these____no-error illness__

Predicted tag seq.:  ['when____no-error judge____no-error a____no-error new____no-error energy____no-error ,____no-error the____no-error engineers____no-error have____no-error to____no-error think____no-error about____no-error the____no-error questions____no-error ,____no-error such____no-error as____no-error -____ArtOrDet ``____no-error is____no-error the____no-error energy____no-error able____no-error to____no-error replace____no-error petrol____no-error']
Actual tag seq.:  ['when____no-error', 'judge____Wci', 'a____no-error', 'new____no-error', 'energy____no-error', ',____no-error', 'the____ArtOrDet', 'engineers____no-error', 'have____no-error', 'to____no-error', 'think____no-error', 'about____no-error', 'the____ArtOrDet', 'questions____no-error', ',____no-error', 'such____no-error', 'as____no-error', '-____Cit', '``____no-error', 'is____no-error', 'the____no-error', 'energy____no-error', 'able____no-error', 'to____no-error', 'replace____no-error', 'petrol____no-error']


Optimum pa

Optimum path probability:  -196.88829188727328
Predicted tag seq.:  ['first____no-error ,____no-error vhtr____no-error is____no-error performed____no-error under____no-error very____no-error high____no-error temperature____no-error ,____no-error therefore____no-error allowing____no-error for____no-error more____no-error efficient____no-error conversion____no-error of____no-error heat____no-error to____no-error electricity____no-error']
Actual tag seq.:  ['first____no-error', ',____no-error', 'vhtr____no-error', 'is____Vform', 'performed____Vform', 'under____no-error', 'very____no-error', 'high____no-error', 'temperature____no-error', ',____no-error', 'therefore____Wci', 'allowing____no-error', 'for____no-error', 'more____no-error', 'efficient____no-error', 'conversion____no-error', 'of____no-error', 'heat____no-error', 'to____no-error', 'electricity____no-error']


Optimum path probability:  -488.6517147378924
Predicted tag seq.:  ["in____no-error the____no-error event____no-error that

Predicted tag seq.:  ['compared____no-error to____no-error the____no-error structure____no-error of____no-error a____no-error conventional____no-error automobile____no-error ,____no-error -____ArtOrDet electric____no-error vehicle____no-error has____no-error a____no-error similar____no-error structure____no-error but____no-error different____no-error components____no-error']
Actual tag seq.:  ['compared____no-error', 'to____no-error', 'the____no-error', 'structure____no-error', 'of____no-error', 'a____no-error', 'conventional____no-error', 'automobile____no-error', ',____no-error', '-____ArtOrDet', 'electric____no-error', 'vehicle____no-error', 'has____no-error', 'a____no-error', 'similar____no-error', 'structure____no-error', 'but____no-error', 'different____no-error', 'components____no-error']


Optimum path probability:  -160.80762909616143
Predicted tag seq.:  ['the____no-error above____no-error mentioned____no-error facts____no-error state____no-error that____no-error the____no-er

Optimum path probability:  -265.90846205442904
Predicted tag seq.:  ["in____no-error addition____no-error ,____no-error -____ArtOrDet government____no-error can____no-error also____no-error invest____no-error in____no-error areas____no-error where____no-error the____no-error profit____no-error and____no-error social____no-error welfare____no-error can____no-error be____no-error improved____no-error so____no-error that____no-error the____no-error citizens____no-error '____no-error standard____no-error of____no-error living____no-error will____no-error be____no-error increased____no-error"]
Actual tag seq.:  ['in____no-error', 'addition____no-error', ',____no-error', '-____ArtOrDet', 'government____no-error', 'can____no-error', 'also____no-error', 'invest____no-error', 'in____no-error', 'areas____no-error', 'where____no-error', 'the____ArtOrDet', 'profit____no-error', 'and____no-error', 'social____no-error', 'welfare____no-error', 'can____no-error', 'be____no-error', 'improved____no-erro

Predicted tag seq.:  ['with____no-error this____no-error idea____no-error of____no-error -____ArtOrDet solution____no-error ,____no-error i____no-error would____no-error build____no-error a____no-error 3d____no-error replica____no-error of____no-error this____no-error system____no-error to____no-error test____no-error on____no-error the____no-error effectiveness____no-error of____no-error the____no-error reducing____no-error traffic____no-error congestion____no-error']
Actual tag seq.:  ['with____no-error', 'this____no-error', 'idea____no-error', 'of____no-error', '-____ArtOrDet', 'solution____no-error', ',____no-error', 'i____no-error', 'would____no-error', 'build____no-error', 'a____no-error', '3d____no-error', 'replica____no-error', 'of____no-error', 'this____no-error', 'system____no-error', 'to____no-error', 'test____no-error', 'on____no-error', 'the____no-error', 'effectiveness____no-error', 'of____Wci', 'the____Wci', 'reducing____no-error', 'traffic____no-error', 'congestion____n

Optimum path probability:  -100.53336578278237
Predicted tag seq.:  ['safety____no-error of____no-error bystanders____no-error is____no-error also____no-error considered____no-error where____no-error design____no-error safety____no-error is____no-error concerned____no-error']
Actual tag seq.:  ['safety____ArtOrDet', 'of____no-error', 'bystanders____no-error', 'is____no-error', 'also____no-error', 'considered____no-error', 'where____no-error', 'design____no-error', 'safety____no-error', 'is____no-error', 'concerned____no-error']


Optimum path probability:  -92.19370599969525
Predicted tag seq.:  ['highly____no-error radioactive____no-error spent____no-error fuel____no-error requires____no-error constant____no-error cooling____no-error']
Actual tag seq.:  ['highly____Trans', 'radioactive____no-error', 'spent____no-error', 'fuel____no-error', 'requires____no-error', 'constant____no-error', 'cooling____no-error']


Optimum path probability:  -261.1435651065744
Predicted tag seq.:  ['to___

Predicted tag seq.:  ['the____no-error government____no-error estimates____no-error that____no-error by____no-error the____no-error year____no-error 2030____no-error ,____no-error 23____no-error %____no-error of____no-error its____no-error population____no-error will____no-error be____no-error over____no-error the____no-error age____no-error of____no-error 65____no-error (____no-error wijaya____no-error ,____no-error 2009____no-error )____no-error']
Actual tag seq.:  ['the____no-error', 'government____no-error', 'estimates____no-error', 'that____no-error', 'by____no-error', 'the____no-error', 'year____no-error', '2030____no-error', ',____no-error', '23____no-error', '%____no-error', 'of____no-error', 'its____no-error', 'population____no-error', 'will____no-error', 'be____no-error', 'over____no-error', 'the____no-error', 'age____Wci', 'of____Wci', '65____Wci', '(____no-error', 'wijaya____no-error', ',____no-error', '2009____no-error', ')____no-error']


Optimum path probability:  -237.9

In [143]:
print("Number of words that are correctly tagged: ", word_level_acc / w)
print("Number of sentences that are correctly tagged: ",sentence_level_acc / len(test_error_seq))

Number of words that are correctly tagged:  0.748258914169687
Number of sentences that are correctly tagged:  0.10027472527472528


In [144]:
score_per_error = {}
for error in errorlist:
    if error not in score_per_error:
        score_per_error[error] = []
for i in range(len(test_error_seq)):
    for j in range(len(test_error_seq[i])):
        #print(test_error_seq[i][j])
        score_per_error[test_error_seq[i][j].split("____")[1]].append((test_error_seq[i][j].split("____")[1],
             viterbi_tagged[i][0].split()[j].split("____")[1]))
for error in score_per_error:
    cnt = 0
    if len(score_per_error[error]) > 0:
        for j in range(len(score_per_error[error])):
            if score_per_error[error][j][0] == score_per_error[error][j][1]:
                cnt += 1
        print("Number of words correctly classified with error" + " " + "'"+ error + "'" + " " + "is :", cnt/ len(score_per_error[error]))

Number of words correctly classified with error 'V0' is : 0.0
Number of words correctly classified with error 'WOadv' is : 0.0
Number of words correctly classified with error 'Spar' is : 0.0
Number of words correctly classified with error 'SVA' is : 0.005847953216374269
Number of words correctly classified with error 'Pform' is : 0.0
Number of words correctly classified with error 'Vform' is : 0.0
Number of words correctly classified with error 'Wtone' is : 0.08571428571428572
Number of words correctly classified with error 'Smod' is : 0.0
Number of words correctly classified with error 'Wci' is : 0.001321003963011889
Number of words correctly classified with error 'Sfrag' is : 0.0
Number of words correctly classified with error 'Wform' is : 0.004545454545454545
Number of words correctly classified with error 'Cit' is : 0.4564102564102564
Number of words correctly classified with error 'Um' is : 0.05714285714285714
Number of words correctly classified with error 'Vm' is : 0.0
Number of

In [145]:
most_probable_words = {}
for error in errorlist:
    if error!=START_SYMBOL and error!=STOP_SYMBOL and error not in most_probable_words:
        most_probable_words[error] = [(word,e_values[(word,word_error)]) for word,word_error in e_values if word_error==error]
        most_probable_words[error] = sorted(most_probable_words[error], key=lambda x: float(x[1]), reverse=True)

data=[]
for error in most_probable_words:
    uniq = []
    #uniq.append([tag])
    for word, prob in most_probable_words[error]:
        if word not in uniq and word != START_SYMBOL and word != STOP_SYMBOL and word != RARE_SYMBOL and word.lower(
        ).islower() == True:
            uniq.append(word)
    if len(uniq)>=51:
        k=51
    else:
        k=len(uniq)
    print("Top words emitted by error" + " " + "'" + error + "'" + " " + "is :")
    print(uniq[0:k])
    print("\n")

Top words emitted by error 'WOadv' is :
['the', 'in', 'can', 'of', 'also', 'is', 'to', 'be', 'a', 'for', 'and', 'now', 'only', 'are', 'has', 'their', 'that', 'have', 'elderly', 'up', 'on', 'not', 'singapore', 'will', 'currently', 'too', 'it', 'even', 'an', 'people', 'japan', 'most', 'policies', 'population', 'important', 'still', 'limited', 'children', "'s", 'they', 'system', 'countries', 'fall', 'other', 'easily', 'problem', 'always', 'lot', 'this', 'already', 'just']


Top words emitted by error 'Spar' is :
['the', 'to', 'and', 'of', 'in', 'also', 'as', 'a', 'having', 'they', 'their', 'is', 'one', 'for', 'provide', 'be', 'are', 'case', 'well', 'will', "'s", 'improve', 'being', 'it', 'able', 'build', 'or', 'have', 'not', 'aspect', 'providing', 'at', 'power', 'services', 'hospitals', 'temperature', 'medical', 'time', 'core', 'these', 'only', 'introducing', 'subsidy', 'more', 'healthcare', 'an', 'generating', 'reactor', 'education', 'that', 'designer']


Top words emitted by error 'SVA'

Top words emitted by error 'no-error' is :
['the', 'to', 'of', 'and', 'in', 'is', 'a', 'that', 'be', 'for', 'as', 'are', 'it', 'on', 'not', 'will', 'can', 'their', 'with', 'this', 'have', 'more', 'by', 'they', 'elderly', 'nuclear', 'people', 'which', 'should', 'from', "'s", 'has', 'also', 'government', 'such', 'or', 'an', 'development', 'other', 'energy', 'there', 'aged', 'technology', 'public', 'one', 'country', 'aging', 'however', 'may', 'process', 'spending']


Top words emitted by error 'Trans' is :
['and', 'the', 'in', 'as', 'but', 'to', 'so', 'on', 'of', 'also', 'other', 'that', 'it', 'thus', 'however', 'with', 'then', 'therefore', 'for', 'besides', 'or', 'they', 'hand', 'which', 'this', 'despite', 'well', 'due', 'not', 'at', 'while', 'a', 'contrary', 'by', 'when', 'since', 'be', 'such', 'than', 'where', 'more', 'even', 'second', 'first', 'side', 'are', 'last', 'healthcare', 'beside', 'additionally', 'all']


Top words emitted by error 'Npos' is :
["'s", 'the', 'their', 'it', 'co