# Use knlm

In [1]:
# add to $y$tem path the parent directory
import os
import sys
notebook_dir = os.path.abspath(os.path.dirname(''))
parent_dir = os.path.abspath(os.path.join(notebook_dir, os.pardir))
sys.path.append(parent_dir)

In [2]:
import pickle
with open(os.path.join(notebook_dir, '..','models','saved_model_knlm2'),'rb') as inputfile:
    kn_lm2 = pickle.load(inputfile) 

In [3]:
sample_sentences = ['हरेक [MASK] नेपामको संविधानक पालना गर्नुपर्छ ।' ,
                    'म पुस्तकालयबाट थुलो किताब पढ्न चाहन्छ ।',
                    'तर उस समयमा पनि स्वस्थ राजनैतिक वातावरनको अभावले गर्दा देश विकासतर्फ विशेष प्रगति हुन  सकेन।',
                   'नेपालमा आधुनिक रुपमा आर्थक विकाससम्बन्धी कार्यरू प्रारम्भ भएको हालै मात्र हो।',
                   'हार धुनुहोस् र स्वास्थ जीवन जिउनुहोस्।',
                   'जब प्रवीधिहरू एकीकृत हुन सूरु गर्छन् अर्थतन्त्र तथ सँस्कृति पनि निश्चितरूपमा विस्तारै एकीकृत हुने छ।',
                   'उद्देयहरुमा पनि कुनै एक उद्देश्य पूर्ति नहुँदै अर्को नयाँ  उद्देश्यको रुपमा लिइने परम्परा बस्यो।',
                   'लगानीकर्ताहरूको धयान तुरुन्त फेरियो , व्यापारीहरुले वताए ।',
                    'अति धेरै हिज्जे गलती भएका शब्दहरू । तपाईँले टाइप गर्दा हिज्जे जाँच अक्षम पारयो ।',
                    'लामो',
                    'म नेपाली राम्रोसँग बोल्दिन',
                    'तपाईंको उमर कति हो?',
                    'मलाइ एक्लै छोडनुहोस्',
                   'रुसी राष्ट्रपती पुटिनको प्रेम जावन त्य्ती सफल छैन ।']

In [3]:
#help(tokenizer)

# Functions to correct

In [14]:
import torch
import pickle
import BrillMoore
import regex as re
from utils import final_candidate_words, return_lexicon_dict
import math
import itertools
import numpy

def words(text): 
    text = re.sub(r'[\u0964]', r'\u0020\u0964\u0020', text)
    return re.findall(r'[\u0900-\u097F]+', text.lower())

final_lexicon_dict = return_lexicon_dict()

with open(os.path.join(notebook_dir, '..','models','bma_27dec.pickle'),'rb') as f:
    bma = pickle.load(f)

In [5]:
def words_bigram(text):   
    text = re.sub(r'[\u0964]', r'\u0020\u0964\u0020', text)
    return [tuple(x.split()) for x in re.findall
                                (r'\b[\u0900-\u097F]+\s[\u0900-\u097F]+',text.lower(), overlapped=True)]


def logprob(ngram,model,minimum):
    '''
    Calculate log probability
    
    
    '''    
    if ngram in model.lm[0]:
        return model.lm[0][ngram]
    return minimum

In [6]:
def likelihood_bm(sentence,candidate_sentence):
    '''
    Returns P(Possible Typo Sentence/Candidate Correct Sentence)
    
    Uses Naive approach to compute probability for sentence from individual words
    
    '''    
    
    prod = 1
    for word,candidate_word in zip(sentence.split(),candidate_sentence):          
        prod*= bma.likelihood(word,candidate_word)
        #print("prod:",prod)
    return prod

In [7]:
def correctize_entire_knlm(sentence, model,p_lambda = 1,prior='bigram',trie = False,likelihood = 'default'):
    "Corrects the given 'sentence' using minimum edit"

    tokens = words(sentence)

    candidates = []    
    for _ in tokens:
        candidates.append(final_candidate_words(_,use_trie = trie))
    
   
    candidate_sentences = list(itertools.product(*candidates))
    minimum = min(model.lm[0].values())
    
    if prior == 'trigram':
        pass
    
    if prior == 'bigram':
        bi_tokens = [words_bigram(' '.join(_)) for _ in candidate_sentences]
        bi_token_probab = []
   
        for row in bi_tokens:
            bi_token_probab.append([logprob(tuple(_),model,minimum) for _ in row])  

        if likelihood=='default':
            candidate_count = [len(_) for _ in candidates]  
            sentences_probab_post=[(sum(row)*p_lambda) +
                                   math.log(constant_distributive_likelihood(sentence,candidate_sentence,candidate_count)) 
                                   for row,candidate_sentence in zip(bi_token_probab,candidate_sentences)]
        elif likelihood=='bm':
            sentences_probab_post=[(sum(row)*p_lambda) + 
                                    math.log(likelihood_bm(sentence,candidate_sentence)) 
                                    for row,candidate_sentence in zip(bi_token_probab,candidate_sentences)]


        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
        
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted

In [8]:
def correctize_with_window_knlm(sentence,model,window = 5,p_lambda = 1,prior = 'bigram',trie = False,likelihood = 'default'):
    '''
    
    '''   
    
    tokens = words(sentence)
    if len(tokens) <= window:
        return [correctize_entire_knlm(sentence,model,p_lambda=p_lambda,prior = prior,trie = trie,likelihood = likelihood)]
    else:
        windows = [tokens[n:window+n] for n in range(0,len(tokens),window-1) if window+n <len(tokens)-1]    
        remaining = (window-1)*len(windows)
        windows.append(tokens[remaining:])
        corrects = []
        for _ in windows:
            d = correctize_entire_knlm(' '.join(_),model,p_lambda=p_lambda,prior = prior,trie = trie,likelihood = likelihood)
            corrects.append(d)
        return corrects


In [9]:
def return_choices2(sample_sentences,model,p_lambda = 1,trie = False,model_type ='knlm' ,likelihood = 'default'):
    
    if model_type =='knlm':
        d = correctize_with_window_knlm(sample_sentences,model,p_lambda =p_lambda,trie = trie,likelihood = likelihood)
        window_candidates = []
        window_probab = []
        for window in d:
            maxim = min(len(window[0]),10)
            top_candidates = window[0][:maxim]
            window_candidates.append(top_candidates)
            window_probab.append(window[1][:maxim])
        return window_candidates,window_probab
    
    if model_type == 'transformer':
        d = correctize_with_window_nn(sample_sentences,model,p_lambda =p_lambda,trie = trie,likelihood = likelihood)
        window_candidates = []
        window_probab = []
        for window in d:
            maxim = min(len(window[0]),10)
            top_candidates = window[0][:maxim]
            window_candidates.append(top_candidates)
            window_probab.append(window[1][:maxim])
        return window_candidates,window_probab
        
def extract_choices(sample_sentences,model,p_lambda = 1,trie = False,likelihood = 'default',model_type = 'knlm'):
    
    
    wc,wp = return_choices2(sample_sentences,model,p_lambda = p_lambda,trie = trie ,model_type = model_type,likelihood = likelihood)
#     choices_list=[set() for i in range(len(sample_sentences.split())+1)]
    choices_list=[[] for i in range(len(sample_sentences.split())+1)]
#     print(len(choices_list))

    const = 0
    for _ in wc:
        for sens in _:
            for i,w in enumerate(sens):
                index = i + const
                if w not in choices_list[index]:
                    choices_list[index].append(w)
        const += len(wc[0][0])-1
    if len(choices_list[len(choices_list)-1]) == 0:
        return choices_list[:len(choices_list)-1]
    return choices_list

# Example

In [12]:
extract_choices(sample_sentences[1],model=kn_lm2,p_lambda = 0.,trie = True,likelihood = 'bm')

[['म', 'क', 'मा', 'त', 'मि'],
 ['पुस्तकालयबाट'],
 ['थुलो', 'थलो', 'धुलो'],
 ['किताब'],
 ['पढ्न', 'पस्न', 'पर्न', 'बढ्न', 'पढ्ने', 'चढ्न'],
 ['चाहन्छ', 'चाहिन्छ', 'चाहन्छु', 'चाहन्छन्'],
 ['।']]

In [13]:
extract_choices(sample_sentences[2],model=kn_lm2,p_lambda = 0.8,trie = True,likelihood = 'bm')

[['पर', 'तर', 'तह', 'गर'],
 ['स', 'उप', 'उसो', 'आस'],
 ['समयमा', 'समयको', 'समयका'],
 ['पनि'],
 ['स्वस्थ', 'अस्वस्थ', 'स्वच्छ'],
 ['राजनीतिक', 'राजनैतिक', 'राजनीति'],
 ['वातावरणको'],
 ['अभावले', 'अभावमा'],
 ['गर्दा', 'गर्न', 'गर्दै', 'पर्दा'],
 ['देश'],
 ['विकासतर्फ'],
 ['विशेष'],
 ['प्रगति', 'प्रति', 'प्रालि', 'प्रगतिको', 'प्रावि', 'प्रगाढ'],
 ['हुन', 'हुने', 'हुनै'],
 ['सकेन', 'सकेनन्', 'सकेका', 'सकिन', 'सक्न', 'सकेमा', 'सकेर'],
 ['।']]

In [28]:
sample_sentences[2]

'तर उस समयमा पनि स्वस्थ राजनैतिक वातावरनको अभावले गर्दा देश विकासतर्फ विशेष प्रगति हुन  सकेन।'

# Evaluation

In [10]:
from eval import gather_dataset,WER,word_accuracy,char_accuracy

dataset_file = os.path.join(notebook_dir, '..','data','eval_data2.pic')

In [11]:
dataset = gather_dataset(dataset_file)[:400]

In [12]:
correct_tokens = [words(t[0]) for t in dataset]
error_tokens = [words(t[1]) for t in dataset]
error_sentences = [t[1] for t in dataset] 

### Bm

In [None]:
Gap = 0.2
a = 10
def find_p_lambda(error_sentences):
    predicted_tokens = []
    for j in range(10):
        for i,s in enumerate(error_sentences[:a]):
            c = extract_choices(s,model=kn_lm2,p_lambda =Gap*j,trie = True,likelihood = 'bm')
        #     print(c)
            c = [t[0] for t in c]
            predicted_tokens.append(c)
            if i%2 == 0:
                print(i)
#         word_accuracy(correct_tokens[:a],predicted_tokens[:a],error_tokens[:a])
        print(Gap*j,' : ',word_accuracy(correct_tokens[:a],predicted_tokens[:a],error_tokens[:a]),'WER1: ',WER(correct_tokens[:a],error_tokens[:a] ),'WER2: ',WER(correct_tokens[:a],predicted_tokens[:a] ))
find_p_lambda(error_sentences)

In [15]:
Gap = 0.2
a = 10
def find_p_lambda(error_sentences):
    predicted_tokens = []
    for j in range(10):
        for i,s in enumerate(error_sentences[:a]):
            c = extract_choices(s,model=kn_lm2,p_lambda =Gap*j,trie = True,likelihood = 'bm')
        #     print(c)
            c = [t[0] for t in c]
            predicted_tokens.append(c)
            if i%2 == 0:
                print(i)
#         word_accuracy(correct_tokens[:a],predicted_tokens[:a],error_tokens[:a])
        print(Gap*j,' : ',word_accuracy(correct_tokens[:a],predicted_tokens[:a],error_tokens[:a]),'WER1: ',WER(correct_tokens[:a],error_tokens[:a] ),'WER2: ',WER(correct_tokens[:a],predicted_tokens[:a] ))
find_p_lambda(error_sentences)

0
2
4
6
8
0.0  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727
0
2
4
6
8
0.2  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727
0
2
4
6
8
0.4  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727
0
2
4
6
8
0.6000000000000001  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727
0
2
4
6
8
0.8  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727
0
2
4
6
8
1.0  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727
0
2
4
6
8
1.2000000000000002  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727
0
2
4
6
8
1.4000000000000001  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727
0
2
4
6
8
1.6  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727
0
2
4
6
8
1.8  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727


In [34]:
predicted_tokens = []
for i,s in enumerate(error_sentences):
    c = extract_choices(s,model=kn_lm2,p_lambda =0.8 ,trie = True,likelihood = 'bm')
#     print(c)
    c = [t[0] for t in c]
    predicted_tokens.append(c)
    if i%2 == 0:
        print(i)
    if i==323:
        l = len(predicted_tokens)
        print(WER(correct_tokens[:l],error_tokens[:l] ),WER(correct_tokens[:l],predicted_tokens[:l] ),word_accuracy(correct_tokens[:l],predicted_tokens[:l],error_tokens[:l]),char_accuracy(correct_tokens[:l],predicted_tokens[:l],error_tokens[:l]))

0
2
4
6
8
10
12
14
16
18
20
22
24
26
28
30
32
34
36
38
40
42
44
46
48
50
52
54
56
58
60
62
64
66
68
70
72
74
76
78
80
82
84
86
88
90
92
94
96
98
100
102
104
106
108
110
112
114
116
118
120
122
124
126
128
130
132
134
136
138
140
142
144
146
148
150
152
154
156
158
160
162
164
166
168
170
172
174
176
178
180
182
184
186
188
190
192
194
196
198
200
202
204
206
208
210
212
214
216
218
220
222
224
226
228
230
232
234
236
238
240
242
244
246
248
250
252
254
256
258
260
262
264
266
268
270
272
274
276
278
280
282
284
286
288
290
292
294
296
298
300
302
304
306
308
310
312
314
316
318
320
322
0.25042111173498033 0.18697361033127458 (0.6472346786248132, 866, 1338) (0.682741116751269, 1076, 1576)
324
326
328
330
332
334
336
338
340
342
344
346
348
350
352
354
356
358
360
362
364
366
368
370
372
374
376
378
380
382
384
386
388
390
392
394
396
398


In [20]:
a = len(predicted_tokens)
WER(correct_tokens[:l],error_tokens[:a] ),WER(correct_tokens[:a],predicted_tokens[:a] )

(0.25042111173498033, 0.18697361033127458)

In [22]:
word_accuracy(correct_tokens[:a],predicted_tokens[:a],error_tokens[:a])

(0.6472346786248132, 866, 1338)

In [33]:
char_accuracy(correct_tokens[:a],predicted_tokens[:a],error_tokens[:a])

(0.6706852791878173, 1057, 1576)

### Constant Distributive likelihood

In [25]:
alpha = 0.65
def constant_distributive_likelihood(sentence,candidate_sentence,candidate_count):
    prod = 1    
    i = 0
    #print(sentence.split(),candidate_sentence)
    
    for word,candidate_word in zip(sentence.split(),candidate_sentence):        
        if word==candidate_word:
            prod*= alpha
        else:
            N = candidate_count[i]
            prod*= (1-alpha)/N
        i+=1
    return prod

In [26]:
Gap = 0.2
a = 10
def find_p_lambda(error_sentences):
    predicted_tokens = []
    for j in range(10):
        for i,s in enumerate(error_sentences[:a]):
            c = extract_choices(s,model=kn_lm2,p_lambda =Gap*j,trie = True,likelihood = 'default')
        #     print(c)
            c = [t[0] for t in c]
            predicted_tokens.append(c)
            if i%2 == 0:
                print(i)
#         word_accuracy(correct_tokens[:a],predicted_tokens[:a],error_tokens[:a])
        print(Gap*j,' : ',word_accuracy(correct_tokens[:a],predicted_tokens[:a],error_tokens[:a]),'WER1: ',WER(correct_tokens[:a],error_tokens[:a] ),'WER2: ',WER(correct_tokens[:a],predicted_tokens[:a] ))
find_p_lambda(error_sentences)

0
2
4
6
8
0.0  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727
0
2
4
6
8
0.2  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727
0
2
4
6
8
0.4  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727
0
2
4
6
8
0.6000000000000001  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727
0
2
4
6
8
0.8  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727
0
2
4
6
8
1.0  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727
0
2
4
6
8
1.2000000000000002  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727
0
2
4
6
8
1.4000000000000001  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727
0
2
4
6
8
1.6  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727
0
2
4
6
8
1.8  :  (0.0, 0, 42) WER1:  0.2727272727272727 WER2:  0.2727272727272727


In [27]:
predicted_tokens = []
for i,s in enumerate(error_sentences):
    c = extract_choices(s,model=kn_lm2,p_lambda =0.8 ,trie = True,likelihood = 'default')
#     print(c)
    c = [t[0] for t in c]
    predicted_tokens.append(c)
    if i%2 == 0:
        print(i)

0
2
4
6
8
10
12
14
16
18
20
22
24
26
28
30
32
34
36
38
40
42
44
46
48
50
52
54
56
58
60
62
64
66
68
70
72
74
76
78
80
82
84
86
88
90
92
94
96
98
100
102
104
106
108
110
112
114
116
118
120
122
124
126
128
130
132
134
136
138
140
142
144
146
148
150
152
154
156
158
160
162
164
166
168
170
172
174
176
178
180
182
184
186
188
190
192
194
196
198
200
202
204
206
208
210
212
214
216
218
220
222
224
226
228
230
232
234
236
238
240
242
244
246
248
250
252
254
256
258
260
262
264
266
268
270
272
274
276
278
280
282
284
286
288
290
292
294
296
298
300
302
304
306
308
310
312
314
316
318
320
322
324
326
328
330
332
334
336
338
340
342
344
346
348
350
352
354
356
358
360
362
364
366
368
370
372
374
376
378
380
382
384
386
388
390
392
394
396
398


In [30]:
a = 324
WER(correct_tokens[:a],error_tokens[:a] ),WER(correct_tokens[:a],predicted_tokens[:a] )

(0.25042111173498033, 0.20082350739285046)

In [31]:
word_accuracy(correct_tokens[:a],predicted_tokens[:a],error_tokens[:a])

(0.601644245142003, 805, 1338)

In [32]:
char_accuracy(correct_tokens[:a],predicted_tokens[:a],error_tokens[:a])

(0.6706852791878173, 1057, 1576)