In [1]:
import regex as re
from collections import Counter
import itertools
import math
import pickle
import numpy


sample_factor = 0.01
sample_factor2 = 0.05


alpha = 0.65

def words(text): 
    text = re.sub(r'[\u0964]', r'\u0020\u0964\u0020', text)
    return re.findall(r'[\u0900-\u097F]+', text.lower())


def words_bigram(text):   
    text = re.sub(r'[\u0964]', r'\u0020\u0964\u0020', text)
    return [tuple(x.split()) for x in re.findall
                                (r'\b[\u0900-\u097F]+\s[\u0900-\u097F]+',text.lower(), overlapped=True)]

def words_trigram(text):
    text = re.sub(r'[\u0964]', r'\u0020\u0964\u0020', text)
    return re.findall(
    r'\b[\u0900-\u097F]+\s[\u0900-\u097F]+\s[\u0900-\u097F]+', text.lower(),
    overlapped=True)


def words_bigram_from_list(sentence):
    return [(sentence[i],sentence[i+1]) for i in range(len(sentence)-1)]


# List of all Nepali characters
char_vocab = []
for _ in range(2304, 2432):
    if _ not in range(2406,2416):
        char_vocab += [chr(_)]


def change_keys(d):
    return dict([(k[0], v) for k, v in d.items()])

with open('data/saved_words_counter1','rb') as inputfile:
    WORDS = pickle.load(inputfile) 
    WORDS = Counter(change_keys(WORDS))

with open('data/saved_words_counter2','rb') as inputfile:
    WORDS_bigram = pickle.load(inputfile) 

WORDS_trigram = WORDS_bigram


#Words Partition(splits)
WORDS_full = WORDS
WORDS2 = Counter(dict(WORDS.most_common(int(sample_factor2*len(WORDS)))))
WORDS = Counter(dict(WORDS.most_common(int(sample_factor*len(WORDS)))))




#Token Probability
def probability(word, N=sum(WORDS.values())):
    "Probability of `word`."
    return (WORDS[word] + 1)/ N

def probability_bigram(bi_word, N=sum(WORDS_bigram.values())):
    "Probability of `two words` given as a tuple."
    return (WORDS_bigram[bi_word]+1) / N

def probability_trigram(tri_word, N=sum(WORDS_trigram.values())):
    "Probability of `two words` given as a tuple."
    return (WORDS_trigram[tri_word]+1) / N


words_list = list(WORDS)


#Likelihood models
def likelihood(sentence,N=len(words_list)):
    prod = 1    
    for word in sentence:
        if word not in WORDS:
            prod*= 0.95
        else:
            word_index = words_list.index(word)
            proportional_word = words_list[-word_index+N-1]
            prod*= 0.05*probability(proportional_word)
    return prod




def constant_distributive_likelihood(sentence,candidate_sentence,candidate_count):
    prod = 1    
    i = 0
    #print(sentence.split(),candidate_sentence)
    
    for word,candidate_word in zip(sentence.split(),candidate_sentence):        
        if word==candidate_word:
            prod*= alpha
        else:
            N = candidate_count[i]
            prod*= (1-alpha)/N
        i+=1
    return prod




        
    




def edits1(word):
    "All edits that are one edit away from `word`."
    letters = char_vocab
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)


def edits2(word):
    "All edits that are two edits away from `word`."
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1))



def edits2_(word):
    for e1 in known_from_full(edits1(word)):
        s = set( e2 for  e2 in edits1(e1))
        return known_from_full(s)
# Isn't exact


def edits3(word):
    "All edits that are two edits away from `word`."
    return set(e3 for e2 in known(edits2(word)) for e3 in edits1(e2))


def known(words):
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def known_from_WORDS2(words):
    return set(w for w in words if (w,) in WORDS2 or w in WORDS2)
    

def known_from_full(words):
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if (w,) in WORDS_full or w in WORDS_full)


def candidates_ordered(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])


def candidates_all(word):
    "Generate possible spelling corrections for word."
    return (set.union(known([word]), known(edits1(word)), known(edits2(word)) ,[word]))

def candidates_all_from_full(word):
    "Generate possible spelling corrections for word."
    return (set.union(known_from_full([word]), known(edits1(word)), edits2_(word) ,[word]))

def candidates_all_within1(word):
    
    "Generate possible spelling corrections for word."
    return set.union(known_from_full([word]), known(edits1(word)),[word])

def candidates_all_within1_full(word):
    
    "Generate possible spelling corrections for word."
    return set.union(known_from_full([word]), known(edits1(word)),[word])


def candidates_all_within1_full_expanded(word):
    
    "Generate possible spelling corrections for word."
   
    return set.union(known_from_full([word]), known_from_WORDS2(edits1(word)),[word])


def correction(word):
    "Most probable spelling correction for word."
    return max(candidates_ordered(word), key=probability)


def correctize(sentence, prior='bigram'):
    "Corrects the given 'sentence' using minimum edit"
    tokens = words(sentence)
    candidates = []    
    for _ in tokens:
        candidates.append(list(candidates_all_within1(_)))
    candidate_sentences = list(itertools.product(*candidates))
    #candidate_count = [len(_) for _ in candidate_sentences]
    
    if prior == 'trigram':
        #trigram tokens for possible sentences
        tri_tokens = [words_trigram(' '.join(sentence)) for sentence in candidate_sentences]
        tri_token_probab = []

        for row in tri_tokens:
            tri_token_probab.append([probability_trigram(_) for _ in row])
            
        sentence_likelihood = likelihood(sentence)
        sentences_probab_post = [math.prod(row)*sentence_likelihood for row in tri_token_probab]
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)

        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted
    
    if prior == 'bigram':
        #bigram tokens for possible sentences
        bi_tokens = [words_bigram(' '.join(sentence)) for sentence in candidate_sentences]
        bi_token_probab = []
        for row in bi_tokens:
            bi_token_probab.append([probability_bigram(_) for _ in row])  
        sentence_likelihood = likelihood(sentence)
        
        sentences_probab_post = [math.prod(row)*sentence_likelihood for row in bi_token_probab]
        #sentences_probab_post = [math.prod(row)*likelihood2(sentence,candidate_sentence,candidate_count) for row,candidate_sentence in zip(bi_token_probab,candidate_sentences)]
        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
    #return candidate_sentences[sentences_probab.index(max(sentences_probab))]
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted

def correctize_entire_with_time(sentence, p_lambda = 1,prior='bigram',tokenized = False):
    "Corrects the given 'sentence' using minimum edit"
    import time

    t_start = time.time()
    tokens = words(sentence)
    start1 = time.time()
    candidates = []    
    for _ in tokens:
        candidates.append(list(filter(lambda word: word in tokens or WORDS2[word]>1000 ,list(candidates_all_within1_full_expanded(_)))))
    candidate_count = [len(_) for _ in candidates]  
    print(candidate_count[0:len(candidates)])      
    end1 = time.time()
    print("Time passed", end1-start1,"sec")
    
    start1 = time.time()
    candidate_sentences = list(itertools.product(*candidates))
    end1 = time.time()
    print("Time passed", end1-start1,"sec")


    
    if prior == 'trigram':
        #bigram tokens for possible sentences
        tri_tokens = [words_trigram(' '.join(_)) for _ in candidate_sentences]
        tri_token_probab = []

        for row in tri_tokens:
            tri_token_probab.append([probability_trigram(_) for _ in row])
            
        #sentence_likelihood = likelihood(sentence)
        #sentences_probab = [math.prod(row) for row in tri_token_probab]
        sentences_probab_post = [math.prod(row)*constant_distributive_likelihood(sentence,candidate_sentence,candidate_count) for row,candidate_sentence in zip(tri_token_probab,candidate_sentences)]
        # sorted_index = numpy.argsort(sentences_probab)
        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
    #return candidate_sentences[sentences_probab.index(max(sentences_probab))]
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted
    
    if prior == 'bigram':
        start1 = time.time()
        #bigram tokens for possible sentences
        bi_tokens = [words_bigram(' '.join(_)) for _ in candidate_sentences]
        #bi_tokens = [[a,b for zip(_[:-1],_[1:])] for _ in candidate_sentences]
        end1 = time.time()
        print("Time passed", end1-start1,"sec")
        
        bi_token_probab = []
        start1 = time.time()
        for row in bi_tokens:
            bi_token_probab.append([probability_bigram(_) for _ in row])  
            #sentence_likelihood_ = likelihood2(sentence,candidate_sentences)
        end1 = time.time()
        print("Time passed", end1-start1,"sec")
        #sentence_likelihood = likelihood(sentence)
        
        start1 = time.time()
        # sentences_probab_post = [math.prod(row)*sentence_likelihood for row in bi_token_probab]
        sentences_probab_post = [math.log((math.prod(row)**p_lambda)*constant_distributive_likelihood(sentence,candidate_sentence,candidate_count)) for row,candidate_sentence in zip(bi_token_probab,candidate_sentences)]
        #sentences_log_probab = [math.ln(m) for m in sentences_probab_post]
        end1 = time.time()
        print("Time passed", end1-start1,"sec")
        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
        

        t_end = time.time()
        print("Total Time passed", t_end-t_start,"sec")
    #return candidate_sentences[sentences_probab.index(max(sentences_probab))]
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted
    

def correctize_entire(sentence, p_lambda = 1,prior='bigram',tokenized = False):
    "Corrects the given 'sentence' using minimum edit"

    tokens = words(sentence)

    candidates = []    
    for _ in tokens:
        candidates.append(list(filter(lambda word: word in tokens or WORDS2[word]>1000 ,list(candidates_all_within1_full_expanded(_)))))
    candidate_count = [len(_) for _ in candidates]  
   
    candidate_sentences = list(itertools.product(*candidates))



    
    if prior == 'trigram':
        #bigram tokens for possible sentences
        tri_tokens = [words_trigram(' '.join(_)) for _ in candidate_sentences]
        tri_token_probab = []

        for row in tri_tokens:
            tri_token_probab.append([probability_trigram(_) for _ in row])
            
        #sentence_likelihood = likelihood(sentence)
        #sentences_probab = [math.prod(row) for row in tri_token_probab]
        sentences_probab_post = [math.prod(row)*constant_distributive_likelihood(sentence,candidate_sentence,candidate_count) for row,candidate_sentence in zip(tri_token_probab,candidate_sentences)]
        # sorted_index = numpy.argsort(sentences_probab)
        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
    #return candidate_sentences[sentences_probab.index(max(sentences_probab))]
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted
    
    if prior == 'bigram':

        #bigram tokens for possible sentences
        bi_tokens = [words_bigram(' '.join(_)) for _ in candidate_sentences]
        #bi_tokens = [[a,b for zip(_[:-1],_[1:])] for _ in candidate_sentences]

  
        
        bi_token_probab = []
   
        for row in bi_tokens:
            bi_token_probab.append([probability_bigram(_) for _ in row])  
            #sentence_likelihood_ = likelihood2(sentence,candidate_sentences)

        #sentence_likelihood = likelihood(sentence)

        # sentences_probab_post = [math.prod(row)*sentence_likelihood for row in bi_token_probab]
        sentences_probab_post = [math.log((math.prod(row)**p_lambda)*constant_distributive_likelihood(sentence,candidate_sentence,candidate_count)) for row,candidate_sentence in zip(bi_token_probab,candidate_sentences)]
        #sentences_log_probab = [math.ln(m) for m in sentences_probab_post]

        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
        

    #return candidate_sentences[sentences_probab.index(max(sentences_probab))]
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted

def tupler(x):
        return tuple(x.split())
    
def logprob(ngram,kn_lm2,minimum):
    if ngram in kn_lm2.lm[0]:
        return kn_lm2.lm[0][ngram]
    return minimum

def score_sent(sent):
    """
    Return log prob of the sentence.

    Params:
        sent [tuple->string] The words in the unpadded sentence.
    """
    padded = (
        ('<s>',) * (2 - 1) + sent)
    sent_logprob = 0
    for i in range(len(sent) - 2 + 1):
        ngram = sent[i:i+2]
        sent_logprob += logprob(ngram)
    return sent_logprob

def correctize_entire_knlm(sentence, model,p_lambda = 1,prior='bigram',tokenized = False):
    "Corrects the given 'sentence' using minimum edit"

    tokens = words(sentence)

    candidates = []    
    for _ in tokens:
        candidates.append(list(filter(lambda word: word in tokens or WORDS2[word]>5000,list(candidates_all(_)))))
    candidate_count = [len(_) for _ in candidates]  
   
    candidate_sentences = list(itertools.product(*candidates))


    minimum = min(model.lm[0].values())
    
    if prior == 'trigram':
        #bigram tokens for possible sentences
        tri_tokens = [words_trigram(' '.join(_)) for _ in candidate_sentences]
        tri_token_probab = []

        for row in tri_tokens:
            tri_token_probab.append([logprob(tuple(_),model,minumum) for _ in row])
        sentences_probab_post=[(sum(row)*p_lambda)+math.log(constant_distributive_likelihood(sentence,candidate_sentence,candidate_count)) for row,candidate_sentence in zip(tri_token_probab,candidate_sentences)]
        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted
    
    if prior == 'bigram':

        #bigram tokens for possible sentences
        bi_tokens = [words_bigram(' '.join(_)) for _ in candidate_sentences]
        #bi_tokens = [[a,b for zip(_[:-1],_[1:])] for _ in candidate_sentences]
        bi_token_probab = []
   
        for row in bi_tokens:
            bi_token_probab.append([logprob(tuple(_),model,minimum) for _ in row])  
            
        sentences_probab_post=[(sum(row)*p_lambda)+math.log(constant_distributive_likelihood(sentence,candidate_sentence,candidate_count)) 
                               for row,candidate_sentence in zip(bi_token_probab,candidate_sentences)]

        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
        
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted
    
def correctize_with_window(sentence,window = 5,p_lambda = 1,prior = 'bigram'):
    tokens = words(sentence)
    if len(tokens) <= window:
        return correctize_entire(sentence,p_lambda=p_lambda,prior = prior)
    else:
        windows = [tokens[n:window+n] for n in range(0,len(tokens),window-1) if window+n <len(tokens)-1]    
        remaining = (window-1)*len(windows)
        windows.append(tokens[remaining:])
        corrects = []
        for _ in windows:
            #corrects.append(correctize3(' '.join(_)))
            d = correctize_entire(' '.join(_),p_lambda=p_lambda,prior = prior)
            corrects.append(d)
        return corrects
    
def correctize_with_window_knlm(sentence,model,window = 5,p_lambda = 1,prior = 'bigram'):
    '''
    
    '''   
    
    tokens = words(sentence)
    if len(tokens) <= window:
        return correctize_entire_knlm(sentence,model,p_lambda=p_lambda,prior = prior)
    else:
        windows = [tokens[n:window+n] for n in range(0,len(tokens),window-1) if window+n <len(tokens)-1]    
        remaining = (window-1)*len(windows)
        windows.append(tokens[remaining:])
        corrects = []
        for _ in windows:
            #corrects.append(correctize3(' '.join(_)))
            d = correctize_entire_knlm(' '.join(_),model,p_lambda=p_lambda,prior = prior)
            corrects.append(d)
        return corrects
    
def return_choices(sample_sentences,model):
    d = correctize_with_window_knlm(sample_sentences,model)
    top_choice,other_choices = print_corrected_sentence(d)


    choices_list=[set() for i in range(len(sample_sentences.split())+1)]
    print(len(choices_list))
    
    const = 0
    for _ in other_choices:
        for sens in _:
            for i,w in enumerate(sens):
                index = i + const
                choices_list[index].add(w)
        const += len(other_choices[0][0])-1

    return choices_list  

    
def print_corrected_sentence(d,j = 0):
    s = ''
    k = []
    if(len(d)>1):
        for i in range(len(d)-1):
            s += ' '.join(d[i][0][j][0:4])
            s+=' '
            k.append(d[i][0][0:5])
    s+=' '.join(d[len(d)-1][0][j])
    k.append(d[len(d)-1][0][0:5])
    return s,k
    #return bi_token_probab
    
    
    

def timer(fun,args):
    import time
    s = time.time()
    k = fun(args)
    e = time.time()
    print("Time taken, : ",e-s," sec")
    return k 

In [476]:
def likelihood_bm(sentence,candidate_sentence,bm):
    prod =1 
    for word,candidate_word in zip(sentence.split(),candidate_sentence): 
        prod*=bm.likelihood(word,candidate_word)
        
    return prod

In [2]:
candidates_all('प्काश')

{'अवकाश',
 'आकाश',
 'पक्का',
 'पदका',
 'पाका',
 'प्काश',
 'प्याक',
 'प्याड',
 'प्रकार',
 'प्रकाश',
 'प्रकाशन',
 'प्रा',
 'प्राण',
 'प्राय',
 'प्लान',
 'प्वाल'}

In [3]:
import pickle
with open('data/saved_model_knlm2','rb') as inputfile:
    kn_lm2 = pickle.load(inputfile) 

In [91]:
import textdistance


num = [chr(_) for _ in range(2406,2416)]
char_as_word = [chr(_) for _ in range(2362,2383)]
def filterer(w):
    
    #Remove all the words not seperated with '।'
    if len(w)>1 and '।' in w:
        return False
        
    if w in char_as_word:
        return False
    
    
    #Filter all words with characters not needed
    for char in num:
        if char in w:
            return False
        
    return True

#Filter all words with characters not needed


# WORDS_ordered = list(dict(WORDS_full.most_common()).keys())

# WORDS_filtered = filter(filterer,WORDS_ordered)
# lis = list(WORDS_filtered)



with open('vocab_list.pickle','rb') as vl:
    lis = pickle.load(vl)
l = len(lis)


depth_dict = {0:lis[0:int(0.01*l)],1:lis[int(0.01*l):int(0.02*l)],2:lis[int(0.02*l):int(0.05*l)] , 3:lis[int(0.05*l):int(0.1*l)] , 4:lis[int(0.1*l):int(0.5*l)],
5:lis[int(0.5*l):l]}


def check_distance(w, depth = 1,edit_distance = 2,candidates = []):
    '''
    
    
    '''
    count = 0
    #candidates = []
    words = depth_dict[depth]
    for word in words:
        if(textdistance.levenshtein.distance(w, word)) <= edit_distance:
            count+=1
            candidates.append(word)
    return (candidates,count)



def check_distance2(w, depth = 1,edit_distance = 2):
    '''
    
    
    
    '''
    words = depth_dict[depth]
    candidates = list(filter(lambda word:textdistance.levenshtein.distance(w, word) <= edit_distance,words))        
    return (candidates,len(candidates))

In [325]:
# with open('vocab_list.pickle','wb') as vl:
#     pickle.dump(lis,vl)
    


In [362]:
0.05*1171236

58561.8

In [270]:
print(l)

def candidate_words(word,minimum = 1,start_depth = 0):
    '''
    
    '''
    c = None
    #If word length is less than 3 than only use edit distance of 1 or less
    if len(word)<=3:
        c,c_ = check_distance(word,depth = start_depth,edit_distance = 1,candidates = [])
        for i in range(len(depth_dict)-1):
            if c_ < minimum:
                c,c_ = check_distance(word, depth = start_depth+i+1,edit_distance = 1,candidates = c)
                
    #If word length is more than 3 than use edit distance of 2 or less
    else:        
        c,c_ = check_distance(word, depth = start_depth,candidates = [])
        for i in range(len(depth_dict)-1):
            if c_ < minimum:
                print("Entered depth, ",i+1)
                c,c_ = check_distance(word, depth = start_depth+i+1,candidates = c)
    
    #Filter 2 edit of type delete
    c = list(filter(lambda w: len(w)>=len(word)-1,c))
    if word not in c:
        c.append(word)
    return c

1171236


In [94]:
#म पुस्तकलयबाटे थुलो किताब पढ्न चाहन्छु ।

word_list = candidate_words('थुलो')
len(word_list)

31

In [95]:
from metaphone import doublemetaphone


doublemetaphone(sanscript.transliterate('थुलो', sanscript.DEVANAGARI, sanscript.ITRANS))[0]


'0L'

In [96]:
# %pip install romanize
# %pip install metaphone

# %pip install indic-transliteration
# #from transliterate import translit

from indic_transliteration import sanscript



def phonetic_distance(word,word_list,top = 5,include_metaphone = False):
    english_text = sanscript.transliterate(word, sanscript.DEVANAGARI, sanscript.ITRANS)
    m = []
    m1 = doublemetaphone(english_text)
    for w in word_list:
        english_text2 = sanscript.transliterate(w, sanscript.DEVANAGARI, sanscript.ITRANS)    
        
        if include_metaphone!=True:
            m.append(textdistance.levenshtein.distance(english_text.lower(),english_text2.lower()))
        else:
            m2 = doublemetaphone(english_text2)
            m.append(textdistance.levenshtein.distance(m1,m2))
            
        #m.append(textdistance.sorensen_dice(english_text.lower(),english_text2.lower()))        
    sorted_list = list(sorted(zip(m,word_list)))
    top_list = [x for _,x in sorted_list]
    if len(top_list)<top:
        return_list = top_list
    else:
        top_dis = sorted_list[top-1][0]
        return_list = [x for _,x in sorted_list if _<=top_dis ]
    return return_list

In [177]:
import textdistance

class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

class Trie:
    def __init__(self):
        self.root = TrieNode()


    def insert(self, word):
        current = self.root
        for char in word:
            if char not in current.children:
                current.children[char] = TrieNode()
            current = current.children[char]
        current.is_end_of_word = True
        
    def insert_list(self,lis):
        for w in lis:
            self.insert(w)
        return self

    def search(self, word, max_distance):
        current = self.root
        queue = [(current, "", 0)]
        found_words = []
        
        while queue:
            current, current_word, distance = queue.pop(0)
            if current.is_end_of_word:
                if distance <= max_distance:
                    found_words.append(current_word)
#                 else:
#                     candidate_words.append(current_word)
            for char, node in current.children.items():
                new_word = current_word + char
                new_distance = textdistance.levenshtein.distance(word, new_word)
                if new_distance <= int(abs(len(new_word) - len(word)))+2:
                    queue.append((node, new_word, new_distance))
            #print(list(queue))
        return found_words
    
# t = Trie()
# for w in lis[:110000]:
#     t.insert(w)
trie_depth = {0:Trie().insert_list(lis[0:int(0.01*l)]),
               1:Trie().insert_list(lis[int(0.01*l):int(0.02*l)]),
               2:Trie().insert_list(lis[int(0.02*l):int(0.05*l)]),
               3:Trie().insert_list(lis[int(0.05*l):int(0.1*l)]),
               4:Trie().insert_list(lis[int(0.1*l):int(0.2*l)]),
             5:Trie().insert_list(lis[int(0.1*l):int(0.5*l)]),
             6:Trie().insert_list(lis[int(0.5*l):])}

In [216]:
t = Trie()
for w in lis[:110000]:
    t.insert(w)

In [640]:
def check_distance_trie(w, depth = 1,edit_distance = 2,candidates = []):
    '''
    
    
    '''
    count = 0
    #candidates = []
    candidates = list(trie_depth[depth].search(w,edit_distance))

    return (candidates,len(candidates))

In [641]:
def candidate_words_trie(word,minimum = 1,start_depth = 0,edit_probabs = None):
    '''
    
    '''
    if edit_probabs == None:
        ed1 = 1
        ed2 = 2
    else:
        ed1 = edit_probabs[0]
        ed2 = edit_probabs[1]
    
    c = None
    #If word length is less than 3 than only use edit distance of 1 or less
    if len(word)<=3:
        c,c_ = check_distance_trie(word,depth = start_depth,edit_distance = ed1,candidates = [])
        for i in range(len(depth_dict)-1):
            if c_ < minimum:
                c,c_ = check_distance_trie(word, depth = start_depth+i+1,edit_distance = ed1,candidates = c)
                
    #If word length is more than 3 than use edit distance of 2 or less
    else:        
        c,c_ = check_distance_trie(word, depth = start_depth,edit_distance = ed2,candidates = [])
        for i in range(len(depth_dict)-1):
            if c_ < minimum:
                print("Entered depth, ",i+1)
                c,c_ = check_distance_trie(word, depth = start_depth+i+1,edit_distance = ed2,candidates = c)
    
    #Filter 2 edit of type delete
    c = list(filter(lambda w: len(w)>=len(word)-1,c))
    if word not in c:
        c.append(word)
    return c

In [222]:
doublemetaphone(sanscript.transliterate('हात', sanscript.DEVANAGARI, sanscript.ITRANS))[0]

'HT'

In [642]:
def final_candidate_words(word,minimum =1,top = 5,start_depth =0 ,use_trie = False):
    
    import time
    s = time.time()
    
    c = candidate_words(word,minimum = minimum,start_depth = start_depth) if use_trie == False else candidate_words_trie(word,minimum = minimum,start_depth = start_depth)
      
    
    if len(c) <6:
        e = time.time()
        print("time passed fc-: ",e-s)
        print(c)
        return c
    else:
        e = time.time()
        print("time passed fc: ",e-s)
        print(phonetic_distance(word,c,top = top))
        return phonetic_distance(word,c,top = top)
    e = time.time()

def final_candidate_words_trie(word,minimum =1,top = 5,start_depth =0 ):

    import time
    s = time.time()

    c = list(t.search(word,2))


    if len(c) <6:
        e = time.time()
        print("time passed fc: ",e-s)
        print(c)
        return c
    else:
        e = time.time()
        print("time passed fc: ",e-s)
        print(phonetic_distance(word,c,top = 10))
        return phonetic_distance(word,c,top = top)
    e = time.time()

    

In [209]:
list(t.search(word,2))

['रो',
 'मे',
 'मो',
 'मर',
 'परो',
 'गरो',
 'हेर',
 'नेर',
 'एरो',
 'करो',
 'बेर',
 'बरो',
 'मार',
 'मेल',
 'मेड',
 'मेस',
 'मेच',
 'मेट',
 'मेक',
 'मेन',
 'मेघ',
 'में',
 'मेष',
 'मेङ',
 'मेख',
 'महर',
 'मनो',
 'मतो',
 'मोर',
 'मर्',
 'मरे',
 'मरि',
 'मरी',
 'मरु',
 'मरण',
 'मरो',
 'मगर',
 'मदर',
 'मकर',
 'मको',
 'मीर',
 'मटर',
 'वेर',
 'सरो',
 'सेर',
 'दरो',
 'धेर',
 'चरो',
 'जरो',
 'फेर',
 'शेर',
 'टेर',
 'खरो',
 'खेर',
 'डेर',
 'झरो',
 'रेको',
 'रेनो',
 'रमेर',
 'छोरो',
 'छारो',
 'छेको',
 'छेलो',
 'प्रो',
 'पहरो',
 'पारो',
 'पेसो',
 'पेरु',
 'पेरी',
 'पीरो',
 'पिरो',
 'भेरी',
 'भेडो',
 'लेदो',
 'लहरो',
 'लौरो',
 'गोरो',
 'गारो',
 'गमेर',
 'गेसो',
 'गेडो',
 'गेरु',
 'गेरी',
 'हिरो',
 'हेर्',
 'हेरे',
 'हेरि',
 'हेरौ',
 'हेरी',
 'हेरक',
 'हेरर',
 'हेरै',
 'हेलो',
 'हेको',
 'युरो',
 'यूरो',
 'येरी',
 'उमेर',
 'तेसो',
 'तेरो',
 'तेर्',
 'तेरा',
 'तेरी',
 'तेरे',
 'तेरै',
 'तारो',
 'तिरो',
 'थारो',
 'थ्रो',
 'थेगो',
 'नेको',
 'नेटो',
 'नेरी',
 'नेरु',
 'नेरा',
 'कुरो',
 'केहो',
 'केटो'

In [181]:
len(final_candidate_words('हात'))

time passed fc:  0.2910580635070801
['हात', 'अत', 'खाता', 'गत', 'छाता', 'जात', 'दाता', 'नत', 'नाता', 'पात', 'बाट', 'भात', 'मत', 'माता', 'रात', 'सात', 'साता', 'हक', 'हद', 'हब', 'हल', 'हाते', 'हार', 'हाल', 'हावा', 'हिट', 'हित']


27

In [182]:
len(final_candidate_words('हात',use_trie = True))

time passed fc:  0.19330644607543945
['हात', 'अत', 'खाता', 'गत', 'छाता', 'जात', 'दाता', 'नत', 'नाता', 'पात', 'बाट', 'भात', 'मत', 'माता', 'रात', 'सात', 'साता', 'हक', 'हद', 'हब', 'हल', 'हाते', 'हार', 'हाल', 'हावा', 'हिट', 'हित']


27

In [183]:
final_candidate_words('पुस्तकलयबाटे')

Entered depth,  1
Entered depth,  2
Entered depth,  3
time passed fc:  9.418993949890137
['पुस्तकालयबाट', 'पुस्तकलयबाटे']


['पुस्तकालयबाट', 'पुस्तकलयबाटे']

In [184]:
final_candidate_words('पुस्तकलयबाटे',use_trie = True)

Entered depth,  1
Entered depth,  2
Entered depth,  3
time passed fc:  4.098645448684692
['पुस्तकालयबाट', 'पुस्तकलयबाटे']


['पुस्तकालयबाट', 'पुस्तकलयबाटे']

In [146]:
phonetic_distance('थुलो',word_list)

['ठुलो', 'ठूलो', 'थुलो', 'ठुला', 'थलो', 'धुलो', 'धूलो', 'कुलो', 'खुला', 'झुटो']

In [147]:
english_text2 = sanscript.transliterate('नीलो', sanscript.DEVANAGARI, sanscript.IAST) 
e3 = sanscript.transliterate('हुँ', sanscript.DEVANAGARI, sanscript.ITRANS)
e4 = sanscript.transliterate('hu.n',  sanscript.ITRANS,sanscript.DEVANAGARI)
english_text2, e3,e4 , 'hu.N'.lower()


('nīlo', 'hu.N', 'हुं', 'hu.n')

In [77]:
s = [x for _,x in sorted(zip(m,word_list))]

In [78]:
sorted(zip(m,word_list))

[(0, 'थुलो'),
 (2, 'कुलो'),
 (2, 'ठुलो'),
 (2, 'थलो'),
 (2, 'धुलो'),
 (4, 'कालो'),
 (4, 'किलो'),
 (4, 'कुरो'),
 (4, 'कुल'),
 (4, 'खुला'),
 (4, 'जालो'),
 (4, 'जुडो'),
 (4, 'झुटो'),
 (4, 'ठुला'),
 (4, 'ठूलो'),
 (4, 'ढिलो'),
 (4, 'थाले'),
 (4, 'थियो'),
 (4, 'धूलो'),
 (4, 'नीलो'),
 (4, 'नौलो'),
 (4, 'पालो'),
 (4, 'पुल'),
 (4, 'पुलको'),
 (4, 'फलो'),
 (4, 'फुल'),
 (4, 'मुल'),
 (4, 'युरो'),
 (4, 'सुलभ'),
 (4, 'हलो'),
 (4, 'हिलो')]

In [493]:
del sum

In [558]:
bm.edit_dict

defaultdict(int,
            {(None, 'ि'): 0.03423718960219251,
             ('फ', 'प'): 0.25680933852140075,
             ('व', 'म'): 0.01584953508030431,
             ('ध', 'म'): 0.09346846846846847,
             ('अ', 'आ'): 0.07934443288241415,
             ('ी', 'ि'): 0.32407647272162066,
             ('म', None): 0.14282997337390643,
             (None, 'ी'): 0.09774935636575036,
             ('ा', 'ो'): 0.4745404199676948,
             (None, 'ा'): 0.16578772527198737,
             (None, 'अ'): 0.03791213354372561,
             ('ल', 'अ'): 0.010130571814497974,
             (None, '्'): 0.12052570384519558,
             (None, 'य'): 0.016651440910223404,
             ('ि', 'ी'): 0.8480255285201436,
             ('ा', None): 0.384816552572879,
             ('क', 'त'): 0.040561095149569036,
             ('ी', None): 0.5893217967980934,
             (None, 'न'): 0.07910472552113612,
             ('ऊ', 'उ'): 1.0,
             ('स', 'श'): 0.350964891885608,
             ('अ', None): 0

In [887]:
sorted(bm.edit_dict.values())

[0.0006228718544971348,
 0.0006436342496470393,
 0.0006994456245790374,
 0.0007474462253965618,
 0.000751256411584892,
 0.0008030671985907466,
 0.0008030671985907466,
 0.0008927829914458932,
 0.0009844049531112379,
 0.0010768402430582262,
 0.0010769844435580375,
 0.0010796445477950336,
 0.0011152988231674487,
 0.0011692605435045561,
 0.001249899201677284,
 0.0013075917237135605,
 0.00141117651802274,
 0.001491815176195468,
 0.0015556441962504986,
 0.0015987044265426459,
 0.0016320397906844206,
 0.001640229216842455,
 0.0017559833506763788,
 0.0017740504798000162,
 0.0017855659828917864,
 0.0018133775452049117,
 0.0018270907731915953,
 0.0018315018315018315,
 0.0019019442096365174,
 0.0019501950195019502,
 0.0019501950195019502,
 0.0019723865877712033,
 0.0019931899343908314,
 0.002056285783404564,
 0.0020762395149904493,
 0.002097001910140354,
 0.0021140805743917034,
 0.002159289095590067,
 0.002175217521752175,
 0.0022019584477488215,
 0.002331485415263458,
 0.002357390808766385,
 0.0

In [1060]:
# del sum
def likelihood_bm(sentence,candidate_sentence):
    prod = 1
    for word,candidate_word in zip(sentence.split(),candidate_sentence):          
        prod*= bma.likelihood(word,candidate_word)
    #return 1
    return prod

def correctize_entire_knlm(sentence, model,p_lambda = 1,prior='bigram',tokenized = False,trie = False,likelihood = 'default'):
    "Corrects the given 'sentence' using minimum edit"

    tokens = words(sentence)

    candidates = []    
    for _ in tokens:
        #candidates.append(list(filter(lambda word: word in tokens or WORDS2[word]>5000,list(candidates_all(_)))))
        candidates.append(final_candidate_words(_,use_trie = trie))
    candidate_count = [len(_) for _ in candidates]  
   
    candidate_sentences = list(itertools.product(*candidates))


    minimum = min(model.lm[0].values())
    
    if prior == 'trigram':
        #bigram tokens for possible sentences
        tri_tokens = [words_trigram(' '.join(_)) for _ in candidate_sentences]
        tri_token_probab = []

        for row in tri_tokens:
            tri_token_probab.append([logprob(tuple(_),model,minumum) for _ in row])
        sentences_probab_post=[(sum(row)*p_lambda)+math.log(constant_distributive_likelihood(sentence,candidate_sentence,candidate_count)) for row,candidate_sentence in zip(tri_token_probab,candidate_sentences)]
        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted
    
    if prior == 'bigram':

        #bigram tokens for possible sentences
        bi_tokens = [words_bigram(' '.join(_)) for _ in candidate_sentences]
        #bi_tokens = [[a,b for zip(_[:-1],_[1:])] for _ in candidate_sentences]
        bi_token_probab = []
   
        for row in bi_tokens:
            bi_token_probab.append([logprob(tuple(_),model,minimum) for _ in row])  
            
            
        for row in bi_tokens:
            bi_token_probab.append([logprob(tuple(_),model,minimum) for _ in row])  
 

#         sentences_probab_post=[(sum(row)*p_lambda)+math.log(constant_distributive_likelihood(sentence,candidate_sentence,candidate_count)) for row,candidate_sentence in zip(bi_token_probab,candidate_sentences)]


        
 
        if likelihood=='default':
            sentences_probab_post=[(sum(row)*p_lambda) +
                                   math.log(constant_distributive_likelihood(sentence,candidate_sentence,candidate_count)) 
                                   for row,candidate_sentence in zip(bi_token_probab,candidate_sentences)]
        elif likelihood=='bm':
            sentences_probab_post=[(sum(row)*p_lambda) + 
                                    math.log(likelihood_bm(sentence,candidate_sentence)) 
                                    for row,candidate_sentence in zip(bi_token_probab,candidate_sentences)]


        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
        
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted
    
def correctize_with_window(sentence,window = 5,p_lambda = 1,prior = 'bigram'):
    tokens = words(sentence)
    if len(tokens) <= window:
        return correctize_entire(sentence,p_lambda=p_lambda,prior = prior)
    else:
        windows = [tokens[n:window+n] for n in range(0,len(tokens),window-1) if window+n <len(tokens)-1]    
        remaining = (window-1)*len(windows)
        windows.append(tokens[remaining:])
        corrects = []
        for _ in windows:
            #corrects.append(correctize3(' '.join(_)))
            d = correctize_entire(' '.join(_),p_lambda=p_lambda,prior = prior)
            corrects.append(d)
        return corrects
    
def correctize_with_window_knlm(sentence,model,window = 5,p_lambda = 1,prior = 'bigram',trie = False,likelihood = 'default'):
    '''
    
    '''   
    
    tokens = words(sentence)
    if len(tokens) <= window:
        return correctize_entire_knlm(sentence,model,p_lambda=p_lambda,prior = prior,trie = trie,likelihood = likelihood)
    else:
        windows = [tokens[n:window+n] for n in range(0,len(tokens),window-1) if window+n <len(tokens)-1]    
        remaining = (window-1)*len(windows)
        windows.append(tokens[remaining:])
        corrects = []
        for _ in windows:
            #corrects.append(correctize3(' '.join(_)))
            d = correctize_entire_knlm(' '.join(_),model,p_lambda=p_lambda,prior = prior,trie = trie,likelihood = likelihood)
            corrects.append(d)
        return corrects

In [1061]:
sample_sentences = ['हरेक सेपालीले नेपामको संविधानक पालना गर्नुपर्छ ।' ,
                    'म पुस्तकलयबाटे थुलो किताब पढ्न चाहन्छु ।',
                    'तर उस समयमा पनि स्वस्थ राजनैतिक वातावरणको अभावले गर्दा देश विकासतर्फ विशेष प्रगति हुन  सकेन।',
                   'नेपालमा आधुनिक रुपमा आर्थक विकाससम्बन्धी कार्यरू प्रारम्भ भएको हालै मात्र हो।',
                   'हार धुनुहोस् र स्वास्थ जीवन जिउनुहोस्।',
                   'जब प्रवीधिहरू एकीकृत हुन सूरु गर्छन् अर्थतन्त्र तथ सँस्कृति पनि निश्चितरूपमा विस्तारै एकीकृत हुने छ।',
                   'उद्देयहरुमा पनि कुनै एक उद्देश्य पूर्ति नहुँदै अर्को नयाँ  उद्देश्यको रुपमा लिइने परम्परा बस्यो।',
                   'लगानीकर्ताहरूको धयान तुरुन्त फेरियो , व्यापारीहरूले वताए ।']

In [1066]:
wc,wp = return_choices2(sample_sentences[4],model = kn_lm2,p_lambda = 0.8,likelihood = 'bm',trie = True)

time passed fc:  0.25834131240844727
['हार', 'कार', 'चार', 'तार', 'धार', 'पार', 'बार', 'भार', 'मार', 'सार', 'हात', 'हाल', 'हेर']
Entered depth,  1
time passed fc-:  0.4378623962402344
['दिनुहोस्', 'धुनुहोस्']
time passed fc:  0.4358210563659668
['र', 'अ', 'आ', 'आर', 'क', 'ग', 'ज', 'त', 'द', 'न', 'प', 'म', 'रु', 'रे', 'र्', 'ल', 'व', 'स']
time passed fc-:  0.2124311923980713
['स्वस्थ', 'अस्वस्थ', 'स्वार्थ', 'स्वास्थ्य', 'स्वास्थ']
time passed fc:  0.08077573776245117
['जिवन', 'जीवन', 'आजीवन', 'जवान', 'जीवन्त']
time passed fc:  0.07698297500610352
['जिवन', 'जीवन', 'आजीवन', 'जवान', 'जीवन्त']
Entered depth,  1
time passed fc-:  0.5449841022491455
['दिनुहोस्', 'जिउनुहोस्']
time passed fc:  0.4259011745452881
['।', 'ः', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ओ']


In [1067]:
list(zip(wc,wp))

[([('हात', 'धुनुहोस्', 'र', 'स्वस्थ', 'जीवन'),
   ('हात', 'धुनुहोस्', 'र', 'स्वास्थ', 'जीवन'),
   ('हात', 'धुनुहोस्', 'र', 'स्वास्थ्य', 'जीवन'),
   ('हात', 'धुनुहोस्', 'र', 'अस्वस्थ', 'जीवन'),
   ('हार', 'धुनुहोस्', 'र', 'स्वस्थ', 'जीवन'),
   ('हात', 'धुनुहोस्', 'र', 'स्वस्थ', 'जिवन'),
   ('हार', 'धुनुहोस्', 'र', 'स्वास्थ', 'जीवन'),
   ('हाल', 'धुनुहोस्', 'र', 'स्वस्थ', 'जीवन'),
   ('धार', 'धुनुहोस्', 'र', 'स्वस्थ', 'जीवन'),
   ('बार', 'धुनुहोस्', 'र', 'स्वस्थ', 'जीवन')],
  [-66.23554771478263,
   -69.30151037576965,
   -73.15026525775431,
   -73.39893732276612,
   -74.60941926250624,
   -76.06717291599435,
   -77.67538192349325,
   -79.98112004004228,
   -80.01980570701275,
   -80.18057892760051]),
 ([('जीवन', 'जिउनुहोस्', '।'),
   ('जिवन', 'जिउनुहोस्', '।'),
   ('आजीवन', 'जिउनुहोस्', '।'),
   ('जीवन्त', 'जिउनुहोस्', '।'),
   ('जीवन', 'दिनुहोस्', '।'),
   ('जवान', 'जिउनुहोस्', '।'),
   ('जिवन', 'दिनुहोस्', '।'),
   ('आजीवन', 'दिनुहोस्', '।'),
   ('जीवन्त', 'दिनुहोस्', '।'),
   ('जवान'

In [855]:
'सँस्कृति' in lines

True

In [962]:
bm.edit_dict

defaultdict(int,
            {(None, 'ि'): 0.006847437920438501,
             ('फ', 'प'): 0.05136186770428014,
             ('व', 'म'): 0.0031699070160608614,
             ('ध', 'म'): 0.01869369369369369,
             ('अ', 'आ'): 0.015868886576482825,
             ('ी', 'ि'): 0.06481529454432412,
             ('म', None): 0.02856599467478128,
             (None, 'ी'): 0.01954987127315007,
             ('ा', 'ो'): 0.09490808399353894,
             (None, 'ा'): 0.03315754505439747,
             (None, 'अ'): 0.00758242670874512,
             ('ल', 'अ'): 0.0020261143628995944,
             (None, '्'): 0.02410514076903911,
             (None, 'य'): 0.00333028818204468,
             ('ि', 'ी'): 0.16960510570402867,
             ('ा', None): 0.07696331051457578,
             ('क', 'त'): 0.008112219029913806,
             ('ी', None): 0.11786435935961866,
             (None, 'न'): 0.01582094510422722,
             ('ऊ', 'उ'): 0.19999999999999996,
             ('स', 'श'): 0.07019297837712157,


In [953]:
bm.likelihood('पुस्तकलयबाटे','पुस्तकालयबाट')

0.002598238292258348

In [960]:
bm.likelihood_from_list((None,),['ा'])

[(0.03315754505439747, 'ा')]

In [1055]:
extract_choices(sample_sentences[1],model = kn_lm2,p_lambda=0.3,likelihood = 'bm',trie = True)

time passed fc:  0.7268452644348145
['म', 'मा', 'अ', 'आ', 'आम', 'एम', 'ओम', 'क', 'ग', 'ज', 'त', 'द', 'न', 'प', 'मि', 'मे', 'मै', 'र', 'ल', 'व', 'स']
Entered depth,  1
Entered depth,  2
Entered depth,  3
time passed fc-:  7.492818117141724
['पुस्तकालयबाट', 'पुस्तकलयबाटे']
time passed fc:  0.15712642669677734
['ठुलो', 'ठूलो', 'थुलो', 'ठुला', 'थलो', 'धुलो', 'धूलो']
time passed fc:  0.22457361221313477
['किताब', 'किटान', 'कतार', 'किताबमा', 'किनार', 'किरात', 'किसान', 'पिसाब', 'हिसाब']
time passed fc:  0.18586492538452148
['पढ्न', 'पढ्दा', 'पढ्ने', 'बढ्न', 'चढ्न', 'पठन', 'पढाइ', 'पढाई', 'पढाउन', 'पढे', 'पढेर', 'पढ्दै', 'पतन', 'परेन', 'पर्न', 'पवन', 'पस्न', 'पाउन', 'पाएन', 'पाटन', 'पान', 'पालन', 'बच्न', 'बढ्नु', 'बढ्ने', 'लड्न']
time passed fc:  0.19008088111877441
['पढ्न', 'पढ्दा', 'पढ्ने', 'बढ्न', 'चढ्न', 'पठन', 'पढाइ', 'पढाई', 'पढाउन', 'पढे', 'पढेर', 'पढ्दै', 'पतन', 'परेन', 'पर्न', 'पवन', 'पस्न', 'पाउन', 'पाएन', 'पाटन', 'पान', 'पालन', 'बच्न', 'बढ्नु', 'बढ्ने', 'लड्न']
time passed fc:  0.32

[{'क', 'ज', 'न', 'म', 'मा', 'मि', 'र', 'व', 'स'},
 {'पुस्तकलयबाटे'},
 {'थुलो', 'धुलो'},
 {'किताब'},
 {'पढाउन', 'पढ्न', 'पढ्ने', 'पस्न', 'बढ्न', 'लड्न'},
 {'चाहन्छ', 'चाहन्छन्', 'चाहन्छु'},
 {'ः', '।'},
 set()]

In [799]:
wc,wp= return_choices2(sample_sentences[5],model = kn_lm2,p_lambda=0.4,likelihood = 'default',trie = True)

time passed fc:  0.5874612331390381
['जब', 'अब', 'जग', 'जन', 'जय', 'जल', 'जस', 'तब', 'सब', 'हब']
Entered depth,  1
Entered depth,  2
time passed fc-:  1.23466157913208
['प्रविधिहरू', 'प्रवीधिहरू']
time passed fc-:  0.1077280044555664
['एकीकृत', 'एकीकरण']
time passed fc:  0.13761591911315918
['हुन', 'कुन', 'छुन', 'जुन', 'धुन', 'नुन', 'पुन', 'रुन', 'सुन', 'हुनु', 'हुने', 'हुनै', 'हुन्']
time passed fc:  0.11174654960632324
['सुरु', 'सुरू', 'सूरु', 'गुरु', 'शुरु']
time passed fc:  0.10272908210754395
['सुरु', 'सुरू', 'सूरु', 'गुरु', 'शुरु']
time passed fc:  0.13962650299072266
['गर्छन्', 'गर्छन', 'गर्छिन्', 'गर्छौ', 'पर्छन्']
time passed fc-:  0.2652902603149414
['अर्थतन्त्र', 'अर्थमन्त्री', 'अर्थतन्त्रको', 'अर्थतन्त्रका', 'अर्थतन्त्रमा']
time passed fc:  0.5764565467834473
['तथ', 'तथा', 'तह', 'रथ', 'तब', 'तय', 'तर', 'तल']
time passed fc-:  0.2124319076538086
['संस्कृत', 'संस्कृति', 'साँस्कृतिक', 'सँस्कृति']
time passed fc-:  0.20142722129821777
['संस्कृत', 'संस्कृति', 'साँस्कृतिक', 'सँस्

In [391]:
d = correctize_with_window_knlm(sample_sentences[1],kn_lm2)
top_choice,other_choices = print_corrected_sentence(d)
print("corrected:",top_choice)
print(other_choices,"\n")
    

time passed fc:  0.1595466136932373
['म', 'मा', 'अ', 'आ', 'आम', 'एम', 'ओम', 'क', 'ग', 'ज', 'त', 'द', 'न', 'प', 'मि', 'मे', 'मै', 'र', 'ल', 'व', 'स']
Entered depth,  1
Entered depth,  2
Entered depth,  3
time passed fc:  9.314621686935425
['पुस्तकालयबाट', 'पुस्तकलयबाटे']
time passed fc:  0.3435852527618408
['ठुलो', 'ठूलो', 'थुलो', 'ठुला', 'थलो', 'धुलो', 'धूलो', 'कुलो', 'खुला', 'झुटो', 'ढिलो', 'थाले', 'थियो', 'नौलो', 'फलो', 'फुल', 'हलो', 'हिलो']
time passed fc:  0.39594030380249023
['किताब', 'किटान', 'कतार', 'किताबमा', 'किनार', 'किरात', 'किसान', 'पिसाब', 'हिसाब', 'किरा', 'पिता']
time passed fc:  0.33409857749938965
['पढ्न', 'पढ्दा', 'पढ्ने', 'बढ्न', 'चढ्न', 'पठन', 'पढाइ', 'पढाई', 'पढाउन', 'पढे', 'पढेर', 'पढ्दै', 'पतन', 'परेन', 'पर्न', 'पवन', 'पस्न', 'पाउन', 'पाएन', 'पाटन', 'पान', 'पार्न', 'पालन', 'पाल्न', 'बच्न', 'बढ्नु', 'बढ्ने', 'लड्न']
time passed fc:  0.33011603355407715
['पढ्न', 'पढ्दा', 'पढ्ने', 'बढ्न', 'चढ्न', 'पठन', 'पढाइ', 'पढाई', 'पढाउन', 'पढे', 'पढेर', 'पढ्दै', 'पतन', 'परेन', 

In [645]:
def return_choices(sample_sentences,model,trie = False,likelihood = 'default'):
    import time
    
    s = time.time()
    d = correctize_with_window_knlm(sample_sentences,model,trie = trie,likelihood = 'default')
    top_choice,other_choices = print_corrected_sentence(d)


    choices_list=[set() for i in range(len(sample_sentences.split())+1)]
    print(len(choices_list))

    const = 0
    for _ in other_choices:
        for sens in _:
            for i,w in enumerate(sens):
                index = i + const
                choices_list[index].add(w)
        const += len(other_choices[0][0])-1
    e = time.time()
    
    print("Time Passed:", e-s)
    return choices_list

In [724]:
def return_choices2(sample_sentences,model,p_lambda = 1,trie = False,likelihood = 'default'):
    d = correctize_with_window_knlm(sample_sentences,model,p_lambda =p_lambda,trie = trie,likelihood = likelihood)
    window_candidates = []
    window_probab = []
    for window in d:
        maxim = min(len(window[0]),10)
        top_candidates = window[0][:maxim]
        window_candidates.append(top_candidates)
        window_probab.append(window[1][:maxim])
    return window_candidates,window_probab
        
        
    

In [766]:
def extract_choices(sample_sentences,model,p_lambda = 1,trie = False,likelihood = 'default'):
    wc,wp = return_choices2(sample_sentences,model,p_lambda = p_lambda,trie = trie ,likelihood = likelihood)
    
    choices_list=[set() for i in range(len(sample_sentences.split())+1)]
    print(len(choices_list))

    const = 0
    for _ in wc:
        for sens in _:
            for i,w in enumerate(sens):
                index = i + const
                choices_list[index].add(w)
        const += len(wc[0][0])-1
    return choices_list
        
    

In [651]:
phonetic_distance('उस',candidate_words_trie('उस'))

['उस', 'आस', 'उप', 'उसो', 'एस', 'स']

In [628]:
d = correctize_with_window_knlm(sample_sentences[2],model=kn_lm2,window = 5,p_lambda = 1,prior = 'bigram')

time passed fc:  0.22739076614379883
['तर', 'तार', 'आर', 'कर', 'गर', 'डर', 'तब', 'तय', 'तल', 'तह', 'तिर', 'थर', 'दर', 'पर', 'सर']
time passed fc:  0.21043658256530762
['उस', 'आस', 'उप', 'उसो', 'एस', 'स']
time passed fc:  0.4029214382171631
['समयमा', 'समयका', 'समयमै', 'समाजमा', 'संघमा', 'सडकमा', 'सतहमा', 'सदनमा', 'समयको', 'समयलाई', 'समयले', 'समयसीमा', 'समूहमा', 'सम्ममा', 'सहरमा']
time passed fc:  0.290743350982666
['पनि', 'अनि', 'पति', 'उनि', 'नि', 'पछि', 'भनि']
time passed fc:  0.4647557735443115
['स्वस्थ', 'अस्वस्थ', 'स्वस्थ्य', 'स्वार्थ', 'ध्वस्त', 'स्वच्छ', 'स्वर्ग', 'स्वर्ण']
time passed fc:  0.44738245010375977
['स्वस्थ', 'अस्वस्थ', 'स्वस्थ्य', 'स्वार्थ', 'ध्वस्त', 'स्वच्छ', 'स्वर्ग', 'स्वर्ण']
time passed fc-:  0.5833919048309326
['राजनीतिक', 'राजनीति', 'राजनीतिको', 'राजनैतिक', 'राजनीतिका']
time passed fc-:  0.6463034152984619
['वातावरणीय', 'वातावरणमा', 'वातावरणको']
time passed fc-:  0.4727017879486084
['अभावमा', 'अभावले', 'अभावका', 'सभाले', 'अभावको']
time passed fc:  0.411896944

In [833]:

extract_choices(sample_sentences[2],model=kn_lm2,p_lambda = 0.22,trie = True,likelihood = 'bm')

time passed fc:  0.5520272254943848
['तर', 'तार', 'आर', 'कर', 'गर', 'डर', 'तब', 'तय', 'तल', 'तह', 'तिर', 'थर', 'दर', 'पर', 'सर']
time passed fc:  0.5505266189575195
['उस', 'आस', 'उप', 'उसो', 'एस', 'स']
time passed fc:  0.12270736694335938
['समयमा', 'समयका', 'समयमै', 'संघमा', 'सडकमा', 'सतहमा', 'सदनमा', 'समयको', 'समयलाई', 'समयले', 'समयसीमा', 'सम्ममा', 'सहरमा']
time passed fc:  0.1914536952972412
['पनि', 'अनि', 'पति', 'उनि', 'नि', 'पछि', 'भनि']
time passed fc:  0.14162039756774902
['स्वस्थ', 'अस्वस्थ', 'स्वस्थ्य', 'ध्वस्त', 'स्वच्छ', 'स्वर्ग', 'स्वर्ण']
time passed fc:  0.13763117790222168
['स्वस्थ', 'अस्वस्थ', 'स्वस्थ्य', 'ध्वस्त', 'स्वच्छ', 'स्वर्ग', 'स्वर्ण']
time passed fc-:  0.21642065048217773
['राजनीति', 'राजनीतिक', 'राजनैतिक', 'राजनीतिको', 'राजनीतिका']
time passed fc-:  0.2433490753173828
['वातावरणीय', 'वातावरणमा', 'वातावरणको']
time passed fc-:  0.15209746360778809
['सभाले', 'अभावमा', 'अभावले', 'अभावका', 'अभावको']
time passed fc:  0.1406235694885254
['गर्दा', 'गर्दै', 'गर्न', 'गर्

[{'आर'},
 {'एस'},
 {'संघमा', 'समयमा', 'समयलाई', 'समयले', 'सहरमा'},
 {'पनि'},
 {'अस्वस्थ', 'स्वच्छ', 'स्वर्ण', 'स्वस्थ'},
 {'राजनीति', 'राजनीतिक'},
 {'वातावरणको'},
 {'अभावमा', 'अभावले'},
 {'गर्दा', 'गर्न', 'पर्दा'},
 {'देश'},
 {'विकासतर्फ'},
 {'विशेष'},
 {'प्रगति', 'प्रगतिको', 'प्रगाढ', 'प्रति', 'प्रालि', 'प्रावि'},
 {'हुन', 'हुने'},
 {'सकिएन', 'सकेका', 'सकेन', 'सकेनन्', 'सकेमा'},
 {'।'}]

In [831]:
list(zip(wc,wb))

[([('आर', 'एस', 'समयले', 'पनि', 'स्वर्ण'),
   ('आर', 'एस', 'समयले', 'पनि', 'स्वस्थ'),
   ('आर', 'एस', 'समयलाई', 'पनि', 'स्वर्ण'),
   ('आर', 'एस', 'समयलाई', 'पनि', 'स्वस्थ'),
   ('आर', 'एस', 'सहरमा', 'पनि', 'स्वर्ण'),
   ('आर', 'एस', 'समयले', 'पनि', 'स्वच्छ'),
   ('आर', 'एस', 'सहरमा', 'पनि', 'स्वस्थ'),
   ('आर', 'एस', 'समयमा', 'पनि', 'स्वर्ण'),
   ('आर', 'एस', 'संघमा', 'पनि', 'स्वर्ण'),
   ('आर', 'एस', 'समयमा', 'पनि', 'स्वस्थ')],
  [-20.049237233790468,
   -20.07569618557374,
   -20.148348087347888,
   -20.174807039131164,
   -20.208202901840966,
   -20.210243514869074,
   -20.23466185362424,
   -20.26536002108137,
   -20.281103291239628,
   -20.291818972864643]),
 ([('स्वस्थ', 'राजनीतिक', 'वातावरणको', 'अभावले', 'गर्दा'),
   ('स्वस्थ', 'राजनीतिक', 'वातावरणको', 'अभावमा', 'गर्न'),
   ('अस्वस्थ', 'राजनीतिक', 'वातावरणको', 'अभावले', 'गर्दा'),
   ('अस्वस्थ', 'राजनीतिक', 'वातावरणको', 'अभावमा', 'गर्न'),
   ('स्वच्छ', 'राजनीतिक', 'वातावरणको', 'अभावले', 'गर्दा'),
   ('स्वस्थ', 'राजनीतिक', 'वातावर

In [760]:
wc,wb = return_choices2(sample_sentences[2],model=kn_lm2,p_lambda = 0.3,trie = True,likelihood = 'bm')

time passed fc:  0.595526933670044
['तर', 'तार', 'आर', 'कर', 'गर', 'डर', 'तब', 'तय', 'तल', 'तह', 'तिर', 'थर', 'दर', 'पर', 'सर']
time passed fc:  0.5685725212097168
['उस', 'आस', 'उप', 'उसो', 'एस', 'स']
time passed fc:  0.12167477607727051
['समयमा', 'समयका', 'समयमै', 'संघमा', 'सडकमा', 'सतहमा', 'सदनमा', 'समयको', 'समयलाई', 'समयले', 'समयसीमा', 'सम्ममा', 'सहरमा']
time passed fc:  0.19451165199279785
['पनि', 'अनि', 'पति', 'उनि', 'नि', 'पछि', 'भनि']
time passed fc:  0.14957261085510254
['स्वस्थ', 'अस्वस्थ', 'स्वस्थ्य', 'ध्वस्त', 'स्वच्छ', 'स्वर्ग', 'स्वर्ण']
time passed fc:  0.141265869140625
['स्वस्थ', 'अस्वस्थ', 'स्वस्थ्य', 'ध्वस्त', 'स्वच्छ', 'स्वर्ग', 'स्वर्ण']
time passed fc-:  0.2263927459716797
['राजनीति', 'राजनीतिक', 'राजनैतिक', 'राजनीतिको', 'राजनीतिका']
time passed fc-:  0.24437928199768066
['वातावरणीय', 'वातावरणमा', 'वातावरणको']
time passed fc-:  0.152557373046875
['सभाले', 'अभावमा', 'अभावले', 'अभावका', 'अभावको']
time passed fc:  0.1346721649169922
['गर्दा', 'गर्दै', 'गर्न', 'गर्ला',

In [767]:
extract_choices(sample_sentences[2],model=kn_lm2,p_lambda = 0.3,trie = True,likelihood = 'bm')

time passed fc:  0.5460422039031982
['तर', 'तार', 'आर', 'कर', 'गर', 'डर', 'तब', 'तय', 'तल', 'तह', 'तिर', 'थर', 'दर', 'पर', 'सर']
time passed fc:  0.5475728511810303
['उस', 'आस', 'उप', 'उसो', 'एस', 'स']
time passed fc:  0.13367652893066406
['समयमा', 'समयका', 'समयमै', 'संघमा', 'सडकमा', 'सतहमा', 'सदनमा', 'समयको', 'समयलाई', 'समयले', 'समयसीमा', 'सम्ममा', 'सहरमा']
time passed fc:  0.1871500015258789
['पनि', 'अनि', 'पति', 'उनि', 'नि', 'पछि', 'भनि']
time passed fc:  0.14461827278137207
['स्वस्थ', 'अस्वस्थ', 'स्वस्थ्य', 'ध्वस्त', 'स्वच्छ', 'स्वर्ग', 'स्वर्ण']
time passed fc:  0.13763117790222168
['स्वस्थ', 'अस्वस्थ', 'स्वस्थ्य', 'ध्वस्त', 'स्वच्छ', 'स्वर्ग', 'स्वर्ण']
time passed fc-:  0.2174515724182129
['राजनीति', 'राजनीतिक', 'राजनैतिक', 'राजनीतिको', 'राजनीतिका']
time passed fc-:  0.24534201622009277
['वातावरणीय', 'वातावरणमा', 'वातावरणको']
time passed fc-:  0.15359044075012207
['सभाले', 'अभावमा', 'अभावले', 'अभावका', 'अभावको']
time passed fc:  0.1326451301574707
['गर्दा', 'गर्दै', 'गर्न', 'गर्

[{'आर', 'गर', 'तर', 'पर'},
 {'आस', 'उप', 'उसो', 'एस', 'स'},
 {'सदनमा', 'समयको', 'समयमा', 'सहरमा'},
 {'पनि'},
 {'अस्वस्थ', 'स्वच्छ', 'स्वस्थ'},
 {'राजनीति', 'राजनीतिक', 'राजनैतिक'},
 {'वातावरणको', 'वातावरणमा'},
 {'अभावमा', 'अभावले'},
 {'गर्दा', 'गर्दै', 'गर्न', 'पर्दा'},
 {'देश'},
 {'विकासतर्फ'},
 {'विशेष'},
 {'प्रगति', 'प्रगाढ', 'प्रति', 'प्रालि', 'प्रावि'},
 {'हुन', 'हुने', 'हुनै'},
 {'सकिएन', 'सकिन', 'सकेका', 'सकेन', 'सकेनन्', 'सकेर', 'सक्न'},
 {'।'}]

In [761]:
list(zip(wc,wb))

[([('तर', 'स', 'समयमा', 'पनि', 'स्वस्थ'),
   ('आर', 'एस', 'समयमा', 'पनि', 'स्वस्थ'),
   ('पर', 'स', 'समयमा', 'पनि', 'स्वस्थ'),
   ('तर', 'आस', 'समयमा', 'पनि', 'स्वस्थ'),
   ('तर', 'उप', 'समयमा', 'पनि', 'स्वस्थ'),
   ('गर', 'उसो', 'समयमा', 'पनि', 'स्वस्थ'),
   ('तर', 'स', 'सदनमा', 'पनि', 'स्वस्थ'),
   ('आर', 'एस', 'सदनमा', 'पनि', 'स्वस्थ'),
   ('तर', 'स', 'समयको', 'पनि', 'स्वस्थ'),
   ('तर', 'स', 'सहरमा', 'पनि', 'स्वस्थ')],
  [-31.391029201064697,
   -31.464902205496276,
   -32.51399739517944,
   -34.21268159987448,
   -34.5984770492736,
   -35.22173375891981,
   -35.2480037670744,
   -35.321876771505984,
   -35.346165280725614,
   -35.351170953874686]),
 ([('स्वस्थ', 'राजनीतिक', 'वातावरणको', 'अभावले', 'गर्दा'),
   ('अस्वस्थ', 'राजनीतिक', 'वातावरणको', 'अभावले', 'गर्दा'),
   ('स्वस्थ', 'राजनैतिक', 'वातावरणको', 'अभावले', 'गर्दा'),
   ('स्वस्थ', 'राजनीतिक', 'वातावरणको', 'अभावले', 'गर्न'),
   ('स्वच्छ', 'राजनीतिक', 'वातावरणको', 'अभावले', 'गर्दा'),
   ('स्वस्थ', 'राजनीतिक', 'वातावरणको', 'अभा

In [669]:
list(zip(wc,wb))

[([('आर', 'एस', 'समयमा', 'पनि', 'स्वस्थ'),
   ('पर', 'स', 'समयमा', 'पनि', 'स्वस्थ'),
   ('आर', 'एस', 'समयले', 'पनि', 'स्वस्थ'),
   ('आर', 'एस', 'समयमा', 'पनि', 'स्वर्ण'),
   ('आर', 'एस', 'समयलाई', 'पनि', 'स्वस्थ'),
   ('तर', 'स', 'समयमा', 'पनि', 'स्वस्थ'),
   ('आर', 'एस', 'सहरमा', 'पनि', 'स्वस्थ'),
   ('आर', 'एस', 'समयमा', 'पनि', 'स्वच्छ'),
   ('आर', 'एस', 'संघमा', 'पनि', 'स्वस्थ'),
   ('गर', 'उसो', 'समयमा', 'पनि', 'स्वस्थ')],
  [-100.12734345335336,
   -101.23073887362231,
   -102.32895571335338,
   -102.57202484816366,
   -102.77945959315984,
   -102.8777439786926,
   -103.05152693176474,
   -103.30387158033915,
   -103.38289233812229,
   -103.41156540378516]),
 ([('स्वस्थ', 'राजनीतिक', 'वातावरणको', 'अभावले', 'गर्दा'),
   ('अस्वस्थ', 'राजनीतिक', 'वातावरणको', 'अभावले', 'गर्दा'),
   ('स्वच्छ', 'राजनीतिक', 'वातावरणको', 'अभावले', 'गर्दा'),
   ('स्वस्थ', 'राजनीतिक', 'वातावरणको', 'अभावमा', 'गर्न'),
   ('स्वस्थ', 'राजनीतिक', 'वातावरणको', 'अभावले', 'गर्न'),
   ('अस्वस्थ', 'राजनीतिक', 'वातावर

In [655]:
wc

[[('आर', 'एस', 'समयमा', 'पनि', 'स्वस्थ'),
  ('पर', 'स', 'समयमा', 'पनि', 'स्वस्थ'),
  ('आर', 'एस', 'समयले', 'पनि', 'स्वस्थ'),
  ('आर', 'एस', 'समयमा', 'पनि', 'स्वर्ण'),
  ('तर', 'स', 'समयमा', 'पनि', 'स्वस्थ')],
 [('स्वस्थ', 'राजनीतिक', 'वातावरणको', 'अभावले', 'गर्दा'),
  ('अस्वस्थ', 'राजनीतिक', 'वातावरणको', 'अभावले', 'गर्दा'),
  ('स्वस्थ', 'राजनीतिक', 'वातावरणको', 'अभावमा', 'गर्न'),
  ('स्वच्छ', 'राजनीतिक', 'वातावरणको', 'अभावले', 'गर्दा'),
  ('स्वस्थ', 'राजनीतिक', 'वातावरणको', 'अभावले', 'गर्न')],
 [('गर्दा', 'देश', 'विकासतर्फ', 'विशेष', 'प्रजाति'),
  ('पर्दा', 'देश', 'विकासतर्फ', 'विशेष', 'प्रजाति'),
  ('गर्दै', 'देश', 'विकासतर्फ', 'विशेष', 'प्रजाति'),
  ('गर्न', 'देश', 'विकासतर्फ', 'विशेष', 'प्रजाति'),
  ('गर्दा', 'देश', 'विकासतर्फ', 'विशेष', 'प्रगति')],
 [('प्रगति', 'हुन', 'सकेन', '।'),
  ('प्रसूति', 'हुन', 'सकेन', '।'),
  ('प्रगति', 'हुन', 'सकेका', '।'),
  ('प्रगति', 'हुन', 'सकेनन्', '।'),
  ('प्रजाति', 'हुन', 'सकेन', '।')]]

In [629]:
d1 = correctize_with_window_knlm(sample_sentences[2],model=kn_lm2,window = 5,p_lambda = 1,prior = 'bigram',trie = True)

time passed fc:  0.5705416202545166
['तर', 'तार', 'आर', 'कर', 'गर', 'डर', 'तब', 'तय', 'तल', 'तह', 'तिर', 'थर', 'दर', 'पर', 'सर']
time passed fc:  0.5704402923583984
['उस', 'आस', 'उप', 'उसो', 'एस', 'स']
time passed fc:  0.1276233196258545
['समयमा', 'समयका', 'समयमै', 'संघमा', 'सडकमा', 'सतहमा', 'सदनमा', 'समयको', 'समयलाई', 'समयले', 'समयसीमा', 'सम्ममा', 'सहरमा']
time passed fc:  0.18999624252319336
['पनि', 'अनि', 'पति', 'उनि', 'नि', 'पछि', 'भनि']
time passed fc:  0.15054678916931152
['स्वस्थ', 'अस्वस्थ', 'स्वस्थ्य', 'ध्वस्त', 'स्वच्छ', 'स्वर्ग', 'स्वर्ण']
time passed fc:  0.14256906509399414
['स्वस्थ', 'अस्वस्थ', 'स्वस्थ्य', 'ध्वस्त', 'स्वच्छ', 'स्वर्ग', 'स्वर्ण']
time passed fc-:  0.22240376472473145
['राजनीति', 'राजनीतिक', 'राजनैतिक', 'राजनीतिको', 'राजनीतिका']
time passed fc-:  0.2533698081970215
['वातावरणीय', 'वातावरणमा', 'वातावरणको']
time passed fc-:  0.15358757972717285
['सभाले', 'अभावमा', 'अभावले', 'अभावका', 'अभावको']
time passed fc:  0.1366281509399414
['गर्दा', 'गर्दै', 'गर्न', 'गर्

In [630]:
d2 = correctize_with_window_knlm(sample_sentences[2],model=kn_lm2,window = 5,p_lambda = 1,prior = 'bigram',trie = True,likelihood = 'bm')

time passed fc:  0.5774896144866943
['तर', 'तार', 'आर', 'कर', 'गर', 'डर', 'तब', 'तय', 'तल', 'तह', 'तिर', 'थर', 'दर', 'पर', 'सर']
time passed fc:  0.5684797763824463
['उस', 'आस', 'उप', 'उसो', 'एस', 'स']
time passed fc:  0.12569451332092285
['समयमा', 'समयका', 'समयमै', 'संघमा', 'सडकमा', 'सतहमा', 'सदनमा', 'समयको', 'समयलाई', 'समयले', 'समयसीमा', 'सम्ममा', 'सहरमा']
time passed fc:  0.19547629356384277
['पनि', 'अनि', 'पति', 'उनि', 'नि', 'पछि', 'भनि']
time passed fc:  0.1495988368988037
['स्वस्थ', 'अस्वस्थ', 'स्वस्थ्य', 'ध्वस्त', 'स्वच्छ', 'स्वर्ग', 'स्वर्ण']
time passed fc:  0.13961148262023926
['स्वस्थ', 'अस्वस्थ', 'स्वस्थ्य', 'ध्वस्त', 'स्वच्छ', 'स्वर्ग', 'स्वर्ण']
time passed fc-:  0.22890090942382812
['राजनीति', 'राजनीतिक', 'राजनैतिक', 'राजनीतिको', 'राजनीतिका']
time passed fc-:  0.23935961723327637
['वातावरणीय', 'वातावरणमा', 'वातावरणको']
time passed fc-:  0.15558338165283203
['सभाले', 'अभावमा', 'अभावले', 'अभावका', 'अभावको']
time passed fc:  0.13763213157653809
['गर्दा', 'गर्दै', 'गर्न', 'ग

In [257]:
print(return_choices(sample_sentences[2],kn_lm2))


KeyboardInterrupt



In [502]:
print(return_choices(sample_sentences[1],kn_lm2))

time passed fc:  0.1775510311126709
['म', 'मा', 'अ', 'आ', 'आम', 'एम', 'ओम', 'क', 'ग', 'ज', 'त', 'द', 'न', 'प', 'मि', 'मे', 'मै', 'र', 'ल', 'व', 'स']
Entered depth,  1
Entered depth,  2
Entered depth,  3
time passed fc:  9.540229082107544
['पुस्तकालयबाट', 'पुस्तकलयबाटे']
time passed fc:  0.345076322555542
['ठुलो', 'ठूलो', 'थुलो', 'ठुला', 'थलो', 'धुलो', 'धूलो', 'कुलो', 'खुला', 'झुटो', 'ढिलो', 'थाले', 'थियो', 'नौलो', 'फलो', 'फुल', 'हलो', 'हिलो']
time passed fc:  0.41495203971862793
['किताब', 'किटान', 'कतार', 'किताबमा', 'किनार', 'किरात', 'किसान', 'पिसाब', 'हिसाब', 'किरा', 'पिता']
time passed fc:  0.3511202335357666
['पढ्न', 'पढ्दा', 'पढ्ने', 'बढ्न', 'चढ्न', 'पठन', 'पढाइ', 'पढाई', 'पढाउन', 'पढे', 'पढेर', 'पढ्दै', 'पतन', 'परेन', 'पर्न', 'पवन', 'पस्न', 'पाउन', 'पाएन', 'पाटन', 'पान', 'पार्न', 'पालन', 'पाल्न', 'बच्न', 'बढ्नु', 'बढ्ने', 'लड्न']
time passed fc:  0.3320951461791992
['पढ्न', 'पढ्दा', 'पढ्ने', 'बढ्न', 'चढ्न', 'पठन', 'पढाइ', 'पढाई', 'पढाउन', 'पढे', 'पढेर', 'पढ्दै', 'पतन', 'परेन', 'पर

In [525]:
print(return_choices(sample_sentences[1],kn_lm2,trie = True))

time passed fc:  0.43982434272766113
['म', 'मा', 'अ', 'आ', 'आम', 'एम', 'ओम', 'क', 'ग', 'ज', 'त', 'द', 'न', 'प', 'मि', 'मे', 'मै', 'र', 'ल', 'व', 'स']
Entered depth,  1
Entered depth,  2
Entered depth,  3
time passed fc:  4.149160146713257
['पुस्तकालयबाट', 'पुस्तकलयबाटे']
time passed fc:  0.08480715751647949
['ठुलो', 'ठूलो', 'थुलो', 'ठुला', 'थलो', 'धुलो', 'धूलो', 'कुलो', 'खुला', 'झुटो', 'ढिलो', 'थाले', 'थियो', 'नौलो', 'फलो', 'फुल', 'हलो', 'हिलो']
time passed fc:  0.1251697540283203
['किताब', 'किटान', 'कतार', 'किताबमा', 'किनार', 'किरात', 'किसान', 'पिसाब', 'हिसाब', 'किरा', 'पिता']
time passed fc:  0.09773874282836914
['पढ्न', 'पढ्दा', 'पढ्ने', 'बढ्न', 'चढ्न', 'पठन', 'पढाइ', 'पढाई', 'पढाउन', 'पढे', 'पढेर', 'पढ्दै', 'पतन', 'परेन', 'पर्न', 'पवन', 'पस्न', 'पाउन', 'पाएन', 'पाटन', 'पान', 'पालन', 'बच्न', 'बढ्नु', 'बढ्ने', 'लड्न']
time passed fc:  0.09574389457702637
['पढ्न', 'पढ्दा', 'पढ्ने', 'बढ्न', 'चढ्न', 'पठन', 'पढाइ', 'पढाई', 'पढाउन', 'पढे', 'पढेर', 'पढ्दै', 'पतन', 'परेन', 'पर्न', 'पवन', 'प

In [344]:
print(return_choices(sample_sentences[0],kn_lm2))

time passed fc:  0.3480679988861084
['हरेक', 'गरेका', 'झरेका', 'परेका', 'मरेका', 'हारेको', 'हारेर', 'हेरेका', 'अनेक', 'खरेल', 'गरेकी', 'गरेकै', 'गरेको', 'गरेन', 'गरेर', 'झरेको', 'तर्क', 'परेकी', 'परेको', 'परेन', 'परेर', 'फरक', 'ब्रेक', 'भरेर', 'मरेको', 'सरिक', 'सरेको', 'हटेको', 'हरित', 'हेरेको', 'हेरेर']
time passed fc:  0.5595026016235352
['नेपालले', 'नेपालीले', 'सेपालीले']
time passed fc:  0.5206067562103271
['नेपालको', 'नेपालका', 'नेपालीको', 'नपाएको', 'नेताको', 'नेपालकै', 'नेकपाको', 'नेपालकी', 'नेपामको']
time passed fc:  0.5694758892059326
['संविधान', 'संवैधानिक', 'संविधानको', 'संविधानले', 'संविधानमा', 'संविधानका', 'संविधानक']
time passed fc:  0.39594054222106934
['पालन', 'पालना', 'पाटन', 'पालामा', 'पाल्न', 'कामना', 'कालमा', 'तालमा', 'तुलना', 'पसिना', 'पाउन', 'पाएन', 'पालिका', 'पालेका', 'पाल्ने', 'पाल्पा', 'पासमा', 'पाहुना', 'पुलमा', 'यातना', 'सामना', 'सालमा', 'हालका']
time passed fc:  0.3809802532196045
['पालन', 'पालना', 'पाटन', 'पालामा', 'पाल्न', 'कामना', 'कालमा', 'तालमा', 'तुलना'

In [517]:
print(return_choices(sample_sentences[3],kn_lm2))

time passed fc:  0.5246691703796387
['नेपालमा', 'नेपालका', 'नेपालमै', 'नेपालीमा', 'नेपालकी', 'नेपालकै', 'नेपालको', 'नेपाललाई', 'नेपालले', 'नेपालीका']
time passed fc:  0.46183252334594727
['आधुनिक']
time passed fc:  0.4059116840362549
['रुपमा', 'रूपमा', 'रुटमा', 'रुपमै', 'रूपमै', 'आरोपमा', 'कपमा', 'पुलमा', 'पुसमा', 'युगमा', 'रनमा', 'रातमा', 'रिटमा', 'रुपले', 'रुपैया']
time passed fc:  0.4009277820587158
['आर्थक', 'आर्थिक', 'सार्थक', 'अर्थ', 'दर्शक', 'आकर्षक', 'आर्जन', 'आर्ट', 'आर्यन', 'आस्था', 'समर्थक']
Entered depth,  1
time passed fc:  1.7852222919464111
['विकाससम्बन्धी']
Entered depth,  1
time passed fc:  1.7977774143218994
['विकाससम्बन्धी']
time passed fc:  0.5166184902191162
['कार्यरत', 'कार्यमा', 'कार्यदल', 'कार्यको', 'कार्यले', 'कार्यका', 'कार्यलय', 'कार्यहरू', 'कार्यरू']
time passed fc:  0.5684881210327148
['प्रारम्भिक', 'प्रारम्भ']
time passed fc:  0.3400862216949463
['भएको', 'खाएको', 'छाएको', 'भएका', 'भएकी', 'भनेको', 'आएको', 'उभिएको', 'गएको', 'गाएको', 'छोएको', 'झाको', 'नआएको',

In [516]:
return_choices(sample_sentences[3],kn_lm2,trie = True)

time passed fc:  0.20245742797851562
['नेपालमा', 'नेपालका', 'नेपालमै', 'नेपालीमा', 'नेपालकी', 'नेपालकै', 'नेपालको', 'नेपाललाई', 'नेपालले', 'नेपालीका']
time passed fc:  0.1356489658355713
['आधुनिक']
time passed fc:  0.11369562149047852
['रुपमा', 'रूपमा', 'रुटमा', 'रुपमै', 'रूपमै', 'कपमा', 'पुलमा', 'पुसमा', 'युगमा', 'रनमा', 'रातमा', 'रिटमा', 'रुपले', 'रुपैया']
time passed fc:  0.11768484115600586
['आर्थक', 'आर्थिक', 'अर्थ', 'दर्शक', 'आर्जन', 'आर्ट', 'आर्यन', 'आस्था', 'आर्य', 'गर्थे', 'तर्क']
Entered depth,  1
time passed fc:  1.125997543334961
['विकाससम्बन्धी']
Entered depth,  1
time passed fc:  1.1000561714172363
['विकाससम्बन्धी']
time passed fc:  0.2074449062347412
['कार्यका', 'कार्यको', 'कार्यरत', 'कार्यमा', 'कार्यले', 'कार्यलय', 'कार्यदल', 'कार्यहरू', 'कार्यरू']
time passed fc:  0.26628732681274414
['प्रारम्भ', 'प्रारम्भिक']
time passed fc:  0.07978653907775879
['भएको', 'भएका', 'भएकी', 'आएको', 'गएको', 'झाको', 'नभएको', 'भएँ', 'भएकाे', 'भएकै', 'भएकोमा', 'भएकोले', 'भएन', 'भएमा', 'भएर', 

[{'नेपालकी', 'नेपालकै', 'नेपालको', 'नेपालमा'},
 {'आधुनिक'},
 {'रुपमा', 'रूपमा'},
 {'आर्थिक'},
 {'विकाससम्बन्धी'},
 {'कार्यको', 'कार्यरत'},
 {'प्रारम्भ'},
 {'भएको'},
 {'चाल', 'हानि', 'हामी', 'हाल', 'हालै', 'होला'},
 {'मात्र', 'मात्रै'},
 {'यो', 'हो'},
 {'।'}]

In [837]:
print(extract_choices(sample_sentences[4],kn_lm2,trie = True, p_lambda = 0.22, likelihood = 'default'))

time passed fc:  0.25535011291503906
['हार', 'कार', 'चार', 'तार', 'धार', 'पार', 'बार', 'भार', 'मार', 'सार', 'हात', 'हाल', 'हेर']
Entered depth,  1
time passed fc-:  0.43939852714538574
['दिनुहोस्', 'धुनुहोस्']
time passed fc:  0.43483519554138184
['र', 'अ', 'आ', 'आर', 'क', 'ग', 'ज', 'त', 'द', 'न', 'प', 'म', 'रु', 'रे', 'र्', 'ल', 'व', 'स']
time passed fc-:  0.21442365646362305
['स्वस्थ', 'अस्वस्थ', 'स्वार्थ', 'स्वास्थ्य', 'स्वास्थ']
time passed fc:  0.08078598976135254
['जिवन', 'जीवन', 'आजीवन', 'जवान', 'जीवन्त']
time passed fc:  0.07579779624938965
['जिवन', 'जीवन', 'आजीवन', 'जवान', 'जीवन्त']
Entered depth,  1
time passed fc-:  0.550107479095459
['दिनुहोस्', 'जिउनुहोस्']
time passed fc:  0.4204447269439697
['।', 'ः', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ओ']
7
[{'हात', 'हार'}, {'दिनुहोस्', 'धुनुहोस्'}, {'र'}, {'स्वस्थ', 'अस्वस्थ', 'स्वास्थ', 'स्वास्थ्य'}, {'जीवन्त', 'आजीवन', 'जीवन', 'जिवन', 'जवान'}, {'दिनुहोस्', 'जिउनुहोस्'}, {'।'}]


In [620]:
print(return_choices(sample_sentences[2],kn_lm2,likelihood = 'bm'))

time passed fc:  0.22343659400939941
['तर', 'तार', 'आर', 'कर', 'गर', 'डर', 'तब', 'तय', 'तल', 'तह', 'तिर', 'थर', 'दर', 'पर', 'सर']
time passed fc:  0.20546269416809082
['उस', 'आस', 'उप', 'उसो', 'एस', 'स']
time passed fc:  0.39993906021118164
['समयमा', 'समयका', 'समयमै', 'समाजमा', 'संघमा', 'सडकमा', 'सतहमा', 'सदनमा', 'समयको', 'समयलाई', 'समयले', 'समयसीमा', 'समूहमा', 'सम्ममा', 'सहरमा']
time passed fc:  0.287200927734375
['पनि', 'अनि', 'पति', 'उनि', 'नि', 'पछि', 'भनि']
time passed fc:  0.46871423721313477
['स्वस्थ', 'अस्वस्थ', 'स्वस्थ्य', 'स्वार्थ', 'ध्वस्त', 'स्वच्छ', 'स्वर्ग', 'स्वर्ण']
time passed fc:  0.44678759574890137
['स्वस्थ', 'अस्वस्थ', 'स्वस्थ्य', 'स्वार्थ', 'ध्वस्त', 'स्वच्छ', 'स्वर्ग', 'स्वर्ण']
time passed fc-:  0.5765368938446045
['राजनीतिक', 'राजनीति', 'राजनीतिको', 'राजनैतिक', 'राजनीतिका']
time passed fc-:  0.6283528804779053
['वातावरणीय', 'वातावरणमा', 'वातावरणको']
time passed fc-:  0.4557778835296631
['अभावमा', 'अभावले', 'अभावका', 'सभाले', 'अभावको']
time passed fc:  0.3971407

In [646]:
print(return_choices(sample_sentences[2],kn_lm2,trie = True,likelihood = 'bm'))

time passed fc:  0.5794477462768555
['तर', 'तार', 'आर', 'कर', 'गर', 'डर', 'तब', 'तय', 'तल', 'तह', 'तिर', 'थर', 'दर', 'पर', 'सर']
time passed fc:  0.5634937286376953
['उस', 'आस', 'उप', 'उसो', 'एस', 'स']
time passed fc:  0.12270879745483398
['समयमा', 'समयका', 'समयमै', 'संघमा', 'सडकमा', 'सतहमा', 'सदनमा', 'समयको', 'समयलाई', 'समयले', 'समयसीमा', 'सम्ममा', 'सहरमा']
time passed fc:  0.19045782089233398
['पनि', 'अनि', 'पति', 'उनि', 'नि', 'पछि', 'भनि']
time passed fc:  0.1426527500152588
['स्वस्थ', 'अस्वस्थ', 'स्वस्थ्य', 'ध्वस्त', 'स्वच्छ', 'स्वर्ग', 'स्वर्ण']
time passed fc:  0.14418506622314453
['स्वस्थ', 'अस्वस्थ', 'स्वस्थ्य', 'ध्वस्त', 'स्वच्छ', 'स्वर्ग', 'स्वर्ण']
time passed fc-:  0.23038315773010254
['राजनीति', 'राजनीतिक', 'राजनैतिक', 'राजनीतिको', 'राजनीतिका']
time passed fc-:  0.25232505798339844
['वातावरणीय', 'वातावरणमा', 'वातावरणको']
time passed fc-:  0.15658068656921387
['सभाले', 'अभावमा', 'अभावले', 'अभावका', 'अभावको']
time passed fc:  0.13763165473937988
['गर्दा', 'गर्दै', 'गर्न', 'ग

In [609]:
trie_depth[0].search('श्रुति',0)

['श्रुति']

In [611]:
'श्रुति' in lis[:int(0.1*l)]

True

In [613]:
textdistance.levenshtein.distance('श्रुति','प्रगति')

2

In [625]:
print(return_choices(sample_sentences[2],kn_lm2))

time passed fc:  0.22340130805969238
['तर', 'तार', 'आर', 'कर', 'गर', 'डर', 'तब', 'तय', 'तल', 'तह', 'तिर', 'थर', 'दर', 'पर', 'सर']
time passed fc:  0.20544958114624023
['उस', 'आस', 'उप', 'उसो', 'एस', 'स']
time passed fc:  0.40891528129577637
['समयमा', 'समयका', 'समयमै', 'समाजमा', 'संघमा', 'सडकमा', 'सतहमा', 'सदनमा', 'समयको', 'समयलाई', 'समयले', 'समयसीमा', 'समूहमा', 'सम्ममा', 'सहरमा']
time passed fc:  0.27977919578552246
['पनि', 'अनि', 'पति', 'उनि', 'नि', 'पछि', 'भनि']
time passed fc:  0.471724271774292
['स्वस्थ', 'अस्वस्थ', 'स्वस्थ्य', 'स्वार्थ', 'ध्वस्त', 'स्वच्छ', 'स्वर्ग', 'स्वर्ण']
time passed fc:  0.45081496238708496
['स्वस्थ', 'अस्वस्थ', 'स्वस्थ्य', 'स्वार्थ', 'ध्वस्त', 'स्वच्छ', 'स्वर्ग', 'स्वर्ण']
time passed fc-:  0.5705091953277588
['राजनीतिक', 'राजनीति', 'राजनीतिको', 'राजनैतिक', 'राजनीतिका']
time passed fc-:  0.6297867298126221
['वातावरणीय', 'वातावरणमा', 'वातावरणको']
time passed fc-:  0.461270809173584
['अभावमा', 'अभावले', 'अभावका', 'सभाले', 'अभावको']
time passed fc:  0.41094064

In [531]:
print(return_choices("हुन त त्यस बेलामा औद्योगि विकास तर्फ केही महत्वपूर्ण कार्यहरू पनि भएक थिए।",kn_lm2))

time passed fc:  0.28826141357421875
['हुन', 'कुन', 'छुन', 'जुन', 'धुन', 'नुन', 'पुन', 'रुन', 'सुन', 'हुनु', 'हुने', 'हुनै', 'हुन्']
time passed fc:  0.15216755867004395
['त', 'अ', 'अत', 'आ', 'क', 'ग', 'ज', 'ति', 'ती', 'त्', 'द', 'न', 'प', 'म', 'र', 'ल', 'व', 'स']
time passed fc:  0.3291177749633789
['त्यस', 'ग्यास', 'त्यसै', 'त्यसो', 'त्याग', 'त्रास', 'व्यास', 'क्रस', 'तीस', 'त्यति', 'त्यतै', 'त्यसका', 'त्यसको', 'त्यसमा', 'त्यसरी', 'त्यसले', 'त्यहि', 'त्यही', 'प्लस']
time passed fc:  0.4757249355316162
['बेलामा', 'जेलमा', 'बेलैमा', 'भेलामा', 'मेलामा', 'खेलमा', 'तलामा', 'नेतामा', 'पालामा', 'पेसामा', 'बिदामा', 'बेलाको', 'बेलुका', 'सेनामा', 'सेवामा']
time passed fc:  0.5082094669342041
['उद्योग', 'औद्योगिक', 'उद्योगी', 'औद्योगि']
time passed fc:  0.49366235733032227
['उद्योग', 'औद्योगिक', 'उद्योगी', 'औद्योगि']
time passed fc:  0.3999345302581787
['विकास', 'निकास', 'निकासा', 'बिकास', 'विकट', 'विकासे', 'निकाय', 'निकासी', 'निवास', 'विकले', 'विकासका', 'विकासको', 'विकासमा', 'विकासले', 'विकेट'

In [561]:
print(return_choices("हुन त त्यस बेलामा औद्योगि विकास तर्फ केही महत्वपूर्ण कार्यहरू पनि भएक थिए।",kn_lm2,trie = True))

time passed fc:  0.14050865173339844
['हुन', 'कुन', 'छुन', 'जुन', 'धुन', 'नुन', 'पुन', 'रुन', 'सुन', 'हुनु', 'हुने', 'हुनै', 'हुन्']
time passed fc:  0.43683862686157227
['त', 'अ', 'अत', 'आ', 'क', 'ग', 'ज', 'ति', 'ती', 'त्', 'द', 'न', 'प', 'म', 'र', 'ल', 'व', 'स']
time passed fc:  0.10072827339172363
['त्यस', 'ग्यास', 'त्यसै', 'त्यसो', 'त्याग', 'त्रास', 'व्यास', 'क्रस', 'तीस', 'त्यति', 'त्यतै', 'त्यसका', 'त्यसको', 'त्यसमा', 'त्यसरी', 'त्यसले', 'त्यहि', 'त्यही', 'प्लस']
time passed fc:  0.15658116340637207
['बेलामा', 'जेलमा', 'बेलैमा', 'भेलामा', 'मेलामा', 'खेलमा', 'तलामा', 'नेतामा', 'पालामा', 'पेसामा', 'बिदामा', 'बेलाको', 'बेलुका', 'सेनामा', 'सेवामा']
time passed fc:  0.16501545906066895
['उद्योग', 'उद्योगी', 'औद्योगिक', 'औद्योगि']
time passed fc:  0.14663958549499512
['उद्योग', 'उद्योगी', 'औद्योगिक', 'औद्योगि']
time passed fc:  0.13962769508361816
['विकास', 'निकास', 'निकासा', 'बिकास', 'विकट', 'विकासे', 'निकाय', 'निकासी', 'निवास', 'विकले', 'विकासका', 'विकासको', 'विकासमा', 'विकासले', 'वि

In [199]:
print(return_choices('त्यस निशान्तले भात खायो। ',kn_lm2))

time passed fc:  0.340146541595459
['त्यस', 'ग्यास', 'त्यसै', 'त्यसो', 'त्याग', 'त्रास', 'व्यास', 'क्रस', 'तीस', 'त्यति', 'त्यतै', 'त्यसका', 'त्यसको', 'त्यसमा', 'त्यसरी', 'त्यसले', 'त्यहि', 'त्यही', 'प्लस']
Entered depth,  1
Entered depth,  2
time passed fc:  3.513815402984619
['निशान्तले']
time passed fc:  0.28024983406066895
['भात', 'खाता', 'छाता', 'बाट', 'भय', 'भर', 'भाका', 'भाग', 'भाडा', 'भान', 'भार', 'भाव', 'भेट', 'भोट', 'हात']
time passed fc:  0.33809471130371094
['खायो', 'भयो', 'आयो', 'कालो', 'खाए', 'खाएको', 'खाका', 'खाजा', 'खाडी', 'खाता', 'खान', 'खाना', 'खानी', 'खानु', 'खाने', 'खालि', 'खाली', 'खाले', 'खास', 'गयो', 'चासो', 'छानो', 'छाया', 'झाको', 'ठाडो', 'थियो', 'धागो', 'पायो', 'यायो', 'रहयो']
time passed fc:  0.14047861099243164
['।', 'ः', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ओ']


TypeError: 'float' object is not subscriptable

In [200]:
print(return_choices('त्यस निशान्तले भात खायो। ',kn_lm2,trie = True))

time passed fc:  0.1117391586303711
['त्यस', 'ग्यास', 'त्यसै', 'त्यसो', 'त्याग', 'त्रास', 'व्यास', 'क्रस', 'तीस', 'त्यति', 'त्यतै', 'त्यसका', 'त्यसको', 'त्यसमा', 'त्यसरी', 'त्यसले', 'त्यहि', 'त्यही', 'प्लस']
Entered depth,  1
Entered depth,  2
time passed fc:  1.43226957321167
['निशान्तले']
time passed fc:  0.18616127967834473
['भात', 'खाता', 'छाता', 'बाट', 'भय', 'भर', 'भाका', 'भाग', 'भाडा', 'भान', 'भार', 'भाव', 'भेट', 'भोट', 'हात']
time passed fc:  0.08498764038085938
['खायो', 'भयो', 'आयो', 'कालो', 'खाए', 'खाएको', 'खाका', 'खाजा', 'खाडी', 'खाता', 'खान', 'खाना', 'खानी', 'खानु', 'खाने', 'खालि', 'खाली', 'खाले', 'खास', 'गयो', 'चासो', 'छानो', 'छाया', 'झाको', 'ठाडो', 'थियो', 'धागो', 'पायो', 'यायो', 'रहयो']
time passed fc:  0.4180727005004883
['।', 'ः', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ओ']


TypeError: 'float' object is not subscriptable

In [202]:
print(return_choices('हैन भो तेस्तो नगर म आफै राख्छु।',kn_lm2))

time passed fc:  0.30286598205566406
['हैन', 'ऐन', 'छैन', 'होइन', 'होईन', 'अन', 'आउन', 'आएन', 'इन', 'कान', 'किन', 'कैद', 'खान', 'गैर', 'चीन', 'चैत', 'छन', 'छिन', 'छैन्', 'जन', 'जान', 'झन', 'टन', 'डिन', 'तिन', 'तीन', 'थान', 'दान', 'दिन', 'धन', 'धान', 'पान', 'बन', 'भएन', 'भान', 'मन', 'मान', 'मौन', 'यौन', 'रन', 'लान', 'लिन', 'वन', 'वान', 'विन', 'सन', 'सान', 'हक', 'हद', 'हब', 'हल', 'हाई', 'हात', 'हानि', 'हानी', 'हाफ', 'हार', 'हाल', 'हिट', 'हित', 'हुदैन', 'हुन', 'है', 'हैरान']
time passed fc:  0.2046356201171875
['भो', 'भि', 'भू', 'शो', 'हो', 'को', 'जो', 'नो', 'पो', 'भइ', 'भई', 'भए', 'भयो', 'भै', 'भोक', 'भोज', 'भोट', 'यो', 'सो']
time passed fc:  0.4629197120666504
['तेस्तो', 'तेस्रो', 'कस्तो', 'जस्तो', 'तेश्रो', 'त्यस्तो', 'यस्तो', 'सस्तो', 'तस्तै', 'पेस्तोल']
time passed fc:  0.28403592109680176
['नगर', 'नगद', 'नगरी', 'नगरे', 'नगर्न', 'नजर', 'नहर', 'बगर', 'मगर', 'सागर']
time passed fc:  0.1439657211303711
['म', 'मा', 'अ', 'आ', 'आम', 'एम', 'ओम', 'क', 'ग', 'ज', 'त', 'द', 'न', 'प', 'मि', 'मे'

In [587]:
print(return_choices('हैन भो तेस्तो नगर म आफै राख्छु।',kn_lm2))

time passed fc:  0.2802450656890869
['छैन', 'हुन', 'हैन', 'है']
time passed fc:  0.20744585990905762
['भो', 'भि', 'भू', 'शो', 'हो', 'को', 'जो', 'नो', 'पो', 'भइ', 'भई', 'भए', 'भयो', 'भै', 'भोक', 'भोज', 'भोट', 'यो', 'सो']
time passed fc:  0.4587380886077881
['तेस्तो', 'तेस्रो', 'कस्तो', 'जस्तो', 'तेश्रो', 'त्यस्तो', 'यस्तो', 'सस्तो', 'तस्तै', 'पेस्तोल']
time passed fc:  0.29720449447631836
['नगरी', 'नगर', 'नगद', 'मगर', 'गर', 'नगरे', 'नहर', 'नजर', 'बगर']
time passed fc:  0.15957212448120117
['म', 'मा', 'अ', 'आ', 'आम', 'एम', 'ओम', 'क', 'ग', 'ज', 'त', 'द', 'न', 'प', 'मि', 'मे', 'मै', 'र', 'ल', 'व', 'स']
time passed fc:  0.14061260223388672
['म', 'मा', 'अ', 'आ', 'आम', 'एम', 'ओम', 'क', 'ग', 'ज', 'त', 'द', 'न', 'प', 'मि', 'मे', 'मै', 'र', 'ल', 'व', 'स']
time passed fc:  0.2762923240661621
['आफू', 'आफैं', 'आफै', 'आजै', 'आफैँ', 'आफु']
time passed fc:  0.4526667594909668
['राख्छु', 'राख्छ', 'देख्छु', 'राख्छन्', 'राख्नु', 'जान्छु', 'राख्दा', 'राख्दै', 'राख्न', 'राख्ने', 'राख्यो']
time passed fc:  

In [None]:
'त्य', 'ते'

In [588]:
print(return_choices('हैन भो तेस्तो नगर म आफै राख्छु।',kn_lm2,trie = True))

time passed fc:  0.11372852325439453
['है', 'छैन', 'हुन', 'हैन']
time passed fc:  0.5710093975067139
['भो', 'भि', 'भू', 'शो', 'हो', 'को', 'जो', 'नो', 'पो', 'भइ', 'भई', 'भए', 'भयो', 'भै', 'भोक', 'भोज', 'भोट', 'यो', 'सो']
time passed fc:  0.14162063598632812
['यस्तो', 'तस्तै', 'कस्तो', 'सस्तो', 'जस्तो', 'तेस्रो', 'तेश्रो', 'पेस्तोल', 'तेस्तो']
time passed fc:  0.18350911140441895
['गर', 'नगर', 'नगद', 'नहर', 'नजर', 'बगर', 'मगर', 'नगरी', 'नगरे']
time passed fc:  0.4313504695892334
['म', 'मा', 'अ', 'आ', 'आम', 'एम', 'ओम', 'क', 'ग', 'ज', 'त', 'द', 'न', 'प', 'मि', 'मे', 'मै', 'र', 'ल', 'व', 'स']
time passed fc:  0.389115571975708
['म', 'मा', 'अ', 'आ', 'आम', 'एम', 'ओम', 'क', 'ग', 'ज', 'त', 'द', 'न', 'प', 'मि', 'मे', 'मै', 'र', 'ल', 'व', 'स']
time passed fc:  0.07380127906799316
['आफू', 'आफै', 'आफु', 'आजै', 'आफैं', 'आफैँ']
time passed fc:  0.14262056350708008
['राख्छु', 'राख्छ', 'देख्छु', 'राख्छन्', 'राख्नु', 'जान्छु', 'राख्दा', 'राख्दै', 'राख्न', 'राख्ने', 'राख्यो']
time passed fc:  0.406910657

In [586]:
trie_depth[0].search('त्यस्तो',)

['त्यस्तो']

In [584]:
trie_depth[0].root.children['त'].children['्'].children

<__main__.TrieNode at 0x1f751472d00>

In [589]:
len('त्य')

3

In [590]:
len("हिंदी")

5

In [577]:
align('त्यस्तो', 'तेस्तो')

[('त', 'त'),
 ('्', None),
 ('य', 'े'),
 ('स', 'स'),
 ('्', '्'),
 ('त', 'त'),
 ('ो', 'ो')]

In [564]:
final_candidate_words('तेस्तो',use_trie = True)

time passed fc:  0.16455936431884766
['यस्तो', 'तस्तै', 'कस्तो', 'सस्तो', 'जस्तो', 'तेस्रो', 'तेश्रो', 'पेस्तोल', 'तेस्तो']


['यस्तो',
 'तस्तै',
 'कस्तो',
 'सस्तो',
 'जस्तो',
 'तेस्रो',
 'तेश्रो',
 'पेस्तोल',
 'तेस्तो']

In [566]:
candidate_words_trie('तेस्तो',2)

['यस्तो',
 'तस्तै',
 'कस्तो',
 'सस्तो',
 'जस्तो',
 'तेस्रो',
 'तेश्रो',
 'पेस्तोल',
 'तेस्तो']

In [567]:
candidate_words('तेस्तो')

['यस्तो',
 'जस्तो',
 'तेस्रो',
 'कस्तो',
 'त्यस्तो',
 'सस्तो',
 'तेश्रो',
 'पेस्तोल',
 'तस्तै',
 'तेस्तो']

NameError: name 'trie_dict' is not defined

In [205]:
print(return_choices('मलाई पहिला बिश्वाश गर्दैन थियौ।',kn_lm2))

time passed fc:  0.3514537811279297
['मलाई', 'आमालाई', 'माला', 'उनलाई', 'उसलाई', 'कमाई', 'कला', 'कामलाई', 'गराई', 'जुलाई', 'टिमलाई', 'तपाई', 'तराई', 'तला', 'नपाई', 'नलिई', 'बनाई', 'मलको', 'मल्ल', 'माई', 'माग', 'मागलाई', 'मान', 'मार', 'मास', 'मेला', 'लगाई', 'लाइ', 'लाई', 'लाल', 'हवाई']
time passed fc:  0.39993953704833984
['पहिला', 'पहिले', 'पहिलो', 'पाइला', 'महिला', 'अहिले', 'कहिले', 'जहिले', 'ढिला', 'नहोला', 'पर्ला', 'पसिना', 'पहिरो', 'पहिलाको', 'महिना']
time passed fc:  0.539071798324585
['विश्वास', 'बिश्वाश']
time passed fc:  0.4567873477935791
['गर्दैन', 'गरिदैन', 'गर्दैनन्', 'गर्दैमा', 'पर्दैन', 'गरिदिन', 'गर्दछ', 'गर्दै', 'गर्दैछ', 'बन्दैन', 'सक्दैन']
time passed fc:  0.3410964012145996
['थियौ', 'थियौं', 'चिया', 'थिम', 'थियो', 'थियौँ', 'थिई', 'थिए', 'थिएँ', 'थिएन']
time passed fc:  0.15059447288513184
['।', 'ः', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ओ']
6
Time Passed: 4.327080726623535
[{'नपाई', 'बनाई', 'मलाई'}, {'पहिला', 'पहिले', 'पहिलो'}, {'विश्वास'}, {'गर्दैन'}, {'थियो'}, {'।'}]

In [204]:
print(return_choices('मलाई पहिला बिश्वाश गर्दैन थियौ।',kn_lm2,trie = True))

time passed fc:  0.10835647583007812
['मलाई', 'माला', 'कमाई', 'कला', 'गराई', 'तपाई', 'तराई', 'तला', 'नपाई', 'नलिई', 'बनाई', 'मलको', 'मल्ल', 'माई', 'माग', 'मान', 'मार', 'मास', 'मेला', 'लगाई', 'लाइ', 'लाई', 'लाल', 'हवाई']
time passed fc:  0.12968802452087402
['पहिला', 'पहिले', 'पहिलो', 'पाइला', 'महिला', 'अहिले', 'कहिले', 'जहिले', 'ढिला', 'नहोला', 'पर्ला', 'पसिना', 'पहिरो', 'पहिलाको', 'महिना']
time passed fc:  0.19946694374084473
['विश्वास', 'बिश्वाश']
time passed fc:  0.14468884468078613
['गर्दैन', 'गरिदैन', 'गर्दैनन्', 'गर्दैमा', 'पर्दैन', 'गरिदिन', 'गर्दछ', 'गर्दै', 'गर्दैछ', 'बन्दैन', 'सक्दैन']
time passed fc:  0.07982063293457031
['थियौ', 'थियौं', 'चिया', 'थिम', 'थियो', 'थियौँ', 'थिई', 'थिए', 'थिएँ', 'थिएन']
time passed fc:  0.4208714962005615
['।', 'ः', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ओ']
6
Time Passed: 2.76076340675354
[{'नपाई', 'लाइ', 'बनाई', 'मलाई'}, {'पहिला', 'पहिले'}, {'विश्वास'}, {'गर्दैन'}, {'थियो'}, {'।'}]


In [591]:
print(return_choices('उसले पहिले मलाई विश्वास गर्न प्रयोग गरेन।',kn_lm2))

time passed fc:  0.35056424140930176
['उसले', 'उनले', 'उसैले', 'असल', 'असली', 'आले', 'उनैले', 'उसका', 'उसको', 'उसलाई', 'एकले', 'कसले', 'खाले', 'घले', 'जसले', 'झाले', 'तले', 'थाले', 'दासले', 'बसले', 'भाले', 'माले', 'यसले', 'सके', 'हाले']
time passed fc:  0.3960449695587158
['पहिले', 'अहिले', 'कहिले', 'जहिले', 'पहिला', 'पहिलो', 'आदिले', 'कतिले', 'तहले', 'पहिरो', 'पहिरोले', 'पहिलेको', 'महिने', 'महिला']
time passed fc:  0.33909058570861816
['मलाई', 'आमालाई', 'माला', 'उनलाई', 'उसलाई', 'कमाई', 'कला', 'कामलाई', 'गराई', 'जुलाई', 'टिमलाई', 'तपाई', 'तराई', 'तला', 'नपाई', 'नलिई', 'बनाई', 'मलको', 'मल्ल', 'माई', 'माग', 'मागलाई', 'मान', 'मार', 'मास', 'मेला', 'लगाई', 'लाइ', 'लाई', 'लाल', 'हवाई']
time passed fc:  0.5231726169586182
['विश्वास', 'अविश्वास', 'विश्वका', 'विश्वमा', 'विश्राम', 'विश्वकै', 'विश्वको', 'विश्वमै', 'विश्वासको', 'विश्वासमा']
time passed fc:  0.3401322364807129
['गर्न', 'कर्ण', 'गरिन', 'गरेन', 'गर्दा', 'गर्नु', 'गर्ने', 'गर्नै', 'गर्ला', 'गर्व', 'गाउन', 'गार्ड', 'टार्न', 'पर्न', 'प

In [593]:
print(return_choices('उसले पहिले मलाई विश्वास गर्न प्रयोग गरेन।',kn_lm2,trie = True))

time passed fc:  0.11424422264099121
['उसले', 'उनले', 'उसैले', 'असल', 'असली', 'आले', 'उसका', 'उसको', 'उसलाई', 'एकले', 'कसले', 'खाले', 'घले', 'जसले', 'झाले', 'तले', 'थाले', 'बसले', 'भाले', 'माले', 'यसले', 'सके', 'हाले']
time passed fc:  0.12865567207336426
['पहिले', 'अहिले', 'कहिले', 'जहिले', 'पहिला', 'पहिलो', 'आदिले', 'कतिले', 'तहले', 'पहिरो', 'पहिरोले', 'पहिलेको', 'महिने', 'महिला']
time passed fc:  0.10728263854980469
['मलाई', 'माला', 'कमाई', 'कला', 'गराई', 'तपाई', 'तराई', 'तला', 'नपाई', 'नलिई', 'बनाई', 'मलको', 'मल्ल', 'माई', 'माग', 'मान', 'मार', 'मास', 'मेला', 'लगाई', 'लाइ', 'लाई', 'लाल', 'हवाई']
time passed fc:  0.20943737030029297
['विश्वास', 'अविश्वास', 'विश्वका', 'विश्वमा', 'विश्राम', 'विश्वकै', 'विश्वको', 'विश्वमै', 'विश्वासको', 'विश्वासमा']
time passed fc:  0.1326451301574707
['गर्न', 'कर्ण', 'गरिन', 'गरेन', 'गर्दा', 'गर्नु', 'गर्ने', 'गर्नै', 'गर्ला', 'गर्व', 'गाउन', 'गार्ड', 'पर्न', 'मर्न', 'वर्ण']
time passed fc:  0.13965988159179688
['गर्न', 'कर्ण', 'गरिन', 'गरेन', 'गर्दा',

In [594]:
candidate

'physical'

In [211]:
WORDS_full.most_common()[:-10000-1:-1]

[('विभिन्नस्थानहरूलाई', 1),
 ('टेरिटरीको', 1),
 ('सम्झौताहिन', 1),
 ('कार्यालयमाा', 1),
 ('स्क्र्याब', 1),
 ('कापीलगायत', 1),
 ('धोक्रोभरि', 1),
 ('फोहरधनी', 1),
 ('उद्धमशिलता', 1),
 ('मोटिवेशनल', 1),
 ('श्रृस्टी', 1),
 ('मेंखु', 1),
 ('तान्डुकार', 1),
 ('रेनि', 1),
 ('सिमृति', 1),
 ('आरटिबीमा', 1),
 ('आरसिवीबाट', 1),
 ('आरटिबी', 1),
 ('ईन्जिनियरहरुलाई', 1),
 ('पुर्नसर्भे', 1),
 ('परम्परादेखी', 1),
 ('लाभग्राहिको', 1),
 ('लाभग्राहिका', 1),
 ('जनशक्तिलाईलाई', 1),
 ('आवश्वकता', 1),
 ('बीत', 1),
 ('विद्यर्थीहरुको', 1),
 ('खार्दुमा', 1),
 ('रामाकोट', 1),
 ('विद्यथालय', 1),
 ('विवादमाथिको', 1),
 ('मोटरेल', 1),
 ('सरलकर्जाको', 1),
 ('बस्नथाल्यो', 1),
 ('आसामुखीमात्रै', 1),
 ('पार्दशितालाई', 1),
 ('कार्यान्वनलाई', 1),
 ('अनुशिक्षणपछि', 1),
 ('आफुहरुपनि', 1),
 ('पुर्वगृहमन्त्री', 1),
 ('गेडीखोलामा', 1),
 ('शेराबेसी', 1),
 ('आरसिआइपीले', 1),
 ('आरसिआइपी', 1),
 ('लुखुं', 1),
 ('मानवअधिकाको', 1),
 ('चियरबाट', 1),
 ('मुलुङ्', 1),
 ('समरमाथाको', 1),
 ('फर्किहाल्छ।', 1),
 ('।उत्तरपट्टीको', 1),
 ('नग

In [206]:
'पुस्तकलयबाटे' in WORDS_full

False

In [321]:
from collections import Counter

m = Counter({k: WORDS_full[k] for k in lis if WORDS_full[k] <=34}).most_common()

In [322]:
len(m),len(lis),len(WORDS_full)

(1080716, 1171236, 1229736)

In [323]:
m

[('यहाँहरू', 34),
 ('खिचें', 34),
 ('उड्सलाई', 34),
 ('राष्ट्रप्रेमी', 34),
 ('दिमागका', 34),
 ('रुद्रमणि', 34),
 ('पक्षहरुले', 34),
 ('ठाउँठाउँ', 34),
 ('फिर्दै', 34),
 ('एफएनसीसीआई', 34),
 ('प्रतितपत्र', 34),
 ('कटाउनु', 34),
 ('विजुलीका', 34),
 ('एमटीओडब्लू', 34),
 ('राईसँगै', 34),
 ('भूखण्डमा', 34),
 ('गैरनाफामुखी', 34),
 ('मेरोतर्फबाट', 34),
 ('अविकास', 34),
 ('मंगलबारमात्रै', 34),
 ('गुज्रन', 34),
 ('गरिहाल्छु', 34),
 ('ननिकाल्ने', 34),
 ('आनन्ददेव', 34),
 ('रावणलाई', 34),
 ('हामीभित्रको', 34),
 ('निहु', 34),
 ('जुद्धोदय', 34),
 ('मालिकहरूको', 34),
 ('कमिसनर', 34),
 ('कोच्ने', 34),
 ('बनाएजस्तै', 34),
 ('बुतामा', 34),
 ('केटीभी', 34),
 ('आह्लादित', 34),
 ('वाक्यांशले', 34),
 ('लोडसेडिङमा', 34),
 ('कम्बो', 34),
 ('पूर्वाग्राही', 34),
 ('नसच्याई', 34),
 ('खबरै', 34),
 ('माननीयहरुले', 34),
 ('सम्प्रदायबीचको', 34),
 ('गोरखपुरबाट', 34),
 ('घोक्ने', 34),
 ('अक्सिजनका', 34),
 ('कथाभन्दा', 34),
 ('निकैनै', 34),
 ('दिऔँ', 34),
 ('विमर्शले', 34),
 ('कम्युनिस्टसँग', 34),
 ('एनसीसीले', 34),


In [309]:
WORDS_full['निशान्तले']

4

In [20]:
choices_list=[set() for i in range(len(sample_sentences[1].split())+1)]
const=0
for _ in other_choices:
    for sens in _:
        for i,w in enumerate(sens):
            index = i + const
            choices_list[index].add(w)
    const += len(other_choices[0][0])-1

In [22]:
choices_list

[{'।'},
 {'पुस्तकलयबाट'},
 {'कुल', 'खुला', 'ठूलो'},
 {'किताब', 'हिसाब'},
 {'गर्न', 'थप्न', 'पढ्न', 'पढ्ने', 'पार्न', 'भन्न', 'मर्न'},
 {'चाहन्छु'},
 {'।'}]

In [25]:


candidates_all('थुलो')

{'कालो',
 'किलो',
 'कुरो',
 'कुल',
 'कुलो',
 'खुला',
 'जालो',
 'जुडो',
 'झुटो',
 'ठुला',
 'ठुलो',
 'ठूलो',
 'ढिलो',
 'थलो',
 'थाले',
 'थियो',
 'थुलो',
 'धुलो',
 'धूलो',
 'निलो',
 'नीलो',
 'नौलो',
 'पालो',
 'पुल',
 'पुलको',
 'फलो',
 'फुल',
 'मुल',
 'युरो',
 'सुलभ',
 'हलो',
 'हिलो'}

In [26]:
WORDS

Counter({'<s>': 6884825,
         '।': 4667400,
         'र': 1697969,
         'छ': 1321657,
         'पनि': 1021030,
         'भएको': 537320,
         'लागि': 532843,
         'भने': 512636,
         'छन्': 482185,
         'गर्न': 479006,
         'गरेको': 473950,
         'हो': 421229,
         'यो': 409301,
         'गर्ने': 399109,
         'उनले': 347540,
         'तथा': 332883,
         'छ।': 313896,
         'थियो': 300393,
         'नै': 299612,
         'तर': 273086,
         'हुने': 269133,
         'एक': 245702,
         'नेपाल': 242579,
         'कुनै': 239024,
         'गरेका': 225245,
         'काम': 220636,
         'रहेको': 219705,
         'को': 213629,
         'छैन': 212175,
         'भएका': 196702,
         'थिए': 183621,
         'बताए': 182977,
         'गरी': 179047,
         'भन्ने': 176396,
         'गरिएको': 174957,
         'अनुसार': 174601,
         '१': 173013,
         'प्रतिक्रिया': 172118,
         'नयाँ': 165712,
         'मा': 165051,
         'वा': 

In [9]:
!pip install textdistance

import textdistance

def find_words_v3(word, dictionary):
    result = set()
    for w in dictionary:
        if textdistance.levenshtein.distance(word,w) <= 2:
            result.add(w)
    return result

dictionary = {"bat", "cat", "rat", "sat", "pat", "that", "this", "fat", "rat", "mat", "pat"}
print(find_words_v3("cat", dictionary))

{'bat', 'fat', 'cat', 'sat', 'mat', 'rat', 'pat', 'that'}


In [8]:
%pip install textdistance

Collecting textdistance
  Downloading textdistance-4.5.0-py3-none-any.whl (31 kB)
Installing collected packages: textdistance
Successfully installed textdistance-4.5.0
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'F:\SpellChecker\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [262]:
import textdistance

In [23]:
textdistance.levenshtein.distance('नपाल','पागल')

2

In [11]:
num = [chr(_) for _ in range(2406,2416)]
def numerics(w):
    if len(w)>1 and w.endswith('।'):
        return False
    for char in num:
        if char in w:
            return False
    return True




#Filter all words with characters not needed
WORDS_filtered = filter(numerics,WORDS_full)

lis = list(WORDS_filtered)
l = len(lis)

depth_dict = {0:lis[0:int(0.01*l)],1:lis[int(0.01*l):int(0.02*l)],2:lis[int(0.02*l):int(0.05*l)]  , 3:lis[int(0.05*l):int(0.1*l)] , 4:lis[int(0.1*l):int(0.5*l)],
5:lis[0:l]}


def check_distance(w, depth = 1,edit_distance = 2):

    count = 0
    candidates = []
    words = depth_dict[depth]
    for word in words:
        if(textdistance.levenshtein.distance(w, word)) <= edit_distance:
            count+=1
            candidates.append(word)
    return (candidates,count)

In [112]:
l

1216082

In [113]:
1216082*0.05

60804.100000000006

In [59]:
depth = 0


for word in sample_sentences[2].split():
    if len(word)<3:
        c,c_ = check_distance(word,depth,edit_distance = 1)
        for i in range(len(depth_dict)-1):
            if c_ < 1:
                c,c_ = check_distance(word, depth = depth+i+1,edit_distance = 1)
    else:
        c,c_ = check_distance(word, depth = depth)
        for i in range(len(depth_dict)-1):
            if c_ < 1:
                c,c_ = check_distance(word, depth = depth+i+1)
    if word not in c:
        c.append(word)
    print(c_,c)

def candidate_words(word,minimum = 1):
    if len(word)<3:
        c,c_ = check_distance(word,depth,edit_distance = 1)
        for i in range(len(depth_dict)-1):
            if c_ < minimum:
                c,c_ = check_distance(word, depth = depth+i+1,edit_distance = 1)
    else:
        c,c_ = check_distance(word, depth = depth)
        for i in range(len(depth_dict)-1):
            if c_ < minimum:
                c,c_ = check_distance(word, depth = depth+i+1)
    if word not in c:
        c.append(word)
    return c

    
    
#     c,c_ = check_distance(word, depth = depth)
#     if c_ < 2:
#         print("Entered Inside")
#         c,c_ = check_distance(word, depth = depth+1)
#         if c_ < 2:
#             print("Entered Inside, inside")
#             c,c_ = check_distance(word, depth = depth+2)
#     print((c_,c))
    

22 ['र', 'तर', 'त', 'घर', 'तै', 'कर', 'तल', 'सर', 'ती', 'तय', 'पर', 'तिर', 'थर', 'दर', 'फर', 'तरङ', 'डर', 'भर', 'तीर', 'तब', 'तह', 'तरल']
11 ['बस', 'उ', 'उड', 'स', 'यस', 'एस', 'टस', 'दस', 'जस', 'उप', 'उसो', 'उस']
30 ['समाजमा', 'मनमा', 'समय', 'समयमा', 'लयमा', 'सम्मान', 'सालमा', 'सबैमा', 'सदनमा', 'जम्मा', 'सडकमा', 'सभामा', 'ममा', 'संघमा', 'समाना', 'सोचमा', 'सम्म', 'सीमा', 'सहरमा', 'समयमै', 'सेपमा', 'समयका', 'सुनमा', 'सयमा', 'समिम', 'साथमा', 'सीमामा', 'समूहमा', 'सतहमा', 'सेलमा']
93 ['गति', 'भने', 'पनि', 'अघि', 'मन', 'भन्', 'न', 'सपना', 'अनि', 'पति', 'पो', 'छन्', 'नै', 'पाना', 'पछि', 'नि', 'कि', 'पटक', 'पर्', 'परे', 'कति', 'आएपनि', 'भनी', 'जति', 'भएपनि', 'धनी', 'मनन', 'झन्', 'उनी', 'आदि', 'परी', 'जना', 'नं', 'यति', 'पहल', 'ने', 'पर', 'पानी', 'उनै', 'पु', 'छवि', 'बने', 'यिनि', 'झनै', 'रन', 'पेट', 'ओपनर', 'अनिल', 'कवि', 'पाए', 'पुनः', 'पता', 'छनक', 'नौ', 'पेस', 'सन्', 'पुल', 'गरि', 'एने', 'ऐन', 'भनिए', 'पेश', 'पुस', 'पाने', 'पिस', 'पार', 'भनि', 'पुन', 'बि', 'बडि', 'पुष', 'तापनि', 'नत', 'टन',

In [20]:
len(WORDS)

12297

In [27]:
candidates_all('नेपाल') ,len(candidates_all('नेपाल'))

({'कपाल',
  'गोपाल',
  'चेपाङ',
  'नपाई',
  'नपाए',
  'नेकपा',
  'नेकपाले',
  'नेता',
  'नेताले',
  'नेपाल',
  'नेपालका',
  'नेपालकी',
  'नेपालकै',
  'नेपालको',
  'नेपालमा',
  'नेपालमै',
  'नेपालले',
  'नेपाली',
  'नेवार',
  'नेशनल',
  'नेसनल',
  'पाल',
  'बनेपा'},
 23)

In [55]:
ord('९')

2415

In [69]:
help(filter)

Help on class filter in module builtins:

class filter(object)
 |  filter(function or None, iterable) --> filter object
 |  
 |  Return an iterator yielding those items of iterable for which function(item)
 |  is true. If function is None, return the items that are true.
 |  
 |  Methods defined here:
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __iter__(self, /)
 |      Implement iter(self).
 |  
 |  __next__(self, /)
 |      Implement next(self).
 |  
 |  __reduce__(...)
 |      Return state information for pickling.
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  __new__(*args, **kwargs) from builtins.type
 |      Create and return a new object.  See help(type) for accurate signature.



In [56]:
hex(2415)

'0x96f'

In [65]:
char_vocab = []
for _ in range(2304, 2432):
    if _ not in range(2406,2416):
        char_vocab += [chr(_)]
        
char_vocab = [chr(_) for _ in range(2304,2432) if _ not in range(2406,2416) and ]

In [67]:
def edits1(word):
    "All edits that are one edit away from `word`."
    letters = char_vocab
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)


def edits2(word):
    "All edits that are two edits away from `word`."
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1))

In [66]:
char_vocab

['ऀ',
 'ँ',
 'ं',
 'ः',
 'ऄ',
 'अ',
 'आ',
 'इ',
 'ई',
 'उ',
 'ऊ',
 'ऋ',
 'ऌ',
 'ऍ',
 'ऎ',
 'ए',
 'ऐ',
 'ऑ',
 'ऒ',
 'ओ',
 'औ',
 'क',
 'ख',
 'ग',
 'घ',
 'ङ',
 'च',
 'छ',
 'ज',
 'झ',
 'ञ',
 'ट',
 'ठ',
 'ड',
 'ढ',
 'ण',
 'त',
 'थ',
 'द',
 'ध',
 'न',
 'ऩ',
 'प',
 'फ',
 'ब',
 'भ',
 'म',
 'य',
 'र',
 'ऱ',
 'ल',
 'ळ',
 'ऴ',
 'व',
 'श',
 'ष',
 'स',
 'ह',
 'ऺ',
 'ऻ',
 '़',
 'ऽ',
 'ा',
 'ि',
 'ी',
 'ु',
 'ू',
 'ृ',
 'ॄ',
 'ॅ',
 'ॆ',
 'े',
 'ै',
 'ॉ',
 'ॊ',
 'ो',
 'ौ',
 '्',
 'ॎ',
 'ॏ',
 'ॐ',
 '॑',
 '॒',
 '॓',
 '॔',
 'ॕ',
 'ॖ',
 'ॗ',
 'क़',
 'ख़',
 'ग़',
 'ज़',
 'ड़',
 'ढ़',
 'फ़',
 'य़',
 'ॠ',
 'ॡ',
 'ॢ',
 'ॣ',
 '।',
 '॥',
 '॰',
 'ॱ',
 'ॲ',
 'ॳ',
 'ॴ',
 'ॵ',
 'ॶ',
 'ॷ',
 'ॸ',
 'ॹ',
 'ॺ',
 'ॻ',
 'ॼ',
 'ॽ',
 'ॾ',
 'ॿ']

In [60]:
ord('ॽ')

2429

In [68]:
len('यो')

2

In [188]:
count = 0
for i in WORDS_full:
    print(i)
    count+=1
    
    if count == 20:
        break
    

<s>
साँवा
अक्षर
कखरा
मा
बाह्रखरी
मात्रा
मिल्नाले
भाषा
समृद्ध
र
अर्थपूर्ण
बनेजस्तै
देशको
समाजिक
राजनीतिक
आर्थिक
अन्य
क्षेत्रको
स्थूल


In [261]:
WORDS_full.most_common()

[('<s>', 6884825),
 ('।', 4667400),
 ('र', 1697969),
 ('छ', 1321657),
 ('पनि', 1021030),
 ('भएको', 537320),
 ('लागि', 532843),
 ('भने', 512636),
 ('छन्', 482185),
 ('गर्न', 479006),
 ('गरेको', 473950),
 ('हो', 421229),
 ('यो', 409301),
 ('गर्ने', 399109),
 ('उनले', 347540),
 ('तथा', 332883),
 ('छ।', 313896),
 ('थियो', 300393),
 ('नै', 299612),
 ('तर', 273086),
 ('हुने', 269133),
 ('एक', 245702),
 ('नेपाल', 242579),
 ('कुनै', 239024),
 ('गरेका', 225245),
 ('काम', 220636),
 ('रहेको', 219705),
 ('को', 213629),
 ('छैन', 212175),
 ('भएका', 196702),
 ('थिए', 183621),
 ('बताए', 182977),
 ('गरी', 179047),
 ('भन्ने', 176396),
 ('गरिएको', 174957),
 ('अनुसार', 174601),
 ('१', 173013),
 ('प्रतिक्रिया', 172118),
 ('नयाँ', 165712),
 ('मा', 165051),
 ('वा', 163343),
 ('केही', 163185),
 ('२', 160067),
 ('हजार', 159135),
 ('गर्दै', 158163),
 ('सय', 152931),
 ('नेपाली', 152540),
 ('हुन्छ', 150135),
 ('त', 148197),
 ('सरकारले', 146865),
 ('स्थानीय', 145138),
 ('दुई', 144531),
 ('हुन', 141285),
 ('मात्र',

In [395]:
count = 0
for w in WORDS_full:
    print(w)
    if count == 10:
        break
    count+=1

<s>
साँवा
अक्षर
कखरा
मा
बाह्रखरी
मात्रा
मिल्नाले
भाषा
समृद्ध
र


In [533]:
ws = WORDS_full.most_common(300000)

In [534]:
ws_dict = dict(ws)

In [None]:
t_small = Trie().insert_list()

In [305]:
with open('vocabulary-dictionary.txt',encoding="utf8") as f:
    lines = [line.rstrip() for line in f]

In [307]:
import random
random.shuffle(lines)

In [535]:
vocab_lines = [word for word in ws_dict if word in lines]

In [536]:
len(vocab_lines)

17421

In [311]:
WORDS_full['ल्वाङ']

105

In [336]:




triples_dict = {}
count = 1
for w in vocab_lines:
#     filtered_words = filter(lambda word: True if textdistance.levenshtein.distance(w[0],word[0]) <= 1 and 
#                             w[1]>10*word[1] else False,ws)



#Only look for length greater than 
    if len(w)>4:
        count_ = WORDS_full[w]
        #c = candidate_words_trie(w,edit_probabs =(1,1) )
        c = list(t.search(w,1))
        filtered_words = list(filter(lambda word: True if count_>10*WORDS_full[word] else False,c))
        if len(filtered_words)>0 and len(filtered_words)<=2:
            triples_dict[w] = list(zip(filtered_words,[WORDS_full.get(word) for word in filtered_words]))
            count+=1
        if count%100 == 0:
            print("Finished: ",count)
        if count == 300:
            break

Finished:  100
Finished:  200
Finished:  200
Finished:  300


In [334]:
WORDS_full['रजस्वला']

267

In [342]:
triples_dict2

{'प्रतिक्रिया': [('प्रतिक्रया', 41), ('फ्रतिक्रिया', 48)],
 'निर्माण': [('निर्वाण', 192), ('निर्धाण', 47)],
 'आर्थिक': [('अर्थिक', 158), ('आर्थीक', 106)],
 'प्रक्रिया': [('फ्रक्रिया', 66), ('प्रक्रियमा', 26)],
 'प्रहरी': [('प्रहर', 140)],
 'प्रयोग': [('प्रयाग', 98)],
 'कार्यालय': [('कार्यलय', 834)],
 'अडियो': [('डियो', 32), ('लडियो', 45)],
 'उल्लेख': [('उललेख', 26)],
 'कृपया': [('कृपा', 453)],
 'स्वास्थ्य': [('स्वास्थय', 92), ('स्वस्थ्य', 896)],
 'लगानी': [('लगानि', 71), ('लागानी', 54)],
 'क्षेत्र': [('क्षेक्र', 239), ('क्षेत्री', 2380)],
 'सम्पन्न': [('सम्पन्', 193), ('समपन्न', 59)],
 'पक्राउ': [('पक्राऊ', 28)],
 'कर्मचारी': [('कर्मचरी', 42), ('कार्मचारी', 26)],
 'प्रकाशित': [('प्रकासित', 176), ('अप्रकाशित', 94)],
 'जग्गा': [('जग्गे', 28)],
 'घोषणा': [('घोषण', 343)],
 'पछिल्लो': [('पछील्लो', 37)],
 'उद्योग': [('उध्योग', 31)],
 'सहभागी': [('सहभागि', 1538)],
 'केन्द्र': [('एकेन्द्र', 133), ('कलेन्द्र', 74)],
 'प्रदान': [('प्रदा', 34)],
 'खेलाडी': [('खेलाडि', 161), ('खिलाडी', 34)],
 'विद

In [338]:
triples_dict2 = {}
count = 1
for w in vocab_lines:
#     filtered_words = filter(lambda word: True if textdistance.levenshtein.distance(w[0],word[0]) <= 1 and 
#                             w[1]>10*word[1] else False,ws)



#Only look for length greater than 
    if len(w)>4:
        count_ = WORDS_full[w]
        #c = candidate_words_trie(w,edit_probabs =(1,1) )
        c = list(t.search(w,1))
        filtered_words = list(filter(lambda word: True if count_>10*WORDS_full[word] else False,c))
        if len(filtered_words)>0 and len(filtered_words) <= 4:
            triples_dict[w] = list(zip(filtered_words,[WORDS_full.get(word) for word in filtered_words]))
            count+=1
        if count%100 == 0:
            print("Finished: ",count)
        if count == 300:
            break

Finished:  100
Finished:  100
Finished:  200
Finished:  300


In [340]:
triples_dict

{'प्रतिक्रिया': [('प्रतिक्रया', 41), ('फ्रतिक्रिया', 48)],
 'निर्माण': [('निर्वाण', 192), ('निर्धाण', 47)],
 'आर्थिक': [('अर्थिक', 158), ('आर्थीक', 106)],
 'प्रक्रिया': [('फ्रक्रिया', 66), ('प्रक्रियमा', 26)],
 'प्रहरी': [('प्रहर', 140)],
 'प्रयोग': [('प्रयाग', 98)],
 'कार्यालय': [('कार्यलय', 834)],
 'अडियो': [('डियो', 32), ('लडियो', 45)],
 'उल्लेख': [('उललेख', 26)],
 'कृपया': [('कृपा', 453)],
 'स्वास्थ्य': [('स्वास्थय', 92), ('स्वस्थ्य', 896)],
 'लगानी': [('लगानि', 71), ('लागानी', 54)],
 'क्षेत्र': [('क्षेक्र', 239), ('क्षेत्री', 2380)],
 'सम्पन्न': [('सम्पन्', 193), ('समपन्न', 59)],
 'पक्राउ': [('पक्राऊ', 28)],
 'कर्मचारी': [('कर्मचरी', 42), ('कार्मचारी', 26)],
 'प्रकाशित': [('प्रकासित', 176), ('अप्रकाशित', 94)],
 'जग्गा': [('जग्गे', 28)],
 'घोषणा': [('घोषण', 343)],
 'पछिल्लो': [('पछील्लो', 37)],
 'उद्योग': [('उध्योग', 31)],
 'सहभागी': [('सहभागि', 1538)],
 'केन्द्र': [('एकेन्द्र', 133), ('कलेन्द्र', 74)],
 'प्रदान': [('प्रदा', 34)],
 'खेलाडी': [('खेलाडि', 161), ('खिलाडी', 34)],
 'विद

In [539]:
triples_dict3 = {}
count = 1
scan = 0
for w in vocab_lines:
#     filtered_words = filter(lambda word: True if textdistance.levenshtein.distance(w[0],word[0]) <= 1 and 
#                             w[1]>10*word[1] else False,ws)



#Only look for length greater than 
    if len(w)>4:
        count_ = WORDS_full[w]
        #c = candidate_words_trie(w,edit_probabs =(1,1) )
        c = list(t.search(w,1))
        filtered_words = list(filter(lambda word: True if count_>5*WORDS_full[word] else False,c))
        if len(filtered_words)>0 and len(filtered_words) <= 2:
            triples_dict3[w] = list(zip(filtered_words,[WORDS_full.get(word) for word in filtered_words]))
            count+=1
        
#             if count%100 == 0:
#                 print("Finished: ",count)
    scan +=1
    if scan%100 == 0:
        print("Finished iteration: ",scan,"Count: ",count)
#         if count == 300:
#             break

Finished iteration:  100 Count:  7
Finished iteration:  200 Count:  20
Finished iteration:  300 Count:  36
Finished iteration:  400 Count:  51
Finished iteration:  500 Count:  70
Finished iteration:  600 Count:  100
Finished iteration:  700 Count:  123
Finished iteration:  800 Count:  154
Finished iteration:  900 Count:  184
Finished iteration:  1000 Count:  215
Finished iteration:  1100 Count:  238
Finished iteration:  1200 Count:  272
Finished iteration:  1300 Count:  309
Finished iteration:  1400 Count:  342
Finished iteration:  1500 Count:  377
Finished iteration:  1600 Count:  410
Finished iteration:  1700 Count:  444
Finished iteration:  1800 Count:  478
Finished iteration:  1900 Count:  513
Finished iteration:  2000 Count:  544
Finished iteration:  2100 Count:  570
Finished iteration:  2200 Count:  597
Finished iteration:  2300 Count:  628
Finished iteration:  2400 Count:  659
Finished iteration:  2500 Count:  688
Finished iteration:  2600 Count:  726
Finished iteration:  2700 C

KeyboardInterrupt: 

# Run this

In [542]:
# with open('triples.pickle','wb') as f:
#     pickle.dump(triples_dict3,f)

with open('triples.pickle','rb') as f:
    triples_dict3 = pickle.load(f)

In [543]:
triples_dict3

{'प्रतिक्रिया': [('प्रतिक्रया', 41), ('फ्रतिक्रिया', 48)],
 'निर्माण': [('निर्वाण', 192), ('निर्धाण', 47)],
 'आर्थिक': [('अर्थिक', 158), ('आर्थीक', 106)],
 'प्रक्रिया': [('फ्रक्रिया', 66), ('प्रक्रियमा', 26)],
 'प्रहरी': [('प्रहर', 140)],
 'प्रयोग': [('प्रयाग', 98)],
 'कार्यालय': [('कार्यलय', 834)],
 'अडियो': [('डियो', 32), ('लडियो', 45)],
 'उल्लेख': [('उललेख', 26)],
 'कृपया': [('कृपा', 453)],
 'स्वास्थ्य': [('स्वास्थय', 92), ('स्वस्थ्य', 896)],
 'लगानी': [('लगानि', 71), ('लागानी', 54)],
 'क्षेत्र': [('क्षेक्र', 239), ('क्षेत्री', 2380)],
 'सम्पन्न': [('सम्पन्', 193), ('समपन्न', 59)],
 'पक्राउ': [('पक्राऊ', 28)],
 'कर्मचारी': [('कर्मचरी', 42), ('कार्मचारी', 26)],
 'प्रकाशित': [('प्रकासित', 176), ('अप्रकाशित', 94)],
 'जग्गा': [('जग्गे', 28)],
 'घोषणा': [('घोषण', 343)],
 'पछिल्लो': [('पछिल्ला', 6950), ('पछील्लो', 37)],
 'उद्योग': [('उध्योग', 31), ('उद्योगी', 4298)],
 'सहभागी': [('सहभागि', 1538)],
 'केन्द्र': [('एकेन्द्र', 133), ('कलेन्द्र', 74)],
 'प्रदान': [('प्रदा', 34)],
 'खेलाडी': [(

In [546]:
def make_triples(triples_dict):
    triples = []
    for iw in triples_dict:
        for ow in triples_dict[iw]:
            triples.append((ow[0],iw,ow[1]))    
    return triples
            

In [547]:
triples = make_triples(triples_dict3)

In [554]:
len(triples),triples

(3060,
 [('प्रतिक्रया', 'प्रतिक्रिया', 41),
  ('फ्रतिक्रिया', 'प्रतिक्रिया', 48),
  ('निर्वाण', 'निर्माण', 192),
  ('निर्धाण', 'निर्माण', 47),
  ('अर्थिक', 'आर्थिक', 158),
  ('आर्थीक', 'आर्थिक', 106),
  ('फ्रक्रिया', 'प्रक्रिया', 66),
  ('प्रक्रियमा', 'प्रक्रिया', 26),
  ('प्रहर', 'प्रहरी', 140),
  ('प्रयाग', 'प्रयोग', 98),
  ('कार्यलय', 'कार्यालय', 834),
  ('डियो', 'अडियो', 32),
  ('लडियो', 'अडियो', 45),
  ('उललेख', 'उल्लेख', 26),
  ('कृपा', 'कृपया', 453),
  ('स्वास्थय', 'स्वास्थ्य', 92),
  ('स्वस्थ्य', 'स्वास्थ्य', 896),
  ('लगानि', 'लगानी', 71),
  ('लागानी', 'लगानी', 54),
  ('क्षेक्र', 'क्षेत्र', 239),
  ('क्षेत्री', 'क्षेत्र', 2380),
  ('सम्पन्', 'सम्पन्न', 193),
  ('समपन्न', 'सम्पन्न', 59),
  ('पक्राऊ', 'पक्राउ', 28),
  ('कर्मचरी', 'कर्मचारी', 42),
  ('कार्मचारी', 'कर्मचारी', 26),
  ('प्रकासित', 'प्रकाशित', 176),
  ('अप्रकाशित', 'प्रकाशित', 94),
  ('जग्गे', 'जग्गा', 28),
  ('घोषण', 'घोषणा', 343),
  ('पछिल्ला', 'पछिल्लो', 6950),
  ('पछील्लो', 'पछिल्लो', 37),
  ('उध्योग', 'उद्योग', 

In [1004]:
class BrillMore:
    def __init__(self, N=2, max_candidates=10):
        self.N = N
        self.max_candidates = max_candidates
        self.triples = []
        self.edit_dict = defaultdict(int)
        self.count_dict = defaultdict(int)
        self.alphabet = set()
        
    
    def fit(self, triples,error_rate = 0.8):
        self.error_rate = error_rate
        self.triples = triples
        for x, w, count in triples:
            self.alphabet.update(set(x+w))
            alignment = self.align(x, w)
            edits = self._edits_from_alignment(alignment)
            #print(edits)            
            for edit in edits:
                self.edit_dict[edit] += count
        for edit in self.edit_dict.keys():
            self.count_dict[edit[0]] += self.edit_dict[edit]
        for edit in self.edit_dict.keys():
            self.edit_dict[edit] /= self.count_dict[edit[0]]
            self.edit_dict[edit]*= (1-self.error_rate)
        
    def _edits_from_alignment(self, alignment):
        edits = []
#        print(len(alignment))
        for a, b in alignment:           
            if a != b:
                edits.append((a,b))
        return edits
#         expanded_edits = []
#         for i, (a, b) in enumerate(edits):
#             for j in range(1, self.N+1):
#                 if i+j < len(edits):
#                     expanded_edits.append((a+edits[i+j][0], b+edits[i+j][1]))
#         return expanded_edits
    
    def likelihood(self,x,w):
        """
        
        
        """        
        prob = 1
        if (x == w):
            return self.error_rate
        alignment = self.align(x, w)        
        edits = self._edits_from_alignment(alignment)
        for edit in edits:
            prob*= self.edit_dict.get(edit,0.0005*(1-self.error_rate))            
        return prob
    
    
    
    
    def likelihood_from_list(self,x,W):
        likelihood_ordered_list = []
        for w in W:
            prob = 1
            if (x == w):
                prob = self.error_rate
            else:
                alignment = self.align(x, w)
                edits = self._edits_from_alignment(alignment)
                for edit in edits:
                    prob*= self.edit_dict.get(edit,0.0005*(1-self.error_rate))
            likelihood_ordered_list.append((prob,w))
        return sorted(likelihood_ordered_list)
            
        
        
    
    def transform(self, x):
        candidates = []
        candidate_probs = []
        for w in self.alphabet:
            alignment = self.align(x, w)
            edits = self._edits_from_alignment(alignment)
            prob = 1
            for a, b in edits:
                prob *= self.edit_dict.get((a,b), 0)
            if len(candidates) < self.max_candidates:
                candidates.append(w)
                candidate_probs.append(prob)
            elif prob > min(candidate_probs):
                min_idx = candidate_probs.index(min(candidate_probs))
                candidates[min_idx] = w
                candidate_probs[min_idx] = prob
        return candidates[candidate_probs.index(max(candidate_probs))]



    def align(self,x, w):
        m = len(x)
        n = len(w)
        dp = [[0 for j in range(n + 1)] for i in range(m + 1)]
        for i in range(m + 1):
            for j in range(n + 1):
                if i == 0:
                    dp[i][j] = j
                elif j == 0:
                    dp[i][j] = i
                elif x[i - 1] == w[j - 1]:
                    dp[i][j] = dp[i - 1][j - 1]
                else:
                    dp[i][j] = min(dp[i][j - 1], dp[i - 1][j], dp[i - 1][j - 1]) + 1
        i = m
        j = n
        alignments = []
        while i > 0 and j > 0:
            if x[i - 1] == w[j - 1]:
                alignments.append((x[i - 1], w[j - 1]))
                i -= 1
                j -= 1
            elif dp[i][j] == dp[i - 1][j - 1] + 1:
                alignments.append((x[i - 1], w[j - 1]))
                i -= 1
                j -= 1
            elif dp[i][j] == dp[i][j - 1] + 1:
                alignments.append((None, w[j - 1]))
                j -= 1
            else:
                alignments.append((x[i - 1], None))
                i -= 1
        while i > 0:
            alignments.append((x[i - 1], None))
            i -= 1
        while j > 0:
            alignments.append((None, w[j - 1]))
            j -= 1
        return alignments[::-1]


In [1005]:
bm = BrillMore()
bm.fit(triples,error_rate = 0.65)

In [1006]:
bm.transform('ब्यावसायिक')

'ब'

In [1056]:
class BrillMoreAcc:
    def __init__(self, N=2, max_candidates=10):
        self.N = N
        self.max_candidates = max_candidates
        self.triples = []
        self.edit_dict = defaultdict(int)
        self.count_dict = defaultdict(int)
        self.alphabet = set()
        
    
    def fit(self, triples,error_rate = 0.97):
        self.error_rate = error_rate
        self.triples = triples
        for x, w, count in triples:
            self.alphabet.update(set(x+w))
            alignment = self.align(x, w)
            edits = self._edits_from_alignment(alignment)
            #print(edits)            
            for edit in edits:
                self.edit_dict[edit] += count
        for edit in self.edit_dict.keys():
            self.count_dict[edit[0]] += self.edit_dict[edit]
        for edit in self.edit_dict.keys():
            self.edit_dict[edit] /= self.count_dict[edit[0]]
        
    def _edits_from_alignment(self, alignment):
        edits = []
#        print(len(alignment))
        for a, b in alignment:           
            if a != b:
                edits.append((a,b))
        return edits
    
    
    
    def likelihood(self,x,w):
        """
        
        
        """        
        prob = 1
        if (x == w):
            return (self.error_rate)**len(x)
        alignment = self.align(x, w)        
        edits = self._edits_from_alignment(alignment)
        for edit in edits:
            prob*= self.edit_dict.get(edit,0.0005)            
        return prob* (1-((self.error_rate)**len(x)))
    
    
    
    
    def align(self,x, w):
        m = len(x)
        n = len(w)
        dp = [[0 for j in range(n + 1)] for i in range(m + 1)]
        for i in range(m + 1):
            for j in range(n + 1):
                if i == 0:
                    dp[i][j] = j
                elif j == 0:
                    dp[i][j] = i
                elif x[i - 1] == w[j - 1]:
                    dp[i][j] = dp[i - 1][j - 1]
                else:
                    dp[i][j] = min(dp[i][j - 1], dp[i - 1][j], dp[i - 1][j - 1]) + 1
        i = m
        j = n
        alignments = []
        while i > 0 and j > 0:
            if x[i - 1] == w[j - 1]:
                alignments.append((x[i - 1], w[j - 1]))
                i -= 1
                j -= 1
            elif dp[i][j] == dp[i - 1][j - 1] + 1:
                alignments.append((x[i - 1], w[j - 1]))
                i -= 1
                j -= 1
            elif dp[i][j] == dp[i][j - 1] + 1:
                alignments.append((None, w[j - 1]))
                j -= 1
            else:
                alignments.append((x[i - 1], None))
                i -= 1
        while i > 0:
            alignments.append((x[i - 1], None))
            i -= 1
        while j > 0:
            alignments.append((None, w[j - 1]))
            j -= 1
        return alignments[::-1]            
        

In [1057]:
bma = BrillMoreAcc()

In [1058]:
bma.fit(triples)

In [1059]:
bma.likelihood('पुस्तकलयबाटे','पुस्तकालयबाट')

0.019886762528226548

In [1045]:
len('पुस्तकलयबाटे')

12

In [988]:
sorted(bm.edit_dict.values())

[0.00024914874179885393,
 0.00025745369985881575,
 0.000279778249831615,
 0.0002989784901586247,
 0.0003005025646339568,
 0.00032122687943629866,
 0.00032122687943629866,
 0.0003571131965783573,
 0.0003937619812444952,
 0.00043073609722329054,
 0.00043079377742321506,
 0.00043185781911801344,
 0.0004461195292669795,
 0.00046770421740182245,
 0.0004999596806709136,
 0.0005230366894854242,
 0.000564470607209096,
 0.0005967260704781872,
 0.0006222576785001995,
 0.0006394817706170584,
 0.0006528159162737683,
 0.0006560916867369821,
 0.0007023933402705516,
 0.0007096201919200066,
 0.0007142263931567146,
 0.0007253510180819647,
 0.0007308363092766382,
 0.0007326007326007326,
 0.000760777683854607,
 0.0007800780078007801,
 0.0007800780078007801,
 0.0007889546351084813,
 0.0007972759737563326,
 0.0008225143133618256,
 0.0008304958059961797,
 0.0008388007640561416,
 0.0008456322297566814,
 0.0008637156382360269,
 0.0008700870087008701,
 0.0008807833790995287,
 0.0009325941661053833,
 0.00094295

In [1046]:
bma.edit_dict

defaultdict(int,
            {(None, 'ि'): 0.0010271156880657763,
             ('फ', 'प'): 0.00770428015564203,
             ('व', 'म'): 0.0004754860524091297,
             ('ध', 'म'): 0.0028040540540540566,
             ('अ', 'आ'): 0.0023803329864724265,
             ('ी', 'ि'): 0.009722294181648628,
             ('म', None): 0.0042848992012171966,
             (None, 'ी'): 0.0029324806909725135,
             ('ा', 'ो'): 0.014236212599030856,
             (None, 'ा'): 0.004973631758159625,
             (None, 'अ'): 0.0011373640063117691,
             ('ल', 'अ'): 0.0003039171544349395,
             (None, '्'): 0.0036157711153558705,
             (None, 'य'): 0.0004995432273067025,
             ('ि', 'ी'): 0.02544076585560433,
             ('ा', None): 0.01154449657718638,
             ('क', 'त'): 0.0012168328544870722,
             ('ी', None): 0.017679653903942816,
             (None, 'न'): 0.0023731417656340857,
             ('ऊ', 'उ'): 0.030000000000000027,
             ('स', 'श'):

In [450]:
align('प्रतिक्रिया','प्रतिक्रया')

[('प', 'प'),
 ('्', '्'),
 ('र', 'र'),
 ('त', 'त'),
 ('ि', 'ि'),
 ('क', 'क'),
 ('्', '्'),
 ('र', 'र'),
 ('ि', None),
 ('य', 'य'),
 ('ा', 'ा')]

In [555]:
bm.likelihood('प्रतिक्रया','प्रतिक्रिया')

0.03423718960219251

In [556]:
bm.likelihood('बिवरण','विवरण')

0.9123054592371583

In [934]:
bm.likelihood('आ','न')

0.0005

In [469]:
final_candidate_words('छुन')

time passed fc:  0.497739315032959
['छुन', 'छन', 'छिन', 'छुट', 'छुने', 'धुन', 'हुन', 'कुन', 'छु', 'छैन', 'जुन', 'नुन', 'पुन', 'रुन', 'सुन']


['छुन', 'छन', 'छिन', 'छुट', 'छुने', 'धुन', 'हुन']

In [970]:
bm.likelihood_from_list('छुन', ['छुन', 'छन', 'छिन', 'छुट', 'छुने', 'धुन', 'हुन'])

NameError: name 'error_rate' is not defined

In [969]:
sum = 0
for k in bm.edit_dict:
    if k[0] == None:
        sum+=bm.edit_dict[k]
print(sum)
del sum

0.19999999999999996


In [396]:
for a,b in zip(*bm.align(*triples[0][:2])):
    print(a,b)

ValueError: too many values to unpack (expected 2)

In [393]:
a,b

(('प', '्', 'र', 'त', 'ि', 'क', '्', 'र', None, 'य', 'ा'),
 ('प', '्', 'र', 'त', 'ि', 'क', '्', 'र', 'ि', 'य', 'ा'))

In [375]:

b = align('विवरण','बिवरण')
b

[('व', 'ब'), ('ि', 'ि'), ('व', 'व'), ('र', 'र'), ('ण', 'ण')]

In [374]:
list(zip(*b))

[('व', 'ि', 'व', 'र', 'ण'), ('ब', 'ि', 'व', 'र', 'ण')]

In [366]:
align('व्यावसायिक','ब्यावसायिक')

[('व', 'ब'),
 ('्', '्'),
 ('य', 'य'),
 ('ा', 'ा'),
 ('व', 'व'),
 ('स', 'स'),
 ('ा', 'ा'),
 ('य', 'य'),
 ('ि', 'ि'),
 ('क', 'क')]

In [359]:
em = substring_error_model(triples)

In [351]:
error_model = bill_moore_model(triples,2)

In [329]:
WORDS_full['फैसाला']

29

In [293]:
len('गर्ने')

5

In [285]:
td 

{'<s>': [],
 '।': [('प', 2609),
  ('ल', 2748),
  ('उ', 2640),
  ('अ', 1919),
  ('व', 1488),
  ('स', 3943),
  ('च', 1265),
  ('ज', 886),
  ('घ', 854),
  ('औ', 806),
  ('ओ', 1016),
  ('इ', 1055),
  ('ः', 1967)],
 'र': [('प', 2609),
  ('ल', 2748),
  ('उ', 2640),
  ('अ', 1919),
  ('व', 1488),
  ('स', 3943),
  ('च', 1265),
  ('ज', 886),
  ('घ', 854),
  ('औ', 806),
  ('ओ', 1016),
  ('इ', 1055),
  ('ः', 1967),
  ('रक', 768),
  ('रस', 1059),
  ('रङ', 1854),
  ('रथ', 1363),
  ('पर', 3292),
  ('भर', 4075),
  ('गर', 3735),
  ('थर', 1163),
  ('सर', 2539),
  ('आर', 1457),
  ('फर', 2466)],
 'छ': [('प', 2609),
  ('ल', 2748),
  ('उ', 2640),
  ('अ', 1919),
  ('व', 1488),
  ('स', 3943),
  ('च', 1265),
  ('ज', 886),
  ('घ', 854),
  ('औ', 806),
  ('ओ', 1016),
  ('इ', 1055),
  ('ः', 1967),
  ('छौ', 3837),
  ('छठ', 1505)],
 'पनि': [('पति', 3112), ('भनि', 1287), ('उनि', 897)],
 'भएको': [],
 'लागि': [('लागे', 4574), ('लागु', 3512)],
 'भने': [('ने', 2456), ('भनि', 1287), ('भनेँ', 1417), ('भनें', 1239)],
 'छन्'

In [261]:
m = str(kn_lm2.lm[0].most_common(200000))

True

In [253]:

def get_context_words(word,model):
    lm = kn_lm2.lm[0].most_common()
    before = None
    after = None
    for tup in lm:
        if word == tup[0][0] and after == None:
            after = tup[0][1]
        elif word == tup[0][1] and before == None:
            before = tup[0][0]
    return before,after


In [258]:
len(kn_lm2.lm[0].most_common())

15558470

In [257]:
get_context_words('आफनो',kn_lm2)

('गाउनेहरुले', 'र')

In [264]:
WORDS_full['आफ्नै']

35266

In [233]:
triples_dict['आफ्नो']

['आफनो', 'आफ्नी', 'आआफ्नो', 'आआफ्ना']

In [234]:
candidate_words_trie('आफ्नो')

['आफनो', 'आफ्नो', 'आफ्नै', 'आफ्ना', 'आफ्नी', 'आआफ्नो', 'आआफ्ना']

In [235]:
candidate_words('आफ्नो')

['आफ्नो', 'आफ्नै', 'आफ्ना', 'आफ्नी', 'आफनो', 'आआफ्नो', 'आआफ्ना']

In [237]:
textdistance.levenshtein.distance('आफ्नो','आआफ्ना')

2

In [420]:
%pip install pygtrie

Collecting pygtrie
  Downloading pygtrie-2.5.0-py3-none-any.whl (25 kB)
Installing collected packages: pygtrie
Successfully installed pygtrie-2.5.0
Note: you may need to restart the kernel to use updated packages.


In [134]:
bool([])

textdistance.levenshtein.distance('गील','किताब')

5

In [33]:
t.root.is_end_of_word

False

In [108]:
'पुस्तकालयबाट' in lis[:110000]

True

In [36]:
t.root.children['ग'].children['र'].children['े'].children['क'].children['ो'].is_end_of_word

True

In [110]:
import time
word = 'मेरो'
s = time.time()
phonetic_distance(word,list(t.search(word,2)))
e = time.time()
print("End: ",e-s)
phonetic_distance(word,list(t.search(word,2)))

End:  0.4667494297027588


['मेरो',
 'एरो',
 'तेरो',
 'मरो',
 'मेको',
 'मेट्रो',
 'मेरा',
 'मेरी',
 'मेरे',
 'मेलो',
 'मेसो']

In [139]:
s = time.time()
t.search('पुस्तकलयबाटे',2)
e = time.time()
print("End: ",e-s)


t.search('पुस्तकलयबाटे',2)

End:  3.097593069076538


['पुस्तकालयबाट']

In [140]:
start_depth = 0
final_candidate_words('पुस्तकलयबाटे')

Entered depth,  1
Entered depth,  2
Entered depth,  3
time passed fc:  9.484205961227417
['पुस्तकालयबाट', 'पुस्तकलयबाटे']


['पुस्तकालयबाट', 'पुस्तकलयबाटे']

In [453]:
import networkx as nx
import matplotlib.pyplot as plt

class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, word):
        current = self.root
        for letter in word:
            if letter not in current.children:
                current.children[letter] = TrieNode()
            current = current.children[letter]
        current.is_end_of_word = True

    def find_words_within_distance_two(self, word):
        current = self.root
        queue = []
        for letter in word:
            if letter not in current.children:
                return []  # word not in trie
            queue.append(letter)
            current = current.children[letter]

        words_within_distance = []

        def _find_words_within_distance_two_helper(current, word):
            if current.is_end_of_word:
                if textdistance.levenshtein.distance(word,word)<=2:
                    words_within_distance.append(word)
            for letter, child_node in current.children.items():
                _find_words_within_distance_two_helper(child_node, word + letter)

        _find_words_within_distance_two_helper(current, "")
        return words_within_distance

    def visualize(self):
        G = nx.DiGraph()
        queue = [(self.root, None)]
        while queue:
            current, parent = queue.pop(0)
            for letter, child in current.children.items():
                G.add_edge(parent, child, letter=letter)
                queue.append((child, current))
        pos = nx.spring_layout(G)
        nx.draw_networkx_nodes(G, pos)
        nx.draw_networkx_edges(G, pos)
        nx.draw_networkx_edge_labels(G, pos)
        plt.show()


ModuleNotFoundError: No module named 'networkx'

In [439]:
from pygtrie import CharTrie
from string import ascii_lowercase

trie = CharTrie()
tr = ["hello", "world", "hi", "hell", "hey", "goodbye"]

for w in tr:
    trie[w] = True


In [440]:
trie

CharTrie([('hell', True), ('hello', True), ('hey', True), ('hi', True), ('world', True), ('goodbye', True)])

In [407]:
triples_dict.keys()

dict_keys([('<s>', 6884825), ('।', 4667400), ('र', 1697969), ('छ', 1321657), ('पनि', 1021030), ('भएको', 537320), ('लागि', 532843), ('भने', 512636), ('छन्', 482185), ('गर्न', 479006), ('गरेको', 473950), ('हो', 421229), ('यो', 409301), ('गर्ने', 399109), ('उनले', 347540), ('तथा', 332883), ('छ।', 313896), ('थियो', 300393), ('नै', 299612), ('तर', 273086), ('हुने', 269133), ('एक', 245702), ('नेपाल', 242579), ('कुनै', 239024), ('गरेका', 225245), ('काम', 220636), ('रहेको', 219705), ('को', 213629), ('छैन', 212175), ('भएका', 196702), ('थिए', 183621), ('बताए', 182977), ('गरी', 179047), ('भन्ने', 176396), ('गरिएको', 174957), ('अनुसार', 174601), ('१', 173013), ('प्रतिक्रिया', 172118), ('नयाँ', 165712), ('मा', 165051), ('वा', 163343), ('केही', 163185), ('२', 160067), ('हजार', 159135), ('गर्दै', 158163), ('सय', 152931), ('नेपाली', 152540), ('हुन्छ', 150135), ('त', 148197), ('सरकारले', 146865), ('स्थानीय', 145138), ('दुई', 144531), ('हुन', 141285), ('मात्र', 140135), ('आफ्नो', 138378), ('कारण', 13387

In [410]:
triples_dict[('नयाँ', 165712)]

[('नया', 1175)]

In [411]:
triples_dict[('पाँच', 37801)]

[('पाँचौ', 2655), ('पाँचै', 554)]

In [415]:
triples_dict[('मलाई', 42286)]

[('माई', 729), ('मनलाई', 665), ('मलाइ', 569)]

In [416]:
triples_dict[('भारतीय', 49175)]

[('भारतिय', 491), ('भारती', 482)]

In [417]:
triples_dict[('पानी', 40287)]

[('खानी', 3224),
 ('पारी', 2541),
 ('रानी', 2199),
 ('हानी', 1575),
 ('नानी', 983),
 ('जानी', 843),
 ('पान', 743),
 ('मानी', 697),
 ('पाटी', 640),
 ('सानी', 613),
 ('पनी', 601),
 ('पाना', 541),
 ('पाली', 367)]

In [418]:
triples_dict[('समिति', 37705)]

[]

In [413]:
triples_dict[('नीति', 40808)]

[('नाति', 983), ('नीजि', 634), ('गीति', 498)]

In [400]:
help(filter)

Help on class filter in module builtins:

class filter(object)
 |  filter(function or None, iterable) --> filter object
 |  
 |  Return an iterator yielding those items of iterable for which function(item)
 |  is true. If function is None, return the items that are true.
 |  
 |  Methods defined here:
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __iter__(self, /)
 |      Implement iter(self).
 |  
 |  __next__(self, /)
 |      Implement next(self).
 |  
 |  __reduce__(...)
 |      Return state information for pickling.
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  __new__(*args, **kwargs) from builtins.type
 |      Create and return a new object.  See help(type) for accurate signature.

