In [35]:
import regex as re
from collections import Counter
import itertools
import math
import pickle
import numpy


sample_factor = 0.01
sample_factor2 = 0.05


alpha = 0.65

def words(text): 
    text = re.sub(r'[\u0964]', r'\u0020\u0964\u0020', text)
    return re.findall(r'[\u0900-\u097F]+', text.lower())


def words_bigram(text):   
    text = re.sub(r'[\u0964]', r'\u0020\u0964\u0020', text)
    return [tuple(x.split()) for x in re.findall
                                (r'\b[\u0900-\u097F]+\s[\u0900-\u097F]+',text.lower(), overlapped=True)]

def words_trigram(text):
    text = re.sub(r'[\u0964]', r'\u0020\u0964\u0020', text)
    return re.findall(
    r'\b[\u0900-\u097F]+\s[\u0900-\u097F]+\s[\u0900-\u097F]+', text.lower(),
    overlapped=True)


def words_bigram_from_list(sentence):
    return [(sentence[i],sentence[i+1]) for i in range(len(sentence)-1)]


# List of all Nepali characters
char_vocab = []
for _ in range(2304, 2432):
    if _ not in range(2406,2416):
        char_vocab += [chr(_)]


def change_keys(d):
    return dict([(k[0], v) for k, v in d.items()])

with open('data/saved_words_counter1','rb') as inputfile:
    WORDS = pickle.load(inputfile) 
    WORDS = Counter(change_keys(WORDS))

with open('data/saved_words_counter2','rb') as inputfile:
    WORDS_bigram = pickle.load(inputfile) 

WORDS_trigram = WORDS_bigram


#Words Partition(splits)
WORDS_full = WORDS
WORDS2 = Counter(dict(WORDS.most_common(int(sample_factor2*len(WORDS)))))
WORDS = Counter(dict(WORDS.most_common(int(sample_factor*len(WORDS)))))




#Token Probability
def probability(word, N=sum(WORDS.values())):
    "Probability of `word`."
    return (WORDS[word] + 1)/ N

def probability_bigram(bi_word, N=sum(WORDS_bigram.values())):
    "Probability of `two words` given as a tuple."
    return (WORDS_bigram[bi_word]+1) / N

def probability_trigram(tri_word, N=sum(WORDS_trigram.values())):
    "Probability of `two words` given as a tuple."
    return (WORDS_trigram[tri_word]+1) / N


words_list = list(WORDS)


#Likelihood models
def likelihood(sentence,N=len(words_list)):
    prod = 1    
    for word in sentence:
        if word not in WORDS:
            prod*= 0.95
        else:
            word_index = words_list.index(word)
            proportional_word = words_list[-word_index+N-1]
            prod*= 0.05*probability(proportional_word)
    return prod

def constant_distributive_likelihood(sentence,candidate_sentence,candidate_count):
    prod = 1    
    i = 0
    #print(sentence.split(),candidate_sentence)
    
    for word,candidate_word in zip(sentence.split(),candidate_sentence):        
        if word==candidate_word:
            prod*= alpha
        else:
            N = candidate_count[i]
            prod*= (1-alpha)/N
        i+=1
    return prod


def edits1(word):
    "All edits that are one edit away from `word`."
    letters = char_vocab
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)


def edits2(word):
    "All edits that are two edits away from `word`."
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1))



def edits2_(word):
    for e1 in known_from_full(edits1(word)):
        s = set( e2 for  e2 in edits1(e1))
        return known_from_full(s)
# Isn't exact


def edits3(word):
    "All edits that are two edits away from `word`."
    return set(e3 for e2 in known(edits2(word)) for e3 in edits1(e2))


def known(words):
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def known_from_WORDS2(words):
    return set(w for w in words if (w,) in WORDS2 or w in WORDS2)
    

def known_from_full(words):
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if (w,) in WORDS_full or w in WORDS_full)


def candidates_ordered(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])


def candidates_all(word):
    "Generate possible spelling corrections for word."
    return (set.union(known([word]), known(edits1(word)), known(edits2(word)) ,[word]))

def candidates_all_from_full(word):
    "Generate possible spelling corrections for word."
    return (set.union(known_from_full([word]), known(edits1(word)), edits2_(word) ,[word]))

def candidates_all_within1(word):
    
    "Generate possible spelling corrections for word."
    return set.union(known_from_full([word]), known(edits1(word)),[word])

def candidates_all_within1_full(word):
    
    "Generate possible spelling corrections for word."
    return set.union(known_from_full([word]), known(edits1(word)),[word])


def candidates_all_within1_full_expanded(word):
    
    "Generate possible spelling corrections for word."
   
    return set.union(known_from_full([word]), known_from_WORDS2(edits1(word)),[word])


def correction(word):
    "Most probable spelling correction for word."
    return max(candidates_ordered(word), key=probability)


def correctize(sentence, prior='bigram'):
    "Corrects the given 'sentence' using minimum edit"
    tokens = words(sentence)
    candidates = []    
    for _ in tokens:
        candidates.append(list(candidates_all_within1(_)))
    candidate_sentences = list(itertools.product(*candidates))
    #candidate_count = [len(_) for _ in candidate_sentences]
    
    if prior == 'trigram':
        #trigram tokens for possible sentences
        tri_tokens = [words_trigram(' '.join(sentence)) for sentence in candidate_sentences]
        tri_token_probab = []

        for row in tri_tokens:
            tri_token_probab.append([probability_trigram(_) for _ in row])
            
        sentence_likelihood = likelihood(sentence)
        sentences_probab_post = [math.prod(row)*sentence_likelihood for row in tri_token_probab]
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)

        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted
    
    if prior == 'bigram':
        #bigram tokens for possible sentences
        bi_tokens = [words_bigram(' '.join(sentence)) for sentence in candidate_sentences]
        bi_token_probab = []
        for row in bi_tokens:
            bi_token_probab.append([probability_bigram(_) for _ in row])  
        sentence_likelihood = likelihood(sentence)
        
        sentences_probab_post = [math.prod(row)*sentence_likelihood for row in bi_token_probab]
        #sentences_probab_post = [math.prod(row)*likelihood2(sentence,candidate_sentence,candidate_count) for row,candidate_sentence in zip(bi_token_probab,candidate_sentences)]
        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
    #return candidate_sentences[sentences_probab.index(max(sentences_probab))]
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted

def correctize_entire_with_time(sentence, p_lambda = 1,prior='bigram',tokenized = False):
    "Corrects the given 'sentence' using minimum edit"
    import time

    t_start = time.time()
    tokens = words(sentence)
    start1 = time.time()
    candidates = []    
    for _ in tokens:
        candidates.append(list(filter(lambda word: word in tokens or WORDS2[word]>1000 ,list(candidates_all_within1_full_expanded(_)))))
    candidate_count = [len(_) for _ in candidates]  
    print(candidate_count[0:len(candidates)])      
    end1 = time.time()
    print("Time passed", end1-start1,"sec")
    
    start1 = time.time()
    candidate_sentences = list(itertools.product(*candidates))
    end1 = time.time()
    print("Time passed", end1-start1,"sec")


    
    if prior == 'trigram':
        #bigram tokens for possible sentences
        tri_tokens = [words_trigram(' '.join(_)) for _ in candidate_sentences]
        tri_token_probab = []

        for row in tri_tokens:
            tri_token_probab.append([probability_trigram(_) for _ in row])
            
        #sentence_likelihood = likelihood(sentence)
        #sentences_probab = [math.prod(row) for row in tri_token_probab]
        sentences_probab_post = [math.prod(row)*constant_distributive_likelihood(sentence,candidate_sentence,candidate_count) for row,candidate_sentence in zip(tri_token_probab,candidate_sentences)]
        # sorted_index = numpy.argsort(sentences_probab)
        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
    #return candidate_sentences[sentences_probab.index(max(sentences_probab))]
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted
    
    if prior == 'bigram':
        start1 = time.time()
        #bigram tokens for possible sentences
        bi_tokens = [words_bigram(' '.join(_)) for _ in candidate_sentences]
        #bi_tokens = [[a,b for zip(_[:-1],_[1:])] for _ in candidate_sentences]
        end1 = time.time()
        print("Time passed", end1-start1,"sec")
        
        bi_token_probab = []
        start1 = time.time()
        for row in bi_tokens:
            bi_token_probab.append([probability_bigram(_) for _ in row])  
            #sentence_likelihood_ = likelihood2(sentence,candidate_sentences)
        end1 = time.time()
        print("Time passed", end1-start1,"sec")
        #sentence_likelihood = likelihood(sentence)
        
        start1 = time.time()
        # sentences_probab_post = [math.prod(row)*sentence_likelihood for row in bi_token_probab]
        sentences_probab_post = [math.log((math.prod(row)**p_lambda)*constant_distributive_likelihood(sentence,candidate_sentence,candidate_count)) for row,candidate_sentence in zip(bi_token_probab,candidate_sentences)]
        #sentences_log_probab = [math.ln(m) for m in sentences_probab_post]
        end1 = time.time()
        print("Time passed", end1-start1,"sec")
        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
        

        t_end = time.time()
        print("Total Time passed", t_end-t_start,"sec")
    #return candidate_sentences[sentences_probab.index(max(sentences_probab))]
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted
    

def correctize_entire(sentence, p_lambda = 1,prior='bigram',tokenized = False):
    "Corrects the given 'sentence' using minimum edit"

    tokens = words(sentence)

    candidates = []    
    for _ in tokens:
        candidates.append(list(filter(lambda word: word in tokens or WORDS2[word]>1000 ,list(candidates_all_within1_full_expanded(_)))))
    candidate_count = [len(_) for _ in candidates]  
   
    candidate_sentences = list(itertools.product(*candidates))



    
    if prior == 'trigram':
        #bigram tokens for possible sentences
        tri_tokens = [words_trigram(' '.join(_)) for _ in candidate_sentences]
        tri_token_probab = []

        for row in tri_tokens:
            tri_token_probab.append([probability_trigram(_) for _ in row])
            
        #sentence_likelihood = likelihood(sentence)
        #sentences_probab = [math.prod(row) for row in tri_token_probab]
        sentences_probab_post = [math.prod(row)*constant_distributive_likelihood(sentence,candidate_sentence,candidate_count) for row,candidate_sentence in zip(tri_token_probab,candidate_sentences)]
        # sorted_index = numpy.argsort(sentences_probab)
        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
    #return candidate_sentences[sentences_probab.index(max(sentences_probab))]
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted
    
    if prior == 'bigram':

        #bigram tokens for possible sentences
        bi_tokens = [words_bigram(' '.join(_)) for _ in candidate_sentences]
        #bi_tokens = [[a,b for zip(_[:-1],_[1:])] for _ in candidate_sentences]

  
        
        bi_token_probab = []
   
        for row in bi_tokens:
            bi_token_probab.append([probability_bigram(_) for _ in row])  
            #sentence_likelihood_ = likelihood2(sentence,candidate_sentences)

        #sentence_likelihood = likelihood(sentence)

        # sentences_probab_post = [math.prod(row)*sentence_likelihood for row in bi_token_probab]
        sentences_probab_post = [math.log((math.prod(row)**p_lambda)*constant_distributive_likelihood(sentence,candidate_sentence,candidate_count)) for row,candidate_sentence in zip(bi_token_probab,candidate_sentences)]
        #sentences_log_probab = [math.ln(m) for m in sentences_probab_post]

        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
        

    #return candidate_sentences[sentences_probab.index(max(sentences_probab))]
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted

def tupler(x):
        return tuple(x.split())
    
def logprob(ngram,kn_lm2,minimum):
    if ngram in kn_lm2.lm[0]:
        return kn_lm2.lm[0][ngram]
    return minimum

def score_sent(sent):
    """
    Return log prob of the sentence.

    Params:
        sent [tuple->string] The words in the unpadded sentence.
    """
    padded = (
        ('<s>',) * (2 - 1) + sent)
    sent_logprob = 0
    for i in range(len(sent) - 2 + 1):
        ngram = sent[i:i+2]
        sent_logprob += logprob(ngram)
    return sent_logprob

def correctize_entire_knlm(sentence, model,p_lambda = 1,prior='bigram',tokenized = False):
    "Corrects the given 'sentence' using minimum edit"

    tokens = words(sentence)

    candidates = []    
    for _ in tokens:
        candidates.append(list(filter(lambda word: word in tokens or WORDS2[word]>5000,list(candidates_all(_)))))
    candidate_count = [len(_) for _ in candidates]  
   
    candidate_sentences = list(itertools.product(*candidates))


    minimum = min(model.lm[0].values())
    
    if prior == 'trigram':
        #bigram tokens for possible sentences
        tri_tokens = [words_trigram(' '.join(_)) for _ in candidate_sentences]
        tri_token_probab = []

        for row in tri_tokens:
            tri_token_probab.append([logprob(tuple(_),model,minumum) for _ in row])
        sentences_probab_post=[(sum(row)*p_lambda)+math.log(constant_distributive_likelihood(sentence,candidate_sentence,candidate_count)) for row,candidate_sentence in zip(tri_token_probab,candidate_sentences)]
        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted
    
    if prior == 'bigram':

        #bigram tokens for possible sentences
        bi_tokens = [words_bigram(' '.join(_)) for _ in candidate_sentences]
        #bi_tokens = [[a,b for zip(_[:-1],_[1:])] for _ in candidate_sentences]
        bi_token_probab = []
   
        for row in bi_tokens:
            bi_token_probab.append([logprob(tuple(_),model,minimum) for _ in row])  
 

        sentences_probab_post=[(sum(row)*p_lambda)+math.log(constant_distributive_likelihood(sentence,candidate_sentence,candidate_count)) for row,candidate_sentence in zip(bi_token_probab,candidate_sentences)]


        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
        
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted
    
def correctize_with_window(sentence,window = 5,p_lambda = 1,prior = 'bigram'):
    tokens = words(sentence)
    if len(tokens) <= window:
        return correctize_entire(sentence,p_lambda=p_lambda,prior = prior)
    else:
        windows = [tokens[n:window+n] for n in range(0,len(tokens),window-1) if window+n <len(tokens)-1]    
        remaining = (window-1)*len(windows)
        windows.append(tokens[remaining:])
        corrects = []
        for _ in windows:
            #corrects.append(correctize3(' '.join(_)))
            d = correctize_entire(' '.join(_),p_lambda=p_lambda,prior = prior)
            corrects.append(d)
        return corrects
    
def correctize_with_window_knlm(sentence,model,window = 5,p_lambda = 1,prior = 'bigram'):
    '''
    
    '''   
    
    tokens = words(sentence)
    if len(tokens) <= window:
        return correctize_entire_knlm(sentence,model,p_lambda=p_lambda,prior = prior)
    else:
        windows = [tokens[n:window+n] for n in range(0,len(tokens),window-1) if window+n <len(tokens)-1]    
        remaining = (window-1)*len(windows)
        windows.append(tokens[remaining:])
        corrects = []
        for _ in windows:
            #corrects.append(correctize3(' '.join(_)))
            d = correctize_entire_knlm(' '.join(_),model,p_lambda=p_lambda,prior = prior)
            corrects.append(d)
        return corrects
    
def return_choices(sample_sentences,model):
    d = correctize_with_window_knlm(sample_sentences,model)
    top_choice,other_choices = print_corrected_sentence(d)


    choices_list=[set() for i in range(len(sample_sentences.split())+1)]
    print(len(choices_list))
    
    const = 0
    for _ in other_choices:
        for sens in _:
            for i,w in enumerate(sens):
                index = i + const
                choices_list[index].add(w)
        const += len(other_choices[0][0])-1

    return choices_list  

    
def print_corrected_sentence(d,j = 0):
    s = ''
    k = []
    if(len(d)>1):
        for i in range(len(d)-1):
            s += ' '.join(d[i][0][j][0:4])
            s+=' '
            k.append(d[i][0][0:5])
    s+=' '.join(d[len(d)-1][0][j])
    k.append(d[len(d)-1][0][0:5])
    return s,k
    #return bi_token_probab
    
    
    

def timer(fun,args):
    import time
    s = time.time()
    k = fun(args)
    e = time.time()
    print("Time taken, : ",e-s," sec")
    return k 

In [36]:
candidates_all('प्काश')

{'अवकाश',
 'आकाश',
 'पक्का',
 'पदका',
 'पाका',
 'प्काश',
 'प्याक',
 'प्याड',
 'प्रकार',
 'प्रकाश',
 'प्रकाशन',
 'प्रा',
 'प्राण',
 'प्राय',
 'प्लान',
 'प्वाल'}

In [37]:
import pickle
with open('data/saved_model_knlm2','rb') as inputfile:
    kn_lm2 = pickle.load(inputfile) 

In [324]:
import textdistance


num = [chr(_) for _ in range(2406,2416)]
char_as_word = [chr(_) for _ in range(2362,2383)]
def filterer(w):
    
    #Remove all the words not seperated with '।'
    if len(w)>1 and '।' in w:
        return False
        
    if w in char_as_word:
        return False
    
    
    #Filter all words with characters not needed
    for char in num:
        if char in w:
            return False
        
    return True

#Filter all words with characters not needed


# WORDS_ordered = list(dict(WORDS_full.most_common()).keys())

# WORDS_filtered = filter(filterer,WORDS_ordered)
# lis = list(WORDS_filtered)



with open('vocab_list.pickle','rb') as vl:
    lis = pickle.load(vl)
l = len(lis)


depth_dict = {0:lis[0:int(0.01*l)],1:lis[int(0.01*l):int(0.02*l)],2:lis[int(0.02*l):int(0.05*l)] , 3:lis[0:int(0.1*l)] , 4:lis[int(0.1*l):int(0.5*l)],
5:lis[0:l]}


def check_distance(w, depth = 1,edit_distance = 2,candidates = []):
    '''
    
    
    '''
    count = 0
    #candidates = []
    words = depth_dict[depth]
    for word in words:
        if(textdistance.levenshtein.distance(w, word)) <= edit_distance:
            count+=1
            candidates.append(word)
    return (candidates,count)



def check_distance2(w, depth = 1,edit_distance = 2):
    '''
    
    
    '''
    words = depth_dict[depth]
    candidates = list(filter(lambda word:textdistance.levenshtein.distance(w, word) <= edit_distance,words))        
    return (candidates,len(candidates))

In [325]:
# with open('vocab_list.pickle','wb') as vl:
#     pickle.dump(lis,vl)
    


In [337]:
print(l)

def candidate_words(word,minimum = 1,start_depth = 0):
    '''
    
    '''
    c = None
    #If word length is less than 3 than only use edit distance of 1 or less
    if len(word)<3:
        c,c_ = check_distance(word,depth = start_depth,edit_distance = 1,candidates = [])
        for i in range(len(depth_dict)-1):
            if c_ < minimum:
                c,c_ = check_distance(word, depth = start_depth+i+1,edit_distance = 1,candidates = c)
                
    #If word length is more than 3 than use edit distance of 2 or less
    else:        
        c,c_ = check_distance(word, depth = start_depth,candidates = [])
        for i in range(len(depth_dict)-1):
            if c_ < minimum:
                print("Entered depth, ",i+1)
                c,c_ = check_distance(word, depth = start_depth+i+1,candidates = c)
    
    #Filter 2 edit of type delete
    c = list(filter(lambda w: len(w)>=len(word)-1,c))
    if word not in c:
        c.append(word)
    return c

1171236


In [67]:
#म पुस्तकलयबाटे थुलो किताब पढ्न चाहन्छु ।

word_list = candidate_words('थुलो')
len(word_list)

31

In [207]:
from metaphone import doublemetaphone


doublemetaphone(sanscript.transliterate('थुलो', sanscript.DEVANAGARI, sanscript.ITRANS))[0]


'0L'

In [338]:
# %pip install romanize
# %pip install metaphone

# %pip install indic-transliteration
# #from transliterate import translit

from indic_transliteration import sanscript



def phonetic_distance(word,word_list,top = 5,include_metaphone = False):
    english_text = sanscript.transliterate(word, sanscript.DEVANAGARI, sanscript.ITRANS)
    m = []
    m1 = doublemetaphone(english_text)
    for w in word_list:
        english_text2 = sanscript.transliterate(w, sanscript.DEVANAGARI, sanscript.ITRANS)    
        
        if include_metaphone!=True:
            m.append(textdistance.levenshtein.distance(english_text.lower(),english_text2.lower()))
        else:
            m2 = doublemetaphone(english_text2)
            m.append(textdistance.levenshtein.distance(m1,m2))
            
        #m.append(textdistance.sorensen_dice(english_text.lower(),english_text2.lower()))        
    sorted_list = list(sorted(zip(m,word_list)))
    top_list = [x for _,x in sorted_list]
    if len(top_list)<top:
        return_list = top_list
    else:
        top_dis = sorted_list[top-1][0]
        return_list = [x for _,x in sorted_list if _<=top_dis ]
    return return_list

In [222]:
doublemetaphone(sanscript.transliterate('हात', sanscript.DEVANAGARI, sanscript.ITRANS))[0]

'HT'

In [335]:
def final_candidate_words(word,minimum =1,top = 5,start_depth =0 ):
    import time
    s = time.time()
    c = candidate_words(word,minimum = minimum,start_depth = start_depth)
    
    if len(c) <10:
        e = time.time()
        print("time passed fc: ",e-s)
        print(c)
        return c
    else:
        e = time.time()
        print("time passed fc: ",e-s)
        print(phonetic_distance(word,c,top = 10))
        return phonetic_distance(word,c,top = top)
    e = time.time()
    
    
    

In [333]:
len(final_candidate_words('हात'))

['हात', 'अत', 'खाता', 'गत', 'छाता', 'जात', 'दाता', 'नत', 'नाता', 'पात', 'बाट', 'भात', 'मत', 'माता', 'रात', 'सात', 'साता', 'हक', 'हद', 'हब', 'हल', 'हाते', 'हार', 'हाल', 'हावा', 'हिट', 'हित']


27

In [341]:
final_candidate_words('पुस्तकलयबाटे')

Entered depth,  1
Entered depth,  2
Entered depth,  3
time passed fc:  13.788103103637695
['पुस्तकालयबाट', 'पुस्तकलयबाटे']


['पुस्तकालयबाट', 'पुस्तकलयबाटे']

In [146]:
phonetic_distance('थुलो',word_list)

['ठुलो', 'ठूलो', 'थुलो', 'ठुला', 'थलो', 'धुलो', 'धूलो', 'कुलो', 'खुला', 'झुटो']

In [147]:
english_text2 = sanscript.transliterate('नीलो', sanscript.DEVANAGARI, sanscript.IAST) 
e3 = sanscript.transliterate('हुँ', sanscript.DEVANAGARI, sanscript.ITRANS)
e4 = sanscript.transliterate('hu.n',  sanscript.ITRANS,sanscript.DEVANAGARI)
english_text2, e3,e4 , 'hu.N'.lower()


('nīlo', 'hu.N', 'हुं', 'hu.n')

In [77]:
s = [x for _,x in sorted(zip(m,word_list))]

In [78]:
sorted(zip(m,word_list))

[(0, 'थुलो'),
 (2, 'कुलो'),
 (2, 'ठुलो'),
 (2, 'थलो'),
 (2, 'धुलो'),
 (4, 'कालो'),
 (4, 'किलो'),
 (4, 'कुरो'),
 (4, 'कुल'),
 (4, 'खुला'),
 (4, 'जालो'),
 (4, 'जुडो'),
 (4, 'झुटो'),
 (4, 'ठुला'),
 (4, 'ठूलो'),
 (4, 'ढिलो'),
 (4, 'थाले'),
 (4, 'थियो'),
 (4, 'धूलो'),
 (4, 'नीलो'),
 (4, 'नौलो'),
 (4, 'पालो'),
 (4, 'पुल'),
 (4, 'पुलको'),
 (4, 'फलो'),
 (4, 'फुल'),
 (4, 'मुल'),
 (4, 'युरो'),
 (4, 'सुलभ'),
 (4, 'हलो'),
 (4, 'हिलो')]

In [342]:



def correctize_entire_knlm(sentence, model,p_lambda = 1,prior='bigram',tokenized = False):
    "Corrects the given 'sentence' using minimum edit"

    tokens = words(sentence)

    candidates = []    
    for _ in tokens:
        #candidates.append(list(filter(lambda word: word in tokens or WORDS2[word]>5000,list(candidates_all(_)))))
        candidates.append(final_candidate_words(_))
    candidate_count = [len(_) for _ in candidates]  
   
    candidate_sentences = list(itertools.product(*candidates))


    minimum = min(model.lm[0].values())
    
    if prior == 'trigram':
        #bigram tokens for possible sentences
        tri_tokens = [words_trigram(' '.join(_)) for _ in candidate_sentences]
        tri_token_probab = []

        for row in tri_tokens:
            tri_token_probab.append([logprob(tuple(_),model,minumum) for _ in row])
        sentences_probab_post=[(sum(row)*p_lambda)+math.log(constant_distributive_likelihood(sentence,candidate_sentence,candidate_count)) for row,candidate_sentence in zip(tri_token_probab,candidate_sentences)]
        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted
    
    if prior == 'bigram':

        #bigram tokens for possible sentences
        bi_tokens = [words_bigram(' '.join(_)) for _ in candidate_sentences]
        #bi_tokens = [[a,b for zip(_[:-1],_[1:])] for _ in candidate_sentences]
        bi_token_probab = []
   
        for row in bi_tokens:
            bi_token_probab.append([logprob(tuple(_),model,minimum) for _ in row])  
 

        sentences_probab_post=[(sum(row)*p_lambda)+math.log(constant_distributive_likelihood(sentence,candidate_sentence,candidate_count)) for row,candidate_sentence in zip(bi_token_probab,candidate_sentences)]


        
        sorted_index = numpy.argsort(sentences_probab_post)
        sentences_probab_post_sorted = sorted(sentences_probab_post,reverse = True)
        
        return [candidate_sentences[k] for k in sorted_index[::-1]],sentences_probab_post_sorted
    
def correctize_with_window(sentence,window = 5,p_lambda = 1,prior = 'bigram'):
    tokens = words(sentence)
    if len(tokens) <= window:
        return correctize_entire(sentence,p_lambda=p_lambda,prior = prior)
    else:
        windows = [tokens[n:window+n] for n in range(0,len(tokens),window-1) if window+n <len(tokens)-1]    
        remaining = (window-1)*len(windows)
        windows.append(tokens[remaining:])
        corrects = []
        for _ in windows:
            #corrects.append(correctize3(' '.join(_)))
            d = correctize_entire(' '.join(_),p_lambda=p_lambda,prior = prior)
            corrects.append(d)
        return corrects
    
def correctize_with_window_knlm(sentence,model,window = 5,p_lambda = 1,prior = 'bigram'):
    '''
    
    '''   
    
    tokens = words(sentence)
    if len(tokens) <= window:
        return correctize_entire_knlm(sentence,model,p_lambda=p_lambda,prior = prior)
    else:
        windows = [tokens[n:window+n] for n in range(0,len(tokens),window-1) if window+n <len(tokens)-1]    
        remaining = (window-1)*len(windows)
        windows.append(tokens[remaining:])
        corrects = []
        for _ in windows:
            #corrects.append(correctize3(' '.join(_)))
            d = correctize_entire_knlm(' '.join(_),model,p_lambda=p_lambda,prior = prior)
            corrects.append(d)
        return corrects

In [343]:
sample_sentences = ['हरेक सेपालीले नेपामको संविधानक पालना गर्नुपर्छ ।' ,
                    'म पुस्तकलयबाटे थुलो किताब पढ्न चाहन्छु ।',
                    'तर उस समयमा पनि स्वस्थ राजनैतिक वातावरणको अभावले गर्दा देश विकासतर्फ विशेष प्रगति हुन  सकेन।',
                   'नेपालमा आधुनिक रुपमा आर्थक विकाससम्बन्धी कार्यरू प्रारम्भ भएको हालै मात्र हो।',
                   'हार धुनुहोस् र स्वास्थ जीवन जिउनुहोस्।']

In [151]:
d = correctize_with_window_knlm(sample_sentences[1],kn_lm2)
top_choice,other_choices = print_corrected_sentence(d)
print("corrected:",top_choice)
print(other_choices,"\n")
    

corrected: । पुस्तकलयबाटे खुला किताब पढ्न चाहन्छु ।
[[('।', 'पुस्तकलयबाटे', 'खुला', 'किताब', 'पढ्न'), ('।', 'पुस्तकालयबाट', 'खुला', 'किताब', 'पढ्न'), ('।', 'पुस्तकलयबाटे', 'ठूलो', 'किताब', 'पढ्न'), ('।', 'पुस्तकलयबाटे', 'खुला', 'किताब', 'गर्न'), ('।', 'पुस्तकलयबाटे', 'कुल', 'हिसाब', 'गर्न')], [('पढ्न', 'चाहन्छु', '।'), ('मर्न', 'चाहन्छु', '।'), ('भन्न', 'चाहन्छु', '।'), ('पार्न', 'चाहन्छु', '।'), ('थप्न', 'चाहन्छु', '।')]] 



In [254]:
def return_choices(sample_sentences,model):
    import time
    
    s = time.time()
    d = correctize_with_window_knlm(sample_sentences,model)
    top_choice,other_choices = print_corrected_sentence(d)


    choices_list=[set() for i in range(len(sample_sentences.split())+1)]
    print(len(choices_list))

    const = 0
    for _ in other_choices:
        for sens in _:
            for i,w in enumerate(sens):
                index = i + const
                choices_list[index].add(w)
        const += len(other_choices[0][0])-1
    e = time.time()
    
    print("Time Passed:", e-s)
    return choices_list

In [16]:
candidates_all('पुस्तकलयबाटे')

{'पुस्तकलयबाटे'}

In [257]:
print(return_choices(sample_sentences[2],kn_lm2))


KeyboardInterrupt



In [347]:
print(return_choices(sample_sentences[1],kn_lm2))

time passed fc:  0.16459131240844727
['म', 'मा', 'अ', 'आ', 'आम', 'एम', 'ओम', 'क', 'ग', 'ज', 'त', 'द', 'न', 'प', 'मि', 'मे', 'मै', 'र', 'ल', 'व', 'स']
Entered depth,  1
Entered depth,  2
Entered depth,  3
time passed fc:  13.838687896728516
['पुस्तकालयबाट', 'पुस्तकलयबाटे']
time passed fc:  0.3430814743041992
['ठुलो', 'ठूलो', 'थुलो', 'ठुला', 'थलो', 'धुलो', 'धूलो', 'कुलो', 'खुला', 'झुटो', 'ढिलो', 'थाले', 'थियो', 'नौलो', 'फलो', 'फुल', 'हलो', 'हिलो']
time passed fc:  0.39095401763916016
['किताब', 'किटान', 'कतार', 'किताबमा', 'किनार', 'किरात', 'किसान', 'पिसाब', 'हिसाब', 'किरा', 'पिता']
time passed fc:  0.3420844078063965
['पढ्न', 'पढ्दा', 'पढ्ने', 'बढ्न', 'चढ्न', 'पठन', 'पढाइ', 'पढाई', 'पढाउन', 'पढे', 'पढेर', 'पढ्दै', 'पतन', 'परेन', 'पर्न', 'पवन', 'पस्न', 'पाउन', 'पाएन', 'पाटन', 'पान', 'पार्न', 'पालन', 'पाल्न', 'बच्न', 'बढ्नु', 'बढ्ने', 'लड्न']
time passed fc:  0.3253650665283203
['पढ्न', 'पढ्दा', 'पढ्ने', 'बढ्न', 'चढ्न', 'पठन', 'पढाइ', 'पढाई', 'पढाउन', 'पढे', 'पढेर', 'पढ्दै', 'पतन', 'परेन', 

In [344]:
print(return_choices(sample_sentences[0],kn_lm2))

time passed fc:  0.3480679988861084
['हरेक', 'गरेका', 'झरेका', 'परेका', 'मरेका', 'हारेको', 'हारेर', 'हेरेका', 'अनेक', 'खरेल', 'गरेकी', 'गरेकै', 'गरेको', 'गरेन', 'गरेर', 'झरेको', 'तर्क', 'परेकी', 'परेको', 'परेन', 'परेर', 'फरक', 'ब्रेक', 'भरेर', 'मरेको', 'सरिक', 'सरेको', 'हटेको', 'हरित', 'हेरेको', 'हेरेर']
time passed fc:  0.5595026016235352
['नेपालले', 'नेपालीले', 'सेपालीले']
time passed fc:  0.5206067562103271
['नेपालको', 'नेपालका', 'नेपालीको', 'नपाएको', 'नेताको', 'नेपालकै', 'नेकपाको', 'नेपालकी', 'नेपामको']
time passed fc:  0.5694758892059326
['संविधान', 'संवैधानिक', 'संविधानको', 'संविधानले', 'संविधानमा', 'संविधानका', 'संविधानक']
time passed fc:  0.39594054222106934
['पालन', 'पालना', 'पाटन', 'पालामा', 'पाल्न', 'कामना', 'कालमा', 'तालमा', 'तुलना', 'पसिना', 'पाउन', 'पाएन', 'पालिका', 'पालेका', 'पाल्ने', 'पाल्पा', 'पासमा', 'पाहुना', 'पुलमा', 'यातना', 'सामना', 'सालमा', 'हालका']
time passed fc:  0.3809802532196045
['पालन', 'पालना', 'पाटन', 'पालामा', 'पाल्न', 'कामना', 'कालमा', 'तालमा', 'तुलना'

In [345]:
print(return_choices(sample_sentences[3],kn_lm2))

time passed fc:  0.5435607433319092
['नेपालमा', 'नेपालका', 'नेपालमै', 'नेपालीमा', 'नेपालकी', 'नेपालकै', 'नेपालको', 'नेपाललाई', 'नेपालले', 'नेपालीका']
time passed fc:  0.4507935047149658
['आधुनिक']
time passed fc:  0.41639041900634766
['रुपमा', 'रूपमा', 'रुटमा', 'रुपमै', 'रूपमै', 'आरोपमा', 'कपमा', 'पुलमा', 'पुसमा', 'युगमा', 'रनमा', 'रातमा', 'रिटमा', 'रुपले', 'रुपैया']
time passed fc:  0.39893198013305664
['आर्थक', 'आर्थिक', 'सार्थक', 'अर्थ', 'दर्शक', 'आकर्षक', 'आर्जन', 'आर्ट', 'आर्यन', 'आस्था', 'समर्थक']
Entered depth,  1
time passed fc:  1.7852222919464111
['विकाससम्बन्धी']
Entered depth,  1
time passed fc:  1.7622578144073486
['विकाससम्बन्धी']
time passed fc:  0.5196094512939453
['कार्यरत', 'कार्यमा', 'कार्यदल', 'कार्यको', 'कार्यले', 'कार्यका', 'कार्यलय', 'कार्यहरू', 'कार्यरू']
time passed fc:  0.5754599571228027
['प्रारम्भिक', 'प्रारम्भ']
time passed fc:  0.34108710289001465
['भएको', 'खाएको', 'छाएको', 'भएका', 'भएकी', 'भनेको', 'आएको', 'उभिएको', 'गएको', 'गाएको', 'छोएको', 'झाको', 'नआएको

In [346]:
print(return_choices(sample_sentences[4],kn_lm2))

time passed fc:  0.28324365615844727
['हार', 'आर', 'कर', 'कार', 'गर', 'घर', 'चार', 'डर', 'तर', 'तार', 'तारा', 'थर', 'दर', 'धार', 'धारा', 'नारा', 'पर', 'पार', 'पारा', 'फर', 'बार', 'बारा', 'भर', 'भार', 'मार', 'रारा', 'सर', 'सार', 'सारा', 'हक', 'हद', 'हब', 'हराए', 'हरि', 'हरु', 'हरू', 'हल', 'हात', 'हाल', 'हावा', 'हेर']
Entered depth,  1
time passed fc:  1.19380521774292
['दिनुहोस्', 'धुनुहोस्']
time passed fc:  0.154191255569458
['र', 'अ', 'आ', 'आर', 'क', 'ग', 'ज', 'त', 'द', 'न', 'प', 'म', 'रु', 'रे', 'र्', 'ल', 'व', 'स']
time passed fc:  0.5156233310699463
['स्वास्थ्य', 'स्वस्थ', 'स्वार्थ', 'अस्वस्थ', 'स्वास्थ']
time passed fc:  0.3506340980529785
['जिवन', 'जीवन', 'आजीवन', 'जवान', 'जीवन्त', 'जडान', 'जान', 'जीव', 'जीवनका', 'जीवनको', 'जीवनमा', 'जीवित', 'पवन', 'सेवन']
time passed fc:  0.3211212158203125
['जिवन', 'जीवन', 'आजीवन', 'जवान', 'जीवन्त', 'जडान', 'जान', 'जीव', 'जीवनका', 'जीवनको', 'जीवनमा', 'जीवित', 'पवन', 'सेवन']
Entered depth,  1
time passed fc:  1.29453444480896
['दिनुहोस्', 'जिउन

In [10]:
print(return_choices(sample_sentences[4],kn_lm2))

In [None]:
candidate

In [211]:
WORDS_full.most_common()[:-10000-1:-1]

[('विभिन्नस्थानहरूलाई', 1),
 ('टेरिटरीको', 1),
 ('सम्झौताहिन', 1),
 ('कार्यालयमाा', 1),
 ('स्क्र्याब', 1),
 ('कापीलगायत', 1),
 ('धोक्रोभरि', 1),
 ('फोहरधनी', 1),
 ('उद्धमशिलता', 1),
 ('मोटिवेशनल', 1),
 ('श्रृस्टी', 1),
 ('मेंखु', 1),
 ('तान्डुकार', 1),
 ('रेनि', 1),
 ('सिमृति', 1),
 ('आरटिबीमा', 1),
 ('आरसिवीबाट', 1),
 ('आरटिबी', 1),
 ('ईन्जिनियरहरुलाई', 1),
 ('पुर्नसर्भे', 1),
 ('परम्परादेखी', 1),
 ('लाभग्राहिको', 1),
 ('लाभग्राहिका', 1),
 ('जनशक्तिलाईलाई', 1),
 ('आवश्वकता', 1),
 ('बीत', 1),
 ('विद्यर्थीहरुको', 1),
 ('खार्दुमा', 1),
 ('रामाकोट', 1),
 ('विद्यथालय', 1),
 ('विवादमाथिको', 1),
 ('मोटरेल', 1),
 ('सरलकर्जाको', 1),
 ('बस्नथाल्यो', 1),
 ('आसामुखीमात्रै', 1),
 ('पार्दशितालाई', 1),
 ('कार्यान्वनलाई', 1),
 ('अनुशिक्षणपछि', 1),
 ('आफुहरुपनि', 1),
 ('पुर्वगृहमन्त्री', 1),
 ('गेडीखोलामा', 1),
 ('शेराबेसी', 1),
 ('आरसिआइपीले', 1),
 ('आरसिआइपी', 1),
 ('लुखुं', 1),
 ('मानवअधिकाको', 1),
 ('चियरबाट', 1),
 ('मुलुङ्', 1),
 ('समरमाथाको', 1),
 ('फर्किहाल्छ।', 1),
 ('।उत्तरपट्टीको', 1),
 ('नग

In [206]:
'पुस्तकलयबाटे' in WORDS_full

False

In [321]:
from collections import Counter

m = Counter({k: WORDS_full[k] for k in lis if WORDS_full[k] <=34}).most_common()

In [322]:
len(m),len(lis),len(WORDS_full)

(1080716, 1171236, 1229736)

In [323]:
m

[('यहाँहरू', 34),
 ('खिचें', 34),
 ('उड्सलाई', 34),
 ('राष्ट्रप्रेमी', 34),
 ('दिमागका', 34),
 ('रुद्रमणि', 34),
 ('पक्षहरुले', 34),
 ('ठाउँठाउँ', 34),
 ('फिर्दै', 34),
 ('एफएनसीसीआई', 34),
 ('प्रतितपत्र', 34),
 ('कटाउनु', 34),
 ('विजुलीका', 34),
 ('एमटीओडब्लू', 34),
 ('राईसँगै', 34),
 ('भूखण्डमा', 34),
 ('गैरनाफामुखी', 34),
 ('मेरोतर्फबाट', 34),
 ('अविकास', 34),
 ('मंगलबारमात्रै', 34),
 ('गुज्रन', 34),
 ('गरिहाल्छु', 34),
 ('ननिकाल्ने', 34),
 ('आनन्ददेव', 34),
 ('रावणलाई', 34),
 ('हामीभित्रको', 34),
 ('निहु', 34),
 ('जुद्धोदय', 34),
 ('मालिकहरूको', 34),
 ('कमिसनर', 34),
 ('कोच्ने', 34),
 ('बनाएजस्तै', 34),
 ('बुतामा', 34),
 ('केटीभी', 34),
 ('आह्लादित', 34),
 ('वाक्यांशले', 34),
 ('लोडसेडिङमा', 34),
 ('कम्बो', 34),
 ('पूर्वाग्राही', 34),
 ('नसच्याई', 34),
 ('खबरै', 34),
 ('माननीयहरुले', 34),
 ('सम्प्रदायबीचको', 34),
 ('गोरखपुरबाट', 34),
 ('घोक्ने', 34),
 ('अक्सिजनका', 34),
 ('कथाभन्दा', 34),
 ('निकैनै', 34),
 ('दिऔँ', 34),
 ('विमर्शले', 34),
 ('कम्युनिस्टसँग', 34),
 ('एनसीसीले', 34),


In [309]:
WORDS_full['निशान्तले']

4

In [20]:
choices_list=[set() for i in range(len(sample_sentences[1].split())+1)]
const=0
for _ in other_choices:
    for sens in _:
        for i,w in enumerate(sens):
            index = i + const
            choices_list[index].add(w)
    const += len(other_choices[0][0])-1

In [22]:
choices_list

[{'।'},
 {'पुस्तकलयबाट'},
 {'कुल', 'खुला', 'ठूलो'},
 {'किताब', 'हिसाब'},
 {'गर्न', 'थप्न', 'पढ्न', 'पढ्ने', 'पार्न', 'भन्न', 'मर्न'},
 {'चाहन्छु'},
 {'।'}]

In [25]:


candidates_all('थुलो')

{'कालो',
 'किलो',
 'कुरो',
 'कुल',
 'कुलो',
 'खुला',
 'जालो',
 'जुडो',
 'झुटो',
 'ठुला',
 'ठुलो',
 'ठूलो',
 'ढिलो',
 'थलो',
 'थाले',
 'थियो',
 'थुलो',
 'धुलो',
 'धूलो',
 'निलो',
 'नीलो',
 'नौलो',
 'पालो',
 'पुल',
 'पुलको',
 'फलो',
 'फुल',
 'मुल',
 'युरो',
 'सुलभ',
 'हलो',
 'हिलो'}

In [26]:
WORDS

Counter({'<s>': 6884825,
         '।': 4667400,
         'र': 1697969,
         'छ': 1321657,
         'पनि': 1021030,
         'भएको': 537320,
         'लागि': 532843,
         'भने': 512636,
         'छन्': 482185,
         'गर्न': 479006,
         'गरेको': 473950,
         'हो': 421229,
         'यो': 409301,
         'गर्ने': 399109,
         'उनले': 347540,
         'तथा': 332883,
         'छ।': 313896,
         'थियो': 300393,
         'नै': 299612,
         'तर': 273086,
         'हुने': 269133,
         'एक': 245702,
         'नेपाल': 242579,
         'कुनै': 239024,
         'गरेका': 225245,
         'काम': 220636,
         'रहेको': 219705,
         'को': 213629,
         'छैन': 212175,
         'भएका': 196702,
         'थिए': 183621,
         'बताए': 182977,
         'गरी': 179047,
         'भन्ने': 176396,
         'गरिएको': 174957,
         'अनुसार': 174601,
         '१': 173013,
         'प्रतिक्रिया': 172118,
         'नयाँ': 165712,
         'मा': 165051,
         'वा': 

In [9]:
!pip install textdistance

import textdistance

def find_words_v3(word, dictionary):
    result = set()
    for w in dictionary:
        if textdistance.levenshtein.distance(word,w) <= 2:
            result.add(w)
    return result

dictionary = {"bat", "cat", "rat", "sat", "pat", "that", "this", "fat", "rat", "mat", "pat"}
print(find_words_v3("cat", dictionary))

{'bat', 'fat', 'cat', 'sat', 'mat', 'rat', 'pat', 'that'}


In [8]:
%pip install textdistance

Collecting textdistance
  Downloading textdistance-4.5.0-py3-none-any.whl (31 kB)
Installing collected packages: textdistance
Successfully installed textdistance-4.5.0
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'F:\SpellChecker\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [262]:
import textdistance

In [23]:
textdistance.levenshtein.distance('नपाल','पागल')

2

In [111]:
num = [chr(_) for _ in range(2406,2416)]
def numerics(w):
    if len(w)>1 and w.endswith('।'):
        return False
    for char in num:
        if char in w:
            return False
    return True




#Filter all words with characters not needed
WORDS_filtered = filter(numerics,WORDS_full)

lis = list(WORDS_filtered)
l = len(lis)

depth_dict = {0:lis[0:int(0.01*l)],1:lis[0:int(0.02*l)],2:lis[0:int(0.05*l)]  , 3:lis[0:int(0.1*l)] , 4:lis[int(0.1*l):int(0.5*l)],
5:lis[0:l]}


def check_distance(w, depth = 1,edit_distance = 2):

    count = 0
    candidates = []
    words = depth_dict[depth]
    for word in words:
        if(textdistance.levenshtein.distance(w, word)) <= edit_distance:
            count+=1
            candidates.append(word)
    return (candidates,count)

In [112]:
l

1216082

In [113]:
1216082*0.05

60804.100000000006

In [127]:
depth = 0


for word in sample_sentences[2].split():
    if len(word)<3:
        c,c_ = check_distance(word,depth,edit_distance = 1)
        for i in range(len(depth_dict)-1):
            if c_ < 1:
                c,c_ = check_distance(word, depth = depth+i+1,edit_distance = 1)
    else:
        c,c_ = check_distance(word, depth = depth)
        for i in range(len(depth_dict)-1):
            if c_ < 1:
                c,c_ = check_distance(word, depth = depth+i+1)
    if word not in c:
        c.append(word)
    print(c_,c)

def candidate_words(word,minimum = 1):
    if len(word)<3:
        c,c_ = check_distance(word,depth,edit_distance = 1)
        for i in range(len(depth_dict)-1):
            if c_ < minimum:
                c,c_ = check_distance(word, depth = depth+i+1,edit_distance = 1)
    else:
        c,c_ = check_distance(word, depth = depth)
        for i in range(len(depth_dict)-1):
            if c_ < minimum:
                c,c_ = check_distance(word, depth = depth+i+1)
    if word not in c:
        c.append(word)
    return c

    
    
#     c,c_ = check_distance(word, depth = depth)
#     if c_ < 2:
#         print("Entered Inside")
#         c,c_ = check_distance(word, depth = depth+1)
#         if c_ < 2:
#             print("Entered Inside, inside")
#             c,c_ = check_distance(word, depth = depth+2)
#     print((c_,c))
    

24 ['र', 'तर', 'त', 'घर', 'तै', 'कर', 'तल', 'सर', 'ती', 'तय', 'पर', 'तिर', 'थर', 'दर', 'फर', 'तरङ', 'डर', 'भर', 'तीर', 'तब', 'तह', 'तरल', 'तो', 'हर']
11 ['बस', 'उ', 'उड', 'स', 'यस', 'एस', 'टस', 'दस', 'जस', 'उप', 'उसो', 'उस']
30 ['समाजमा', 'मनमा', 'समय', 'समयमा', 'लयमा', 'सम्मान', 'सालमा', 'सबैमा', 'सदनमा', 'जम्मा', 'सडकमा', 'सभामा', 'ममा', 'संघमा', 'समाना', 'सोचमा', 'सम्म', 'सीमा', 'सहरमा', 'समयमै', 'सेपमा', 'समयका', 'सुनमा', 'सयमा', 'समिम', 'साथमा', 'सीमामा', 'समूहमा', 'सतहमा', 'सेलमा']
94 ['गति', 'भने', 'पनि', 'अघि', 'मन', 'भन्', 'न', 'सपना', 'अनि', 'पति', 'पो', 'छन्', 'नै', 'पाना', 'पछि', 'नि', 'कि', 'पटक', 'पर्', 'परे', 'कति', 'आएपनि', 'भनी', 'जति', 'भएपनि', 'धनी', 'मनन', 'झन्', 'उनी', 'आदि', 'परी', 'जना', 'नं', 'यति', 'पहल', 'ने', 'पर', 'पानी', 'उनै', 'पु', 'छवि', 'बने', 'यिनि', 'झनै', 'रन', 'पेट', 'ओपनर', 'अनिल', 'कवि', 'पाए', 'पुनः', 'पता', 'छनक', 'नौ', 'पेस', 'सन्', 'पुल', 'गरि', 'एने', 'ऐन', 'भनिए', 'पेश', 'पुस', 'पाने', 'पिस', 'पार', 'भनि', 'पुन', 'बि', 'बडि', 'पुष', 'तापनि',

In [20]:
len(WORDS)

12297

In [27]:
candidates_all('नेपाल') ,len(candidates_all('नेपाल'))

({'कपाल',
  'गोपाल',
  'चेपाङ',
  'नपाई',
  'नपाए',
  'नेकपा',
  'नेकपाले',
  'नेता',
  'नेताले',
  'नेपाल',
  'नेपालका',
  'नेपालकी',
  'नेपालकै',
  'नेपालको',
  'नेपालमा',
  'नेपालमै',
  'नेपालले',
  'नेपाली',
  'नेवार',
  'नेशनल',
  'नेसनल',
  'पाल',
  'बनेपा'},
 23)

In [55]:
ord('९')

2415

In [69]:
help(filter)

Help on class filter in module builtins:

class filter(object)
 |  filter(function or None, iterable) --> filter object
 |  
 |  Return an iterator yielding those items of iterable for which function(item)
 |  is true. If function is None, return the items that are true.
 |  
 |  Methods defined here:
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __iter__(self, /)
 |      Implement iter(self).
 |  
 |  __next__(self, /)
 |      Implement next(self).
 |  
 |  __reduce__(...)
 |      Return state information for pickling.
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  __new__(*args, **kwargs) from builtins.type
 |      Create and return a new object.  See help(type) for accurate signature.



In [56]:
hex(2415)

'0x96f'

In [65]:
char_vocab = []
for _ in range(2304, 2432):
    if _ not in range(2406,2416):
        char_vocab += [chr(_)]
        
char_vocab = [chr(_) for _ in range(2304,2432) if _ not in range(2406,2416) and ]

In [67]:
def edits1(word):
    "All edits that are one edit away from `word`."
    letters = char_vocab
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)


def edits2(word):
    "All edits that are two edits away from `word`."
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1))

In [66]:
char_vocab

['ऀ',
 'ँ',
 'ं',
 'ः',
 'ऄ',
 'अ',
 'आ',
 'इ',
 'ई',
 'उ',
 'ऊ',
 'ऋ',
 'ऌ',
 'ऍ',
 'ऎ',
 'ए',
 'ऐ',
 'ऑ',
 'ऒ',
 'ओ',
 'औ',
 'क',
 'ख',
 'ग',
 'घ',
 'ङ',
 'च',
 'छ',
 'ज',
 'झ',
 'ञ',
 'ट',
 'ठ',
 'ड',
 'ढ',
 'ण',
 'त',
 'थ',
 'द',
 'ध',
 'न',
 'ऩ',
 'प',
 'फ',
 'ब',
 'भ',
 'म',
 'य',
 'र',
 'ऱ',
 'ल',
 'ळ',
 'ऴ',
 'व',
 'श',
 'ष',
 'स',
 'ह',
 'ऺ',
 'ऻ',
 '़',
 'ऽ',
 'ा',
 'ि',
 'ी',
 'ु',
 'ू',
 'ृ',
 'ॄ',
 'ॅ',
 'ॆ',
 'े',
 'ै',
 'ॉ',
 'ॊ',
 'ो',
 'ौ',
 '्',
 'ॎ',
 'ॏ',
 'ॐ',
 '॑',
 '॒',
 '॓',
 '॔',
 'ॕ',
 'ॖ',
 'ॗ',
 'क़',
 'ख़',
 'ग़',
 'ज़',
 'ड़',
 'ढ़',
 'फ़',
 'य़',
 'ॠ',
 'ॡ',
 'ॢ',
 'ॣ',
 '।',
 '॥',
 '॰',
 'ॱ',
 'ॲ',
 'ॳ',
 'ॴ',
 'ॵ',
 'ॶ',
 'ॷ',
 'ॸ',
 'ॹ',
 'ॺ',
 'ॻ',
 'ॼ',
 'ॽ',
 'ॾ',
 'ॿ']

In [60]:
ord('ॽ')

2429

In [68]:
len('यो')

2

In [188]:
count = 0
for i in WORDS_full:
    print(i)
    count+=1
    
    if count == 20:
        break
    

<s>
साँवा
अक्षर
कखरा
मा
बाह्रखरी
मात्रा
मिल्नाले
भाषा
समृद्ध
र
अर्थपूर्ण
बनेजस्तै
देशको
समाजिक
राजनीतिक
आर्थिक
अन्य
क्षेत्रको
स्थूल


In [261]:
WORDS_full.most_common()

[('<s>', 6884825),
 ('।', 4667400),
 ('र', 1697969),
 ('छ', 1321657),
 ('पनि', 1021030),
 ('भएको', 537320),
 ('लागि', 532843),
 ('भने', 512636),
 ('छन्', 482185),
 ('गर्न', 479006),
 ('गरेको', 473950),
 ('हो', 421229),
 ('यो', 409301),
 ('गर्ने', 399109),
 ('उनले', 347540),
 ('तथा', 332883),
 ('छ।', 313896),
 ('थियो', 300393),
 ('नै', 299612),
 ('तर', 273086),
 ('हुने', 269133),
 ('एक', 245702),
 ('नेपाल', 242579),
 ('कुनै', 239024),
 ('गरेका', 225245),
 ('काम', 220636),
 ('रहेको', 219705),
 ('को', 213629),
 ('छैन', 212175),
 ('भएका', 196702),
 ('थिए', 183621),
 ('बताए', 182977),
 ('गरी', 179047),
 ('भन्ने', 176396),
 ('गरिएको', 174957),
 ('अनुसार', 174601),
 ('१', 173013),
 ('प्रतिक्रिया', 172118),
 ('नयाँ', 165712),
 ('मा', 165051),
 ('वा', 163343),
 ('केही', 163185),
 ('२', 160067),
 ('हजार', 159135),
 ('गर्दै', 158163),
 ('सय', 152931),
 ('नेपाली', 152540),
 ('हुन्छ', 150135),
 ('त', 148197),
 ('सरकारले', 146865),
 ('स्थानीय', 145138),
 ('दुई', 144531),
 ('हुन', 141285),
 ('मात्र',