In [1]:
import re
import pandas as pd
from tqdm import tqdm

## Finding Unique words from the corpus

In [4]:
with open ('big.txt','r') as fd:
    lines=fd.readlines()
    words=[]
    for line in lines:
        words+=re.findall('\w+',line.lower())

print(len(words))
vocab=list(set(words))# unique words 
print(len(vocab))
    
    

  words+=re.findall('\w+',line.lower())


1115585
32198


## Finding probability distribution of each word

In [5]:
# storing it in dictionary form , key-value pairs 
word_probability={}

for word in tqdm(vocab):
    word_probability[word]=float(words.count(word)/len(words))

word_probability


100%|████████████████████████████████████████████████████████████████████████████| 32198/32198 [08:14<00:00, 65.17it/s]


{'loafing': 8.963906829152417e-07,
 'petrusha': 1.7927813658304835e-06,
 'complications': 2.6891720487457253e-05,
 'eloquence': 5.378344097491451e-06,
 'tens': 1.4342250926643868e-05,
 'persisting': 1.7927813658304835e-06,
 'dispatched': 1.6135032292474352e-05,
 'venus': 8.963906829152417e-07,
 'droning': 8.963906829152417e-07,
 '1914': 1.7927813658304835e-05,
 'militant': 1.7927813658304835e-06,
 'suppurate': 3.585562731660967e-06,
 'cardinal': 3.585562731660967e-06,
 'wolves': 9.860297512067659e-06,
 'hereby': 1.7927813658304835e-06,
 'amounts': 5.378344097491451e-06,
 '_hodgkin': 8.963906829152417e-07,
 'entitlement': 8.963906829152417e-07,
 'warmth': 1.1653078877898144e-05,
 'kudrino': 8.963906829152417e-07,
 'afford': 1.7927813658304835e-05,
 'shuddered': 8.963906829152418e-06,
 '221': 7.171125463321934e-06,
 'apparelled': 8.963906829152417e-07,
 'strenuously': 1.7927813658304835e-06,
 'cowshed': 2.6891720487457255e-06,
 'expressionless': 3.585562731660967e-06,
 'fatally': 4.48195

In [7]:
word_probability['the']

0.07154004401278254

## Text preprocaessing

### Splitting

In [8]:
def split(word):
    lst=[]
    for i in range(len(word)+1):
        lst+=[(word[ : i],word[ i : ])]

    return lst
    

### Delete

In [14]:
def delete(word):
    words=[]
    for l,r in split(word):
        words.append(l+r[1 : ])

    return words
        

In [15]:
delete('loave')

['oave', 'lave', 'love', 'loae', 'loav', 'loave']

### Swap

In [22]:
def swap(word):
    words=[]
    for l,r in split(word):
         if len(r)>1:
             
            words.append(l+r[1]+r[0]+r[2:])

    return words

print(swap('lvoe'))

['vloe', 'love', 'lveo']


### Replace

In [23]:
def replace(word):
    
    characters = 'abcdefghijklmnopqrstuvwxyz'
    output = []    

    for l,r in split(word):
        for char in characters:
            output.append(l + char +  r[1:])
    return output

len(replace('lave'))

130

### Insert

In [24]:
def insert(word):
    
    characters = 'abcdefghijklmnopqrstuvwxyz'
    output = []    

    for l,r in split(word):
        for char in characters:
            output.append(l + char +  r)
    return output

len(insert('lve'))

104

# Finding the prediction

### Combining Possible Words

In [68]:
def edit(word):
    return list(set(delete(word)+replace(word)+insert(word)+swap(word)))
    
    

### Spell Checker

In [69]:
def spell_check(word,count=5):
    suggested_words=edit(word)
    output=[]
    for word in suggested_words:
        if word in word_probability:
            output.append([word,word_probability[word]])

    return list(pd.DataFrame(output,columns=['word','prob']).sort_values(by='prob',ascending=False).head(count)['word'])

In [36]:
spell_check('lve') # gives top words with higher probability

['love', 've', 'live', 'lie', 'le']

In [37]:
spell_check('famely')

['family', 'namely', 'lamely']

### Finding the Prediction (Level - 2)

In [70]:
def spell_check2(word,count=5):
     suggested_words=edit(word)
     output=[]
     # we will find edit of all the wrod which come from edit 
     for w in edit(word):
         suggested_words+=(edit(w))

     # convert to set to remove duplicate words
     suggestion=list(set(suggested_words))

     for word in suggestion:
        if word in word_probability:
            output.append([word,word_probability[word]])

     return list(pd.DataFrame(output,columns=['word','prob']).sort_values(by='prob',ascending=False).head(count)['word'])
     

         

    

In [55]:
spell_check('famely',count=7)

['family', 'namely', 'lamely']

In [71]:
spell_check2('famely',count=7)

['family', 'rarely', 'freely', 'namely', 'fairly', 'lately', 'fame']