In [6]:
import re
import pandas as pd
from tqdm import tqdm

## 1. Finding the Unique Words

In [8]:
with open('big.txt','r') as fd:
    lines = fd.readlines()
    words = []
    for line in lines:
        words += re.findall('\w+',line.lower())
        
print(len(words))
vocab = list(set(words))
print(len(vocab))

1115585
32198


## 2. Finding the Probability Distribution

In [9]:
word_probability = {}

for word in tqdm(vocab):
    word_probability[word] = float(words.count(word)/len(words))

100%|████████████████████████████████| 32198/32198 [07:20<00:00, 73.11it/s]


In [14]:
len(word_probability)

32198

## 3. Text Preprocessing
### Splitting

In [2]:
def split(word):  
    parts = []
    for i in range(len(word) + 1):
        parts += [(word[ : i], word[i : ])]
    return parts

### 3.1) Delete

'loave' -> 'love'

In [3]:
def delete(word):
    
    output = []
    for l,r in split(word):
        output.append(l + r[1:])
    return output

delete('loave')

['oave', 'lave', 'love', 'loae', 'loav', 'loave']

### 3.2) Swap

'lvoe' -> 'love'

In [17]:
def whap(word):
    res = []
    
    for i in range(len(word)):
        if i < len(word) -1:
            print(word[:i] + word[i+1] +  word[i] + word[i+2:])
            
whap('lave')

alve
lvae
laev


In [18]:
def swap(word):
        
    output = []    
    for l,r in split(word):
        if (len(r) > 1):
            output.append(l + r[1] + r[0] + r[2:])
    return output
            
swap('lvoe')

['vloe', 'love', 'lveo']

### 3.3) Replace

'lave' -> 'love'

In [19]:
def replace(word):
    
    characters = 'abcdefghijklmnopqrstuvwxyz'
    output = []    

    for l,r in split(word):
        for char in characters:
            output.append(l + char +  r[1:])
    return output

replace('lave')

['aave',
 'bave',
 'cave',
 'dave',
 'eave',
 'fave',
 'gave',
 'have',
 'iave',
 'jave',
 'kave',
 'lave',
 'mave',
 'nave',
 'oave',
 'pave',
 'qave',
 'rave',
 'save',
 'tave',
 'uave',
 'vave',
 'wave',
 'xave',
 'yave',
 'zave',
 'lave',
 'lbve',
 'lcve',
 'ldve',
 'leve',
 'lfve',
 'lgve',
 'lhve',
 'live',
 'ljve',
 'lkve',
 'llve',
 'lmve',
 'lnve',
 'love',
 'lpve',
 'lqve',
 'lrve',
 'lsve',
 'ltve',
 'luve',
 'lvve',
 'lwve',
 'lxve',
 'lyve',
 'lzve',
 'laae',
 'labe',
 'lace',
 'lade',
 'laee',
 'lafe',
 'lage',
 'lahe',
 'laie',
 'laje',
 'lake',
 'lale',
 'lame',
 'lane',
 'laoe',
 'lape',
 'laqe',
 'lare',
 'lase',
 'late',
 'laue',
 'lave',
 'lawe',
 'laxe',
 'laye',
 'laze',
 'lava',
 'lavb',
 'lavc',
 'lavd',
 'lave',
 'lavf',
 'lavg',
 'lavh',
 'lavi',
 'lavj',
 'lavk',
 'lavl',
 'lavm',
 'lavn',
 'lavo',
 'lavp',
 'lavq',
 'lavr',
 'lavs',
 'lavt',
 'lavu',
 'lavv',
 'lavw',
 'lavx',
 'lavy',
 'lavz',
 'lavea',
 'laveb',
 'lavec',
 'laved',
 'lavee',
 'lavef',
 'la

### 3.4) Insert

'lve' -> 'love'

In [245]:
def insert(word):

    characters = 'abcdefghijklmnopqrstuvwxyz'
    output = []

    for l,r in split(word):
        for char in characters:
            output.append(l + char + r)

    return output

len(insert('lve'))

104

## 4. Finding the Prediction (Level - 1)
### 4.1) Combining Possible Words

In [246]:
def edit(word):   
    return list(set(insert(word) + delete(word) + swap(word) + replace(word)))

### 4.2) Predicting the Word

In [247]:
def spell_check_edit_1(word, count = 5):
    
    output = []
    suggested_words = edit(word)
    
    for wrd in suggested_words:        
        if wrd in word_probability.keys():
            output.append([wrd, word_probability[wrd]])
            
    return list(pd.DataFrame(output, columns = ['word','prob']).sort_values(by = 'prob', ascending = False).head(count)['word'].values)

In [249]:
spell_check_edit_1('famili')

['family']

## 5. Finding the Prediction (Level - 2)
### 5.1) Combining Possible Words

In [240]:
def spell_check_edit_2(word, count = 5):
    
    output = []
    suggested_words = edit(word)       # Level one Edit
    
    for e1 in edit(word):
        suggested_words += edit(e1)    # Second Level Edit 
    
    suggested_words = list(set(suggested_words))
    
    for wrd in suggested_words:
        if wrd in word_probability.keys():
            output.append([wrd, word_probability[wrd]])
    return list(pd.DataFrame(output, columns = ['word','prob']).sort_values(by = 'prob', ascending = False).head(count)['word'].values)
        
spell_check_edit_2('fameli')

['family', 'namely', 'fame', 'camel', 'camelia']