#### 1. Find probability of occurrence of all unique words
1.  Find all unique words
2.  Find probability of that word appear

In [91]:
import re
from tqdm import tqdm
import pandas as pd

#### 1. Finding unique words

In [92]:
with open('big.txt','r') as fd:
    lines = fd.readlines()
    words = []
    for line in lines:
        words += line.split(' ')
print(len(words))

1164968


In [93]:
with open('big.txt','r') as fd:
    lines = fd.readlines()
    words = []
    for line in lines:
        words += re.findall('\w+',line.lower())
print(len(words))
vocab = list(set(words))
print(len(vocab))

1115585
32198


* 1164968 - 1115585 = 49383
* There are 49383 words which are not beginning with a word but rather a number or a special character

#### 2. Finding the probability distribution

In [94]:
word_probability = {}
for word in tqdm(vocab):
    word_probability[word] = float(words.count(word) / len(words))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32198/32198 [06:55<00:00, 77.47it/s]


In [95]:
len(word_probability)

32198

### 3. Text Preprocessing
#### 3.0 Splitting

'loave' -l-> 'oave'
'loave' -o-> 'lave'
'loave' -a-> 'love'
'loave' -v-> 'loae'
'loave' -e-> 'loav'

In [96]:
def split(word):
    parts = []
    for i in range(len(word)+1):
        parts+= [(word[:i],word[i:])]
    return parts
split('Ritika')

[('', 'Ritika'),
 ('R', 'itika'),
 ('Ri', 'tika'),
 ('Rit', 'ika'),
 ('Riti', 'ka'),
 ('Ritik', 'a'),
 ('Ritika', '')]

#### 3.1. Delete
* 'loave' -> 'love'

In [97]:
def delete(word):
    output = []
    
    for l,r in split(word):
        output.append(l+r[1:])
    return output
delete('Ritika')

['itika', 'Rtika', 'Riika', 'Ritka', 'Ritia', 'Ritik', 'Ritika']

#### 3.2 Swap
* 'lvoe' -> 'love'

In [98]:
def swap(word):
    output = []
    for l,r in split(word):
        if(len(r)>1):
            output.append(l+r[1]+r[0]+r[2:])
    return output
swap('Ritika')

['iRtika', 'Rtiika', 'Riitka', 'Ritkia', 'Ritiak']

#### 3.3 Replace
* 'lave' -> 'love'

'lave' -> 'aave','bave','cave',...'zave'

'lave' -> 'lave','lbve','lcve',...'lzve'

..

'lave' -> 'lava','lavb','lavc',...'lavz'

Total combinations = 26* 4 = 104

In [99]:
def replace(word):
    characters = 'abcdefghijklmnopqrstuvwxyz'
    output = []
    for l,r in split(word):
        for char in characters:
            output.append(l+char+r[1:])
    return output
len(replace('Ritika'))

182

 #### 3.4 Insert
* 'lve' -> 'love'

In [100]:
def insert(word):
    characters = 'abcdefghijklmnopqrstuvwxyz'
    output = []
    for l,r in split(word):
        for char in characters:
            output.append(l+char+r)
    return output
len(insert('Ritika'))

182

### 4. Finding the Prediction

#### 4.1) Combining Possible Words

In [101]:
def edit(word):
    return set(insert(word)+ delete(word)+ swap(word)+ replace(word))
len(edit('Ritika'))

340

#### 4.2) Predicting the word

In [102]:
def spell_check_edit_1(word,count = 5):
    output = []
    suggested_words = edit(word)
    for wrd in suggested_words:
        if wrd in word_probability.keys():
            output.append([wrd, word_probability[wrd]])
            
    return list(pd.DataFrame(output,columns=['word','prob']).sort_values(by ='prob',ascending=False).head(count)['word'].values)

In [103]:
spell_check_edit_1('tha',2)

['the', 'that']

### 5. Finding the Prediction (Level-2)


#### 5.1) Combining Possible words

In [111]:
def spell_check_edit_2(word,count = 5):
    output = []
    suggested_words = edit(word)  # Include 1st level edit
    for e1 in edit(word):
        suggested_words.update(edit(e1)) # Include second level edit
    
    suggested_words = set(suggested_words)
    
    for wrd in suggested_words:
        if wrd in word_probability.keys():
            output.append([wrd,word_probability[wrd]])
    return list(pd.DataFrame(output,columns=['word','prob']).sort_values(by ='prob',ascending=False).head(count)['word'].values)
spell_check_edit_2('famli')

['family', 'fall', 'fault', 'fail', 'falls']