# 1. Finding all unique words

In [9]:
import pandas as pd
import re
from tqdm import tqdm

In [10]:
with open("big.txt", "r") as fd:
    lines=fd.readlines()
    words=[]
    for i in lines:
        words+=re.findall("\w+", i.lower())
print(len(words))

vocab=list(set(words))
print(len(vocab))

1115585
32198


# 2. Finding all Probability Distribution (Percentage)

In [13]:
word_probability={}
for i in tqdm(vocab):
    word_probability[i]= float(words.count(i) / len(words))   # Val/Total *100

100%|██████████| 32198/32198 [17:42<00:00, 30.29it/s]


In [14]:
import json

with open('word_probability.json', 'w') as f:
    json.dump(word_probability, f, indent=4) # indent makes the file readable

print("Dictionary saved to word_probability.json")

# --- To load it back later ---
with open('word_probability.json', 'r') as f:
    word_probability = json.load(f)

Dictionary saved to word_probability.json


In [15]:
word_probability

{'newness': 8.963906829152417e-07,
 'dweller': 8.963906829152417e-07,
 'chaises': 8.963906829152417e-07,
 'abused': 5.378344097491451e-06,
 'indicate': 2.7788111170372494e-05,
 'vogel': 1.7927813658304835e-06,
 'belongs': 1.3445860243728626e-05,
 'stacked': 1.7927813658304835e-06,
 '_collateral': 8.963906829152417e-07,
 'doute': 8.963906829152417e-07,
 'unhindered': 1.7927813658304835e-06,
 'lap': 9.860297512067659e-06,
 'sooty': 8.963906829152417e-07,
 'newfoundland': 8.963906829152417e-07,
 'recommends': 8.963906829152418e-06,
 'grape': 3.585562731660967e-06,
 'uncertainties': 1.7927813658304835e-06,
 'vieux': 8.963906829152417e-07,
 'war': 0.000782549066185006,
 'yuri': 8.963906829152417e-07,
 'exertions': 2.6891720487457255e-06,
 'congressional': 8.963906829152418e-06,
 'lawbreaking': 8.963906829152417e-07,
 'diversions': 8.963906829152417e-07,
 'chasseurs': 6.2747347804066925e-06,
 'pontoon': 8.963906829152417e-07,
 'neverovski': 8.963906829152417e-07,
 'has': 0.001436914264713132

# 3. Text Preprocessing

### Splitting

In [16]:
def split(word):
    parts=[]
    for i in range(len(word)+1):
        parts+=[(word[:i], word[i:])]
    return parts

In [17]:
split("Rishita")

[('', 'Rishita'),
 ('R', 'ishita'),
 ('Ri', 'shita'),
 ('Ris', 'hita'),
 ('Rish', 'ita'),
 ('Rishi', 'ta'),
 ('Rishit', 'a'),
 ('Rishita', '')]

#### 3.1) Delete
"loave" ->"love"

In [18]:
def delete(word):
    op=[]
    for l, r in split(word):
        op.append(l+ r[1:])
    return op
delete("loave")

['oave', 'lave', 'love', 'loae', 'loav', 'loave']

#### 3.2) Swap
"lvoe" ->"love"

In [19]:
def swap(word):
    op=[]
    for l, r in split(word):
        if (len(r)>1):
            op.append(l + r[1] + r[0] + r[2:])
    return op

swap("lvoe")

['vloe', 'love', 'lveo']

#### 3.3) Replace
"lave" ->"love"

In [20]:
def replace(word):
    op=[]
    characters="abcdefghijklmnopqrstuvwxyz"

    for l, r in split(word):
        for char in characters:
            op.append(l + char + r[1:])
    return op
replace("lave")


['aave',
 'bave',
 'cave',
 'dave',
 'eave',
 'fave',
 'gave',
 'have',
 'iave',
 'jave',
 'kave',
 'lave',
 'mave',
 'nave',
 'oave',
 'pave',
 'qave',
 'rave',
 'save',
 'tave',
 'uave',
 'vave',
 'wave',
 'xave',
 'yave',
 'zave',
 'lave',
 'lbve',
 'lcve',
 'ldve',
 'leve',
 'lfve',
 'lgve',
 'lhve',
 'live',
 'ljve',
 'lkve',
 'llve',
 'lmve',
 'lnve',
 'love',
 'lpve',
 'lqve',
 'lrve',
 'lsve',
 'ltve',
 'luve',
 'lvve',
 'lwve',
 'lxve',
 'lyve',
 'lzve',
 'laae',
 'labe',
 'lace',
 'lade',
 'laee',
 'lafe',
 'lage',
 'lahe',
 'laie',
 'laje',
 'lake',
 'lale',
 'lame',
 'lane',
 'laoe',
 'lape',
 'laqe',
 'lare',
 'lase',
 'late',
 'laue',
 'lave',
 'lawe',
 'laxe',
 'laye',
 'laze',
 'lava',
 'lavb',
 'lavc',
 'lavd',
 'lave',
 'lavf',
 'lavg',
 'lavh',
 'lavi',
 'lavj',
 'lavk',
 'lavl',
 'lavm',
 'lavn',
 'lavo',
 'lavp',
 'lavq',
 'lavr',
 'lavs',
 'lavt',
 'lavu',
 'lavv',
 'lavw',
 'lavx',
 'lavy',
 'lavz',
 'lavea',
 'laveb',
 'lavec',
 'laved',
 'lavee',
 'lavef',
 'la

#### 3.4) Insert
"lve" ->"love"

In [21]:
def insert(word):
    op=[]
    characters="abcdefghijklmnopqrstuvwxyz"
    for l, r in split(word):
            for char in characters:
                op.append(l+char+ r)
    return op
insert("lve")

['alve',
 'blve',
 'clve',
 'dlve',
 'elve',
 'flve',
 'glve',
 'hlve',
 'ilve',
 'jlve',
 'klve',
 'llve',
 'mlve',
 'nlve',
 'olve',
 'plve',
 'qlve',
 'rlve',
 'slve',
 'tlve',
 'ulve',
 'vlve',
 'wlve',
 'xlve',
 'ylve',
 'zlve',
 'lave',
 'lbve',
 'lcve',
 'ldve',
 'leve',
 'lfve',
 'lgve',
 'lhve',
 'live',
 'ljve',
 'lkve',
 'llve',
 'lmve',
 'lnve',
 'love',
 'lpve',
 'lqve',
 'lrve',
 'lsve',
 'ltve',
 'luve',
 'lvve',
 'lwve',
 'lxve',
 'lyve',
 'lzve',
 'lvae',
 'lvbe',
 'lvce',
 'lvde',
 'lvee',
 'lvfe',
 'lvge',
 'lvhe',
 'lvie',
 'lvje',
 'lvke',
 'lvle',
 'lvme',
 'lvne',
 'lvoe',
 'lvpe',
 'lvqe',
 'lvre',
 'lvse',
 'lvte',
 'lvue',
 'lvve',
 'lvwe',
 'lvxe',
 'lvye',
 'lvze',
 'lvea',
 'lveb',
 'lvec',
 'lved',
 'lvee',
 'lvef',
 'lveg',
 'lveh',
 'lvei',
 'lvej',
 'lvek',
 'lvel',
 'lvem',
 'lven',
 'lveo',
 'lvep',
 'lveq',
 'lver',
 'lves',
 'lvet',
 'lveu',
 'lvev',
 'lvew',
 'lvex',
 'lvey',
 'lvez']

# 4. Finding Prediction (Level 1)

## 4.1) Combining Possible Works

In [22]:
def edit(word):
    return list(set(insert(word)+delete(word)+swap(word)+replace(word)))


## 4.2) Predicting the word 

In [23]:
def spell_check(word,count=5):
    op=[]
    suggested_words=edit(word)
    for i in suggested_words:
        if i in word_probability.keys():
            op.append(([i, word_probability[i]]))
    return list(pd.DataFrame(op, columns=["word", "prob"]).sort_values(by="prob", ascending= False).head(count)["word"].values)


In [24]:
spell_check("tha")

['the', 'that', 'than', 'tea', 'ha']

# 5. Finding the Prediction (Level 2)

## 5.1) Combining Possible Words

In [26]:
def spell_check_edit_2(word, count = 5):
    
    output = []
    suggested_words = edit(word)       # Level one Edit
    
    for e1 in edit(word):
        suggested_words += edit(e1)    # Second Level Edit 
    
    suggested_words = list(set(suggested_words))
    
    for wrd in suggested_words:
        if wrd in word_probability.keys():
            output.append([wrd, word_probability[wrd]])
    return list(pd.DataFrame(output, columns = ['word','prob']).sort_values(by = 'prob', ascending = False).head(count)['word'].values)
        
spell_check_edit_2('fameli')

['family', 'namely', 'fame', 'camelia', 'amelie']