In [1]:
import re
import pandas as pd
from tqdm import tqdm

# Finding the unique words

In [2]:
with open('bigdata.txt','r') as fd:
    lines=fd.readlines()
    words=[]
    for line in lines:
        words+=re.findall('\w+',line)
    
    
print("Total Words:",len(words))
uniq=set(words)
print("Unique Words:",len(uniq))

Total Words: 106076
Unique Words: 8496


# Finding Probability distribution

In [3]:
words.count('The')

344

In [4]:
words.count('the')

5270

In [5]:
word_prob={}

for word in tqdm(uniq):
    word_prob[word]=float(words.count(word)/len(words))

100%|█████████████████████████████████████████████████████████████████████████████| 8496/8496 [00:13<00:00, 634.56it/s]


In [6]:
len(word_prob)

8496

In [7]:
word_prob['the']

0.04968136053395678

In [8]:
 word_prob['Sherlock']

0.0009050115011878276

# Text Preprocessing

## 1.Splitting

In [9]:
def split(word):
    parts=[]
    for i in range(len(word)+1):
        parts+=[(word[:i],word[i:])]
    return parts

split('rizwal')

[('', 'rizwal'),
 ('r', 'izwal'),
 ('ri', 'zwal'),
 ('riz', 'wal'),
 ('rizw', 'al'),
 ('rizwa', 'l'),
 ('rizwal', '')]

## 2.Deleting

In [10]:
def delete(word):
    newwords=[]
    for i in range(len(word)):
        newwords+=[(word[:i]+word[i+1:])]
    return newwords

delete('rizewal')

['izewal', 'rzewal', 'riewal', 'rizwal', 'rizeal', 'rizewl', 'rizewa']

## 3.Swapping

In [11]:
def swap(word):
    
    output=[]
    for l,r in split(word):
        if(len(r)>1):
            output.append(l+r[1]+r[0]+r[2:])
    return output
swap('rizwal')

['irzwal', 'rziwal', 'riwzal', 'rizawl', 'rizwla']

## 4.Replacing

In [12]:

def replace(word):
    ab='abcdefghijklmnopqrstuvwxyz'
    replaced=[]
    for i in range(len(word)):
        for j in range(len(ab)):
            w=word
            w=w.replace(w[i],ab[j])
            replaced+=[w]
    return replaced

replace('lave')

['aave',
 'bave',
 'cave',
 'dave',
 'eave',
 'fave',
 'gave',
 'have',
 'iave',
 'jave',
 'kave',
 'lave',
 'mave',
 'nave',
 'oave',
 'pave',
 'qave',
 'rave',
 'save',
 'tave',
 'uave',
 'vave',
 'wave',
 'xave',
 'yave',
 'zave',
 'lave',
 'lbve',
 'lcve',
 'ldve',
 'leve',
 'lfve',
 'lgve',
 'lhve',
 'live',
 'ljve',
 'lkve',
 'llve',
 'lmve',
 'lnve',
 'love',
 'lpve',
 'lqve',
 'lrve',
 'lsve',
 'ltve',
 'luve',
 'lvve',
 'lwve',
 'lxve',
 'lyve',
 'lzve',
 'laae',
 'labe',
 'lace',
 'lade',
 'laee',
 'lafe',
 'lage',
 'lahe',
 'laie',
 'laje',
 'lake',
 'lale',
 'lame',
 'lane',
 'laoe',
 'lape',
 'laqe',
 'lare',
 'lase',
 'late',
 'laue',
 'lave',
 'lawe',
 'laxe',
 'laye',
 'laze',
 'lava',
 'lavb',
 'lavc',
 'lavd',
 'lave',
 'lavf',
 'lavg',
 'lavh',
 'lavi',
 'lavj',
 'lavk',
 'lavl',
 'lavm',
 'lavn',
 'lavo',
 'lavp',
 'lavq',
 'lavr',
 'lavs',
 'lavt',
 'lavu',
 'lavv',
 'lavw',
 'lavx',
 'lavy',
 'lavz']

## 5.Inserting

In [13]:
def insert(word):
    ab='abcdefghijklmnopqrstuvwxyz'
    inserted=[]
    for l,r in split(word):
        for char in ab:
            inserted.append(l+char+r)
    return inserted
insert('ove')

['aove',
 'bove',
 'cove',
 'dove',
 'eove',
 'fove',
 'gove',
 'hove',
 'iove',
 'jove',
 'kove',
 'love',
 'move',
 'nove',
 'oove',
 'pove',
 'qove',
 'rove',
 'sove',
 'tove',
 'uove',
 'vove',
 'wove',
 'xove',
 'yove',
 'zove',
 'oave',
 'obve',
 'ocve',
 'odve',
 'oeve',
 'ofve',
 'ogve',
 'ohve',
 'oive',
 'ojve',
 'okve',
 'olve',
 'omve',
 'onve',
 'oove',
 'opve',
 'oqve',
 'orve',
 'osve',
 'otve',
 'ouve',
 'ovve',
 'owve',
 'oxve',
 'oyve',
 'ozve',
 'ovae',
 'ovbe',
 'ovce',
 'ovde',
 'ovee',
 'ovfe',
 'ovge',
 'ovhe',
 'ovie',
 'ovje',
 'ovke',
 'ovle',
 'ovme',
 'ovne',
 'ovoe',
 'ovpe',
 'ovqe',
 'ovre',
 'ovse',
 'ovte',
 'ovue',
 'ovve',
 'ovwe',
 'ovxe',
 'ovye',
 'ovze',
 'ovea',
 'oveb',
 'ovec',
 'oved',
 'ovee',
 'ovef',
 'oveg',
 'oveh',
 'ovei',
 'ovej',
 'ovek',
 'ovel',
 'ovem',
 'oven',
 'oveo',
 'ovep',
 'oveq',
 'over',
 'oves',
 'ovet',
 'oveu',
 'ovev',
 'ovew',
 'ovex',
 'ovey',
 'ovez']

# Finding the prediction

## Getting Possible combinations of a word

In [14]:
def edit(word):
    return (delete(word)+swap(word)+replace(word)+insert(word))

suggested_word=edit('cresh')

In [15]:
output=[]
for wrd in suggested_word:
    if(wrd in word_prob.keys()):
        output.append([wrd,word_prob[wrd]])
        
output

[['fresh', 0.00017911685961009087],
 ['crash', 1.885440627474641e-05],
 ['crest', 9.427203137373204e-06]]

In [16]:
pd.DataFrame(output,columns=['Word','Confidence']).sort_values(by='Confidence',ascending=False)

Unnamed: 0,Word,Confidence
0,fresh,0.000179
1,crash,1.9e-05
2,crest,9e-06


# Spell Checker LEVEL-1

In [17]:
def spell_checker(word):
    suggested_word=edit(word)
    output=[]
    for wrd in suggested_word:
        if(wrd in word_prob.keys()):
            output.append([wrd,word_prob[wrd]])
    return pd.DataFrame(output,columns=['Word','Prob']).sort_values(by='Prob',ascending=False)
    

In [18]:
spell_checker('opan')

Unnamed: 0,Word,Prob
1,open,0.000603
0,span,9e-06
2,opal,9e-06


# Spell Checker LEVEL-2

In [26]:
def edit2(word):
    output=[]
    
    suggested_words=edit(word)
    for i in edit(word):
        suggested_words += edit(i)
    suggested_words=list(set(suggested_words))
    
    for wrd in suggested_words:
        if(wrd in word_prob.keys()):
            output.append([wrd,word_prob[wrd]])
    return pd.DataFrame(output,columns=['Word','Prob']).sort_values(by='Prob',ascending=False)

In [27]:
edit2('tesa')

Unnamed: 0,Word,Prob
7,tell,0.000867
17,these,0.00066
19,less,0.000358
0,best,0.000292
3,ten,0.000264
11,yes,0.000226
12,rest,0.000179
27,west,9.4e-05
18,test,5.7e-05
20,task,5.7e-05
