<a href="https://colab.research.google.com/github/SuhasKatkuri/AI-ML/blob/main/Spell_Checker_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import pandas as pd
from tqdm import tqdm

**1. Finding the Unique Words**

In [None]:
with open('/content/big.txt','r') as fd:

  lines=fd.readlines()
  words = []
  for line in lines:
    words += re.findall('\w+',line.lower())

print(len(words))

vocab = list(set(words))

print(len(vocab))

1115585
32198


**2. Finding the Probability Distribution**

In [None]:
word_probability = {}

for word in tqdm(vocab):
  word_probability[word] = float(words.count(word)/len(words))

100%|██████████| 32198/32198 [13:59<00:00, 38.35it/s]


In [None]:
len(word_probability)

32198

**3. Text Preprocessing**


Splitting

In [None]:
def split(word):
  parts = []

  for i in range(len(word)):
    parts += [(word[:i] , word[i:])]
  return parts

split('suhas')


[('', 'suhas'), ('s', 'uhas'), ('su', 'has'), ('suh', 'as'), ('suha', 's')]

3.1. Delete

'loave' -> 'love'

In [None]:
def delete(word):

  output = []

  for l,r in split(word):
    output.append(l + r[1:])
  return output

delete('heallo')

['eallo', 'hallo', 'hello', 'healo', 'healo', 'heall']

3.2. Swap

'lvoe' -> 'love'

In [None]:
def swap(word):

  output = []
  for l,r in split(word):
    if (len(r) > 1):

      output.append(l + r[1] + r[0] +r[2:])
  return output

swap('lvoe')


['vloe', 'love', 'lveo']

3.3. Replace

'lave' -> 'love'

In [None]:
def replace(word):
  characters='abcdefghijklmnopqrstuvwxyz'
  output = []

  for l,r in split(word):
    for char in characters:
      output.append(l + char + r[1:])
  return output

len(replace('thes'))

104

3.4. Insert

'lve' -> 'love'

In [None]:
def insert(word):
  characters='abcdefghijklmnopqrstuvwxyz'
  output = []

  for l,r in split(word):
    for char in characters:
      output.append(l + char + r)
  return output


len(insert('lve'))

78

**4. Finding the Prediction**

4.1. Combining Possible Words

In [None]:
def edit(word):

  return list(set(swap(word) + delete(word) + replace(word) +insert(word)))

len(edit('loave'))

261

4.2. Predicting the Word

In [None]:
def spell_check(word, count = 5):
  output = []

  suggested_words = edit(word)
  for wrd in suggested_words:
    if wrd in word_probability.keys():
      output.append([wrd,word_probability[wrd]])

  return list(pd.DataFrame(output, columns = ['word','prob']).sort_values(by = 'prob',ascending = False).head(count).values)

In [None]:
spell_check('live', count=2)

[array(['like', 0.0009681019375484611], dtype=object),
 array(['life', 0.0007780671127704298], dtype=object)]

**5. Finding the prediction 2.0**

5.1. Combining Possible values

In [None]:
def spell_check_2(word, count=5):

  output = []
  suggested_words = edit(word) #Level one edit

  for e1 in edit(word):
    suggested_words += edit(e1) #Second level edit

  suggested_words = list(set(suggested_words))

  for wrd in suggested_words:
    if wrd in word_probability.keys():
      output.append([wrd, word_probability[wrd]])
  return list(pd.DataFrame(output, columns = ['word','prob']).sort_values(by = 'prob',ascending = False).head(count).values)

spell_check_2('loave')

[array(['have', 0.0031310926554229397], dtype=object),
 array(['love', 0.000433853090530977], dtype=object),
 array(['gave', 0.0003962046818485369], dtype=object),
 array(['leave', 0.0002689172048745725], dtype=object),
 array(['late', 0.00014700807199809966], dtype=object)]

In [None]:
print(spell_check('loave'),'\n',spell_check_2('loave'))

[array(['love', 0.000433853090530977], dtype=object), array(['leave', 0.0002689172048745725], dtype=object)] 
 [array(['have', 0.0031310926554229397], dtype=object), array(['love', 0.000433853090530977], dtype=object), array(['gave', 0.0003962046818485369], dtype=object), array(['leave', 0.0002689172048745725], dtype=object), array(['late', 0.00014700807199809966], dtype=object)]


In [None]:
spell_check_2('reght')

[array(['right', 0.0006364373848698217], dtype=object),
 array(['might', 0.0004804654060425696], dtype=object),
 array(['night', 0.00034421402223945287], dtype=object),
 array(['light', 0.000248300219167522], dtype=object),
 array(['rest', 0.0001846564806805398], dtype=object)]