In [1]:
import numpy as np
from collections import Counter
import re
import string

In [2]:
#we will find 
#probability of word in data
#split word
#delete one char in word
#swap two adjacent char
#replace char in word from a-z
#insert char in word form a-z

In [3]:
class SpellChecker(object):
    def __init__(self, corpus_file_path):
        with open(corpus_file_path, "r") as file:
            lines = file.readlines()
            self.total_words = []
            for line in lines:
                self.total_words += re.findall(r'\w+', line.lower())
        self.total_words=[word.strip('_') for word in self.total_words]
        self.unique_words=set(self.total_words)
        self.word_counts_dic = Counter(self.total_words)
        self.prob={word: self.word_counts_dic[word] / len(self.total_words) for word in self.unique_words}
        
    def edit_distance_1(self,word):
        str=string.ascii_lowercase
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes = [l + r[1:] for l,r in splits if r]
        swaps = [l + r[1] + r[0] + r[2:] for l, r in splits if len(r)>1]
        replaces = [l + c + r[1:] for l, r in splits if r for c in str]
        inserts = [l + c + r for l, r in splits for c in str] 
        return set(deletes + swaps + replaces + inserts)

    def edit_distance_2(self,word):
        return set(e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_2(e1))
    
    def spell_check(self,word):
        suggestions = self.edit_distance_1(word) or self.edit_distance_2(word) or [word]
        valid_suggestions=[word for word in suggestions if word in self.unique_words] 
        return sorted([(c,self.prob[c]) for c in valid_suggestions],key=lambda tup:tup[1],reverse=True) #sort in decreasing order on base of probability 

In [4]:
S=SpellChecker("data.txt")

choice = None
while choice != "3":
    print("\n************** Spell Checker **************")
    print("1. Check valid Spell or not")
    print("2. Display Dictionary")
    print("3. Exit")
    choice = input("Enter your choice: ")
    if choice == "1":
        word = input("Enter word: ")
        print(S.spell_check(word))
    elif choice == "2":
        print(f"There are Total : {len(S.unique_words)} word in dictionary\n\n")
        print(S.unique_words)
    elif choice == "3":
        print("Exiting Spell Checker System. Goodbye!\n")
    else:
        print("Invalid choice. Please try again.")


************** Spell Checker **************
1. Check valid Spell or not
2. Display Dictionary
3. Exit
Enter your choice: 3
Exiting Spell Checker System. Goodbye!



In [5]:
# S.spell_check("detective")

In [6]:
len(S.total_words)

1115313

In [7]:
S.unique_words

{'',
 'eternally',
 'constrained',
 'manchuria',
 'analysis',
 'trustworthy',
 'ronde',
 'ointments',
 'champagne',
 'eton',
 'vegetarian',
 'cleverness',
 'lipping',
 'whatever',
 'gifts',
 'coerce',
 'ostensibly',
 'urban',
 'enticingly',
 'debtor',
 'assemblymen',
 'frontier',
 'subjection',
 'trigeminus',
 'sprawling',
 'compensated',
 'ingestion',
 'dozen',
 'nankeen',
 'curtains',
 '429',
 'consists',
 'explaining',
 'phagocytic',
 'borrowed',
 'reined',
 'pitting',
 'modeled',
 'fairies',
 'disintegration',
 'granges',
 '49',
 'display',
 'optional',
 'leeches',
 'reply',
 'aerial',
 'heavenly',
 '577ff',
 'laser',
 'clutches',
 'perpetrated',
 'piteously',
 'reinstated',
 'piteous',
 'flowering',
 'tr',
 'jokingly',
 'experimented',
 'sprained',
 'overseer',
 'bolkhovitinov',
 'repents',
 'firewood',
 'verruca',
 '172',
 'rapacious',
 'devices',
 'vacated',
 'nonreceipt',
 'sorrows',
 'trepoff',
 'gloves',
 'dustcoat',
 '1903',
 'puget',
 'georgia',
 'prefix',
 'impudently',
 '

# -------------------------------------------------------------------------------------------------------------

In [8]:
def extract_word(file_name):
    with open(file_name,"r") as file:
        lines=file.readlines() #list of line in data
        total_words=[]
        for line in lines:
            total_words+=re.findall(r'\w+', line.lower())
        total_words=[word.strip('_') for word in total_words]
    return total_words

In [9]:
total_words=extract_word("data.txt")
total_words=[word.strip('_') for word in total_words]
len(total_words)

1115313

In [10]:
unique_words=set(total_words) #unique words
len(unique_words)
unique_words

{'',
 'eternally',
 'constrained',
 'manchuria',
 'analysis',
 'trustworthy',
 'ronde',
 'ointments',
 'champagne',
 'eton',
 'vegetarian',
 'cleverness',
 'lipping',
 'whatever',
 'gifts',
 'coerce',
 'ostensibly',
 'urban',
 'enticingly',
 'debtor',
 'assemblymen',
 'frontier',
 'subjection',
 'trigeminus',
 'sprawling',
 'compensated',
 'ingestion',
 'dozen',
 'nankeen',
 'curtains',
 '429',
 'consists',
 'explaining',
 'phagocytic',
 'borrowed',
 'reined',
 'pitting',
 'modeled',
 'fairies',
 'disintegration',
 'granges',
 '49',
 'display',
 'optional',
 'leeches',
 'reply',
 'aerial',
 'heavenly',
 '577ff',
 'laser',
 'clutches',
 'perpetrated',
 'piteously',
 'reinstated',
 'piteous',
 'flowering',
 'tr',
 'jokingly',
 'experimented',
 'sprained',
 'overseer',
 'bolkhovitinov',
 'repents',
 'firewood',
 'verruca',
 '172',
 'rapacious',
 'devices',
 'vacated',
 'nonreceipt',
 'sorrows',
 'trepoff',
 'gloves',
 'dustcoat',
 '1903',
 'puget',
 'georgia',
 'prefix',
 'impudently',
 '

In [11]:
word_counts_dic=Counter(total_words)#dictionary which contain words to count map
word_counts_dic["india"]

26

# Probability of word in data

In [12]:
prob={word: word_counts_dic[word]/len(total_words) for word in word_counts_dic.keys()}
prob['india'] #prob is dictionray contain word to prob mapping

2.33118416085888e-05

# Split word

In [13]:
def split(word):#return list of tuples
    return [(word[:i],word[i:]) for i in range(len(word)+1)]

In [14]:
split("vatsal")

[('', 'vatsal'),
 ('v', 'atsal'),
 ('va', 'tsal'),
 ('vat', 'sal'),
 ('vats', 'al'),
 ('vatsa', 'l'),
 ('vatsal', '')]

# Delete one char in word

In [15]:
def delete(word):
    return [(l+r[1:]) for l,r in split(word) if len(r)>=1]

In [16]:
delete("vatsal")

['atsal', 'vtsal', 'vasal', 'vatal', 'vatsl', 'vatsa']

# Swap two adjacent char

In [17]:
def swap(word):
    return [l+r[1]+r[0]+r[2:] for l,r in split(word) if len(r)>1]

In [18]:
swap("vatsal")

['avtsal', 'vtasal', 'vastal', 'vatasl', 'vatsla']

# Replace char in 

In [19]:
str=string.ascii_lowercase

In [20]:
def replace(word):
    return [l+s+r[1:] for l,r in split(word) if len(r)>=1 for s in str]

In [21]:
replace("vatsal")

['aatsal',
 'batsal',
 'catsal',
 'datsal',
 'eatsal',
 'fatsal',
 'gatsal',
 'hatsal',
 'iatsal',
 'jatsal',
 'katsal',
 'latsal',
 'matsal',
 'natsal',
 'oatsal',
 'patsal',
 'qatsal',
 'ratsal',
 'satsal',
 'tatsal',
 'uatsal',
 'vatsal',
 'watsal',
 'xatsal',
 'yatsal',
 'zatsal',
 'vatsal',
 'vbtsal',
 'vctsal',
 'vdtsal',
 'vetsal',
 'vftsal',
 'vgtsal',
 'vhtsal',
 'vitsal',
 'vjtsal',
 'vktsal',
 'vltsal',
 'vmtsal',
 'vntsal',
 'votsal',
 'vptsal',
 'vqtsal',
 'vrtsal',
 'vstsal',
 'vttsal',
 'vutsal',
 'vvtsal',
 'vwtsal',
 'vxtsal',
 'vytsal',
 'vztsal',
 'vaasal',
 'vabsal',
 'vacsal',
 'vadsal',
 'vaesal',
 'vafsal',
 'vagsal',
 'vahsal',
 'vaisal',
 'vajsal',
 'vaksal',
 'valsal',
 'vamsal',
 'vansal',
 'vaosal',
 'vapsal',
 'vaqsal',
 'varsal',
 'vassal',
 'vatsal',
 'vausal',
 'vavsal',
 'vawsal',
 'vaxsal',
 'vaysal',
 'vazsal',
 'vataal',
 'vatbal',
 'vatcal',
 'vatdal',
 'vateal',
 'vatfal',
 'vatgal',
 'vathal',
 'vatial',
 'vatjal',
 'vatkal',
 'vatlal',
 'vatmal',

# Insert char in word 

In [22]:
def insert(word):
    return [l+s+r for l,r in split(word) if len(r)>=1 for s in str]

In [23]:
insert("vatsal")

['avatsal',
 'bvatsal',
 'cvatsal',
 'dvatsal',
 'evatsal',
 'fvatsal',
 'gvatsal',
 'hvatsal',
 'ivatsal',
 'jvatsal',
 'kvatsal',
 'lvatsal',
 'mvatsal',
 'nvatsal',
 'ovatsal',
 'pvatsal',
 'qvatsal',
 'rvatsal',
 'svatsal',
 'tvatsal',
 'uvatsal',
 'vvatsal',
 'wvatsal',
 'xvatsal',
 'yvatsal',
 'zvatsal',
 'vaatsal',
 'vbatsal',
 'vcatsal',
 'vdatsal',
 'veatsal',
 'vfatsal',
 'vgatsal',
 'vhatsal',
 'viatsal',
 'vjatsal',
 'vkatsal',
 'vlatsal',
 'vmatsal',
 'vnatsal',
 'voatsal',
 'vpatsal',
 'vqatsal',
 'vratsal',
 'vsatsal',
 'vtatsal',
 'vuatsal',
 'vvatsal',
 'vwatsal',
 'vxatsal',
 'vyatsal',
 'vzatsal',
 'vaatsal',
 'vabtsal',
 'vactsal',
 'vadtsal',
 'vaetsal',
 'vaftsal',
 'vagtsal',
 'vahtsal',
 'vaitsal',
 'vajtsal',
 'vaktsal',
 'valtsal',
 'vamtsal',
 'vantsal',
 'vaotsal',
 'vaptsal',
 'vaqtsal',
 'vartsal',
 'vastsal',
 'vattsal',
 'vautsal',
 'vavtsal',
 'vawtsal',
 'vaxtsal',
 'vaytsal',
 'vaztsal',
 'vatasal',
 'vatbsal',
 'vatcsal',
 'vatdsal',
 'vatesal',
 'va

# word after doing edit distance = 1 and 2

In [24]:
def edit_distance_1(word):
    return set(delete(word) + swap(word) + replace(word) + insert(word))

In [25]:
edit_distance_1("vatsal")

{'aatsal',
 'atsal',
 'avatsal',
 'avtsal',
 'batsal',
 'bvatsal',
 'catsal',
 'cvatsal',
 'datsal',
 'dvatsal',
 'eatsal',
 'evatsal',
 'fatsal',
 'fvatsal',
 'gatsal',
 'gvatsal',
 'hatsal',
 'hvatsal',
 'iatsal',
 'ivatsal',
 'jatsal',
 'jvatsal',
 'katsal',
 'kvatsal',
 'latsal',
 'lvatsal',
 'matsal',
 'mvatsal',
 'natsal',
 'nvatsal',
 'oatsal',
 'ovatsal',
 'patsal',
 'pvatsal',
 'qatsal',
 'qvatsal',
 'ratsal',
 'rvatsal',
 'satsal',
 'svatsal',
 'tatsal',
 'tvatsal',
 'uatsal',
 'uvatsal',
 'vaasal',
 'vaatsal',
 'vabsal',
 'vabtsal',
 'vacsal',
 'vactsal',
 'vadsal',
 'vadtsal',
 'vaesal',
 'vaetsal',
 'vafsal',
 'vaftsal',
 'vagsal',
 'vagtsal',
 'vahsal',
 'vahtsal',
 'vaisal',
 'vaitsal',
 'vajsal',
 'vajtsal',
 'vaksal',
 'vaktsal',
 'valsal',
 'valtsal',
 'vamsal',
 'vamtsal',
 'vansal',
 'vantsal',
 'vaosal',
 'vaotsal',
 'vapsal',
 'vaptsal',
 'vaqsal',
 'vaqtsal',
 'varsal',
 'vartsal',
 'vasal',
 'vassal',
 'vastal',
 'vastsal',
 'vataal',
 'vatal',
 'vatasal',
 'vat

In [26]:
def edit_distance_2(word):
    return set(e2 for e1 in edit_distance_1(word) for e2 in edit_distance_1(e1))

In [27]:
edit_distance_2("vatsal")

{'vatsjvl',
 'vattsval',
 'vatesasl',
 'lantsal',
 'vakatsal',
 'vawsanl',
 'vaigal',
 'vuhsal',
 'gatsral',
 'gazsal',
 'vsyatsal',
 'jatual',
 'hfatsal',
 'fvaksal',
 'vatpjl',
 'vxatual',
 'vatsvaz',
 'vatbsaul',
 'vagtoal',
 'vatdsacl',
 'vaqstsal',
 'vajsvl',
 'qvstsal',
 'vatusbal',
 'vaxsael',
 'xvatsail',
 'fatsol',
 'vwaisal',
 'vaptsaj',
 'vaistsal',
 'vamscl',
 'vatnil',
 'dvatsai',
 'oaztsal',
 'avatsual',
 'tatsai',
 'vxtsaq',
 'vatisab',
 'latsol',
 'vatfsax',
 'vqmtsal',
 'pvatsaq',
 'vbtsol',
 'vatmol',
 'vatyswal',
 'vastrl',
 'vsatsjal',
 'vatsbaxl',
 'vacsaul',
 'wvatsae',
 'vatdao',
 'vantusal',
 'vgatsahl',
 'vbazsal',
 'vltgsal',
 'vythal',
 'whtsal',
 'cvartsal',
 'vataso',
 'vaztsml',
 'vatasaql',
 'oatsral',
 'iapsal',
 'vatqsay',
 'xvatsgal',
 'zvatsbal',
 'tatssl',
 'vaesan',
 'vaytjsal',
 'vasgtsal',
 'vafspal',
 'vatsjb',
 'waosal',
 'xagtsal',
 'vatlval',
 'vrtslal',
 'vatzpl',
 'vabsaal',
 'vadwal',
 'vetsau',
 'vkftsal',
 'patbsal',
 'bvartsal',
 'valsai

In [28]:
def spell_checker(word,unique_words,prob):
    if word in unique_words:
        print(f"{word} is already correctly spelt")
        return
    suggestions=edit_distance_1(word) or edit_distance_2 or [word]
    valid_suggestions=[word for word in suggestions if word in total_words]
    return [(word,prob[word]) for word in valid_suggestions]

In [32]:
word="simpllicity"
corrections=spell_checker(word,unique_words,prob)

if corrections:
    probs=np.array([c[1] for c in corrections])
    best_ix=np.argmax(probs)
    correct=corrections[best_ix][0]
    print(f"correct word for {word} is \"{correct}\"")

correct word for simpllicity is "simplicity"
