In [1]:
import re
import string
from collections import Counter
import numpy as np

In [2]:
def read_corpus(filename):
    with open(filename,"r") as file:
        lines = file.readlines()
        words = []
        
        for line in lines:
            words += re.findall(r'\w+', line.lower())
            
    return words

In [3]:
words = read_corpus(".\hamlet.txt")

In [4]:
print(len(words))

32884


In [5]:
vocabs = set(words)
print(len(vocabs))

4606


In [6]:
words_count = Counter(words)
print(words_count["fire"])

12


In [7]:
total_word_count = float(sum(words_count.values()))
word_prob = {word: words_count[word] / total_word_count for word in words_count.keys()}

In [8]:
print(word_prob["fire"])

0.0003649191095973726


In [9]:
def split(word):
    return[(word[:i], word[i:]) for i in range(len(word)+1)]

In [10]:
print(split("grammar"))

[('', 'grammar'), ('g', 'rammar'), ('gr', 'ammar'), ('gra', 'mmar'), ('gram', 'mar'), ('gramm', 'ar'), ('gramma', 'r'), ('grammar', '')]


In [11]:
def delete(word):
    return [l + r[1:] for l,r in split(word) if r]

In [12]:
print(delete("grammar"))

['rammar', 'gammar', 'grmmar', 'gramar', 'gramar', 'grammr', 'gramma']


In [13]:
def swap(word):
    return [l + r[1] + r[0] + r[2:] for l,r in split(word) if len(r)>1]

In [14]:
print(swap("grammar"))

['rgammar', 'garmmar', 'grmamar', 'grammar', 'gramamr', 'grammra']


In [15]:
def replace(word):
    letters = string.ascii_lowercase
    return [l + c + r[1:] for l,r in split(word) if r for c in letters]

In [16]:
print(replace("grammar"))

['arammar', 'brammar', 'crammar', 'drammar', 'erammar', 'frammar', 'grammar', 'hrammar', 'irammar', 'jrammar', 'krammar', 'lrammar', 'mrammar', 'nrammar', 'orammar', 'prammar', 'qrammar', 'rrammar', 'srammar', 'trammar', 'urammar', 'vrammar', 'wrammar', 'xrammar', 'yrammar', 'zrammar', 'gaammar', 'gbammar', 'gcammar', 'gdammar', 'geammar', 'gfammar', 'ggammar', 'ghammar', 'giammar', 'gjammar', 'gkammar', 'glammar', 'gmammar', 'gnammar', 'goammar', 'gpammar', 'gqammar', 'grammar', 'gsammar', 'gtammar', 'guammar', 'gvammar', 'gwammar', 'gxammar', 'gyammar', 'gzammar', 'grammar', 'grbmmar', 'grcmmar', 'grdmmar', 'gremmar', 'grfmmar', 'grgmmar', 'grhmmar', 'grimmar', 'grjmmar', 'grkmmar', 'grlmmar', 'grmmmar', 'grnmmar', 'grommar', 'grpmmar', 'grqmmar', 'grrmmar', 'grsmmar', 'grtmmar', 'grummar', 'grvmmar', 'grwmmar', 'grxmmar', 'grymmar', 'grzmmar', 'graamar', 'grabmar', 'gracmar', 'gradmar', 'graemar', 'grafmar', 'gragmar', 'grahmar', 'graimar', 'grajmar', 'grakmar', 'gralmar', 'grammar'

In [17]:
def insert(word):
    letters = string.ascii_lowercase
    return [l + c + r for l,r in split(word) for c in letters]

In [18]:
print(insert("grammar"))

['agrammar', 'bgrammar', 'cgrammar', 'dgrammar', 'egrammar', 'fgrammar', 'ggrammar', 'hgrammar', 'igrammar', 'jgrammar', 'kgrammar', 'lgrammar', 'mgrammar', 'ngrammar', 'ogrammar', 'pgrammar', 'qgrammar', 'rgrammar', 'sgrammar', 'tgrammar', 'ugrammar', 'vgrammar', 'wgrammar', 'xgrammar', 'ygrammar', 'zgrammar', 'garammar', 'gbrammar', 'gcrammar', 'gdrammar', 'gerammar', 'gframmar', 'ggrammar', 'ghrammar', 'girammar', 'gjrammar', 'gkrammar', 'glrammar', 'gmrammar', 'gnrammar', 'gorammar', 'gprammar', 'gqrammar', 'grrammar', 'gsrammar', 'gtrammar', 'gurammar', 'gvrammar', 'gwrammar', 'gxrammar', 'gyrammar', 'gzrammar', 'graammar', 'grbammar', 'grcammar', 'grdammar', 'greammar', 'grfammar', 'grgammar', 'grhammar', 'griammar', 'grjammar', 'grkammar', 'grlammar', 'grmammar', 'grnammar', 'groammar', 'grpammar', 'grqammar', 'grrammar', 'grsammar', 'grtammar', 'gruammar', 'grvammar', 'grwammar', 'grxammar', 'gryammar', 'grzammar', 'graammar', 'grabmmar', 'gracmmar', 'gradmmar', 'graemmar', 'gr

In [19]:
def level_one_edits(word):
    return set(delete(word) + swap(word) + replace(word) + insert(word))

In [20]:
print(level_one_edits("grammar"))

{'grammarp', 'grlmmar', 'grammagr', 'grawmmar', 'grahmmar', 'gramqar', 'graimar', 'gramyar', 'gramgmar', 'grammpar', 'grammabr', 'grammmar', 'grcmmar', 'grammrar', 'ggammar', 'grammlar', 'gtrammar', 'grammavr', 'grammark', 'grxammar', 'grvmmar', 'gramuar', 'grajmmar', 'gramzar', 'gravmmar', 'gnammar', 'grasmar', 'gpammar', 'erammar', 'grammasr', 'wrammar', 'grmamar', 'gramcar', 'grammahr', 'grammay', 'grammdar', 'gramamr', 'grwmmar', 'xrammar', 'grammfar', 'grammary', 'gramwar', 'gbrammar', 'grqmmar', 'grdmmar', 'grammari', 'zgrammar', 'grammuar', 'grammpr', 'grammkr', 'grammad', 'grsmmar', 'grapmmar', 'grmmar', 'glrammar', 'ghrammar', 'gramoar', 'gramhar', 'grammacr', 'grammarc', 'grammoar', 'gramxar', 'guammar', 'gramzmar', 'grammanr', 'kgrammar', 'grimmar', 'xgrammar', 'gramrmar', 'grammhr', 'grawmar', 'crammar', 'graamar', 'fgrammar', 'ngrammar', 'grammarm', 'grammafr', 'grabmar', 'garammar', 'srammar', 'grammarj', 'gramjar', 'gramomar', 'grammakr', 'grammgr', 'grammnar', 'grampar'

In [21]:
def level_two_edits(word):
    return set(e2 for e1 in level_one_edits(word) for e2 in level_one_edits(e1))

In [23]:
print(level_two_edits("grammar"))

{'grampmcar', 'xgrqammar', 'zgragmmar', 'kgrazmmar', 'griaummar', 'ygurammar', 'gramgmah', 'grzmmoar', 'gkriammar', 'greammpar', 'grajmmabr', 'orammafr', 'grabmas', 'grgdammar', 'grlaommar', 'grafqmmar', 'ggrammap', 'gzymmar', 'gsrammal', 'grarmart', 'gramymgar', 'gyrammad', 'gvammab', 'wfgrammar', 'gvravmar', 'gramkaer', 'gfammtar', 'gravfmmar', 'grbammat', 'gramfmwar', 'grsmmpar', 'agrammyar', 'grammatar', 'grammmacr', 'grlmmaa', 'girammkar', 'grmmcar', 'grammwamr', 'egrammkar', 'graemtmar', 'gramemcar', 'dgralmar', 'graqmmvar', 'hgrammuar', 'grazmmkr', 'grvmmac', 'gjrymmar', 'giamzar', 'grpmhar', 'grammrru', 'gramjkar', 'gratmakr', 'grpqmar', 'grkaxmmar', 'grakmajr', 'gxaxmar', 'gvammlar', 'gwammau', 'grammuard', 'gjrammai', 'gxramamar', 'grhasmar', 'gzammvr', 'gyawmmar', 'grcammir', 'grammwjr', 'rgrammiar', 'dramsmar', 'tgrammlar', 'grommiar', 'grammkarq', 'gmammadr', 'fjrammar', 'gprammzr', 'gractar', 'gramiaor', 'gfamyar', 'gramrak', 'grhmmbar', 'gkrammarp', 'gcammah', 'rgammgr',

In [25]:
def correct_spelling(word, vocabulary, word_probabilities):
    if word in vocabulary:
        print("correctly spelled")
        return
    else:
        suggestion = level_one_edits(word) or level_two_edits(word) or [word]
        best_guess = [w for w in suggestion if w in vocabulary]
        return [(w, word_probabilities[w]) for w in best_guess]

In [26]:
word = "ford"
guess = correct_spelling(word, vocabs, word_prob)

print(guess)

[('for', 0.007663301301544824), ('forg', 3.040992579978105e-05), ('fort', 0.0001824595547986863), ('lord', 0.006872643230750517), ('forc', 6.08198515995621e-05), ('word', 0.0003649191095973726), ('form', 0.0003953290353971536), ('fore', 3.040992579978105e-05), ('food', 6.08198515995621e-05), ('fordo', 3.040992579978105e-05), ('fond', 3.040992579978105e-05)]


In [27]:
class SpellChecker(object):
    
    def __init__(self, corpus_file_path):
        with open(corpus_file_path, "r") as file:
            lines = file.readlines()
            words=[]
            
            for line in lines:
                words += re.findall(r'\w+', line.lower())
                
            self.vocabs= set(words)
            self.word_counts = Counter(words)
            total_word_count = float(sum(words_count.values()))
            self.word_prob = {word: self.word_counts[word] / total_word_count for word in self.vocabs}
            
     
    def level_one_edits(self, word):
        letters = string.ascii_lowercase
        splits = [(word[:i], word[i:]) for i in range(len(word)+1)]
        deletes = [l + r[1:] for l,r in split(word) if r]
        swaps = [l + r[1] + r[0] + r[2:] for l,r in split(word) if len(r)>1]
        replaces = [l + c + r[1:] for l,r in split(word) if r for c in letters]
        inserts = [l + c + r for l,r in split(word) for c in letters]
        
        return set(deletes + swaps + replaces + inserts)
    
    
    def level_two_edits(self, word):
        return set((e2 for e1 in level_one_edits(word) for e2 in level_one_edits(e1)))
    
    def check(self, word):
        if word in self.vocabs:
            print("Correctly Spelled")
            return
        print("Incorrectly Spelled. Suggestions:")
        candidates = self.level_one_edits(word) or self.level_two_edits(word) or [word]
        valid_candidates = [w for w in candidates if w in self.vocabs]
        return sorted([(c) for c in valid_candidates], key=lambda tup:tup[1], reverse=True)

In [28]:
checker= SpellChecker(".\hamlet.txt")

In [31]:
m_word= "grammar"
print(checker.check(m_word))

Incorrectly Spelled. Suggestions:
[]
