# Import Necessary packages

In [70]:
import string
import re
import numpy as np
from collections import Counter

# 1: Reading all words and for this we need vocabulary dictionary

In [71]:
def read_corpus(filename):
    with open(filename,'r',encoding='utf-8') as file:
        lines = file.readlines()
        
        words = []
        for word in lines:
            words += re.findall(r'\w+',word.lower())
    return words

# invoke this function
corpus = read_corpus(r'big.txt')

In [72]:
len(corpus)

222663

# create our vocabulary for unique words

In [73]:
vocab = set(corpus)
len(vocab)

17647

# To see how many times words appear in our corpus (text)  (test)

In [74]:
words_count = Counter(corpus)
words_count

Counter({'the': 14703,
         'project': 91,
         'gutenberg': 94,
         'ebook': 10,
         'of': 6742,
         'moby': 90,
         'dick': 90,
         'or': 797,
         'whale': 1230,
         'by': 1222,
         'herman': 4,
         'melville': 4,
         'this': 1439,
         'is': 1751,
         'for': 1644,
         'use': 49,
         'anyone': 6,
         'anywhere': 16,
         'at': 1335,
         'no': 594,
         'cost': 4,
         'and': 6517,
         'with': 1769,
         'almost': 197,
         'restrictions': 2,
         'whatsoever': 7,
         'you': 958,
         'may': 255,
         'copy': 19,
         'it': 2534,
         'give': 90,
         'away': 186,
         're': 18,
         'under': 126,
         'terms': 33,
         'license': 18,
         'included': 14,
         'online': 4,
         'www': 6,
         'org': 13,
         'title': 8,
         'author': 9,
         'release': 1,
         'date': 4,
         'december': 5,
   

# Interesting Part
# Calculate Word probability
 first we have to find out t.no of words once again
 
 1: P(word) = count(word) / N

In [75]:
total_words_count = float(sum(words_count.values()))

In [76]:
word_probabs = {word:words_count[word] / total_words_count for word in words_count.keys()}

In [77]:
word_probabs['the']

0.06603252448767868

# Autocorrect operations

# 1 Split Operation:
   first split word into two components                                 
   [('', 'hello'), ('h', 'ello'), ('he', 'llo'), ('hel', 'lo'), ('hell', 'o'), ('hello', '')]

In [78]:
def split(word): # why
    return [ (word[:i], word[i:])  for i in range(len(word) + 1)]

In [79]:
print(split('why'))

[('', 'why'), ('w', 'hy'), ('wh', 'y'), ('why', '')]


# 2 Delete Operation:

In [80]:
def delete(word):
    return [left + right[1:] for left,right in split(word) if right]

In [81]:
print(delete('why'))

['hy', 'wy', 'wh']


# 3 Swap Operation:

In [82]:
def swap(word):
    return [left + right[1] + right[0] + right[2:] for left,right in split(word) if len(right) > 1 ]

In [83]:
print(swap('why')) #hy == yh

['hwy', 'wyh']


# 4 replace Operation:
The string.ascii_lowercase is equivalent to the string "abcdefghijklmnopqrstuvwxyz"

In [84]:
def replace(word): # abcdef...z
    return [left + center + right[1:] for left, right in split(word) if right for center in string.ascii_lowercase]

In [85]:
print(replace('why'))

['ahy', 'bhy', 'chy', 'dhy', 'ehy', 'fhy', 'ghy', 'hhy', 'ihy', 'jhy', 'khy', 'lhy', 'mhy', 'nhy', 'ohy', 'phy', 'qhy', 'rhy', 'shy', 'thy', 'uhy', 'vhy', 'why', 'xhy', 'yhy', 'zhy', 'way', 'wby', 'wcy', 'wdy', 'wey', 'wfy', 'wgy', 'why', 'wiy', 'wjy', 'wky', 'wly', 'wmy', 'wny', 'woy', 'wpy', 'wqy', 'wry', 'wsy', 'wty', 'wuy', 'wvy', 'wwy', 'wxy', 'wyy', 'wzy', 'wha', 'whb', 'whc', 'whd', 'whe', 'whf', 'whg', 'whh', 'whi', 'whj', 'whk', 'whl', 'whm', 'whn', 'who', 'whp', 'whq', 'whr', 'whs', 'wht', 'whu', 'whv', 'whw', 'whx', 'why', 'whz']


# 5 Insert Opearations

In [86]:
def insert(word): # abcdef...z
    return [left + center + right[1:] for left, right in split(word) for center in string.ascii_lowercase]

In [87]:
print(replace('love'))

['aove', 'bove', 'cove', 'dove', 'eove', 'fove', 'gove', 'hove', 'iove', 'jove', 'kove', 'love', 'move', 'nove', 'oove', 'pove', 'qove', 'rove', 'sove', 'tove', 'uove', 'vove', 'wove', 'xove', 'yove', 'zove', 'lave', 'lbve', 'lcve', 'ldve', 'leve', 'lfve', 'lgve', 'lhve', 'live', 'ljve', 'lkve', 'llve', 'lmve', 'lnve', 'love', 'lpve', 'lqve', 'lrve', 'lsve', 'ltve', 'luve', 'lvve', 'lwve', 'lxve', 'lyve', 'lzve', 'loae', 'lobe', 'loce', 'lode', 'loee', 'lofe', 'loge', 'lohe', 'loie', 'loje', 'loke', 'lole', 'lome', 'lone', 'looe', 'lope', 'loqe', 'lore', 'lose', 'lote', 'loue', 'love', 'lowe', 'loxe', 'loye', 'loze', 'lova', 'lovb', 'lovc', 'lovd', 'love', 'lovf', 'lovg', 'lovh', 'lovi', 'lovj', 'lovk', 'lovl', 'lovm', 'lovn', 'lovo', 'lovp', 'lovq', 'lovr', 'lovs', 'lovt', 'lovu', 'lovv', 'lovw', 'lovx', 'lovy', 'lovz']


# find minimum distance
five edit for misspelled word or candidates

The "level_one_edits" function in this code snippet is a Python function that takes a single argument "word". The purpose of this function is to generate a set of all possible "candidate" words that can be obtained by applying four types of "edit" operations to the input "word". These operations are:

Delete: Remove one character from the input word
Swap: Swap adjacent pairs of characters in the input word
Replace: Replace one character in the input word with a different lowercase letter
Insert: Insert one lowercase letter at any position in the input word

In [97]:
def level_one_edits(word):
    return set((delete(word) + swap(word) + replace(word) + insert(word)))

In [99]:
print(level_one_edits('load'))

{'noad', 'lyad', 'loak', 'load', 'loadr', 'lvad', 'loadz', 'olad', 'koad', 'lowd', 'loaf', 'lbad', 'liad', 'lofd', 'loadl', 'aoad', 'loag', 'loadg', 'toad', 'loadq', 'lord', 'lozd', 'loaw', 'goad', 'laad', 'loada', 'loat', 'loaa', 'loadd', 'loid', 'lhad', 'loadk', 'loae', 'loado', 'loac', 'loav', 'loap', 'qoad', 'logd', 'lopd', 'llad', 'loab', 'boad', 'lobd', 'zoad', 'lod', 'loadf', 'loal', 'loadp', 'laod', 'locd', 'lold', 'joad', 'ltad', 'lood', 'loadv', 'lokd', 'loadc', 'loda', 'eoad', 'lohd', 'hoad', 'lond', 'loao', 'woad', 'uoad', 'loas', 'loxd', 'loadu', 'loay', 'lnad', 'lomd', 'loqd', 'loaj', 'loadm', 'lfad', 'road', 'lad', 'soad', 'lsad', 'lovd', 'loai', 'ioad', 'yoad', 'loadh', 'loadn', 'loaz', 'loade', 'loads', 'ooad', 'loadx', 'loax', 'loah', 'oad', 'loadj', 'loady', 'foad', 'coad', 'doad', 'loud', 'loadw', 'ldad', 'loadt', 'loar', 'lpad', 'lodd', 'losd', 'loaq', 'lqad', 'luad', 'loan', 'lmad', 'lwad', 'lxad', 'lcad', 'loyd', 'lgad', 'moad', 'xoad', 'lkad', 'ljad', 'poad', 'l

# Gets all cands that we want to find out

In [101]:
def level_two_edits(word):
    return set(e2  for e1 in level_one_edits(word) for e2 in level_one_edits(e1))

In [104]:
print(level_two_edits('cut'))

{'czh', 'cutww', 'sutg', 'czl', 'cztk', 'cutqe', 'cuiz', 'citq', 'ucm', 'csi', 'hun', 'cutqt', 'ctte', 'pu', 'cis', 'zutg', 'cupo', 'quto', 'wun', 'auu', 'cvtx', 'cutlu', 'cutov', 'cdt', 'cdq', 'cdtf', 'catn', 'cxp', 'yun', 'curw', 'ptt', 'nuz', 'mue', 'catz', 'cuap', 'ceti', 'tmt', 'muth', 'ykt', 'cutfn', 'cktn', 'cutue', 'cus', 'pua', 'cutwi', 'puz', 'euu', 'cukz', 'cujz', 'yutm', 'rtt', 'axt', 'cutir', 'cupt', 'out', 'gutc', 'cft', 'cdtl', 'cuah', 'wutl', 'cucd', 'cbtx', 'cwq', 'fbt', 'vuta', 'vmt', 'crtu', 'cuqc', 'eit', 'cdo', 'cutem', 'cgi', 'cvta', 'vutf', 'qug', 'cutlm', 'cog', 'wutd', 'cutgr', 'crx', 'fqt', 'lutw', 'cutmf', 'tun', 'hutt', 'tjt', 'cuie', 'cnd', 'cuwc', 'cuag', 'wqt', 'cutmr', 'ium', 'cttx', 'wutm', 'iutc', 'kutd', 'cucn', 'cutyd', 'cip', 'cutew', 'xuw', 'aun', 'cuiu', 'zute', 'cutge', 'dutr', 'cuhu', 'ert', 'cutft', 'zutn', 'cftv', 'cctr', 'cutrn', 'uur', 'cutzu', 'bdt', 'emt', 'cpx', 'ceb', 'wkt', 'cctf', 'cvth', 'cutui', 'cmh', 'suf', 'quz', 'coq', 'eui', 'hw

# Autocorrect Search Bar

In [111]:
def correct_spelling(word,vocab,word_probabs):
    if word in vocab:
        print(f"{word} is already correctly spelled")
        return 
    #getting all suggesions
    suggestions = level_one_edits(word) or level_two_edits(word) or [word]
    best_guesses = [w for w in suggestions if w in vocab]
    return [(w, word_probabs[w]) for w in best_guesses]

In [119]:
search_word = "laed"
guess = correct_spelling(search_word,vocab,word_probabs)
print(guess)

[('lard', 1.347327575753493e-05), ('laid', 9.880402222192281e-05), ('led', 4.4910919191783095e-05), ('lad', 0.00010778620606027944), ('land', 0.00035928735353426476), ('lead', 8.083965454520958e-05)]
