#### Read the corpus file and get the count of the word

In [22]:
import re
from collections import Counter


WORDS = open("corpus.txt").read()
WORDS = re.findall('[a-zA-Z0-9]+', WORDS.lower())
WORDS = Counter(WORDS)

### How it works
- Deletion : Removes one letter
- Transposition : Swaps two adjacent letters
- Replacement: Change one letter to other
- Insertion : Adds a letter

In [26]:
def edits_at_one_distance(word):
    alphabet = "abcdefghijklmnopqrstuvwxyz"

    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    
    #DELETION
    deletes = [L + R[1:] for L, R in splits if R]

    #TRANSPOSITION
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]

    #REPLACEMENT
    replaces = [L + c + R[1:] for L, R in splits if R for c in alphabet]

    #INSERTION
    inserts = [L + c + R for L, R in splits for c in alphabet]

    return set(deletes + transposes + replaces + inserts)

def edits_at_two_distance(word):
    return (e2 for e1 in edits_at_one_distance(word) for e2 in edits_at_one_distance(e1))

### Cleaning up the word
- Length Reduction: Extra letters in the word are removed (eg: amazingggg -> amazing)
- Probabality Method : Since in a huge dataset the exact spelling is not correct 100% probabibilities are used. Hence The word with the maximum probablity is returned.


In [24]:
#LENGTH REDUCTION
def reduce_length(word):
    return re.sub(r'(.)\1{2,}', r'\1\1', word)

#PROBABILITY METHOD
def probability_of_word(word, N=sum(WORDS.values())):
    return WORDS[word] / N

#RETURNS CORRECTED WORD
def correction(word):
    word = reduce_length(word)
    return max(candidates(word), key=probability_of_word)

def candidates(word):
    return (known([word]) or known(edits_at_one_distance(word)) or known(edits_at_two_distance(word)) or [word])

def known(words):
    return set(word for word in words if word in WORDS)

In [25]:
print(correction("pattern"))
print(correction("amaziiingggggggg"))
print(correction("speling"))
print(correction("currected"))
print(correction("currect"))

pattern
amazing
spelling
corrected
current
