In [1]:
import re
from pprint import pprint
from collections import Counter

letters = 'abcdefghijklmnopqrstuvwxyz'
N = 1024908267229  # Size of Google Web 1T Dataset
word_count = [line.split('\t') for line in open('count_1w.txt', 'r')]
Pdist = dict([(word, float(count) / N) for word, count in word_count])

In [2]:
def Pw(word):
    return Pdist[word] if word in Pdist else 10. / 10**len(word) / N

In [3]:
def words(text):
    return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('big.txt').read()))

def P(word, N=sum(WORDS.values())):
    "Probability of `word`."
    return WORDS[word] / N

In [260]:
# Using Pw(word) or P(word) may result in different answers
def next_states(state):
    L, R, edit, prob = state
    R0, R1 = R[0], R[1:]
    if edit == 2:
        return [(L+R0, R1, edit, prob)]
    noedit = [(L + R0, R1, edit, prob)]
    delete = [(L, R1, edit + 1, P(L + R1))]
    replaces = [(L + c, R1, edit + 1, P(L + c + R1)) for c in letters]
    inserts = [(L + R0 + c, R1, edit + 1, P(L + R0 + c + R1)) for c in letters]
    return noedit + delete + replaces + inserts

In [281]:
# Using Pw(word) or P(word) may result in different answers
def correction(word):
    states = [('', word, 0, P(word))]
    MAXBEAM = 550
    
    # Since Pw(word) never returns 0, so we need a minimum threshold for unseen words
    threshold = 10. / 10**len(word) / N
    
    for i in range(len(word)):
        states = [state for states in map(next_states, states) for state in states]
        
        word_dict = {}
        for state in states:
            L, R, edit, prob = state
            word = L + R
            if word not in word_dict or edit < word_dict[word][2]:
                word_dict[word] = state
                
        states = list(word_dict.values())
        states = sorted(states, key=lambda x: x[3], reverse=True)
        states = sorted(states, key=lambda x: x[2])[:MAXBEAM]
        
    states = [state for state in states if state[2] == 0 or state[3] > threshold]

    return sorted(states, key=lambda x: x[3], reverse=True)[:3]

In [282]:
correction("appearant")

[('appearance', '', 2, 0.00012101274219355764),
 ('apparent', '', 2, 3.764840868244015e-05),
 ('appearing', '', 2, 2.061698570705056e-05)]

In [283]:
correction("runing")

[('during', '', 2, 0.00045088451350636663),
 ('turning', '', 2, 0.0001864492620463703),
 ('running', '', 1, 0.00012549469560813384)]

In [284]:
correction("particpate")

[('participate', '', 1, 3.585562731660967e-06),
 ('participated', '', 2, 2.6891720487457255e-06),
 ('participates', '', 2, 8.963906829152417e-07)]

In [285]:
correction("beleive")

[('believe', '', 2, 0.00016403949497348924),
 ('receive', '', 2, 8.515711487694797e-05),
 ('deceive', '', 2, 1.1653078877898144e-05)]