### 贝叶斯拼写检查器

In [16]:
import re
import collections


def words(text):
    return re.findall("[a-z]+", text.lower())


def train(features):
    model = collections.defaultdict((lambda: 1))
    for f in features:
        model[f] += 1
    return model


NWORDS = train(words(open("train.tsv").read()))

alphabet = "abcdefghijklmnopqrstuvwxyz"


def edits1(word):
    n = len(word)

    return set(
        [word[0:i] + word[i + 1 :] for i in range(n)]
        + [word[0:i] + word[i + 1] + word[i] + word[i + 2 :] for i in range(n - 1)]
        + [word[0:i] + c + word[i + 1 :] for i in range(n) for c in alphabet]
        + [word[0:i] + c + word[i:] for i in range(n) for c in alphabet]
    )

def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)

def known(words):
    return set(w for w in words if w in NWORDS)

def correct(word):
    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
    return max(candidates, key=lambda w: NWORDS[w])

In [17]:
print(correct('apple'))
print(correct('dpple'))
print(correct('dpplea'))

apple
apple
apple


### 求解 argmax_c P(c|w) -> argmax_c P(w|c) * P(c) / P(w)

- P(c)，文章中出现一个正确拼写词c的概率，也就是说，在英语文章中，c出现的概率有多大
- P(w|c)，在用户想键入c的情况下，敲成w的概率。因为这个是代表用户会以多大概率把c敲错成w
- argmaxc，用来枚举所有可能的c并选取概率最大的