In [3]:
from SpellChecker import SpellChecker
from collections import Counter
import re
import pandas as pd
import numpy as np

In [4]:
checker = SpellChecker()

In [5]:
def read_text(filename):
    with open(filename, "r") as file:
        return file.read()

In [8]:
def evaluate_spell_checker(correction_function):
    print(correction_function)
    text_correct = read_text("text_correct.txt").lower()
    text_misspel = read_text("text_misspel.txt").lower()
    
    words_original = re.findall(r'\w+', text_correct.lower())
    words_misspelled = re.findall(r'\w+', text_misspel.lower())
    edits = [correction_function(word) for word in words_misspelled]
    
    try:
        words_correct = [edit.edit for edit in edits]
    except:
        words_correct = edits
    
    df = pd.DataFrame({
        "original" : words_original,
        "missplelled" : words_misspelled, 
        "corrected" : words_correct}
    )
    df["correct"] = df.original == df.corrected
    df["almost_correct"] = (df.original.apply(lambda x: x[:-1]) == df.corrected) | df.cor
    
    correct_fraction = sum(df.correct) / len(df.correct)
    almost_correct_fraction = sum(df.almost_correct) / len(df.correct)
    
    return df, correct_fraction, almost_correct_fraction

In [9]:
df, correct_fraction, almost_correct_fraction = evaluate_spell_checker(checker.correction)
print(correct_fraction)
print(almost_correct_fraction)

<bound method SpellChecker.correction of <SpellChecker.SpellChecker object at 0x11f73bca0>>
0.7894736842105263
0.039473684210526314


In [10]:
  import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('big_en.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [11]:
evaluate_spell_checker(correction)

<function correction at 0x14fc5f0d0>


(     original missplelled  corrected  correct  almost_correct
 0          in          in         in     True           False
 1    nowadays    noaadays   nowadays     True           False
 2     rapidly     paridly    rapidly     True           False
 3   advancing   avdacning  advancing     True           False
 4     digital      diital     distal    False           False
 ..        ...         ...        ...      ...             ...
 71    various    variaous    various     True           False
 72    aspects      sapect     aspect    False           False
 73         of          of         of     True           False
 74        our         our        our     True           False
 75      lives        livs       lips    False           False
 
 [76 rows x 5 columns],
 0.8552631578947368,
 0.013157894736842105)