In [1]:
import sys
import os
sys.path.append('..')

In [2]:
from core.SpellChecker import SpellChecker
from collections import Counter
import re
import pandas as pd
import numpy as np
import itertools

In [3]:
checker = SpellChecker()

In [4]:
def read_text(filename):
    with open(filename, "r") as file:
        return file.read()

In [5]:
def evaluate_spell_checker(correction_function, **kwargs):
    print(correction_function)
    text_correct = read_text("../texts/text_correct.txt").lower()
    text_misspel = read_text("../texts/text_misspel.txt").lower()
    
    words_original = re.findall(r'\w+', text_correct.lower())
    words_misspelled = re.findall(r'\w+', text_misspel.lower())
    edits = [correction_function(word, **kwargs) for word in words_misspelled]
    
    try:
        words_correct = [edit.edit for edit in edits]
    except:
        words_correct = edits
    
    df = pd.DataFrame({
        "original" : words_original,
        "misspllled" : words_misspelled, 
        "corrected" : words_correct}
    )
    df["correct"] = df.original == df.corrected
    df["almost_correct"] = (df.original.apply(lambda x: x[:-1]) == df.corrected) | df.correct
    
    correct_fraction = sum(df.correct) / len(df.correct)
    almost_correct_fraction = sum(df.almost_correct) / len(df.correct)
    
    return df, correct_fraction, almost_correct_fraction

In [6]:
df, correct_fraction, almost_correct_fraction = evaluate_spell_checker(checker.correction)
print(correct_fraction)
print(almost_correct_fraction)

<bound method SpellChecker.correction of <core.SpellChecker.SpellChecker object at 0x1200b6dc0>>
0.7763157894736842
0.8157894736842105


In [17]:
df

Unnamed: 0,original,misspllled,corrected,correct,almost_correct
0,in,in,in,True,True
1,nowadays,noaadays,nowadays,True,True
2,rapidly,paridly,partly,False,False
3,advancing,avdacning,advancing,True,True
4,digital,diital,digital,True,True
...,...,...,...,...,...
71,various,variaous,various,True,True
72,aspects,sapect,aspect,False,True
73,of,of,of,True,True
74,our,our,our,True,True


Almost correct means that the algorithm missed the plural form

In [7]:
correct_wrong = list(df[["misspllled", "original"]].itertuples(index=False, name=None))
params = {
    'keyboard_weight': [1.0, 5.0, 10.0],
    'edit_type_weight': [0.1, 1],
    'word_probability_weight': [10.0, 20.0, 50.0]
}

In [11]:
def probability(edit, keyboard_weight = 1.0, edit_type_weight = 1.0, word_probability_weight = 5.0):
    # Scale the distances and probabilities so they have similar magnitudes.
    keyboard_distance_score = 1 / (1 + edit.keyboard_distance + edit.levenshtein_distance)**keyboard_weight
    edit_type_probability = edit.edit_type.probability() ** edit_type_weight
    word_probability = edit.edit_word_probability ** word_probability_weight

    # Combine them using multiplication: this will return high probability only if all factors are high.
    combined_probability = keyboard_distance_score * edit_type_probability * word_probability
    return combined_probability


In [12]:
def grid_search(spellchecker, misspelled_correct_pairs, params):
    """
    Parameters:
    spellchecker: an instance of your spellchecker
    misspelled_correct_pairs: a list of tuples, where the first element is a 
                              misspelled word and the second is the correct spelling
    params: a dictionary where keys are parameter names and values are lists of 
            parameter values to try
    """
    
    best_accuracy = 0
    best_params = None

    # Generate all combinations of parameter values
    param_combinations = list(itertools.product(*(params[param] for param in params)))

    for param_values in param_combinations:
        param_dict = dict(zip(params.keys(), param_values))
        print("NEW PARAMS", param_dict)
        correct = 0
        for misspelled, correct_word in misspelled_correct_pairs:
            candidates = list(spellchecker.candidates(misspelled))
            probabilities = [probability(candidate, **param_dict) for candidate in candidates]
            
            if candidates[np.argmax(probabilities)].edit == correct_word:
                correct += 1

        accuracy = correct / len(misspelled_correct_pairs)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = param_dict
            
        print(accuracy)
        print()

    return best_params, best_accuracy


In [13]:
grid_search(checker, correct_wrong, params)

NEW PARAMS {'keyboard_weight': 1.0, 'edit_type_weight': 0.1, 'word_probability_weight': 10.0}
0.8552631578947368

NEW PARAMS {'keyboard_weight': 1.0, 'edit_type_weight': 0.1, 'word_probability_weight': 20.0}
0.8552631578947368

NEW PARAMS {'keyboard_weight': 1.0, 'edit_type_weight': 0.1, 'word_probability_weight': 50.0}
0.8552631578947368

NEW PARAMS {'keyboard_weight': 1.0, 'edit_type_weight': 1, 'word_probability_weight': 10.0}
0.8552631578947368

NEW PARAMS {'keyboard_weight': 1.0, 'edit_type_weight': 1, 'word_probability_weight': 20.0}
0.8552631578947368

NEW PARAMS {'keyboard_weight': 1.0, 'edit_type_weight': 1, 'word_probability_weight': 50.0}
0.8552631578947368

NEW PARAMS {'keyboard_weight': 5.0, 'edit_type_weight': 0.1, 'word_probability_weight': 10.0}
0.8421052631578947

NEW PARAMS {'keyboard_weight': 5.0, 'edit_type_weight': 0.1, 'word_probability_weight': 20.0}
0.8421052631578947

NEW PARAMS {'keyboard_weight': 5.0, 'edit_type_weight': 0.1, 'word_probability_weight': 50.0}


({'keyboard_weight': 1.0,
  'edit_type_weight': 0.1,
  'word_probability_weight': 10.0},
 0.8552631578947368)

In [9]:
  import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('big_en.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [39]:
evaluate_spell_checker(correction)

<function correction at 0x107f14310>


(     original misspllled  corrected  correct  almost_correct
 0          in         in         in     True            True
 1    nowadays   noaadays   nowadays     True            True
 2     rapidly    paridly    rapidly     True            True
 3   advancing  avdacning  advancing     True            True
 4     digital     diital     distal    False           False
 ..        ...        ...        ...      ...             ...
 71    various   variaous    various     True            True
 72    aspects     sapect     aspect    False            True
 73         of         of         of     True            True
 74        our        our        our     True            True
 75      lives       livs       lips    False           False
 
 [76 rows x 5 columns],
 0.8552631578947368,
 0.881578947368421)