ideas

if edit1 word is less common that edit2 word, then choose edit2 


In [1]:
from enum import Enum

class EditType(Enum):
    none = 0
    insert = 1
    replace = 2
    delete = 3
    transpose = 4
    
    def __lt__(self, other):
        self.value < other.value
    

class Edit:
    
    def __init__(self, edit, edit_type: EditType = EditType.none, prev_edit = None, keyboard_distance = -1):
        self.edit = edit
        self.edit_type = edit_type
        self.prev_edit = prev_edit
        self.keyboard_distance = keyboard_distance
        
    def __hash__(self):
        return hash((self.edit, self.edit_type))

    def __eq__(self,other):
        return self.edit == other.edit and self.edit_type == other.edit_type
    
    def __repr__(self):
        return f"""
        Edit
            content = {self.edit}
            type = {self.edit_type.name}
            previous edit = 
                {self.prev_edit}
            keyboard_distance = {self.keyboard_distance}
        """
    

In [2]:
import re
from collections import Counter
from constants import keyboard_letters, neighbour_letters, keyboard_layouts

class SpellChecker:
    
    def __init__(self, language = "en"):
        
        file_to_open = f'big_{language}.txt'
        
        with open(file_to_open, "r") as file:
            raw_text = file.read()
            raw_words = re.findall(r'\w+', raw_text.lower())
            self.WORDS = Counter(raw_words)
        
        self.N = sum(self.WORDS.values())
        self.letters = keyboard_letters[language]
        self.neighbour_letters = neighbour_letters[language]
        self.keyboard_layout = keyboard_layouts[language]
        
    def keyboard_distance(self, key1, key2):

        key_coordinates = {}

        for i, row in enumerate(self.keyboard_layout):
            for j, key in enumerate(row):
                key_coordinates[key] = (i, j)

        if key1 not in key_coordinates or key2 not in key_coordinates:
            raise ValueError("Both keys must be on the keyboard")

        x1, y1 = key_coordinates[key1]
        x2, y2 = key_coordinates[key2]

        return abs(x1 - x2) + abs(y1 - y2)
        
    def P(self, edit: Edit): 
        "Probability of `edit`."
        
        # first sort by probability, then sort by edit_type, then sort by keyboard distance, if any
        return self.WORDS[edit.edit] / self.N

    def correction(self, word): 
        "Most probable spelling correction for word."
        
        # correction is defined as following:
        candidates = list(self.candidates(word))
        
        candidates.sort(key = lambda x: checker.P(x), reverse = True)
        candidates.sort(key = lambda x: x.keyboard_distance)
        candidates.sort(key = lambda x: x.edit_type.value)
        
        return candidates[0]

    def candidates(self, word): 
        "Generate possible spelling corrections for word."
        return (self.known([Edit(word)]) or self.known(self.edits1(word)) or self.known(self.edits2(word)) or [Edit(word)])

    def known(self, edits): 
        "The subset of `words` that appear in the dictionary of WORDS."
        return set(w for w in edits if w.edit in self.WORDS)

    def edits1(self, word, prev_edit: Edit = None):
        "All edits that are one edit away from `word`."
        splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
        
        deletes    = set(self.__edits_deletes(splits, prev_edit))
        transposes = set(self.__edits_transposes(splits, prev_edit))
        replaces   = set(self.__edits_replaces(splits, prev_edit))
        inserts    = set(self.__edits_inserts(splits, prev_edit))
        all_set = deletes.union(transposes).union(replaces).union(inserts)
        return all_set

    def edits2(self, word): 
        "All edits that are two edits away from `word`."
        return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1.edit, prev_edit = e1))
    
    def __edits_transposes(self, splits, prev_edit: Edit):
        return {Edit(L + R[1] + R[0] + R[2:], EditType.transpose, prev_edit, 0) for L, R in splits if len(R)>1}
    
    def __edits_inserts(self, splits, prev_edit: Edit):
        for L, R in splits:
            for c in self.letters:
                
                # Left's distance
                if len(L) > 0:
                    left = L[-1]
                    L_distance = self.keyboard_distance(left, c)
                else:
                    L_distance = -1
                
                yield Edit(L + c + R, EditType.insert, prev_edit, L_distance)
    
    def __edits_deletes(self, splits, prev_edit: Edit):
        for L, R in splits:
            if R:
                omitted = R[0]
                
                # Left's distance
                if len(L) > 0:
                    left = L[-1]
                    L_distance = self.keyboard_distance(left, omitted)
                else:
                    L_distance = -1
                    
                # Left's distance
                if len(R) > 0:
                    right = R[-1]
                    R_distance = self.keyboard_distance(right, omitted)
                else:
                    R_distance = -1
                
                key_distance = min(L_distance, R_distance)

                yield Edit(L + R[1:], EditType.delete, prev_edit, key_distance)
    
    def __edits_replaces(self, splits, prev_edit: Edit):
        for L, R in splits:
            if R:
                for c in self.letters:
                    
                    omitted = R[0]
                    key_distance = self.keyboard_distance(c, omitted)
                    
                    yield Edit(L + c + R[1:], EditType.replace, prev_edit, key_distance)
    

In [3]:
checker = SpellChecker()

In [4]:
checker.correction("receit")


        Edit
            content = receipt
            type = insert
            previous edit = 
                None
            keyboard_distance = 2
        

In [5]:
candidates = list(checker.candidates("receit")) # receipt
candidates.sort(key = lambda x: checker.P(x), reverse = True)
candidates.sort(key = lambda x: x.keyboard_distance)
candidates.sort(key = lambda x: x.edit_type.value)
candidates

[
         Edit
             content = receipt
             type = insert
             previous edit = 
                 None
             keyboard_distance = 2
         ,
 
         Edit
             content = deceit
             type = replace
             previous edit = 
                 None
             keyboard_distance = 2
         ,
 
         Edit
             content = recent
             type = replace
             previous edit = 
                 None
             keyboard_distance = 4
         ]

In [85]:
checker.P(Edit("recent"))

4.750870619450781e-05

In [86]:
checker.correction("mre")


        Edit
            content = more
            type = insert
            previous edit = 
                None
            keyboard_distance = 4
        

In [54]:
candidates = checker.candidates("mre") # more
candidates

{
         Edit
             content = are
             type = replace
             previous edit = 
                 None
             keyboard_distance = 7
         ,
 
         Edit
             content = ere
             type = replace
             previous edit = 
                 None
             keyboard_distance = 6
         ,
 
         Edit
             content = ire
             type = replace
             previous edit = 
                 None
             keyboard_distance = 3
         ,
 
         Edit
             content = mare
             type = insert
             previous edit = 
                 None
             keyboard_distance = 7
         ,
 
         Edit
             content = me
             type = delete
             previous edit = 
                 None
             keyboard_distance = 1
         ,
 
         Edit
             content = mere
             type = insert
             previous edit = 
                 None
             keyboard_distance = 6

In [87]:
checker.correction("evar")


        Edit
            content = ever
            type = replace
            previous edit = 
                None
            keyboard_distance = 3
        

In [35]:
candidates = checker.candidates("evar") # ever
candidates

{
         Edit
             content = ear
             type = delete
             previous edit = 
                 None
             keyboard_distance = 2
         ,
 
         Edit
             content = eva
             type = delete
             previous edit = 
                 None
             keyboard_distance = 0
         ,
 
         Edit
             content = ever
             type = replace
             previous edit = 
                 None
             keyboard_distance = 3
         }