ideas

if edit1 word is less common that edit2 word, then choose edit2 


In [1]:
from enum import Enum

class EditType(Enum):
    none = 0
    delete = 1
    transpose = 2
    replace = 3
    insert = 4

class Edit:
    
    def __init__(self, edit, edit_type: EditType = EditType.none, prev_edit = None, is_neighbour = False):
        self.edit = edit
        self.edit_type = edit_type
        self.prev_edit = prev_edit
        self.is_neighbour = is_neighbour
        
    def __hash__(self):
        return hash((self.edit, self.edit_type))

    def __eq__(self,other):
        return self.edit == other.edit and self.edit_type == other.edit_type
    
    def __repr__(self):
        return f"""
        Edit
            content = {self.edit}
            type = {self.edit_type.name}
            previous edit = 
                {self.prev_edit}
            is neighbour = {self.is_neighbour}
        """
    

In [5]:
import re
from collections import Counter
from constants import letters, neighbour_letters

class SpellChecker:
    
    def __init__(self, language = "en"):
        
        file_to_open = f'big_{language}.txt'
        
        with open(file_to_open, "r") as file:
            raw_text = file.read()
            raw_words = re.findall(r'\w+', raw_text.lower())
            self.WORDS = Counter(raw_words)
        
        self.N = sum(self.WORDS.values())
        self.letters = letters[language]
        self.neighbour_letters = neighbour_letters[language]
        
        
    def P(self, edit: Edit): 
        "Probability of `edit`."
        
        return self.WORDS[edit.edit] / self.N

    def correction(self, word): 
        "Most probable spelling correction for word."
        return max(self.candidates(word), key=self.P)

    def candidates(self, word): 
        "Generate possible spelling corrections for word."
        return (self.known([Edit(word)]) or self.known(self.edits1(word)) or self.known(self.edits2(word)) or [Edit(word)])

    def known(self, edits): 
        "The subset of `words` that appear in the dictionary of WORDS."
        return set(w for w in edits if w.edit in self.WORDS)

    def edits1(self, word, prev_edit: Edit = None):
        "All edits that are one edit away from `word`."
        letters    = self.letters
        splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
        
        deletes    = set(self.__edits_deletes(splits, prev_edit))
        transposes = set(self.__edits_transposes(splits, prev_edit))
        replaces   = set(self.__edits_replaces(splits, prev_edit))
        inserts    = set(self.__edits_inserts(splits, prev_edit))
        all_set = deletes.union(transposes).union(replaces).union(inserts)
        print(all_set)
        return all_set

    def edits2(self, word): 
        "All edits that are two edits away from `word`."
        return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1.edit, prev_edit = e1))
    
    def __edits_transposes(self, splits, prev_edit: Edit):
        return {Edit(L + R[1] + R[0] + R[2:], EditType.transpose, prev_edit, False) for L, R in splits if len(R)>1}
    
    def __edits_inserts(self, splits, prev_edit: Edit):
        return {Edit(L + c + R, EditType.insert, prev_edit, False) for L, R in splits for c in letters}
    
    def __edits_deletes(self, splits, prev_edit: Edit):
        for L, R in splits:
            if R:
                omitted = R[0]
                next_to_omitted = self.neighbour_letters[omitted]
                
                # Right is neighbour
                if len(R) > 1:
                    right = R[1]
                    right_is_neighbour = right in next_to_omitted
                else:
                    right_is_neighbour = False
                
                # Left is neighbour
                if len(L) > 0:
                    left = L[-1]
                    left_is_neighbour = left in next_to_omitted
                else:
                    left_is_neighbour = False
                
                is_neighbour = left_is_neighbour or right_is_neighbour

                yield Edit(L + R[1:], EditType.delete, prev_edit, is_neighbour)
    
    def __edits_replaces(self, splits, prev_edit: Edit):
        for L, R in splits:
            if R:
                for c in letters:
                    
                    omitted = R[0]
                    next_to_omitted = self.neighbour_letters[omitted]
                    is_neighbour = c in next_to_omitted
                    
                    yield Edit(L + c + R[1:], EditType.replace, prev_edit, is_neighbour)
    

In [6]:
checker = SpellChecker()

In [7]:
candidates = checker.candidates("receit")
candidates

{
        Edit
            content = receitam
            type = insert
            previous edit = 
                None
            is neighbour = False
        , 
        Edit
            content = erceit
            type = transpose
            previous edit = 
                None
            is neighbour = False
        , 
        Edit
            content = receiam
            type = replace
            previous edit = 
                None
            is neighbour = False
        , 
        Edit
            content = recit
            type = delete
            previous edit = 
                None
            is neighbour = False
        , 
        Edit
            content = eneceit
            type = replace
            previous edit = 
                None
            is neighbour = False
        , 
        Edit
            content = receamit
            type = insert
            previous edit = 
                None
            is neighbour = False
        , 
        Edit
   

{
         Edit
             content = recent
             type = delete
             previous edit = 
                 
         Edit
             content = recenit
             type = replace
             previous edit = 
                 None
             is neighbour = False
         
             is neighbour = False
         ,
 
         Edit
             content = recent
             type = replace
             previous edit = 
                 
         Edit
             content = recit
             type = delete
             previous edit = 
                 None
             is neighbour = False
         
             is neighbour = False
         ,
 
         Edit
             content = recite
             type = transpose
             previous edit = 
                 
         Edit
             content = reciet
             type = transpose
             previous edit = 
                 None
             is neighbour = False
         
             is neighbour = False
    