In [2]:
import sys
import os
sys.path.append('..')

from core.SpellChecker import SpellChecker
from collections import Counter

from core.NGramModel import NGramModel

Unfortunately, I couldn't improve upon the Norvig's solution, even though numerous simple and complex probability models were explored and tested. One way of advancing the work may be devising a robust language model, that would 

## Original solution by Norvig

In [3]:
import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('../texts/big_en.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

## Suggested solution

# English

In [4]:
checker = SpellChecker(override_file = "../texts/big_en.txt")
checker.correction("receit")


        Edit of
            content = receit
            edit = receipt
            type = insert
            edit word probaility = 1.1653078877898144e-05
            keyboard_distance = 2
            levenshtein distance = 1
            previous edit = 
                None
        

In [5]:
checker.candidates("mre")

{
         Edit of
             content = mre
             edit = are
             type = replace
             edit word probaility = 0.0032538981789823275
             keyboard_distance = 7
             levenshtein distance = 1
             previous edit = 
                 None
         ,
 
         Edit of
             content = mre
             edit = ere
             type = replace
             edit word probaility = 8.963906829152417e-07
             keyboard_distance = 6
             levenshtein distance = 1
             previous edit = 
                 None
         ,
 
         Edit of
             content = mre
             edit = ire
             type = replace
             edit word probaility = 8.963906829152417e-07
             keyboard_distance = 3
             levenshtein distance = 1
             previous edit = 
                 None
         ,
 
         Edit of
             content = mre
             edit = mare
             type = insert
             edit word pro

In [6]:
checker.correction("mre")


        Edit of
            content = mre
            edit = more
            type = insert
            edit word probaility = 0.001790092193781738
            keyboard_distance = 4
            levenshtein distance = 1
            previous edit = 
                None
        

In [11]:
checker.correction("whatevee")


        Edit of
            content = whatevee
            edit = whatever
            type = replace
            edit word probaility = 0.00010218853785233757
            keyboard_distance = 1
            levenshtein distance = 1
            previous edit = 
                None
        

In [12]:
checker.candidates("appll")

{
         Edit of
             content = appll
             edit = apple
             type = replace
             edit word probaility = 9.860297512067659e-06
             keyboard_distance = 7
             levenshtein distance = 1
             previous edit = 
                 None
         ,
 
         Edit of
             content = appll
             edit = apply
             type = replace
             edit word probaility = 3.85447993653554e-05
             keyboard_distance = 4
             levenshtein distance = 1
             previous edit = 
                 None
         }

# Armenian

In [15]:
checker = SpellChecker(language = "am", override_file = "../texts/big_am.txt")
checker.correction("քննություլ")


        Edit of
            content = քննություլ
            edit = քննություն
            type = replace
            edit word probaility = 3.924292548160881e-06
            keyboard_distance = 5
            levenshtein distance = 1
            previous edit = 
                None
        

# 3Gram model

In [8]:
model = NGramModel("en", override_file = "../texts/big_en.txt")

In [10]:
model.get_probabilities("I am going", ["anaconda", "more", "to", "him", "today", "tomorrow"])

{'anaconda': 0.0,
 'more': 0.0,
 'to': 0.8,
 'him': 0.0,
 'today': 0.0,
 'tomorrow': 0.0}