In [1]:
#libraries
import re
import string
from collections import Counter
import numpy as np

In [2]:
class auto_correct(object):
    def __init__(self, corpus_file_path):
        with open(corpus_file_path, "r") as file:
            lines = file.readlines()
            words = []
        for line in lines:
            words += re.findall(r'\w+', line.lower())
        self.vocabs = set(words)# as we need unique words
        self.word_counts = Counter(words)
        total_words = float(sum(self.word_counts.values()))
        self.word_probas = {word: self.word_counts[word] / total_words for word in self.vocabs}

    def _level_one_edits(self, word):
        #we need words in english and in lowercase.
        letters = string.ascii_lowercase
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes = [l + r[1:] for l,r in splits if r]
        swaps = [l + r[1] + r[0] + r[2:] for l, r in splits if len(r)>1]
        replaces = [l + c + r[1:] for l, r in splits if r for c in letters]
        inserts = [l + c + r for l, r in splits for c in letters] 
        return set(deletes + swaps + replaces + inserts)

    def _level_two_edits(self, word):
        return set(e2 for e1 in self._level_one_edits(word) for e2 in self._level_one_edits(e1))

    def check(self, word):
        candidates = self._level_one_edits(word) or self._level_two_edits(word) or [word]
        valid_candidates = [w for w in candidates if w in self.vocabs]
        return sorted([(c, self.word_probas[c]) for c in valid_candidates], key=lambda tup: tup[1], reverse=True)

In [3]:
checker = auto_correct("./sonnets.txt")

In [4]:
checker.check("hime")

[('time', 0.003894080996884735),
 ('him', 0.002113929684023142),
 ('hide', 0.00027814864263462394),
 ('home', 0.00016688918558077436)]

In [5]:
checker.check("cas")

[('as', 0.006731197151757899),
 ('can', 0.002447708055184691),
 ('was', 0.0016132621272808188),
 ('case', 0.00016688918558077436),
 ('cast', 0.00011125945705384957),
 ('cars', 5.562972852692479e-05),
 ('car', 5.562972852692479e-05)]

In [6]:
checker.check("mi")

[('my', 0.02186248331108144),
 ('i', 0.019526034712950602),
 ('me', 0.009123275478415665)]

In [7]:
checker.check("he")

[('the', 0.02403204272363151),
 ('me', 0.009123275478415665),
 ('be', 0.00789942145082332),
 ('her', 0.0028371161548731644),
 ('he', 0.002447708055184691),
 ('she', 0.001835781041388518),
 ('we', 0.0008344459279038719),
 ('hue', 0.00027814864263462394),
 ('ne', 0.00022251891410769915),
 ('ye', 0.00011125945705384957),
 ('re', 5.562972852692479e-05)]

In [8]:
checker.check("boe")

[('be', 0.00789942145082332),
 ('woe', 0.0006675567423230974),
 ('boy', 0.00016688918558077436),
 ('bore', 0.00011125945705384957),
 ('bow', 0.00011125945705384957),
 ('foe', 5.562972852692479e-05)]