# Autocorrect

In [40]:
import collections
import itertools
import re

import numpy
import pandas

## Data Loading & Processing

In [41]:
def process_data(file_path):
    with open(file_path, "r") as file:
        pattern = re.compile(r"\w+")
        for text_line in file:
            for match in pattern.finditer(text_line):
                yield match[0].lower()

In [42]:
words = list(process_data("shakespeare.txt"))
words[:16]

['o',
 'for',
 'a',
 'muse',
 'of',
 'fire',
 'that',
 'would',
 'ascend',
 'the',
 'brightest',
 'heaven',
 'of',
 'invention',
 'a',
 'kingdom']

## Getting Word Probabilities

In [46]:
def get_probabiliites(words):
    word_counter = collections.Counter(words)
    total_count = sum(word_counter.values())
    return collections.Counter({word: word_count / total_count for word, word_count in word_counter.items()})

In [51]:
probabilities = get_probabiliites(words)
list(itertools.islice(probabilities.items(), 16))

[('o', 0.0029283396127877045),
 ('for', 0.008840974372365426),
 ('a', 0.01411944641325027),
 ('muse', 0.000335733204013877),
 ('of', 0.020405118066176745),
 ('fire', 0.0004103405826836274),
 ('that', 0.014641698063938523),
 ('would', 0.0025739545641063903),
 ('ascend', 1.865184466743761e-05),
 ('the', 0.028444063117842356),
 ('brightest', 3.730368933487522e-05),
 ('heaven', 0.0008952885440370053),
 ('invention', 0.0001678666020069385),
 ('kingdom', 0.00011191106800462566),
 ('stage', 5.595553400231283e-05),
 ('princes', 7.460737866975044e-05)]

## Getting Possible Candidates

In [70]:
def get_candidates(word, f):
    candidates = set()
    for i in range(len(word) + 1):
        former, latter = word[:i], word[i:]
        candidates |= f(former, latter)
    return candidates


def by_addition(a, b, /):
    return {f"{a}{alphabet}{b}" for alphabet in "abcdefghijklmnopqrstuvwxyz"}


def by_deletion(a, b, /):
    return {f"{a}{b[1:]}"} if len(b) >= 1 else set()


def by_substitution(a, b, /):
    return {f"{a}{alphabet}{b[1:]}" for alphabet in "abcdefghijklmnopqrstuvwxyz" if len(b) >= 1}


def get_primary_candidates(word):
    return get_candidates(word, by_addition) | get_candidates(word, by_deletion) | get_candidates(word, by_substitution)


def get_secondary_candidates(word):
    secondary_candidates = set()
    for primary_candidate in get_primary_candidates(word):
        secondary_candidates |= get_primary_candidates(primary_candidate)
    return secondary_candidates


def get_corrections(word, word_probabilities, n=5):
    vocabularies = set(word_probabilities.keys())
    candidates = (
        {word} & vocabularies
        or get_primary_candidates(word) & vocabularies
        or get_secondary_candidates(word) & vocabularies
        or {word}
    )
    return collections.Counter({candidate: word_probabilities[candidate] for candidate in candidates}).most_common(n)

In [71]:
get_corrections(word="dys", word_probabilities=probabilities)

[('days', 0.0004103405826836274), ('dye', 1.865184466743761e-05)]