In [2]:
import pandas as pd
import numpy as np
import textdistance
import re
from collections import Counter

# File Opening And Cleaning (change formate to utf-8)

In [3]:


word = []

with open('autocorrect book.txt', 'r', encoding='utf-8') as f:
    data = f.read()
    data = data.lower()
    words = re.findall('\w+', data)
    words += words

print(words[0:10])


['the', 'project', 'gutenberg', 'ebook', 'of', 'moby', 'dick', 'or', 'the', 'whale']


# make vocabulary

In [4]:

len(words)

445710

In [5]:
V = set(words)
V

{'empty',
 'favourable',
 'glimmering',
 'hooped',
 'oozy',
 'apron',
 'thistle',
 'ungraspable',
 'contortions',
 'greyhounds',
 'bumped',
 'pinny',
 'whipped',
 'solve',
 'hav',
 'bruised',
 'gore',
 'sustain',
 'distrusted',
 'fawned',
 'noiseless',
 'owners',
 'turtles',
 'harbors',
 'commonest',
 'counter',
 'scrimps',
 'resign',
 'exhausting',
 'practical',
 'aroma',
 'amounts',
 'imperial',
 'tying',
 'matted',
 'tanned',
 'owned',
 'error',
 'banks',
 'relaxed',
 'foggy',
 'garters',
 'wrapper',
 'mightest',
 'hurl',
 'angularly',
 'speedy',
 'chewed',
 'secondly',
 'yawning',
 'lancet',
 'date',
 'pouring',
 'mimicking',
 'circumpolar',
 'indebted',
 'broiling',
 'success',
 'ween',
 'glen',
 'shading',
 'unheeded',
 'defined',
 'entreated',
 'entered',
 '47',
 'anew',
 '_that',
 'geometry',
 'et',
 'eloquent',
 'fang',
 'hemlock',
 'gnaw',
 'rakes',
 'bargain',
 'legatees',
 'gripping',
 'tempestuous',
 'sociably',
 'convenient',
 'rich',
 'wery',
 'idleness',
 'monks',
 'ald

In [6]:
len(V)

17789

# build the frequency of those words

In [7]:
words_freq_dict = {}
words_freq_dict = Counter(words)

In [8]:
words_freq_dict.most_common()[0:10]

[('the', 29410),
 ('of', 13486),
 ('and', 13038),
 ('a', 9610),
 ('to', 9416),
 ('in', 8476),
 ('that', 6166),
 ('it', 5068),
 ('his', 5060),
 ('i', 4240)]

# Relative Frequency of words
Now we want to get the probability of occurrence of each word, this equals the relative frequencies of the words:

The formula used to calculate the probability of a word in the provided code is:

Probability(word) = Frequency(word) / Total count of all words

In [9]:
Total = sum(words_freq_dict.values())
probs = {}

for k in words_freq_dict.keys():
    probs[k] = words_freq_dict[k] / Total

In [10]:
probs

{'the': 0.06598460882636692,
 'project': 0.0004083372596531377,
 'gutenberg': 0.0004217989275537906,
 'ebook': 4.4872226335509633e-05,
 'of': 0.03025734221803415,
 'moby': 0.0004038500370195867,
 'dick': 0.0004038500370195867,
 'or': 0.003576316438940118,
 'whale': 0.0055192838392676856,
 'by': 0.005483386058199278,
 'herman': 1.7948890534203855e-05,
 'melville': 1.7948890534203855e-05,
 'this': 0.006457113369679837,
 'is': 0.007857126831347738,
 'for': 0.007376994009557784,
 'use': 0.0002198739090439972,
 'anyone': 2.692333580130578e-05,
 'anywhere': 7.179556213681542e-05,
 'at': 0.005994929438424087,
 'no': 0.0026654102443292725,
 'cost': 1.7948890534203855e-05,
 'and': 0.029252204348118732,
 'with': 0.007946871284018756,
 'almost': 0.0008839828588095398,
 'restrictions': 8.974445267101928e-06,
 'whatsoever': 3.1410558434856746e-05,
 'you': 0.004298759282941823,
 'may': 0.0011442417715554958,
 'copy': 8.525723003746831e-05,
 'it': 0.011370622153418141,
 'give': 0.0004038500370195867,

# Finding Similar Words
Now we will sort similar words according to the Jaccard distance by calculating the 2 grams Q of the words. Next, we will return the 5 most similar words ordered by similarity and probability:


The Jaccard distance measures the dissimilarity between two sets by comparing their intersection and union

In [11]:
def autocorrect(word):
    word = word.lower()
    if word in V:
        return('Your word seems to be correct', word)
    else:
        similarities = [1-(textdistance.Jaccard(qval=2).distance(v,word))  for v in words_freq_dict.keys()]
        df = pd.DataFrame.from_dict(probs, orient='index').reset_index()
        df = df.rename(columns={'index':'Word',0:'Prob'})
        df['Similarity'] = similarities
        output = df.sort_values(['Similarity','Prob'],ascending=False).head(3)
        return(output)
autocorrect("parace")

Unnamed: 0,Word,Prob,Similarity
4792,race,7.6e-05,0.6
7996,pace,3.1e-05,0.6
15073,paracelsus,4e-06,0.555556
