In [1]:
import numpy as np
import pandas as pd
import textdistance 
from collections import Counter
import re

# File Opening And Cleaning (change formate to utf-8

In [2]:
words = []
with open('autocorrect book.txt','r',encoding='utf-8') as f:
    data = f.read()
    data = data.lower()
    word = re.findall('\w+', data)
    words +=word

In [3]:
print(words[0:10])

['the', 'project', 'gutenberg', 'ebook', 'of', 'moby', 'dick', 'or', 'the', 'whale']


# make vocabulary

# build the frequency of those words

In [4]:
len(words)

222906

In [5]:
V = set(words)

In [6]:
len(V)

17874

In [7]:
word_freq_dict = Counter(words)

In [8]:
word_freq_dict.most_common(10)

[('the', 14703),
 ('of', 6742),
 ('and', 6518),
 ('a', 4799),
 ('to', 4707),
 ('in', 4238),
 ('that', 3081),
 ('it', 2534),
 ('his', 2530),
 ('i', 2120)]

# Relative Frequency of words
Now we want to get the probability of occurrence of each word, this equals the relative frequencies of the words:

The formula used to calculate the probability of a word in the provided code is:

Probability(word) = Frequency(word) / Total count of all words

In [9]:
Total_words_freq = sum(word_freq_dict.values())


probs = {}
for k in word_freq_dict.keys():
    probs[k] = word_freq_dict[k] / Total_words_freq

In [10]:
probs

{'the': 0.06596053942020404,
 'project': 0.00040824383372363237,
 'gutenberg': 0.0004217024216485873,
 'ebook': 4.486195974984971e-05,
 'of': 0.030245933263348675,
 'moby': 0.00040375763774864743,
 'dick': 0.00040375763774864743,
 'or': 0.003575498192063022,
 'whale': 0.005518021049231514,
 'by': 0.0054821314814316345,
 'herman': 1.7944783899939884e-05,
 'melville': 1.7944783899939884e-05,
 'this': 0.006455636008003374,
 'is': 0.007855329152198685,
 'for': 0.007375306182875293,
 'use': 0.0002198236027742636,
 'anyone': 2.691717584990983e-05,
 'anywhere': 7.177913559975953e-05,
 'at': 0.005989071626604937,
 'no': 0.0026648004091410727,
 'cost': 1.7944783899939884e-05,
 'and': 0.02924102536495204,
 'with': 0.007936080679748414,
 'almost': 0.0008837806070720394,
 'restrictions': 8.972391949969942e-06,
 'whatsoever': 3.14033718248948e-05,
 'you': 0.004297775744035602,
 'may': 0.0011439799736211677,
 'copy': 8.523772352471445e-05,
 'it': 0.011368020600611917,
 'give': 0.00040375763774864743

# Finding Similar Words¶
Now we will sort similar words according to the Jaccard distance by calculating the 2 grams Q of the words. Next, we will return the 5 most similar words ordered by similarity and probability:

The Jaccard distance measures the dissimilarity between two sets by comparing their intersection and union

In [20]:
def autocorrect(word): # Hel is
    word = word.lower() 
    if word in probs:
        print('the word is already there', word)
    else:
        similarities = [1-(textdistance.Jaccard(qval=len(word))).distance(w,word) for w in word_freq_dict.keys()]
        df = pd.DataFrame.from_dict(probs,orient='index').reset_index()
        df = df.rename(columns={'index':'Word',0:'Prob'})
        df['Similarity'] = similarities
        output = df.sort_values(['Similarity','Prob'],ascending=False).head(15)
        return(output)
autocorrect('parac')

Unnamed: 0,Word,Prob,Similarity
15073,paracelsus,4e-06,0.166667
15227,paracelsan,4e-06,0.166667
17717,paracetamol,4e-06,0.142857
0,the,0.065961,0.0
4,of,0.030246,0.0
21,and,0.029241,0.0
70,a,0.021529,0.0
137,to,0.021117,0.0
214,in,0.019012,0.0
504,that,0.013822,0.0
