# Loading Necessary Libraries

In [2]:
!pip install textdistance

Collecting textdistance
  Downloading textdistance-4.6.3-py3-none-any.whl.metadata (18 kB)
Downloading textdistance-4.6.3-py3-none-any.whl (31 kB)
Installing collected packages: textdistance
Successfully installed textdistance-4.6.3


In [4]:
import numpy as np
import pandas as pd
import textdistance
import re



# File opening and cleaning

In [11]:
word = []

with open('/content/autocorrect book.txt', 'r', encoding = 'utf-8') as f:
  data = f.read()
  data = data.lower()
  words = re.findall('\w+',data)
  words += words


In [12]:
print(words[0:10])

['the', 'project', 'gutenberg', 'ebook', 'of', 'moby', 'dick', 'or', 'the', 'whale']


# Make Vocabulary

In [13]:
len(words)

445326

In [40]:
#Convert set to list
Vocabulary = list(set(words))
Vocabulary

['definition',
 'reigns',
 'boldly',
 'inventive',
 'two_',
 'pricked',
 'sinker',
 'cambrics',
 'flesh',
 'smiting',
 'props',
 'rescuing',
 'mugs',
 'somebody',
 'sea',
 'jimmini',
 'shingled',
 'anew',
 'scorches',
 'marvelled',
 'machines',
 'creeds',
 'captors',
 'opined',
 'secretary',
 'carried',
 'disorderly',
 'treasure',
 'stroke',
 'spluttering',
 'description',
 'spectrally',
 'downtown',
 'ruminated',
 'torture',
 'assailable',
 'muezzin',
 'eleven',
 'pea',
 'sweatings',
 'peru',
 'die',
 'weazel',
 'unconquering',
 'damned',
 'prevalent',
 'smoking',
 'sworn',
 'spanned',
 'dressing',
 'brutes',
 'anomaly',
 'con',
 'bacon',
 'bowie',
 'rooms',
 'pirouetting',
 'decipher',
 'wit',
 'inactive',
 'raimond',
 'antlered',
 'rubbing',
 'gouge',
 'courageous',
 'angels',
 'echo',
 'miscreants',
 'anxious',
 'architect',
 'fitting',
 'abeam',
 'isle',
 'headmost',
 'obeys',
 'thunderings',
 'antony',
 'heat',
 'distrusted',
 'demonism',
 'trodden',
 'distributed',
 'steak',
 'f

In [41]:
len(Vocabulary )

17647

# Build the Frequency of those words

Counter from the collections module is used to count the frequency of elements in an iterable.
It returns a dictionary-like object with elements as keys and their counts as values.

In [16]:
from collections import Counter

In [42]:
words_freq_dict = {}
words_freq_dict = Counter(words)

In [43]:
words_freq_dict.most_common()[0:10]

[('the', 29406),
 ('of', 13484),
 ('and', 13034),
 ('a', 9598),
 ('to', 9414),
 ('in', 8476),
 ('that', 6162),
 ('it', 5068),
 ('his', 5060),
 ('i', 4240)]

# Relative Frequency of Words

Now we want to get the probability of occurace of each word, this equals the relative frequencies of the words.

The formula use to calculate the probability of a word in the provided code is:

Probability(word) = Frequency(word)/Total Count of all words

In [50]:
Total_word_frequency = sum(words_freq_dict.values())
probs = {k: v / Total_word_frequency for k, v in words_freq_dict.items()}

# Finiding similar words

now we will sort similar words according to the jaccard distance by calculating the 2 grams Q of the words. Next, we will return the 5 most similar words oredered by similarity and probability.

The Jaccard disatance measures the dissimilarity between two sets by comparing their intersection and union

In [51]:
def autocorrect(word):
    word = word.lower()  # Convert input word to lowercase

    # Check if the word is present in the vocabulary
    if word in Vocabulary:
        return f"Your word seems to be correct: {word}"

     # Calculate Jaccard similarity for all words in the vocabulary
    similarities = [1 - textdistance.Jaccard(qval=2).distance(v, word) for v in words_freq_dict.keys()]

    # Create a DataFrame to combine probabilities and similarities
    df = pd.DataFrame({
        'Word': list(words_freq_dict.keys()),
        'Prob': list(probs.values()),
        'Similarity': similarities
    })

    # Sort by similarity and probability
    output = df.sort_values(['Similarity', 'Prob'], ascending=False).head(3)

    return output


In [52]:
autocorrect('hello')


Unnamed: 0,Word,Prob,Similarity
1653,hell,7.6e-05,0.75
9257,shell,1.8e-05,0.6
1439,fellow,0.000243,0.5


In [58]:
def get_suggestions(input_word, top_n=5):
    input_word = input_word.lower()

    # Calculate similarities
    similarities = [1 - Jaccard(qval=2).distance(word, input_word) for word in Vocabulary]

    # Combine words, their similarities, and probabilities
    suggestions_df = pd.DataFrame({
        'Word': Vocabulary,
        'Similarity': similarities,
        'Frequency': [words_freq_dict.get(word, 0) for word in Vocabulary]
    })

    # Sort by similarity and frequency
    suggestions_df['Weighted_Score'] = suggestions_df['Similarity'] * 0.7 + (suggestions_df['Frequency'] / Total_word_frequency) * 0.3
    suggestions_df = suggestions_df.sort_values(['Weighted_Score', 'Similarity'], ascending=False).head(top_n)

    return suggestions_df[['Word', 'Similarity', 'Frequency', 'Weighted_Score']]


In [59]:
#Example usage
input_text = "pink"
suggestions = get_suggestions(input_text)
print(suggestions)

       Word  Similarity  Frequency  Weighted_Score
16643   pin    0.666667         12        0.466675
9509    ink    0.666667          2        0.466668
12532  sink    0.500000         44        0.350030
14656  pine    0.500000         22        0.350015
221    spin    0.500000          6        0.350004
