# Text Preprocessing
## tokenization, lowercase, punctuation removal

In [6]:
import re
import nltk
from collections import Counter
from nltk.corpus import words
from nltk.metrics.distance import edit_distance

In [2]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    return words

# Word Checking
## To check if a word is misspelled, we’ll compare it against a dictionary of correctly spelled words. We can use nltk's words corpus.

In [8]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [9]:
correct_words = set(words.words())

In [10]:
def is_correct_word(word):
    return word in correct_words

# Generate Suggestions
## Using Edit Distance (Levenshtein Distance): Calculate the number of edits needed to transform one word into another approach.

In [11]:
from nltk.metrics.distance import edit_distance

In [12]:
def get_suggestions(word, n=3):
    suggestions = sorted(correct_words, key=lambda w: edit_distance(word, w))[:n]
    return suggestions

# Correction
## Replace the misspelled words with their best suggestions. We will take the top suggestion.

In [13]:
def autocorrect(text):
    words = preprocess_text(text)
    corrected_text = []
    
    for word in words:
        if is_correct_word(word):
            corrected_text.append(word)
        else:
            suggestions = get_suggestions(word)
            best_suggestion = suggestions[0] if suggestions else word
            corrected_text.append(best_suggestion)
    
    return ' '.join(corrected_text)

In [16]:
input_text = "This is an exmple of misspelld sentnce."
corrected_text = autocorrect(input_text)
print("Original Text:", input_text)
print("Corrected Text:", corrected_text)

Original Text: This is an exmple of misspelld sentnce.
Corrected Text: this is an example of misspell sentence
