<a href="https://colab.research.google.com/github/Roshini1369/-Autocomplete-and-Autocorrect/blob/main/oasis_task_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Load dataset (e.g., a corpus of text messages, books, tweets, etc.)
# Sample dataset: Gutenberg Corpus from NLTK
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg

raw_text = gutenberg.raw('austen-emma.txt')
print(raw_text[:500])  # preview


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.

She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's marriage,
been mistress of his house from a very early period.  Her mother
had died t


In [9]:
tokens = clean_text.split()  # basic tokenizer (just splits on spaces)
print(tokens[:20])


['emma', 'by', 'jane', 'austen', 'volume', 'i', 'chapter', 'i', 'emma', 'woodhouse', 'handsome', 'clever', 'and', 'rich', 'with', 'a', 'comfortable', 'home', 'and', 'happy']


In [11]:
from collections import defaultdict

def build_ngram_model(tokens, n=3):
    model = defaultdict(list)
    for i in range(len(tokens) - n + 1):
        key = tuple(tokens[i:i + n - 1])
        next_word = tokens[i + n - 1]
        model[key].append(next_word)
    return model

# Build trigram model
trigram_model = build_ngram_model(tokens, n=3)


In [12]:
def autocomplete(prompt, model, n=3):
    prompt_tokens = prompt.lower().split()
    if len(prompt_tokens) < n - 1:
        return []

    key = tuple(prompt_tokens[-(n - 1):])
    return model.get(key, [])


In [13]:
# Try some predictions
predictions = autocomplete("she was", trigram_model)
print("Predicted next words:", predictions[:5])


Predicted next words: ['the', 'now', 'more', 'a', 'a']


In [14]:
from collections import Counter

def autocomplete_best(prompt, model, n=3):
    prompt_tokens = prompt.lower().split()
    if len(prompt_tokens) < n - 1:
        return None

    key = tuple(prompt_tokens[-(n - 1):])
    possible_next = model.get(key, [])
    if not possible_next:
        return None

    # Return most common predicted word
    return Counter(possible_next).most_common(1)[0][0]

# Try best guess
best_word = autocomplete_best("she was", trigram_model)
print("Best predicted next word:", best_word)


Best predicted next word: not


In [15]:
!pip install textdistance


Collecting textdistance
  Downloading textdistance-4.6.3-py3-none-any.whl.metadata (18 kB)
Downloading textdistance-4.6.3-py3-none-any.whl (31 kB)
Installing collected packages: textdistance
Successfully installed textdistance-4.6.3


In [16]:
import textdistance

# Build a vocabulary from your tokens
vocabulary = set(tokens)

def autocorrect(word, vocab, max_distance=2):
    suggestions = [w for w in vocab if textdistance.levenshtein(word, w) <= max_distance]
    suggestions = sorted(suggestions, key=lambda w: textdistance.levenshtein(word, w))
    return suggestions[:5]  # return top 5 suggestions

# Example
misspelled = "definately"
corrections = autocorrect(misspelled, vocabulary)
print("Suggestions:", corrections)


Suggestions: ['delicately']


In [17]:
def smart_typing_assist(prompt, model, vocab, n=3):
    words = prompt.lower().split()
    if words:
        last_word = words[-1]
        corrections = autocorrect(last_word, vocab)
        if corrections:
            words[-1] = corrections[0]  # Use the best autocorrection

    # Now predict next word
    new_prompt = ' '.join(words)
    next_word = autocomplete_best(new_prompt, model, n)
    return corrections[:3], next_word


In [18]:
prompt = "she was definately"
corrections, prediction = smart_typing_assist(prompt, trigram_model, vocabulary)
print("Autocorrect suggestions:", corrections)
print("Predicted next word:", prediction)


Autocorrect suggestions: ['delicately']
Predicted next word: None
