In [None]:
!pip install textdistance
!pip install metaphone

Collecting textdistance
  Downloading textdistance-4.6.3-py3-none-any.whl.metadata (18 kB)
Downloading textdistance-4.6.3-py3-none-any.whl (31 kB)
Installing collected packages: textdistance
Successfully installed textdistance-4.6.3
Collecting metaphone
  Downloading Metaphone-0.6.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: metaphone
  Building wheel for metaphone (setup.py) ... [?25l[?25hdone
  Created wheel for metaphone: filename=Metaphone-0.6-py3-none-any.whl size=13901 sha256=dc4d0d7f95484e909415fd2d04d64df3df4722f0110d619898f683dfb77ca712
  Stored in directory: /root/.cache/pip/wheels/08/cb/f9/3ce2de290cd1b6f10dd8ed4795f3dec4a835b02d2514f9b9d3
Successfully built metaphone
Installing collected packages: metaphone
Successfully installed metaphone-0.6


In [None]:
import subprocess
import sys
from pathlib import Path
import re
from collections import Counter, defaultdict
import nltk
from nltk.corpus import stopwords
import textdistance
import pandas as pd
from metaphone import doublemetaphone
import difflib

def install_packages():
    packages = ['metaphone', 'textdistance', 'nltk', 'matplotlib', 'pandas']
    for package in packages:
        try:
            __import__(package)
        except ImportError:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])

install_packages()

def load_corpus(file_path="/content/corpus.txt"):
    """Load corpus with fallback to default if file not found"""
    try:
        file_path = Path(file_path)
        if not file_path.exists():
            alt_paths = ["corpus.txt", "./data/corpus.txt", "text_data.txt"]
            for alt_path in alt_paths:
                if Path(alt_path).exists():
                    file_path = Path(alt_path)
                    break
            else:
                return """
                Natural language processing is a field of artificial intelligence that focuses on
                interaction between computers and humans through natural language. Machine learning
                algorithms are used to analyze and understand human language. Text processing involves
                cleaning and preparing text data for analysis. Word frequency analysis helps identify
                common words in text. Autocorrect systems use various techniques including edit distance,
                phonetic matching, and statistical models to suggest corrections for misspelled words.
                Spell checking is important feature in many applications. Context aware suggestions can
                improve accuracy of autocorrect systems. Bigram models help predict next word based on
                previous word. Natural language understanding requires sophisticated algorithms and large
                amounts of training data. Machine learning models can be trained on large text corpora
                to learn patterns in language use. Programming languages like Python provide excellent
                tools for natural language processing tasks. Computer science students learn about
                artificial intelligence and machine learning concepts. Data science involves analyzing
                large datasets to extract meaningful insights. Software engineering requires good
                programming skills and system design knowledge.
                """

        with open(file_path, "r", encoding="utf-8") as file:
            return file.read()

    except Exception as e:
        return """
        Natural language processing is a field of artificial intelligence that focuses on
        interaction between computers and humans through natural language. Machine learning
        algorithms are used to analyze and understand human language.
        """

# Initialize the system
file_content = load_corpus()
words = re.findall(r'\w+', file_content.lower())

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords', quiet=True)

stop_words = set(stopwords.words('english'))
filtered_words = [w for w in words if w.isalpha() and w not in stop_words and len(w) > 2]
vocabulary = set(filtered_words)
word_freq = Counter(filtered_words)
total_count = sum(word_freq.values())
probs = {word: freq/total_count for word, freq in word_freq.items()}

def build_bigram_model(tokens):
    bigram_counts = defaultdict(Counter)
    for w1, w2 in zip(tokens[:-1], tokens[1:]):
        bigram_counts[w1][w2] += 1
    return bigram_counts

bigram_counts = build_bigram_model(filtered_words)

def phonetic_candidates(word, vocab):
    try:
        word_code = doublemetaphone(word)
        candidates = []
        for w in vocab:
            code = doublemetaphone(w)
            if code[0] == word_code[0] or (word_code[1] and code[1] == word_code[1]):
                candidates.append(w)
        return candidates
    except:
        return []

def check_spelling(input_text):
    """Check if word/sentence is correct or provide corrections"""
    input_words = re.findall(r'\w+', input_text.lower())

    if len(input_words) == 1:
        # Single word
        word = input_words[0]
        if word in vocabulary:
            return "Correct"
        else:
            # Find best suggestion
            similarity_scores = {
                w: textdistance.levenshtein.normalized_similarity(word, w)
                for w in word_freq.keys()
            }

            phonetic_matches = phonetic_candidates(word, vocabulary)
            for pm in phonetic_matches:
                similarity_scores[pm] = max(similarity_scores.get(pm, 0), 0.9)

            close_matches = difflib.get_close_matches(word, list(vocabulary), n=3, cutoff=0.6)

            best_suggestions = sorted(
                similarity_scores.items(),
                key=lambda x: (x[1], word_freq.get(x[0], 0)),
                reverse=True
            )[:3]

            if close_matches:
                return close_matches[0]
            elif best_suggestions:
                return best_suggestions[0][0]
            else:
                return word

    else:
        # Multiple words (sentence)
        corrected_words = []
        all_correct = True

        for word in input_words:
            if word in vocabulary:
                corrected_words.append(word)
            else:
                all_correct = False
                close_matches = difflib.get_close_matches(word, list(vocabulary), n=1, cutoff=0.6)
                if close_matches:
                    corrected_words.append(close_matches[0])
                else:
                    # Use similarity scoring
                    similarity_scores = {
                        w: textdistance.levenshtein.normalized_similarity(word, w)
                        for w in list(vocabulary)[:1000]  # Limit for performance
                    }
                    best_match = max(similarity_scores.items(), key=lambda x: x[1])
                    if best_match[1] > 0.5:
                        corrected_words.append(best_match[0])
                    else:
                        corrected_words.append(word)

        if all_correct:
            return "Correct"
        else:
            return ' '.join(corrected_words)

def predict_next_word(input_text):
    """Predict next word based on the last word in input"""
    words_in_input = re.findall(r'\w+', input_text.lower())
    if not words_in_input:
        return "No input provided"

    last_word = words_in_input[-1]

    if last_word in bigram_counts:
        predictions = [w for w, _ in bigram_counts[last_word].most_common(3)]
        if predictions:
            return predictions
        else:
            # Fallback to most common words
            return [w for w, _ in word_freq.most_common(3)]
    else:
        # If word not in bigram model, suggest most common words
        return [w for w, _ in word_freq.most_common(3)]

def complete_word(input_text):
    """Complete partial word or suggest completions"""
    words_in_input = input_text.strip().split()
    if not words_in_input:
        return "No input provided"

    last_word = words_in_input[-1].lower()

    # Find words that start with the partial word
    completions = [w for w in vocabulary if w.startswith(last_word)]

    if not completions:
        # If no direct completions, find similar words
        similarity_scores = {
            w: textdistance.jaccard(last_word, w)
            for w in vocabulary
            if w.startswith(last_word[:2])  # At least first 2 characters match
        }

        if similarity_scores:
            best_matches = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)[:5]
            completions = [match[0] for match in best_matches]

    # Sort by frequency
    completions.sort(key=lambda w: word_freq.get(w, 0), reverse=True)

    if completions:
        if last_word in completions:
            # If the word is already complete, return it
            return last_word
        else:
            # Return top 3 completions
            return completions[:3]
    else:
        return "No completions found"

def run_autocorrect_system():
    """Main interactive function"""
    print("="*50)
    print("        AUTOCORRECT SYSTEM")
    print("="*50)
    print("System initialized successfully!")
    print(f"Vocabulary size: {len(vocabulary)} words")
    print("-"*50)

    while True:
        print("\n" + "="*50)

        # Get user input
        user_input = input("Enter a word or sentence (or 'quit' to exit): ").strip()

        if user_input.lower() in ['quit', 'exit', 'q']:
            print("\nThank you for using Autocorrect System!")
            break

        if not user_input:
            print("Please enter some text.")
            continue

        print("-"*50)
        print("Choose an option:")
        print("1. Check correct or not")
        print("2. Next word")
        print("3. Complete word")
        print("-"*50)

        choice = input("Enter your choice (1/2/3): ").strip()

        print("-"*50)

        if choice == '1':
            result = check_spelling(user_input)
            print(f"Input: '{user_input}'")
            if result == "Correct":
                print("✓ Correct")
            else:
                print(f"✗ Suggestion: '{result}'")

        elif choice == '2':
            predictions = predict_next_word(user_input)
            print(f"Input: '{user_input}'")
            if isinstance(predictions, list):
                print(f"Next word predictions: {', '.join(predictions)}")
            else:
                print(f"Next word prediction: {predictions}")

        elif choice == '3':
            completions = complete_word(user_input)
            print(f"Input: '{user_input}'")
            if isinstance(completions, list):
                print(f"Word completions: {', '.join(completions)}")
            else:
                print(f"Word completion: {completions}")

        else:
            print("Invalid choice. Please enter 1, 2, or 3.")

        print("-"*50)
        continue_choice = input("Continue? (y/n): ").strip().lower()
        if continue_choice in ['n', 'no']:
            print("\nThank you for using Autocorrect System!")
            break



In [None]:
# Main execution
if __name__ == "__main__":
    run_autocorrect_system()

        AUTOCORRECT SYSTEM
System initialized successfully!
Vocabulary size: 132694 words
--------------------------------------------------

Enter a word or sentence (or 'quit' to exit): Fuck
--------------------------------------------------
Choose an option:
1. Check correct or not
2. Next word
3. Complete word
--------------------------------------------------
Enter your choice (1/2/3): 2
--------------------------------------------------
Input: 'Fuck'
Next word predictions: program, tha, goddamn
--------------------------------------------------
Continue? (y/n): y

Enter a word or sentence (or 'quit' to exit): facuk
--------------------------------------------------
Choose an option:
1. Check correct or not
2. Next word
3. Complete word
--------------------------------------------------
Enter your choice (1/2/3): 1
--------------------------------------------------
Input: 'facuk'
✗ Suggestion: 'fack'
--------------------------------------------------
Continue? (y/n): y

Enter a wo