In [None]:
!pip install textdistance
!pip install metaphone
!pip install nltk



In [None]:
from pathlib import Path
import re
from collections import Counter, defaultdict
import difflib
import nltk
from nltk.corpus import stopwords
import textdistance
from metaphone import doublemetaphone

In [None]:
# -----------------------------
# Corpus loading and preprocessing
# -----------------------------
def load_corpus(file_path="/content/corpus.txt"):
    """
    Load corpus text from the given path.
    """

    p = Path(file_path)
    return p.read_text(encoding="utf-8")  # Unicode Transformation Format – 8-bit


# Load text and basic tokenization
raw_text = load_corpus()
tokens = re.findall(r"\w+", raw_text.lower())  # tokenization words with convert the string in lower case then apply re.findall with \w+ syntax


In this above cell I create a function to lode and extract the corpus data set that is used in this project.Here use path that is import from path lab and read and extract the text using utf-8 encoding.After getting the raw text using re library tokenize into words with select \w+ texts and lower the all words.

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words("english"))

# Filter words: alphabetic, not a stopword, length > 2
filtered = [w for w in tokens if w.isalpha() and w not in stop_words and len(w) > 2]
vocabulary = set(filtered)
word_freq = Counter(filtered)
total_tokens = sum(word_freq.values())

# Probabilities (if needed later)
probs = {w: freq / total_tokens for w, freq in word_freq.items()}

From the tokens remove the stopwords and filtered them. In vocabulary add the all uniqque filttered words and using counter create a dictaniory where key is the word and value is the count of this word.

In [None]:
# -----------------------------
# Bigram model builder
# -----------------------------
def build_bigram_counts(token_list):
    """
    Build bigram counts: dict[word] -> Counter(next_word -> count)
    """
    bigrams = defaultdict(Counter)
    for a, b in zip(token_list[:-1], token_list[1:]):
        bigrams[a][b] += 1
    return bigrams

bigram_counts = build_bigram_counts(filtered)   # Use token

build a bigram model where check the next word corrosponding to the previous word and how many times the next word appear corrospond to the previous word.

In [None]:
# -----------------------------
# Phonetic candidate finder
# -----------------------------
def phonetic_candidates(word, vocab):
    """
    Return vocab words whose Double Metaphone codes match the given word's codes.
    """
    try:
        target_codes = doublemetaphone(word)
    except Exception:
        return []

    matches = []
    for v in vocab:
        try:
            v_codes = doublemetaphone(v)
        except Exception:
            continue
        # consider match if primary codes match or secondary codes match
        if v_codes[0] == target_codes[0] and v_codes[0] != "":
            matches.append(v)
        elif target_codes[1] and v_codes[1] == target_codes[1]:
            matches.append(v)
    return matches

Using doublemetaphone fn check the phonetic matchin words corrospond to the word from the all vocab.

In [None]:
def check_spelling(input_text):
    """
    Spell-check / suggestions (Combined output only)
    Uses 3 techniques with weighted combination:
      - difflib (0.3)
      - phonetic (0.3)
      - textdistance.jaro_winkler (0.4)
    Returns top 3 best-matched words overall.
    """

    global vocabulary, word_freq, stop_words

    # sanity checks
    if 'vocabulary' not in globals():
        raise RuntimeError("Global 'vocabulary' not found")
    if 'word_freq' not in globals():
        raise RuntimeError("Global 'word_freq' not found")
    if 'stop_words' not in globals():
        stop_words = set()

    words = re.findall(r"\w+", input_text.lower())   # basically here convert the word into lower form with tokens
    if not words:
        return "No input provided"

    jaro_fn = textdistance.jaro_winkler.normalized_similarity  # use jaro winkler fn from textdistance

    def combined_topk(w, k=3, difflib_seed_n=10, freq_seed_n=50):
        """
        Weighted combination:
          score = 0.3 * difflib + 0.3 * phonetic + 0.4 * jaro_winkler
        Returns top-k best candidates.
        """
        cand_set = set()

        # difflib seeds
        try:
            cand_set.update(difflib.get_close_matches(w, list(vocabulary), n=difflib_seed_n, cutoff=0.2))
        except Exception:
            pass

        # phonetic seeds
        try:
            cand_set.update(phonetic_candidates(w, vocabulary))
        except Exception:
            pass

        # top frequency words
        try:
            cand_set.update([cand for cand, _ in list(word_freq.most_common(freq_seed_n))])
        except Exception:
            pass

        if not cand_set:
            cand_set = set(word_freq.keys())

        scored = []
        for cand in cand_set:
            # difflib ratio
            try:
                s_d = difflib.SequenceMatcher(None, w, cand).ratio()
            except Exception:
                s_d = 0.0

            # phonetic indicator
            try:
                s_p = 1.0 if cand in phonetic_candidates(w, vocabulary) else 0.0
            except Exception:
                s_p = 0.0

            # jaro-winkler
            try:
                s_j = jaro_fn(w, cand)
            except Exception:
                s_j = 0.0

            score = 0.3 * s_d + 0.3 * s_p + 0.4 * s_j
            scored.append((cand, score, word_freq.get(cand, 0)))

        # sort by score desc, then frequency desc
        scored.sort(key=lambda x: (x[1], x[2]), reverse=True)
        return [cand for cand, s, f in scored[:k] if s > 0.0]

    # single word case → just return top 3 combined matches
    if len(words) == 1:
        w = words[0]
        if w in vocabulary or w in stop_words:
            return "Correct"
        return combined_topk(w, k=3)

    # sentence case → return combined correction suggestions for each word
    output = []
    for w in words:
        if w in stop_words or w in vocabulary:
            output.append(w)
        else:
            output.append(combined_topk(w, k=3))

    return output


In [None]:
def predict_next_word(input_text):
    """
    Look at the last word in input_text and return up to 3 top next-word predictions
    from the bigram model. Interaction:
      - If last word is known: show top-3 predictions and let the user
        confirm one (by number or Enter for top) or type a custom word to add.
        Confirmation/custom word increments the corresponding counts by +1.
      - If last word is unseen: prompt the user to add a predicted next word (as before).
    """
    global bigram_counts, word_freq, vocabulary, probs

    words = re.findall(r"\w+", input_text.lower())
    if not words:
        return "No input provided"

    last = words[-1]

    # Ensure data structures exist (safety)
    if 'bigram_counts' not in globals():
        bigram_counts = defaultdict(Counter)
    if 'word_freq' not in globals():
        word_freq = Counter()
    if 'vocabulary' not in globals():
        vocabulary = set()
    if 'probs' not in globals():
        probs = {}

    # --- Known word branch: show predictions and allow confirm/add ---
    if last in bigram_counts and bigram_counts[last]:
        preds = [w for w, _ in bigram_counts[last].most_common(3)]
        # Display predictions and prompt the user for confirmation or custom entry
        print(f"Top predictions for '{last}':")
        for i, p in enumerate(preds, start=1):
            print(f"  {i}. {p}")
        prompt = (
            "If correct, press Enter to accept the top prediction,\n"
            "or type 1/2/3 to confirm a specific prediction,\n"
            "or type your own word to add it as the predicted next word:\n> "
        )
        user_choice = input(prompt).strip()

        # If user pressed Enter, accept top prediction
        if user_choice == "":
            chosen = preds[0]
        else:
            # If user typed a number 1-3 and that index exists, choose that
            if user_choice.isdigit():
                idx = int(user_choice)
                if 1 <= idx <= len(preds):
                    chosen = preds[idx - 1]
                else:
                    # invalid number -> treat as custom word candidate below
                    chosen = None
            else:
                chosen = None

        # If chosen is None, try to parse user_choice as a custom word
        if chosen is None:
            cand = re.findall(r"\w+", user_choice.lower())
            if not cand:
                # user typed invalid input — do nothing, return preds as before
                return preds
            chosen = cand[0]  # the custom word user wants

        # Update bigram_counts and global counts for the chosen word (confirm or custom)
        bigram_counts[last][chosen] += 1
        word_freq[chosen] += 1
        vocabulary.add(chosen)

        # Recompute probabilities
        total_tokens = sum(word_freq.values()) if word_freq else 0
        if total_tokens > 0:
            probs = {w: f / total_tokens for w, f in word_freq.items()}

        # Return updated top-3 predictions for last (so caller sees the result)
        updated_preds = [w for w, _ in bigram_counts[last].most_common(3)]
        return {
            "message": f"Confirmed/added prediction: ('{last}' -> '{chosen}')",
            "top_predictions_for_last": updated_preds,
            "updated_count_for_pred": word_freq[chosen],
            "vocabulary_size": len(vocabulary)
        }

    # --- Unseen word branch (existing behavior) ---
    else:
        prompt = (
            f"Unable to predict the next word for '{last}'.\n"
            "If you want to add a predicted next word now, type it and press Enter.\n"
            "Otherwise just press Enter to skip: "
        )
        user_input = input(prompt).strip()
        cand = re.findall(r"\w+", user_input.lower()) if user_input else []
        if not cand:
            return f"Unable to predict the next word for '{last}'. No prediction added."

        pred = cand[0]

        # update bigram counts and frequency/vocab/probs
        bigram_counts[last][pred] += 1
        word_freq[pred] += 1
        vocabulary.add(pred)

        total_tokens = sum(word_freq.values()) if word_freq else 0
        if total_tokens > 0:
            probs = {w: f / total_tokens for w, f in word_freq.items()}

        return {
            "message": f"Added bigram: ('{last}' -> '{pred}')",
            "top_predictions_for_last": [w for w, _ in bigram_counts[last].most_common(3)],
            "updated_count_for_pred": word_freq[pred],
            "vocabulary_size": len(vocabulary)
        }


In [None]:
def run_autocorrect_system():
    print("=" * 60)
    print("          SIMPLE AUTOCORRECT SYSTEM")
    print("=" * 60)
    print(f"Vocabulary size: {len(vocabulary)} tokens")
    print("-" * 60)

    while True:
        user_input = input("\nEnter a word or sentence (or type 'quit' to exit): ").strip()
        if not user_input:
            print("Please enter some text.")
            continue

        if user_input.lower() in ("quit", "exit", "q"):
            print("Goodbye!")
            break

        print("\nChoose an action:")
        print("1 - Check spelling / suggestions")
        print("2 - Predict next word")
        choice = input("Enter 1, or 2: ").strip()

        if choice == "1":
            result = check_spelling(user_input)
            print(f"\nInput: {user_input}")
            if result == "Correct":
                print("✓ Correct")
            else:
                print(f"✗ Suggestion: {result}")

        elif choice == "2":
            # Call the interactive predictor (it may prompt the user internally)
            preds = predict_next_word(user_input)
            print(f"\nInput: {user_input}")

            # If predictor returned a simple list of suggestions
            if isinstance(preds, list):
                if preds:
                    print("Next word predictions:", ", ".join(preds))
                else:
                    print("No predictions available.")

            # If predictor returned a dict (confirmation / addition info)
            elif isinstance(preds, dict):
                # Print the clear message and useful stats
                msg = preds.get("message", "Prediction updated.")
                top = preds.get("top_predictions_for_last")
                cnt = preds.get("updated_count_for_pred")
                vocab_sz = preds.get("vocabulary_size")
                print(msg)
                if top:
                    print("Top predictions for the last word now:", ", ".join(top))
                if cnt is not None:
                    print("Updated count for that prediction:", cnt)
                if vocab_sz is not None:
                    print("Updated vocabulary size:", vocab_sz)

            # If predictor returned a string (e.g., error / informative message)
            elif isinstance(preds, str):
                print(preds)

            else:
                # Fallback for any unexpected return type
                print("Next word prediction:", preds)

        else:
            print("Invalid choice. Try again.")

        cont = input("\nContinue? (y/n): ").strip().lower()
        if cont in ("n", "no"):
            print("Goodbye!")
            break


In [None]:
# Allow running as script
if __name__ == "__main__":
    run_autocorrect_system()

          SIMPLE AUTOCORRECT SYSTEM
Vocabulary size: 132694 tokens
------------------------------------------------------------

Enter a word or sentence (or type 'quit' to exit): dd

Choose an action:
1 - Check spelling / suggestions
2 - Predict next word
Enter 1, or 2: 8
Invalid choice. Try again.

Continue? (y/n): n
Goodbye!
