In [1]:
pip install torch torchvision torchaudio



In [2]:
pip install transformers




In [3]:
pip install pandas



In [5]:
pip install torch torchvision transformers



In [1]:
import difflib
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load GPT-2 model and tokenizer
print("Loading GPT-2 model and tokenizer...")
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")


def predict_next_words(context, max_length=10, num_suggestions=3):
    """
    Predict the next possible words using the GPT-2 model.

    Parameters:
    - context: The input text to provide context.
    - max_length: The maximum number of tokens to generate.
    - num_suggestions: The number of next-word suggestions to return.

    Returns:
    - List of next-word suggestions.
    """
    inputs = tokenizer.encode(context, return_tensors="pt")

    # Use sampling for diverse next-word suggestions
    outputs = model.generate(
        inputs,
        max_length=len(inputs[0]) + max_length,
        num_return_sequences=num_suggestions,
        do_sample=True,  # Enables sampling for diverse suggestions
        top_k=50,  # Limits sampling to top 50 tokens
        temperature=0.7,  # Controls randomness
    )

    # Decode and clean up suggestions
    suggestions = []
    for output in outputs:
        decoded = tokenizer.decode(output, skip_special_tokens=True)
        # Extract only the next word(s) generated
        suggestion = decoded[len(context):].strip().split(" ")[0]
        suggestions.append(suggestion)

    return suggestions


def suggest_similar_words(word, vocabulary, max_suggestions=3):
    """
    Suggest similar words for a given input word using difflib.

    Parameters:
    - word: The word for which suggestions are needed.
    - vocabulary: The list of valid words to check against.
    - max_suggestions: Maximum number of suggestions to return.

    Returns:
    - List of similar word suggestions.
    """
    return difflib.get_close_matches(word, vocabulary, n=max_suggestions, cutoff=0.7)


def autocorrect(input_text, vocabulary):
    """
    Autocorrect the input text by finding the closest match in the vocabulary.

    Parameters:
    - input_text: The text input by the user.
    - vocabulary: The list of valid words to check against.

    Returns:
    - Autocorrected text as a string.
    """
    words = input_text.split()
    corrected_words = []
    for word in words:
        matches = suggest_similar_words(word, vocabulary, max_suggestions=1)
        corrected_words.append(matches[0] if matches else word)
    return " ".join(corrected_words)


def autocorrect_keyboard(input_text, vocabulary, max_suggestions=3):
    """
    Combine autocorrect, per-word suggestions, and next-word prediction.

    Parameters:
    - input_text: The user's input text.
    - vocabulary: List of valid words for autocorrect.
    - max_suggestions: Number of suggestions to generate for each word.

    Returns:
    - A tuple containing:
        - Autocorrected text
        - Per-word suggestions
        - List of next-word suggestions
    """
    # Autocorrect the input text
    corrected_text = autocorrect(input_text, vocabulary)

    # Generate per-word suggestions
    words = input_text.split()
    word_suggestions = {word: suggest_similar_words(word, vocabulary, max_suggestions) for word in words}

    # Generate next-word suggestions using GPT-2
    next_word_suggestions = predict_next_words(corrected_text, num_suggestions=max_suggestions)

    return corrected_text, word_suggestions, next_word_suggestions


if __name__ == "__main__":
    # Sample vocabulary for autocorrect and suggestions
    vocabulary = ["hello", "world", "python", "keyboard", "suggestion", "autocorrect", "system", "text", "predict", "love", "programming", "in"]

    # User input
    print("Type a sentence (e.g., 'i lov programming in'): ")
    user_input = input("Your Input: ")

    # Get autocorrected text, word suggestions, and next-word suggestions
    corrected_text, word_suggestions, next_word_suggestions = autocorrect_keyboard(user_input, vocabulary)

    # Display the results
    print("\nCorrected Text:", corrected_text)
    print("\nSuggestions for Each Word:")
    for word, suggestions in word_suggestions.items():
        print(f"  '{word}': {suggestions}")
    print("\nNext-Word Suggestions:", next_word_suggestions)


Loading GPT-2 model and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Type a sentence (e.g., 'i lov programming in'): 
Your Input: i lov my mother


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Corrected Text: i love my mother

Suggestions for Each Word:
  'i': []
  'lov': ['love']
  'my': []
  'mother': []

Next-Word Suggestions: ['.', "'s", '.']
