**Approach 1**

The Python script demonstrates token healing, which is a technique used to correct misspelt or mistyped words in text. It utilises a dictionary of misspelt words and their corresponding correct spellings to perform the token healing process.

In [21]:
import csv

def load_dictionary(file_path):
    """
    Loads the CSV file which has the data for both the misspelled and correctly spelled word
    """
    dictionary = {}
    with open(file_path, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            misspelled_word = row['Misspelled Word'].strip()
            correct_word = row['Correct Word'].strip()
            dictionary[misspelled_word] = correct_word
    return dictionary

def perform_token_healing(text, dictionary):
    """
    Perform token healing on the given input text using the provided dictionary and provides the details of corrected tokens.
    """
    tokens = text.split()
    corrected_tokens = []
    for token in tokens:
        if token in dictionary:
            corrected_tokens.append(dictionary[token])
        else:
            corrected_tokens.append(token)
    corrected_text = ' '.join(corrected_tokens)
    return corrected_text


def analyze_token_healing(original_text, corrected_text):
    """
    Perform analysis of the token healing process.
    Prints the original text, corrected text, and provides various statistics.
    """
    original_tokens = original_text.split()
    corrected_tokens = corrected_text.split()
    num_tokens = len(original_tokens)
    num_corrected_tokens = sum(1 for original, corrected in zip(original_tokens, corrected_tokens) if original != corrected)
    correction_rate = (num_corrected_tokens / num_tokens) * 100

    print("Original Text:")
    print(original_text)
    print()
    print("Corrected Text:")
    print(corrected_text)
    print()
    print("Number of Tokens:", num_tokens)
    print("Number of Tokens Corrected:", num_corrected_tokens)
    print("Correction Rate: {:.2f}%".format(correction_rate))

    # Additional analysis done
    incorrect_token_indices = [i for i, (original, corrected) in enumerate(zip(original_tokens, corrected_tokens)) if original != corrected]
    incorrect_tokens = [original_tokens[i] for i in incorrect_token_indices]
    corrected_tokens = [corrected_tokens[i] for i in incorrect_token_indices]

    print()
    print("Incorrect Tokens:")
    for original, corrected in zip(incorrect_tokens, corrected_tokens):
        print("Original: {:<15} Corrected: {:<15}".format(original, corrected))

    unique_incorrect_tokens = set(incorrect_tokens)
    print()
    print("Unique Incorrect Tokens:", len(unique_incorrect_tokens))
    print("Unique Incorrect Tokens:", unique_incorrect_tokens)


# the csv file path that has the dataset of the correct and misspelled words. 
csv_file_path = '/content/Book1 (1).csv'

# Input
input_text = input()
dictionary = load_dictionary(csv_file_path)
corrected_text = perform_token_healing(input_text, dictionary)
analyze_token_healing(input_text, corrected_text)


They don't know abot thier wrods
Original Text:
They don't know abot thier wrods

Corrected Text:
They don't know about their words

Number of Tokens: 6
Number of Tokens Corrected: 3
Correction Rate: 50.00%

Incorrect Tokens:
Original: abot            Corrected: about          
Original: thier           Corrected: their          
Original: wrods           Corrected: words          

Unique Incorrect Tokens: 3
Unique Incorrect Tokens: {'thier', 'abot', 'wrods'}


**Approach 2**

This Python script demonstrates the concept of token healing using Spacy, a popular natural language processing library. Token healing is a technique used to correct misspelled words or tokens in text by replacing them with their correct spellings.

In [22]:
pip install spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [23]:
import csv
import spacy

def load_dictionary(file_path):
    """
    Load the misspelled words and their correct spellings from a CSV file.
    Returns a dictionary where the keys are misspelled words and the values are their correct spellings.
    """
    dictionary = {}
    with open(file_path, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            misspelled_word = row['Misspelled Word'].strip()
            correct_word = row['Correct Word'].strip()
            dictionary[misspelled_word] = correct_word
    return dictionary

def perform_token_healing(text, dictionary):
    """
    Perform token healing on the given input text using the provided dictionary and spaCy.
    Returns the text with corrected tokens.
    """
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    corrected_tokens = []
    for token in doc:
        if token.text in dictionary:
            corrected_tokens.append(dictionary[token.text])
        else:
            corrected_tokens.append(token.text)
    corrected_text = ' '.join(corrected_tokens)
    return corrected_text


def analyze_token_healing(original_text, corrected_text):
    """
    Perform analysis of the token healing process.
    Prints the original text, corrected text, and provides various statistics.
    """
    original_tokens = original_text.split()
    corrected_tokens = corrected_text.split()
    num_tokens = len(original_tokens)
    num_corrected_tokens = sum(1 for original, corrected in zip(original_tokens, corrected_tokens) if original != corrected)
    correction_rate = (num_corrected_tokens / num_tokens) * 100

    print("Original Text:")
    print(original_text)
    print()
    print("Corrected Text:")
    print(corrected_text)
    print()
    print("Number of Tokens:", num_tokens)
    print("Number of Tokens Corrected:", num_corrected_tokens)
    print("Correction Rate: {:.2f}%".format(correction_rate))

    # Additional analysis
    incorrect_token_indices = [i for i, (original, corrected) in enumerate(zip(original_tokens, corrected_tokens)) if original != corrected]
    incorrect_tokens = [original_tokens[i] for i in incorrect_token_indices]
    corrected_tokens = [corrected_tokens[i] for i in incorrect_token_indices]

    print()
    print("Incorrect Tokens:")
    for original, corrected in zip(incorrect_tokens, corrected_tokens):
        print("Original: {:<15} Corrected: {:<15}".format(original, corrected))

    unique_incorrect_tokens = set(incorrect_tokens)
    print()
    print("Unique Incorrect Tokens:", len(unique_incorrect_tokens))
    print("Unique Incorrect Tokens:", unique_incorrect_tokens)


# the csv file path that has the dataset of the correct and misspelled words.
csv_file_path = '/content/Book1 (1).csv'

# Example usage
input_text = input()
dictionary = load_dictionary(csv_file_path)
corrected_text = perform_token_healing(input_text, dictionary)
analyze_token_healing(input_text, corrected_text)


They don't know abot thier wrods
Original Text:
They don't know abot thier wrods

Corrected Text:
They do n't know about their words

Number of Tokens: 6
Number of Tokens Corrected: 5
Correction Rate: 83.33%

Incorrect Tokens:
Original: don't           Corrected: do             
Original: know            Corrected: n't            
Original: abot            Corrected: know           
Original: thier           Corrected: about          
Original: wrods           Corrected: their          

Unique Incorrect Tokens: 5
Unique Incorrect Tokens: {"don't", 'know', 'thier', 'abot', 'wrods'}


**Approach 3** (**Still in progress**)

This Python script demonstrates the concept of token healing using the GPT-2 model from the Hugging Face transformers library. Token healing is a technique used to correct inconsistencies between the prompt and the generated text when using language models.


In [None]:
import transformers

def perform_token_healing(prompt, generated_text):
    tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2')
    prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False)
    generated_tokens = tokenizer.encode(generated_text, add_special_tokens=False)

    # Remove the last token from the prompt tokens
    prompt_tokens = prompt_tokens[:-1]

    # Check if the first token of the generated tokens matches the last token of the prompt tokens
    if generated_tokens[0] != prompt_tokens[-1]:
        # Replace the first token of the generated tokens with the last token of the prompt tokens
        generated_tokens[0] = prompt_tokens[-1]

    # Decode the tokens back into text
    corrected_text = tokenizer.decode(generated_tokens)

    return corrected_text

# Take user input for prompt and generated text
prompt = input("Enter the prompt: ")
generated_text = input("Enter the generated text: ")

# Perform token healing
corrected_text = perform_token_healing(prompt, generated_text)

# Display the results
print("Prompt: ", prompt)
print("Generated Text: ", generated_text)
print("Corrected Text: ", corrected_text)


**Approach 4** (**Still in progress**)

The code is trying to implement token healing using the PyEnchant library. 

The code performs token healing by checking each token in the input text against a dictionary of correctly spelled words. If a token is found to be misspelled, it suggests a correction and replaces the misspelled token with the suggested correction. The code aims to improve the accuracy and readability of the text by automatically correcting common spelling mistakes.

In [25]:
pip install pyenchant


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [26]:
pip install pyenchant‑3.2.1‑cp39‑cp39‑win_amd64.whl


[0m[31mERROR: pyenchant‑3.2.1‑cp39‑cp39‑win_amd64.whl is not a valid wheel filename.[0m[31m
[0m

In [None]:
import enchant
import re

def perform_token_healing(text):
    # Initialize the spell checker
    spell_checker = enchant.Dict("en_US")

    # Tokenize the text
    tokens = re.findall(r'\w+|[^\w\s]', text)

    # Perform token healing
    corrected_tokens = []
    for token in tokens:
        # Check if the token is a word
        if token.isalpha():
            # Check if the token is misspelled
            if not spell_checker.check(token):
                # Get suggested replacements for the misspelled token
                suggestions = spell_checker.suggest(token)
                if suggestions:
                    corrected_token = suggestions[0]  # Choose the first suggestion as the correction
                else:
                    corrected_token = token  # Use the original token if no suggestions available
            else:
                corrected_token = token  # Use the original token if it is correctly spelled
        else:
            corrected_token = token  # Keep non-word tokens as is

        corrected_tokens.append(corrected_token)

    # Reconstruct the corrected text
    corrected_text = ' '.join(corrected_tokens)
    return corrected_text


# Example usage
input_text = input("Enter the text to perform token healing: ")
corrected_text = perform_token_healing(input_text)

print("Original Text:")
print(input_text)
print()
print("Corrected Text:")
print(corrected_text)
