# Data Preparation and Augmentation

 This notebook loads the questions and misspellings datasets, generates augmented variants (with 0, 1, …, up to N misspellings),
 and saves the results in `data/augmented_questions.csv`.
The augmentation function automatically adjusts the maximum errors based on eligible words in each question.


In [52]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.getcwd()))

import pandas as pd
# Force reload of data_io
import importlib
import src.data_io
importlib.reload(src.data_io)
from src.data_io import load_questions, load_misspellings

# Define generate_augmented_variants with support for multiple misspellings
import itertools

def adjust_case(original, new):
    if original.istitle():
        return new.capitalize()
    elif original.isupper():
        return new.upper()
    elif original.islower():
        return new.lower()
    return new

def generate_augmented_variants(sentence, misspellings_dict, max_errors=10):
    print(f"DEBUG - Processing sentence: {sentence}")
    words = sentence.split()
    candidate_indices = [i for i, word in enumerate(words) if word.lower() in misspellings_dict]
    print(f"DEBUG - Candidate indices: {candidate_indices}")
    variants = [(sentence, 0)]
    max_errors = min(max_errors, len(candidate_indices))

    # Generate variants with 1 to max_errors errors
    for error_count in range(1, max_errors + 1):
        # Get combinations of indices to misspell
        for indices in itertools.combinations(candidate_indices, error_count):
            # For each combination, generate all possible variants
            replacement_options = []
            for i in indices:
                word = words[i]
                candidates = misspellings_dict[word.lower()]
                print(f"DEBUG - Word: {word}, Candidates: {candidates}")
                adjusted_candidates = [adjust_case(word, cand) for cand in candidates]
                replacement_options.append(adjusted_candidates)

            # Generate all combinations of replacements
            for replacements in itertools.product(*replacement_options):
                new_words = words.copy()
                for idx, i in enumerate(indices):
                    new_words[i] = replacements[idx]
                variant = " ".join(new_words)
                variants.append((variant, error_count))

    return variants

# Test print to confirm output visibility
print("DEBUG - Testing output visibility in notebook")

# Check if the updated function is loaded
def check_function_version():
    test_sentence = "Test"
    test_dict = {"test": ["tset", "tets"]}
    result = generate_augmented_variants(test_sentence, test_dict, max_errors=1)
    print(f"DEBUG - Function version check result: {result}")
    if len(result) > 1:
        print("DEBUG - Function version check: Updated version detected (multiple variants generated)")
    else:
        print("DEBUG - Function version check: Issue detected (expected multiple variants)")

check_function_version()

# Load the full questions dataset
questions_df = load_questions('data/questions.csv')
print(f"Loaded questions: {questions_df.shape}")

# Load the misspellings dictionary
misspellings_dict = load_misspellings('data/misspellings.csv')
print(f"DEBUG - Misspellings dictionary: {misspellings_dict}")

# Process the full dataset
questions_full = questions_df.copy()

# Generate augmented variants for each question
augmented_variants = []
for idx, row in questions_full.iterrows():
    original_text = row['question']
    print(f"DEBUG - Processing question: {original_text}")
    variants = generate_augmented_variants(original_text, misspellings_dict, max_errors=10)
    print(f"DEBUG - All variants for '{original_text}': {variants}")
    for variant_text, error_count in variants:
        augmented_variants.append({
            'original_question': original_text,
            'variant_question': variant_text,
            'error_count': error_count
        })

augmented_df = pd.DataFrame(augmented_variants)
print("Augmented dataset shape:", augmented_df.shape)

# Save the augmented dataset
augmented_df.to_csv('data/augmented_questions.csv', index=False)
print("Augmented questions saved at data/augmented_questions.csv")

DEBUG - Testing output visibility in notebook
DEBUG - Processing sentence: Test
DEBUG - Candidate indices: [0]
DEBUG - Word: Test, Candidates: ['tset', 'tets']
DEBUG - Function version check result: [('Test', 0), ('Tset', 1), ('Tets', 1)]
DEBUG - Function version check: Updated version detected (multiple variants generated)
Loaded questions: (1, 1)
DEBUG - Entering load_misspellings with filepath: data/misspellings.csv
DEBUG - Loading CSV from: D:\Evaluating_the_ Robustness_of_Language_Models_to_Misspellings\data/misspellings.csv
DEBUG - CSV header: ['correct_word', 'misspellings']
DEBUG - Raw row: ['example', 'exampel,exampl']
DEBUG - Loaded: example -> ['exampel', 'exampl']
DEBUG - Raw row: ['research', 'reserach,reserch,rescarch']
DEBUG - Loaded: research -> ['reserach', 'reserch', 'rescarch']
DEBUG - Raw row: ['model', 'modle,mudel']
DEBUG - Loaded: model -> ['modle', 'mudel']
DEBUG - Raw row: ['what', 'wath,wtah']
DEBUG - Loaded: what -> ['wath', 'wtah']
DEBUG - Raw row: ['is', 'i