# Data Preparation and Augmentation

 This notebook loads the questions and misspellings datasets, generates augmented variants (with 0, 1, …, up to N misspellings),
 and saves the results in `data/augmented_questions.csv`.
The augmentation function automatically adjusts the maximum errors based on eligible words in each question.


In [32]:
import pandas as pd
from src.data_io import load_questions, load_misspellings
from src.misspelling_augmenter import generate_augmented_variants

# Load the questions dataset.
questions_df = load_questions('data/questions.csv')
print("DEBUG - Loaded questions:", questions_df.shape)

# Load the misspellings dictionary.
misspellings_dict = load_misspellings('data/misspellings.csv')
print("DEBUG - Misspellings dictionary has", len(misspellings_dict), "entries.")

# Process each question to generate augmented variants.
augmented_variants = []
for idx, row in questions_df.iterrows():
    original_text = row['question']
    variants = generate_augmented_variants(original_text, misspellings_dict, max_errors=10)
    for variant_text, error_count in variants:
        augmented_variants.append({
            'original_question': original_text,
            'variant_question': variant_text,
            'error_count': error_count
        })

augmented_df = pd.DataFrame(augmented_variants)
print("DEBUG - Augmented dataset shape:", augmented_df.shape)

# Save the augmented questions to a CSV file.
augmented_df.to_csv('data/augmented_questions.csv', index=False)
print("DEBUG - Augmented questions saved at data/augmented_questions.csv")

DEBUG - Loaded questions: (1, 1)
DEBUG - Misspellings dictionary has 1 entries.
DEBUG - Augmented dataset shape: (2, 3)
DEBUG - Augmented questions saved at data/augmented_questions.csv
