In [1]:
from datasets import load_dataset
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("deepmind/aqua_rat")

In [3]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'options', 'rationale', 'correct'],
        num_rows: 97467
    })
    test: Dataset({
        features: ['question', 'options', 'rationale', 'correct'],
        num_rows: 254
    })
    validation: Dataset({
        features: ['question', 'options', 'rationale', 'correct'],
        num_rows: 254
    })
})


In [4]:
from datasets import load_dataset, Dataset, DatasetDict
import re
from tqdm import tqdm

def split_rationale(rationale):
    """Split rationale into steps with improved logic"""
    steps = re.split(r'\.\s+', rationale.strip())
    steps = [s.strip() for s in steps if s.strip()]
    
    # Additional cleaning
    cleaned_steps = []
    for step in steps:
        if len(step) > 10:  # Skip very short steps
            if not step.endswith(('.', '!', '?')):
                step += '.'
            cleaned_steps.append(step)
    
    return cleaned_steps

def expand_example(example):
    """Create input-output examples from a single data point"""
    steps = split_rationale(example["rationale"])
    
    if len(steps) < 2:  # Skip examples with too few steps
        return []
    
    outputs = []
    options_text = "\n".join(example["options"])
    
    for i in range(1, len(steps)):
        prev_steps_text = "\n".join([f"Step {j+1}: {step}" for j, step in enumerate(steps[:i])])
        
        input_text = f"""Question: {example['question']}

Options:
{options_text}

Previous Steps:
{prev_steps_text}

Next Step:"""
        
        target = steps[i]
        
        outputs.append({
            "input": input_text, 
            "target": target,
            "question": example["question"],
            "correct_answer": example["correct"],
            "step_number": i + 1,
            "total_steps": len(steps)
        })
    
    return outputs

def process_all_splits():
    """Process all splits (train, validation, test)"""
    
    # Load the complete dataset
    print("Loading AQuA-RAT dataset...")
    dataset = load_dataset("deepmind/aqua_rat")
    
    # Check available splits
    print(f"Available splits: {list(dataset.keys())}")
    
    processed_splits = {}
    
    # Process each split
    for split_name in dataset.keys():
        print(f"\nProcessing {split_name} split...")
        raw_data = dataset[split_name]
        all_pairs = []
        
        # Process with progress bar
        for example in tqdm(raw_data, desc=f"Processing {split_name}"):
            pairs = expand_example(example)
            all_pairs.extend(pairs)
        
        # Convert to Dataset
        processed_splits[split_name] = Dataset.from_list(all_pairs)
        
        # Print statistics
        print(f"  Original {split_name}: {len(raw_data)} examples")
        print(f"  Processed {split_name}: {len(all_pairs)} step-prediction pairs")
        if len(raw_data) > 0:
            print(f"  Average steps per example: {len(all_pairs) / len(raw_data):.2f}")
    
    # Create final DatasetDict
    processed_dataset = DatasetDict(processed_splits)
    
    return dataset, processed_dataset

def show_split_statistics(original_dataset, processed_dataset):
    """Show detailed statistics for all splits"""
    print("\n" + "="*70)
    print("DATASET STATISTICS")
    print("="*70)
    
    print(f"{'Split':<12} {'Original':<10} {'Processed':<12} {'Avg Steps':<10}")
    print("-" * 50)
    
    for split_name in processed_dataset.keys():
        orig_count = len(original_dataset[split_name])
        proc_count = len(processed_dataset[split_name])
        avg_steps = proc_count / orig_count if orig_count > 0 else 0
        
        print(f"{split_name:<12} {orig_count:<10} {proc_count:<12} {avg_steps:<10.2f}")

def show_sample_from_each_split(processed_dataset):
    """Show one sample from each split"""
    print("\n" + "="*70)
    print("SAMPLE FROM EACH SPLIT")
    print("="*70)
    
    for split_name in processed_dataset.keys():
        if len(processed_dataset[split_name]) > 0:
            sample = processed_dataset[split_name][0]
            print(f"\n{split_name.upper()} SAMPLE:")
            print(f"Step {sample['step_number']}/{sample['total_steps']}")
            print(f"Correct Answer: {sample['correct_answer']}")
            print("INPUT (first 200 chars):")
            print(sample["input"][:200] + "...")
            print("TARGET:")
            print(sample["target"])
            print("-" * 50)

def save_all_splits(processed_dataset, base_path="aqua_rat_processed"):
    """Save processed dataset with all splits"""
    print(f"\nSaving complete processed dataset to: {base_path}")
    processed_dataset.save_to_disk(base_path)
    print("All splits saved successfully!")
    
    # Also save individual splits as JSON for convenience
    for split_name in processed_dataset.keys():
        json_path = f"{base_path}_{split_name}.json"
        processed_dataset[split_name].to_json(json_path)
        print(f"Saved {split_name} to {json_path}")

def main():
    """Complete preprocessing pipeline"""
    
    # Process all splits
    original_dataset, processed_dataset = process_all_splits()
    
    # Show statistics
    show_split_statistics(original_dataset, processed_dataset)
    
    # Show samples
    show_sample_from_each_split(processed_dataset)
    
    # Ask about saving
    save_choice = input("\nSave processed dataset? (y/n): ").lower().strip()
    if save_choice == 'y':
        save_all_splits(processed_dataset)
    
    return original_dataset, processed_dataset

# Alternative: Process specific splits only
def process_specific_splits(splits_to_process=None):
    """Process only specific splits if you want"""
    if splits_to_process is None:
        splits_to_process = ["train", "validation", "test"]
    
    dataset = load_dataset("deepmind/aqua_rat")
    processed_splits = {}
    
    for split_name in splits_to_process:
        if split_name in dataset:
            print(f"Processing {split_name}...")
            raw_data = dataset[split_name]
            all_pairs = []
            
            for example in tqdm(raw_data):
                all_pairs.extend(expand_example(example))
            
            processed_splits[split_name] = Dataset.from_list(all_pairs)
            print(f"Created {len(all_pairs)} examples for {split_name}")
    
    return DatasetDict(processed_splits)

if __name__ == "__main__":
    # Run complete preprocessing
    original_dataset, processed_dataset = main()
    
    # Access individual splits like this:
    # train_data = processed_dataset["train"]
    # val_data = processed_dataset["validation"] 
    # test_data = processed_dataset["test"]
    
    print(f"\nFinal processed dataset: {processed_dataset}")

# Quick usage examples:
"""
# Load all splits
original, processed = process_all_splits()

# Use for training
train_dataset = processed["train"]
val_dataset = processed["validation"]
test_dataset = processed["test"]

# Or process only what you need
processed = process_specific_splits(["train", "validation"])
"""

Loading AQuA-RAT dataset...
Available splits: ['train', 'test', 'validation']

Processing train split...


Processing train: 100%|██████████| 97467/97467 [00:02<00:00, 41052.29it/s]


  Original train: 97467 examples
  Processed train: 132775 step-prediction pairs
  Average steps per example: 1.36

Processing test split...


Processing test: 100%|██████████| 254/254 [00:00<00:00, 39472.15it/s]


  Original test: 254 examples
  Processed test: 339 step-prediction pairs
  Average steps per example: 1.33

Processing validation split...


Processing validation: 100%|██████████| 254/254 [00:00<00:00, 39271.35it/s]


  Original validation: 254 examples
  Processed validation: 310 step-prediction pairs
  Average steps per example: 1.22

DATASET STATISTICS
Split        Original   Processed    Avg Steps 
--------------------------------------------------
train        97467      132775       1.36      
test         254        339          1.33      
validation   254        310          1.22      

SAMPLE FROM EACH SPLIT

TRAIN SAMPLE:
Step 2/3
Correct Answer: E
INPUT (first 200 chars):
Question: Two friends plan to walk along a 43-km trail, starting at opposite ends of the trail at the same time. If Friend P's rate is 15% faster than Friend Q's, how many kilometers will Friend P hav...
TARGET:
x + 1.15x = 43
2.15x=43
x = 43/2.15 = 20
Then P will have have walked 1.15*20=23 km.
--------------------------------------------------

TEST SAMPLE:
Step 2/10
Correct Answer: A
INPUT (first 200 chars):
Question: A car is being driven, in a straight line and at a uniform speed, towards the base of a vertical towe

Saving the dataset (1/1 shards): 100%|██████████| 132775/132775 [00:00<00:00, 1439141.20 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 339/339 [00:00<00:00, 153136.14 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 310/310 [00:00<00:00, 165803.91 examples/s]


All splits saved successfully!


Creating json from Arrow format: 100%|██████████| 133/133 [00:00<00:00, 214.58ba/s]


Saved train to aqua_rat_processed_train.json


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 422.98ba/s]


Saved test to aqua_rat_processed_test.json


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 435.73ba/s]

Saved validation to aqua_rat_processed_validation.json

Final processed dataset: DatasetDict({
    train: Dataset({
        features: ['input', 'target', 'question', 'correct_answer', 'step_number', 'total_steps'],
        num_rows: 132775
    })
    test: Dataset({
        features: ['input', 'target', 'question', 'correct_answer', 'step_number', 'total_steps'],
        num_rows: 339
    })
    validation: Dataset({
        features: ['input', 'target', 'question', 'correct_answer', 'step_number', 'total_steps'],
        num_rows: 310
    })
})





'\n# Load all splits\noriginal, processed = process_all_splits()\n\n# Use for training\ntrain_dataset = processed["train"]\nval_dataset = processed["validation"]\ntest_dataset = processed["test"]\n\n# Or process only what you need\nprocessed = process_specific_splits(["train", "validation"])\n'

In [5]:
print(processed_dataset)

DatasetDict({
    train: Dataset({
        features: ['input', 'target', 'question', 'correct_answer', 'step_number', 'total_steps'],
        num_rows: 132775
    })
    test: Dataset({
        features: ['input', 'target', 'question', 'correct_answer', 'step_number', 'total_steps'],
        num_rows: 339
    })
    validation: Dataset({
        features: ['input', 'target', 'question', 'correct_answer', 'step_number', 'total_steps'],
        num_rows: 310
    })
})


In [6]:
# Quick exploration and saving of your processed_dataset

# 1. Basic overview
print("Dataset Overview:")
print("="*50)
for split_name in processed_dataset.keys():
    split_data = processed_dataset[split_name]
    print(f"{split_name:>12}: {len(split_data):>6} examples")

print(f"\nFeatures: {processed_dataset['train'].features}")

# 2. Look at a few examples
print("\n" + "="*80)
print("SAMPLE EXAMPLES")
print("="*80)

for i in range(2):  # Show 2 examples
    example = processed_dataset["train"][i]
    print(f"\nExample {i+1} - Step {example['step_number']}/{example['total_steps']}")
    print(f"Correct Answer: {example['correct_answer']}")
    print("\nINPUT:")
    print("-" * 40)
    print(example["input"])
    print("\nTARGET:")
    print("-" * 40)
    print(example["target"])
    print("="*80)

# 3. Step distribution analysis
print("\nSTEP DISTRIBUTION ANALYSIS")
print("="*50)

for split_name in ["train", "test", "validation"]:
    split_data = processed_dataset[split_name]
    step_numbers = [ex['step_number'] for ex in split_data]
    total_steps = [ex['total_steps'] for ex in split_data]
    
    print(f"\n{split_name.upper()}:")
    
    # Step prediction distribution
    step_counts = {}
    for step in step_numbers:
        step_counts[step] = step_counts.get(step, 0) + 1
    
    print("  Predicting step distribution:")
    for step in sorted(step_counts.keys())[:5]:  # Show first 5 steps
        count = step_counts[step]
        percentage = (count / len(split_data)) * 100
        print(f"    Step {step}: {count:>5} examples ({percentage:>5.1f}%)")
    
    # Problem complexity
    total_counts = {}
    for total in total_steps:
        total_counts[total] = total_counts.get(total, 0) + 1
    
    print("  Problem complexity:")
    for total in sorted(total_counts.keys())[:5]:
        count = total_counts[total]
        percentage = (count / len(split_data)) * 100
        print(f"    {total} steps: {count:>5} problems ({percentage:>5.1f}%)")

# 4. Save the dataset
print(f"\n{'='*60}")
print("SAVING DATASET")
print(f"{'='*60}")

# Save as HuggingFace dataset (best for training)
hf_path = "aqua_rat_step_prediction"
print(f"Saving HuggingFace dataset to: {hf_path}/")
processed_dataset.save_to_disk(hf_path)
print("✓ HuggingFace dataset saved!")

# Save as JSON files (easy to inspect)
# Save as JSON files (easy to inspect)
print(f"\nSaving JSON files:")
import json
for split_name in processed_dataset.keys():
    json_path = f"aqua_rat_{split_name}.json"
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(processed_dataset[split_name].to_list(), f, indent=4, ensure_ascii=False)
    print(f"  ✓ {split_name}: {json_path} ({len(processed_dataset[split_name])} examples)")




# 5. Create a summary file
summary_path = "dataset_summary.txt"
print(f"\nCreating summary file: {summary_path}")

with open(summary_path, 'w', encoding='utf-8') as f:
    f.write("AQuA-RAT Step Prediction Dataset Summary\n")
    f.write("="*50 + "\n\n")
    
    f.write("Dataset Overview:\n")
    f.write("-"*20 + "\n")
    for split_name in processed_dataset.keys():
        f.write(f"{split_name:>12}: {len(processed_dataset[split_name]):>6} examples\n")
    
    f.write(f"\nTotal examples: {sum(len(processed_dataset[split]) for split in processed_dataset.keys())}\n")
    
    f.write(f"\nFeatures: {list(processed_dataset['train'].features.keys())}\n")
    
    f.write("\nSample Example:\n")
    f.write("-"*15 + "\n")
    sample = processed_dataset["train"][0]
    f.write(f"Step: {sample['step_number']}/{sample['total_steps']}\n")
    f.write(f"Correct Answer: {sample['correct_answer']}\n")
    f.write(f"Question: {sample['question'][:100]}...\n")
    f.write(f"Target: {sample['target'][:100]}...\n")

print("✓ Summary file saved!")

# 6. Test loading the saved dataset
print(f"\n{'='*60}")
print("TESTING DATASET LOADING")
print(f"{'='*60}")

try:
    from datasets import DatasetDict
    loaded_dataset = DatasetDict.load_from_disk(hf_path)
    print(f"✓ Successfully loaded dataset!")
    print(f"  Loaded: {loaded_dataset}")
    print(f"  Train examples: {len(loaded_dataset['train'])}")
except Exception as e:
    print(f"❌ Error loading dataset: {e}")

# 7. Final summary
print(f"\n{'='*60}")
print("FILES CREATED")
print(f"{'='*60}")
print("📁 aqua_rat_step_prediction/     - HuggingFace dataset (use this for training)")
print("📄 aqua_rat_train.json          - Training data (JSON)")
print("📄 aqua_rat_test.json           - Test data (JSON)")
print("📄 aqua_rat_validation.json     - Validation data (JSON)")
print("📊 aqua_rat_train.csv           - Training data (CSV)")
print("📊 aqua_rat_test.csv            - Test data (CSV)")
print("📊 aqua_rat_validation.csv      - Validation data (CSV)")
print("📝 dataset_summary.txt          - Dataset summary")

print(f"\n{'='*60}")
print("READY FOR TRAINING!")
print(f"{'='*60}")
print("To use for training:")
print("from datasets import DatasetDict")
print("dataset = DatasetDict.load_from_disk('aqua_rat_step_prediction')")
print("train_data = dataset['train']")
print("val_data = dataset['validation']")
print("test_data = dataset['test']")

Dataset Overview:
       train: 132775 examples
        test:    339 examples
  validation:    310 examples

Features: {'input': Value(dtype='string', id=None), 'target': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'correct_answer': Value(dtype='string', id=None), 'step_number': Value(dtype='int64', id=None), 'total_steps': Value(dtype='int64', id=None)}

SAMPLE EXAMPLES

Example 1 - Step 2/3
Correct Answer: E

INPUT:
----------------------------------------
Question: Two friends plan to walk along a 43-km trail, starting at opposite ends of the trail at the same time. If Friend P's rate is 15% faster than Friend Q's, how many kilometers will Friend P have walked when they pass each other?

Options:
A)21
B)21.5
C)22
D)22.5
E)23

Previous Steps:
Step 1: If Q complete x kilometers, then P completes 1.15x kilometers.

Next Step:

TARGET:
----------------------------------------
x + 1.15x = 43
2.15x=43
x = 43/2.15 = 20
Then P will have have walked 1.15*20=23

Saving the dataset (1/1 shards): 100%|██████████| 132775/132775 [00:00<00:00, 1803638.75 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 339/339 [00:00<00:00, 153053.72 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 310/310 [00:00<00:00, 117370.85 examples/s]

✓ HuggingFace dataset saved!

Saving JSON files:





  ✓ train: aqua_rat_train.json (132775 examples)
  ✓ test: aqua_rat_test.json (339 examples)
  ✓ validation: aqua_rat_validation.json (310 examples)

Creating summary file: dataset_summary.txt
✓ Summary file saved!

TESTING DATASET LOADING
✓ Successfully loaded dataset!
  Loaded: DatasetDict({
    train: Dataset({
        features: ['input', 'target', 'question', 'correct_answer', 'step_number', 'total_steps'],
        num_rows: 132775
    })
    test: Dataset({
        features: ['input', 'target', 'question', 'correct_answer', 'step_number', 'total_steps'],
        num_rows: 339
    })
    validation: Dataset({
        features: ['input', 'target', 'question', 'correct_answer', 'step_number', 'total_steps'],
        num_rows: 310
    })
})
  Train examples: 132775

FILES CREATED
📁 aqua_rat_step_prediction/     - HuggingFace dataset (use this for training)
📄 aqua_rat_train.json          - Training data (JSON)
📄 aqua_rat_test.json           - Test data (JSON)
📄 aqua_rat_validation.json