In [5]:
from datasets import load_dataset

print("Loading dataset with only specific columns...")
ds = load_dataset("julien31/soar_arc_train_5M", columns=['task_id', 'correct_train_input', 'correct_test_input', 'model'])
df = ds['train'].to_pandas()

print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
df.head()

  from .autonotebook import tqdm as notebook_tqdm


Loading dataset with only specific columns...
Columns: ['correct_train_input', 'correct_test_input', 'task_id', 'model']

First few rows:
Columns: ['correct_train_input', 'correct_test_input', 'task_id', 'model']

First few rows:


Unnamed: 0,correct_train_input,correct_test_input,task_id,model
0,"[True, True, True, True, True]",[True],007bbfb7,Mistral-Large-Instruct-2407
1,"[True, True, True, True, True]",[True],007bbfb7,Mistral-Large-Instruct-2407
2,"[True, True, True, True, True]",[True],007bbfb7,Mistral-Large-Instruct-2407
3,"[True, True, True, True, True]",[True],007bbfb7,Mistral-Large-Instruct-2407
4,"[True, True, True, True, True]",[True],007bbfb7,Mistral-Large-Instruct-2407


In [6]:
# Filter dataset to only include Mistral-Large-Instruct-2407 and Qwen2.5-72B-Instruct models
print(f"Original dataset size: {len(df)}")
print(f"Unique models in dataset: {df['model'].unique()}")

# Filter for both Mistral-Large-Instruct-2407 and Qwen2.5-72B-Instruct
models_to_keep = ['Mistral-Large-Instruct-2407', 'Qwen2.5-72B-Instruct']
df_filtered = df[df['model'].isin(models_to_keep)].copy()

print(f"Filtered dataset size: {len(df_filtered)}")
print(f"Percentage retained: {len(df_filtered)/len(df)*100:.2f}%")

# Show distribution by model
print(f"\nDistribution by model:")
print(df_filtered['model'].value_counts())

# Display first few rows of filtered data
print("\nFirst few rows of filtered data:")
df_filtered.head()

Original dataset size: 4926487
Unique models in dataset: ['Mistral-Large-Instruct-2407' 'Qwen2.5-72B-Instruct'
 'Qwen2.5-Coder-32B-Instruct' 'Qwen2.5-Coder-14B-Instruct'
 'Qwen2.5-Coder-7B-Instruct']
Filtered dataset size: 1734164
Percentage retained: 35.20%

Distribution by model:
model
Qwen2.5-72B-Instruct           1055492
Mistral-Large-Instruct-2407     678672
Name: count, dtype: int64

First few rows of filtered data:
Filtered dataset size: 1734164
Percentage retained: 35.20%

Distribution by model:
model
Qwen2.5-72B-Instruct           1055492
Mistral-Large-Instruct-2407     678672
Name: count, dtype: int64

First few rows of filtered data:


Unnamed: 0,correct_train_input,correct_test_input,task_id,model
0,"[True, True, True, True, True]",[True],007bbfb7,Mistral-Large-Instruct-2407
1,"[True, True, True, True, True]",[True],007bbfb7,Mistral-Large-Instruct-2407
2,"[True, True, True, True, True]",[True],007bbfb7,Mistral-Large-Instruct-2407
3,"[True, True, True, True, True]",[True],007bbfb7,Mistral-Large-Instruct-2407
4,"[True, True, True, True, True]",[True],007bbfb7,Mistral-Large-Instruct-2407


In [7]:
# Add score column: (correct_train + correct_test) / (total_train + total_test)
df_filtered['score'] = df_filtered.apply(
    lambda row: (sum(row['correct_train_input']) + sum(row['correct_test_input'])) / 
                (len(row['correct_train_input']) + len(row['correct_test_input'])), 
    axis=1
)

print("Score statistics:")
print(f"Mean score: {df_filtered['score'].mean():.4f}")
print(f"Median score: {df_filtered['score'].median():.4f}")
print(f"Min score: {df_filtered['score'].min():.4f}")
print(f"Max score: {df_filtered['score'].max():.4f}")

# Show score distribution
print(f"\nScore distribution:")
print(df_filtered['score'].describe())

# Display first few rows with score
print("\nFirst few rows with score:")
df_filtered[['task_id', 'model', 'score']].head()

Score statistics:
Mean score: 0.0247
Median score: 0.0000
Min score: 0.0000
Max score: 1.0000

Score distribution:
count    1.734164e+06
mean     2.466137e-02
std      1.268591e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: score, dtype: float64

First few rows with score:


Unnamed: 0,task_id,model,score
0,007bbfb7,Mistral-Large-Instruct-2407,1.0
1,007bbfb7,Mistral-Large-Instruct-2407,1.0
2,007bbfb7,Mistral-Large-Instruct-2407,1.0
3,007bbfb7,Mistral-Large-Instruct-2407,1.0
4,007bbfb7,Mistral-Large-Instruct-2407,1.0


In [8]:
# Load all ARC-AGI-1 training tasks and select top/bottom 50 rows per task
import sys
sys.path.append('..')
from task_loader import TaskLoader
import pandas as pd

# Load all training task IDs
loader = TaskLoader(data_root="../../data")

# Get all training task files
training_path = loader.data_root / "arc-agi-1" / "training"
all_training_task_ids = [f.stem for f in training_path.glob("*.json")]

print(f"Found {len(all_training_task_ids)} training tasks in ARC-AGI-1")
print(f"First 10 task IDs: {all_training_task_ids[:10]}")

# Create final dataset with top 50 + bottom 50 for each task
final_rows = []

for task_id in all_training_task_ids:
    # Get all rows for this task ID
    task_rows = df_filtered[df_filtered['task_id'] == task_id].copy()
    
    if len(task_rows) == 0:
        print(f"No SOAR data found for task {task_id}")
        continue
    
    # Sort by score (ascending for easy top/bottom selection)
    task_rows_sorted = task_rows.sort_values('score', ascending=True)
    
    # Take bottom 50 (lowest scores) and top 50 (highest scores)
    bottom_50 = task_rows_sorted.head(50)
    top_50 = task_rows_sorted.tail(50)
    
    # Combine them
    selected_rows = pd.concat([bottom_50, top_50], ignore_index=True)
    
    print(f"Task {task_id}: {len(task_rows)} total rows, selected {len(selected_rows)} (bottom 50 + top 50)")
    
    final_rows.append(selected_rows)

# Combine all selected rows
if final_rows:
    final_df = pd.concat(final_rows, ignore_index=True)
    
    print(f"\nFinal dataset summary:")
    print(f"Total rows: {len(final_df)}")
    print(f"Unique tasks: {final_df['task_id'].nunique()}")
    print(f"Average rows per task: {len(final_df) / final_df['task_id'].nunique():.1f}")
    
    # Show distribution by model
    print(f"\nModel distribution in final dataset:")
    print(final_df['model'].value_counts())
    
    # Show score statistics
    print(f"\nScore statistics in final dataset:")
    print(final_df['score'].describe())
    
    print(f"\nFirst few rows of final dataset:")
    print(final_df[['task_id', 'model', 'score']].head(10))
else:
    print("No data found for any tasks!")
    final_df = pd.DataFrame()

Found 400 training tasks in ARC-AGI-1
First 10 task IDs: ['49d1d64f', '890034e9', '1f0c79e5', '1e32b0e9', '22168020', '0b148d64', '3ac3eb23', '54d82841', '2dc579da', 'a9f96cdd']
Task 49d1d64f: 615 total rows, selected 100 (bottom 50 + top 50)
Task 890034e9: 8633 total rows, selected 100 (bottom 50 + top 50)
Task 1f0c79e5: 15359 total rows, selected 100 (bottom 50 + top 50)
Task 1e32b0e9: 9297 total rows, selected 100 (bottom 50 + top 50)
Task 22168020: 1098 total rows, selected 100 (bottom 50 + top 50)
Task 0b148d64: 1179 total rows, selected 100 (bottom 50 + top 50)
Task 3ac3eb23: 1197 total rows, selected 100 (bottom 50 + top 50)
Task 54d82841: 2363 total rows, selected 100 (bottom 50 + top 50)
Task 2dc579da: 2181 total rows, selected 100 (bottom 50 + top 50)
Task a9f96cdd: 2213 total rows, selected 100 (bottom 50 + top 50)
Task 543a7ed5: 3613 total rows, selected 100 (bottom 50 + top 50)
Task ce9e57f2: 1134 total rows, selected 100 (bottom 50 + top 50)
Task 496994bd: 412 total rows,

In [9]:
# Check the final dataset and save it
print("Final dataset created successfully!")
print(f"Total rows in final dataset: {len(final_df):,}")
print(f"Unique tasks represented: {final_df['task_id'].nunique()}")
print(f"Expected rows (400 tasks × 100 rows): {400 * 100:,}")

# Check how many rows per task we actually got
rows_per_task = final_df['task_id'].value_counts()
print(f"\nRows per task statistics:")
print(f"Min rows per task: {rows_per_task.min()}")
print(f"Max rows per task: {rows_per_task.max()}")
print(f"Mean rows per task: {rows_per_task.mean():.1f}")
print(f"Tasks with exactly 100 rows: {(rows_per_task == 100).sum()}")
print(f"Tasks with less than 100 rows: {(rows_per_task < 100).sum()}")

# Show model distribution
print(f"\nModel distribution in final dataset:")
model_counts = final_df['model'].value_counts()
for model, count in model_counts.items():
    percentage = count / len(final_df) * 100
    print(f"{model}: {count:,} ({percentage:.1f}%)")

# Show score distribution
print(f"\nScore distribution in final dataset:")
score_stats = final_df['score'].describe()
for stat, value in score_stats.items():
    print(f"{stat}: {value:.4f}")

# Save the final dataset
output_path = "soar_filtered_top_bottom_50_per_task.csv"
final_df.to_csv(output_path, index=False)
print(f"\nDataset saved to: {output_path}")
print(f"File size: {len(final_df)} rows × {len(final_df.columns)} columns")

Final dataset created successfully!
Total rows in final dataset: 40,000
Unique tasks represented: 400
Expected rows (400 tasks × 100 rows): 40,000

Rows per task statistics:
Min rows per task: 100
Max rows per task: 100
Mean rows per task: 100.0
Tasks with exactly 100 rows: 400
Tasks with less than 100 rows: 0

Model distribution in final dataset:
Qwen2.5-72B-Instruct: 25,635 (64.1%)
Mistral-Large-Instruct-2407: 14,365 (35.9%)

Score distribution in final dataset:
count: 40000.0000
mean: 0.3524
std: 0.4355
min: 0.0000
25%: 0.0000
50%: 0.0000
75%: 1.0000
max: 1.0000

Dataset saved to: soar_filtered_top_bottom_50_per_task.csv
File size: 40000 rows × 5 columns


In [10]:
# Efficient approach: Use original pandas index to selectively load from HuggingFace dataset
print("Setting up efficient row tracking without loading full dataset...")

# The key insight: Our df_filtered already has the original pandas index from when we loaded it!
# We just need to preserve these indices through our filtering process

# Let's modify our existing final_df to preserve the original indices
print("Recreating final selection with original index preservation...")

# Reset df_filtered to have original indices available
df_filtered_with_idx = df_filtered.reset_index()  # This preserves the original index as a column
print(f"df_filtered now has original indices: {df_filtered_with_idx.columns.tolist()}")

# Now redo our task-based filtering but keep the original indices
final_rows_with_idx = []

for task_id in all_training_task_ids:
    # Get all rows for this task ID (now with original index preserved)
    task_rows = df_filtered_with_idx[df_filtered_with_idx['task_id'] == task_id].copy()
    
    if len(task_rows) == 0:
        continue
    
    # Sort by score and select top/bottom 50
    task_rows_sorted = task_rows.sort_values('score', ascending=True)
    bottom_50 = task_rows_sorted.head(50)
    top_50 = task_rows_sorted.tail(50)
    
    # Combine them
    selected_rows = pd.concat([bottom_50, top_50])
    final_rows_with_idx.append(selected_rows)

# Combine all selected rows
final_df_with_idx = pd.concat(final_rows_with_idx, ignore_index=True)

# Extract the original indices we need
selected_indices = final_df_with_idx['index'].tolist()  # These are the original row numbers
print(f"Selected {len(selected_indices)} rows with original indices")
print(f"Index range: {min(selected_indices)} to {max(selected_indices)}")

# Now use HuggingFace datasets to efficiently load only these specific rows
print("\nLoading only selected rows from HuggingFace dataset...")
ds_full = load_dataset("julien31/soar_arc_train_5M")

# Use select() to get only the rows we need - this is memory efficient!
selected_dataset = ds_full['train'].select(selected_indices)
final_full_df = selected_dataset.to_pandas()

print(f"\nEfficiently loaded full dataset:")
print(f"Rows: {len(final_full_df):,}")
print(f"Columns: {len(final_full_df.columns)}")
print(f"All columns: {list(final_full_df.columns)}")

# Add our computed score column to the full dataset
print("\nAdding score column to full dataset...")
final_full_df['score'] = final_full_df.apply(
    lambda row: (sum(row['correct_train_input']) + sum(row['correct_test_input'])) / 
                (len(row['correct_train_input']) + len(row['correct_test_input'])), 
    axis=1
)

# Verify we got the right data
print(f"\nVerification:")
print(f"Unique tasks: {final_full_df['task_id'].nunique()}")
print(f"Model distribution:")
print(final_full_df['model'].value_counts())

# Save the complete dataset with all original columns
output_path_full = "soar_filtered_top_bottom_50_per_task_FULL.csv"
final_full_df.to_csv(output_path_full, index=False)
print(f"\nFull dataset saved to: {output_path_full}")

# Show first few rows
print(f"\nFirst few rows (key columns):")
key_cols = ['task_id', 'model', 'score']
print(final_full_df[key_cols].head())

Setting up efficient row tracking without loading full dataset...
Recreating final selection with original index preservation...
df_filtered now has original indices: ['index', 'correct_train_input', 'correct_test_input', 'task_id', 'model', 'score']
Selected 40000 rows with original indices
Index range: 0 to 4926252

Loading only selected rows from HuggingFace dataset...
Selected 40000 rows with original indices
Index range: 0 to 4926252

Loading only selected rows from HuggingFace dataset...

Efficiently loaded full dataset:
Rows: 40,000
Columns: 8
All columns: ['code', 'correct_train_input', 'predicted_train_output', 'correct_test_input', 'predicted_test_output', 'task_id', 'model', 'generation']

Adding score column to full dataset...

Efficiently loaded full dataset:
Rows: 40,000
Columns: 8
All columns: ['code', 'correct_train_input', 'predicted_train_output', 'correct_test_input', 'predicted_test_output', 'task_id', 'model', 'generation']

Adding score column to full dataset...

