In [1]:
from pathlib import Path
import sys
project_root = next((parent for parent in [Path.cwd()] + list(Path.cwd().parents) if (parent / "pyproject.toml").exists()), Path.cwd())
sys.path.append(str(project_root))

In [2]:
import duckdb
con = duckdb.connect()

In [3]:
from llm_python.utils.task_loader import TaskLoader

# Initialize task loader
task_loader = TaskLoader()

arc_agi_1_training_dir = task_loader.data_root / "arc-agi-1" / "training"
training_task_ids = []

if arc_agi_1_training_dir.exists():
    for task_file in arc_agi_1_training_dir.glob("*.json"):
        training_task_ids.append(task_file.stem)

print(f"Found {len(training_task_ids)} ARC-AGI-1 training tasks")
print(f"First 10 task IDs: {training_task_ids[:10]}")

Found 400 ARC-AGI-1 training tasks
First 10 task IDs: ['49d1d64f', '890034e9', '1f0c79e5', '1e32b0e9', '22168020', '0b148d64', '3ac3eb23', '54d82841', '2dc579da', 'a9f96cdd']


In [4]:
# Create a DuckDB table with the training task IDs
import pandas as pd

# Convert to DataFrame
training_task_df = pd.DataFrame({'task_id': training_task_ids})

# Create table in DuckDB
con.execute("DROP TABLE IF EXISTS arc_agi_1_training_tasks")
con.execute("""
    CREATE TABLE arc_agi_1_training_tasks AS 
    SELECT * FROM training_task_df
""")

print(f"Created table with {len(training_task_ids)} training task IDs")
print("\nFirst few rows:")
print(con.execute("SELECT * FROM arc_agi_1_training_tasks LIMIT 5").df())

Created table with 400 training task IDs

First few rows:
    task_id
0  49d1d64f
1  890034e9
2  1f0c79e5
3  1e32b0e9
4  22168020


In [5]:
# Describe the schema and some basic stats of the parquet files
schema = con.execute("""
DESCRIBE SELECT * FROM '/tmp/king_programs_partition_*.parquet'
""").df()
print("\nSchema:")
print(schema)


Schema:
              column_name   column_type null   key default extra
0                 task_id       VARCHAR  YES  None    None  None
1                    code       VARCHAR  YES  None    None  None
2  predicted_train_output  BIGINT[][][]  YES  None    None  None
3   predicted_test_output  BIGINT[][][]  YES  None    None  None
4     correct_train_input     BOOLEAN[]  YES  None    None  None
5      correct_test_input     BOOLEAN[]  YES  None    None  None
6                   model       VARCHAR  YES  None    None  None


In [6]:
# Load only fully correct programs with deduplication
print("Loading fully correct programs with deduplication (this may take a moment)...")
con.execute("""
DROP TABLE IF EXISTS fully_correct_programs;
CREATE TABLE fully_correct_programs AS
WITH deduplicated_programs AS (
    SELECT 
        p.task_id,
        p.code,
        p.predicted_train_output,
        p.predicted_test_output,
        p.correct_train_input,
        p.correct_test_input,
        p.model,
        LENGTH(p.code) as program_length,
        -- Create normalized code for deduplication
        LOWER(REGEXP_REPLACE(p.code, '\s+', '', 'g')) as normalized_code,
        -- Keep the shortest program among duplicates, with model as tiebreaker
        ROW_NUMBER() OVER (
            PARTITION BY p.task_id, LOWER(REGEXP_REPLACE(p.code, '\s+', '', 'g'))
            ORDER BY LENGTH(p.code) ASC, p.model ASC, p.code ASC
        ) as dedup_rank
    FROM '/tmp/king_programs_partition_*.parquet' p
    INNER JOIN arc_agi_1_training_tasks t ON p.task_id = t.task_id
    WHERE p.model != 'hodel-translated'
    AND (SELECT SUM(s) FROM UNNEST(p.correct_train_input) AS t(s)) = ARRAY_LENGTH(p.correct_train_input)
    AND (SELECT SUM(s) FROM UNNEST(p.correct_test_input) AS t(s)) = ARRAY_LENGTH(p.correct_test_input)
)
SELECT 
    task_id,
    code,
    predicted_train_output,
    predicted_test_output,
    correct_train_input,
    correct_test_input,
    model,
    program_length,
    ROW_NUMBER() OVER (
        PARTITION BY task_id 
        ORDER BY program_length ASC, code ASC
    ) as rank_in_task
FROM deduplicated_programs
WHERE dedup_rank = 1
""")

row_count = con.execute("SELECT COUNT(*) as count FROM fully_correct_programs").fetchone()[0]
print(f"Loaded {row_count:,} deduplicated fully correct programs")

# Show some basic stats
task_count = con.execute("SELECT COUNT(DISTINCT task_id) FROM fully_correct_programs").fetchone()[0]
print(f"Programs available for {task_count} unique tasks")

# Show distribution of programs per task
print("\nPrograms per task distribution:")
task_dist = con.execute("""
SELECT 
    programs_per_task,
    COUNT(*) as num_tasks
FROM (
    SELECT task_id, COUNT(*) as programs_per_task
    FROM fully_correct_programs 
    GROUP BY task_id
) 
GROUP BY programs_per_task
ORDER BY programs_per_task
LIMIT 10
""").df()
print(task_dist)

Loading fully correct programs with deduplication (this may take a moment)...
Loaded 29,832 deduplicated fully correct programs
Programs available for 370 unique tasks

Programs per task distribution:
   programs_per_task  num_tasks
0                  1         10
1                  2          5
2                  3          4
3                  4          4
4                  5          3
5                  6          4
6                  7          4
7                  8         13
8                  9         13
9                 10         21
Loaded 29,832 deduplicated fully correct programs
Programs available for 370 unique tasks

Programs per task distribution:
   programs_per_task  num_tasks
0                  1         10
1                  2          5
2                  3          4
3                  4          4
4                  5          3
5                  6          4
6                  7          4
7                  8         13
8                  9         13
9   

In [10]:
# Create final training dataset with only top N fully correct programs per task
print(f"Creating final training dataset with top N programs per task...")
con.execute("""
DROP TABLE IF EXISTS training_dataset;
CREATE TABLE training_dataset AS
SELECT 
    task_id,
    code,
    predicted_train_output,
    predicted_test_output,
    correct_train_input,
    correct_test_input,
    model,
    program_length,
    rank_in_task
FROM fully_correct_programs 
WHERE rank_in_task <= 50
""")

final_count = con.execute("SELECT COUNT(*) FROM training_dataset").fetchone()[0]
print(f"Final training dataset contains {final_count:,} programs")

# Show how many tasks have different numbers of programs
print("\nTasks by number of programs in final dataset:")
task_program_counts = con.execute("""
SELECT 
    COUNT(*) as program_count,
    COUNT(DISTINCT task_id) as num_tasks
FROM training_dataset
GROUP BY task_id
ORDER BY program_count
""").df()
print(task_program_counts.groupby('program_count').size())

Creating final training dataset with top N programs per task...
Final training dataset contains 12,008 programs

Tasks by number of programs in final dataset:
program_count
1      10
2       5
3       4
4       4
5       3
6       4
7       4
8      13
9      13
10     21
11      5
12      8
13      4
14      5
15      6
16      9
17      2
18      4
19      3
20      2
21      3
22      3
23      6
24      1
25      1
26      5
27      2
28      4
29      2
30      6
32      4
33      4
34      5
35      1
36      3
37      5
38      1
39      2
40      2
41      1
42      4
43      3
44      2
45      2
47      4
48      2
49      2
50    161
dtype: int64


In [11]:
# Compute summary statistics for fully correct programs
print("Training Dataset Summary Statistics:")
summary_stats = con.execute("""
SELECT 
    COUNT(*) as total_programs,
    COUNT(DISTINCT task_id) as unique_tasks,
    MIN(program_length) as min_length,
    MAX(program_length) as max_length,
    AVG(program_length) as avg_length,
    PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY program_length) as median_length
FROM training_dataset
""").df()
print(summary_stats)

# Show tasks with the most programs (up to 8 each)
print(f"\nTasks with most programs (top 10):")
top_tasks = con.execute("""
SELECT 
    task_id,
    COUNT(*) as program_count,
    MIN(program_length) as min_length,
    MAX(program_length) as max_length,
    AVG(program_length) as avg_length
FROM training_dataset
GROUP BY task_id
ORDER BY program_count DESC, task_id
LIMIT 10
""").df()
print(top_tasks)

Training Dataset Summary Statistics:
   total_programs  unique_tasks  min_length  max_length  avg_length  \
0           12008           370          42        6477  826.533145   

   median_length  
0          680.0  

Tasks with most programs (top 10):
    task_id  program_count  min_length  max_length  avg_length
0  007bbfb7             50         304         854      553.42
1  08ed6ac7             50         297         753      601.02
2  0b148d64             50         547        1036      862.90
3  0ca9ddb6             50         505        1031      865.86
4  0d3d703e             50         154         352      277.86
5  1190e5a7             50         455         820      697.32
6  178fcbfb             50         398         911      735.66
7  1cf80156             50         277         575      463.02
8  1e0a9b12             50         245         595      466.80
9  1f642eb9             50         544        1985     1188.90


In [12]:
# Export final training dataset with only original columns
print("Exporting final dataset...")
training_dataset = con.execute("""
SELECT 
    task_id,
    code,
    predicted_train_output,
    predicted_test_output,
    correct_train_input,
    correct_test_input,
    model
FROM training_dataset
ORDER BY task_id, rank_in_task
""").df()

print(f"Final training dataset shape: {training_dataset.shape}")
print(f"Number of unique tasks: {training_dataset['task_id'].nunique()}")

# Save to parquet
output_path = "/tmp/arc_training_dataset_fully_correct_50.parquet"
training_dataset.to_parquet(output_path)
print(f"\nDataset saved to: {output_path}")

# Show distribution of programs per task
print(f"\nPrograms per task distribution:")
task_counts = training_dataset.groupby('task_id').size()
print(f"Mean: {task_counts.mean():.2f}")
print(f"Median: {task_counts.median():.2f}")
print(f"Min: {task_counts.min()}, Max: {task_counts.max()}")
print(f"\nTasks with exactly 8 programs: {(task_counts == 8).sum()}")
print(f"Tasks with fewer than 8 programs: {(task_counts < 8).sum()}")

# Show a few sample task IDs and their program counts
print(f"\nSample task program counts:")
print(task_counts.head(10))

Exporting final dataset...
Final training dataset shape: (12008, 7)
Number of unique tasks: 370

Dataset saved to: /tmp/arc_training_dataset_fully_correct_50.parquet

Programs per task distribution:
Mean: 32.45
Median: 38.50
Min: 1, Max: 50

Tasks with exactly 8 programs: 13
Tasks with fewer than 8 programs: 34

Sample task program counts:
task_id
007bbfb7    50
00d62c1b    40
017c7c7b    29
025d127b     9
0520fde7    37
05269061    15
05f2a901    23
06df4c85     9
08ed6ac7    50
0962bcdd    16
dtype: int64
Final training dataset shape: (12008, 7)
Number of unique tasks: 370

Dataset saved to: /tmp/arc_training_dataset_fully_correct_50.parquet

Programs per task distribution:
Mean: 32.45
Median: 38.50
Min: 1, Max: 50

Tasks with exactly 8 programs: 13
Tasks with fewer than 8 programs: 34

Sample task program counts:
task_id
007bbfb7    50
00d62c1b    40
017c7c7b    29
025d127b     9
0520fde7    37
05269061    15
05f2a901    23
06df4c85     9
08ed6ac7    50
0962bcdd    16
dtype: int64
