In [1]:
from pathlib import Path
import sys
project_root = next((parent for parent in [Path.cwd()] + list(Path.cwd().parents) if (parent / "pyproject.toml").exists()), Path.cwd())
sys.path.append(str(project_root))

In [2]:
from google.cloud import bigquery

client = bigquery.Client(project="trelis-arc")

table_name = "trelis-arc.arc.superking_partially_correct_clean"
file_name = table_name.split('.')[-1]
source_table = "trelis-arc.arc.superking_ext"

In [3]:
create_final_table_query = f"""
CREATE OR REPLACE TABLE `{table_name}` AS

-- Clean programs by collapsing multiple empty lines into single empty lines
WITH programs_cleaned AS (
    SELECT 
        k.task_id,
        -- Clean code by collapsing multiple consecutive newlines into at most one empty line
        -- Pattern matches multiple consecutive newlines with optional whitespace
        REGEXP_REPLACE(k.code, r'\\n(\\s*\\n)+', '\\n\\n') as code,
        k.model,
        k.predicted_train_output,
        k.predicted_test_output,
        k.correct_train_input,
        k.correct_test_input
    FROM `{source_table}` k
    WHERE k.model != 'hodel-translated'
),
-- Calculate metrics and filter by grid size and correctness
programs_with_metrics AS (
    SELECT 
        task_id,
        code,
        model,
        predicted_train_output,
        predicted_test_output,
        correct_train_input,
        correct_test_input,
        -- Count correct train examples
        (SELECT SUM(IF(element, 1, 0)) 
         FROM UNNEST(correct_train_input.list)) as correct_train_count,
        -- Count correct test examples
        (SELECT SUM(IF(element, 1, 0)) 
         FROM UNNEST(correct_test_input.list)) as correct_test_count,
        -- Check grid sizes for train output
        (SELECT MAX(ARRAY_LENGTH(grid_2d.element.list)) 
         FROM UNNEST(predicted_train_output.list) AS grid_2d) as max_train_grid_height,
        (SELECT MAX(ARRAY_LENGTH(row_1d.element.list)) 
         FROM UNNEST(predicted_train_output.list) AS grid_2d,
              UNNEST(grid_2d.element.list) AS row_1d) as max_train_grid_width,
        -- Check grid sizes for test output
        (SELECT MAX(ARRAY_LENGTH(grid_2d.element.list)) 
         FROM UNNEST(predicted_test_output.list) AS grid_2d) as max_test_grid_height,
        (SELECT MAX(ARRAY_LENGTH(row_1d.element.list)) 
         FROM UNNEST(predicted_test_output.list) AS grid_2d,
              UNNEST(grid_2d.element.list) AS row_1d) as max_test_grid_width
    FROM programs_cleaned
),
-- Filter by grid size and require at least one correct in train OR test
programs_filtered AS (
    SELECT *,
        -- Create a hash key from task_id and cleaned code
        TO_HEX(SHA256(CONCAT(task_id, '|', code))) as key
    FROM programs_with_metrics
    WHERE max_train_grid_height <= 40 AND max_train_grid_width <= 40
      AND max_test_grid_height <= 40 AND max_test_grid_width <= 40
      AND (correct_train_count > 0 OR correct_test_count > 0)
)
SELECT task_id, code, model, predicted_train_output, predicted_test_output,
       correct_train_input, correct_test_input, key
FROM programs_filtered
ORDER BY task_id, code
"""

print("Executing BigQuery table creation...")
job = client.query(create_final_table_query)
result = job.result()
print(f"‚úì Table `{table_name}` created successfully")

Executing BigQuery table creation...
‚úì Table `trelis-arc.arc.superking_partially_correct_clean` created successfully


In [3]:
# Check table size
count_query = f"SELECT COUNT(*) as total_count FROM `{table_name}`"
count_result = client.query(count_query).result()
total_count = next(count_result).total_count
print(f"Total programs in table: {total_count:,}")

Total programs in table: 331,430


In [4]:
from llm_python.datasets.bigquery_export import load_bigquery_table_as_dataframe

# Load BigQuery table as DataFrame, automatically handles sharding for large tables
print("Loading BigQuery table data...")
raw_data = load_bigquery_table_as_dataframe(
    client=client,
    table_name=table_name,
    use_sharding=True,
)
print(f"Loaded {len(raw_data)} programs from BigQuery table")

Loading BigQuery table data...
Exporting BigQuery table 'trelis-arc.arc.superking_partially_correct_clean' to GCS with sharding...
Waiting for BigQuery export to complete...
Waiting for BigQuery export to complete...
‚úì Export to GCS completed successfully
Downloading sharded files from GCS...
‚úì Export to GCS completed successfully
Downloading sharded files from GCS...
Found 10 sharded files
Found 10 sharded files
‚úì Downloaded shard 1/10
‚úì Downloaded shard 1/10
‚úì Downloaded shard 2/10
‚úì Downloaded shard 2/10
‚úì Downloaded shard 3/10
‚úì Downloaded shard 3/10
‚úì Downloaded shard 4/10
‚úì Downloaded shard 4/10
‚úì Downloaded shard 5/10
‚úì Downloaded shard 5/10
‚úì Downloaded shard 6/10
‚úì Downloaded shard 6/10
‚úì Downloaded shard 7/10
‚úì Downloaded shard 7/10
‚úì Downloaded shard 8/10
‚úì Downloaded shard 8/10
‚úì Downloaded shard 9/10
‚úì Downloaded shard 9/10
‚úì Downloaded shard 10/10
Combining sharded files...
‚úì Downloaded shard 10/10
Combining sharded files...
‚úì

In [5]:
from llm_python.datasets.bigquery_converter import convert_bigquery_to_soar, save_soar_parquet

# First, let's inspect the actual data structure
print("Inspecting BigQuery data structure...")
sample_row = raw_data.iloc[0]
print(f"Sample row columns: {sample_row.index.tolist()}")
print(f"Train output type: {type(sample_row['predicted_train_output'])}")
print(f"Train correct type: {type(sample_row['correct_train_input'])}")
print(f"Key example: {sample_row['key']}")

print("\n" + "="*50)

# Convert BigQuery data to SOAR format using our reusable function
print("Converting BigQuery data to SOAR format...")
final_dataset = convert_bigquery_to_soar(raw_data, show_progress=True)

# Save the final dataset
if len(final_dataset) > 0:
    output_path = f"/tmp/{file_name}.parquet"
    print(f"Saving final dataset to: {output_path}")
    
    save_soar_parquet(final_dataset, output_path)
else:
    print("No valid data to save!")

Inspecting BigQuery data structure...
Sample row columns: ['task_id', 'code', 'model', 'predicted_train_output', 'predicted_test_output', 'correct_train_input', 'correct_test_input', 'key']
Train output type: <class 'dict'>
Train correct type: <class 'dict'>
Key example: d0fd42df8a59468f9e14211fd974ed75823c6cd7773ad12a72d1242b3fd708c8

Converting BigQuery data to SOAR format...


Converting BQ to SOAR: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 331430/331430 [00:35<00:00, 9436.10it/s] 



Successfully converted 331430 programs from 331430 input rows
Saving final dataset to: /tmp/superking_partially_correct_clean.parquet
‚úì Saved 331430 programs to /tmp/superking_partially_correct_clean.parquet with proper PyArrow schema
‚úì Saved 331430 programs to /tmp/superking_partially_correct_clean.parquet with proper PyArrow schema


In [6]:
# Validate the final dataset using our reusable validation function
from llm_python.datasets.schema import validate_soar_dataset
import pandas as pd

print("=" * 80)
print("DATASET VALIDATION")
print("=" * 80)

file_path = f"/tmp/{file_name}.parquet"

results = validate_soar_dataset(pd.read_parquet(file_path), max_grid_size=40, silent=False)

DATASET VALIDATION
‚úì Starting dataset validation...
‚úì Dataset shape: (331430, 8)
‚úì Unique tasks: 976
‚úÖ Dataset validation passed!
üìä 331,430 programs across 976 tasks
ü§ñ Models: 62
üìè Programs per task: 1-6530 (avg: 339.6)
‚úì Starting dataset validation...
‚úì Dataset shape: (331430, 8)
‚úì Unique tasks: 976
‚úÖ Dataset validation passed!
üìä 331,430 programs across 976 tasks
ü§ñ Models: 62
üìè Programs per task: 1-6530 (avg: 339.6)


In [7]:
import duckdb

con = duckdb.connect()
sample_df = con.execute(f"SELECT * FROM '{file_path}' LIMIT 10").fetchdf()
con.close()
print(sample_df)

    task_id reasoning                                               code  \
0  00576224            def transform(grid):\n\n    A = grid[0][0]\n  ...   
1  00576224            def transform(grid):\n\n    A = grid[0][0]\n  ...   
2  00576224            def transform(grid):\n\n    A = grid[0][0]\n  ...   
3  00576224            def transform(grid):\n\n    B1 = grid\n\n    B...   
4  00576224            def transform(grid):\n\n    B1 = grid\n\n    B...   
5  00576224            def transform(grid):\n\n    a = grid[0][0]\n  ...   
6  00576224            def transform(grid):\n\n    a = grid[0][0]\n  ...   
7  00576224            def transform(grid):\n\n    a = grid[0][0]\n  ...   
8  00576224            def transform(grid):\n\n    a = grid[0][0]\n  ...   
9  00576224            def transform(grid):\n\n    a, b = grid[0]\n\n...   

  correct_train_input correct_test_input  \
0        [True, True]             [True]   
1        [True, True]             [True]   
2        [True, True]          