In [1]:
from pathlib import Path
import sys
project_root = next((parent for parent in [Path.cwd()] + list(Path.cwd().parents) if (parent / "pyproject.toml").exists()), Path.cwd())
sys.path.append(str(project_root))

In [2]:
from google.cloud import bigquery

client = bigquery.Client(project="trelis-arc")

create_final_table_query = """
CREATE OR REPLACE TABLE `trelis-arc.arc.programs_50_correct_200_partial` AS

WITH training_tasks AS (
    SELECT DISTINCT task_id
    FROM `trelis-arc.arc.arc_task_ids`
    WHERE subset = "arc-agi-1/training"
),
-- Common base view with all filtering and computed fields
programs_base AS (
    SELECT 
        k.task_id,
        k.code,
        k.model,
        k.predicted_train_output,
        k.predicted_test_output,
        k.correct_train_input,
        k.correct_test_input,
        LENGTH(k.code) as program_length,
        -- Check if all train inputs are correct - access .list first, then .element
        (SELECT LOGICAL_AND(correct_val.element) 
         FROM UNNEST(k.correct_train_input.list) AS correct_val) as all_train_correct,
        -- Check if all test inputs are correct - access .list first, then .element
        (SELECT LOGICAL_AND(correct_val.element) 
         FROM UNNEST(k.correct_test_input.list) AS correct_val) as all_test_correct,
        -- Count correct examples - access .list first, then .element
        (SELECT COUNTIF(correct_val.element) 
         FROM UNNEST(k.correct_train_input.list) AS correct_val) + 
        (SELECT COUNTIF(correct_val.element) 
         FROM UNNEST(k.correct_test_input.list) AS correct_val) as total_correct,
        ARRAY_LENGTH(k.correct_train_input.list) + ARRAY_LENGTH(k.correct_test_input.list) as total_possible,
        -- Check grid sizes for train output - access .list first, then .element.list
        (SELECT MAX(ARRAY_LENGTH(grid_2d.element.list)) 
         FROM UNNEST(k.predicted_train_output.list) AS grid_2d) as max_train_grid_height,
        (SELECT MAX(ARRAY_LENGTH(row_1d.element.list)) 
         FROM UNNEST(k.predicted_train_output.list) AS grid_2d,
              UNNEST(grid_2d.element.list) AS row_1d) as max_train_grid_width,
        -- Check grid sizes for test output - access .list first, then .element.list
        (SELECT MAX(ARRAY_LENGTH(grid_2d.element.list)) 
         FROM UNNEST(k.predicted_test_output.list) AS grid_2d) as max_test_grid_height,
        (SELECT MAX(ARRAY_LENGTH(row_1d.element.list)) 
         FROM UNNEST(k.predicted_test_output.list) AS grid_2d,
              UNNEST(grid_2d.element.list) AS row_1d) as max_test_grid_width,
        -- Normalize code for deduplication
        LOWER(REGEXP_REPLACE(k.code, r'\\s+', '')) as normalized_code
    FROM `trelis-arc.arc.king_programs_ext` k
    INNER JOIN training_tasks t ON k.task_id = t.task_id
    WHERE k.model != 'hodel-translated'
),
-- Filter base to only valid programs (grid size constraints and at least one correct)
programs_filtered AS (
    SELECT *
    FROM programs_base
    WHERE max_train_grid_height <= 30 AND max_train_grid_width <= 30
      AND max_test_grid_height <= 30 AND max_test_grid_width <= 30
      AND total_correct > 0  -- At least one correct (moved here after calculation)
),
-- Fully correct programs: deduplicate and rank
fully_correct_deduplicated AS (
    SELECT 
        task_id, code, model, predicted_train_output, predicted_test_output,
        correct_train_input, correct_test_input, program_length,
        ROW_NUMBER() OVER (
            PARTITION BY task_id, normalized_code
            ORDER BY program_length ASC, model ASC, code ASC
        ) as dedup_rank
    FROM programs_filtered
    WHERE all_train_correct AND all_test_correct
),
fully_correct_ranked AS (
    SELECT 
        task_id, code, model, predicted_train_output, predicted_test_output,
        correct_train_input, correct_test_input,
        'fully_correct' as program_type,
        ROW_NUMBER() OVER (
            PARTITION BY task_id 
            ORDER BY program_length ASC, model ASC, code ASC
        ) as rank_in_task
    FROM fully_correct_deduplicated
    WHERE dedup_rank = 1
),
-- Partially correct programs: deduplicate and rank
partially_correct_deduplicated AS (
    SELECT 
        task_id, code, model, predicted_train_output, predicted_test_output,
        correct_train_input, correct_test_input, program_length, total_correct,
        ROW_NUMBER() OVER (
            PARTITION BY task_id, normalized_code
            ORDER BY program_length ASC, model ASC, code ASC
        ) as dedup_rank
    FROM programs_filtered
    WHERE NOT (all_train_correct AND all_test_correct)  -- Exclude fully correct
),
partially_correct_ranked AS (
    SELECT 
        task_id, code, model, predicted_train_output, predicted_test_output,
        correct_train_input, correct_test_input,
        'partially_correct' as program_type,
        ROW_NUMBER() OVER (
            PARTITION BY task_id 
            ORDER BY total_correct DESC, program_length ASC, model ASC, code ASC
        ) as rank_in_task
    FROM partially_correct_deduplicated
    WHERE dedup_rank = 1
),
-- Combine both types with limit of 50 fully correct and 200 partially correct
combined_programs AS (
    SELECT task_id, code, model, predicted_train_output, predicted_test_output,
           correct_train_input, correct_test_input, program_type
    FROM fully_correct_ranked WHERE rank_in_task <= 50
    
    UNION ALL
    
    SELECT task_id, code, model, predicted_train_output, predicted_test_output,
           correct_train_input, correct_test_input, program_type
    FROM partially_correct_ranked WHERE rank_in_task <= 200
)
SELECT task_id, code, model, predicted_train_output, predicted_test_output,
       correct_train_input, correct_test_input, program_type
FROM combined_programs
ORDER BY task_id, program_type, code
"""

print("Executing BigQuery table creation...")
job = client.query(create_final_table_query)
result = job.result()
print(f"‚úì Table `trelis-arc.arc.programs_50_correct_200_partial` created successfully")

# Get statistics about the created table
stats_query = """
SELECT 
    program_type,
    COUNT(*) as num_programs,
    COUNT(DISTINCT task_id) as unique_tasks
FROM `trelis-arc.arc.programs_50_correct_200_partial`
GROUP BY program_type
ORDER BY program_type
"""

stats = client.query(stats_query).to_dataframe()
print(f"\nTable statistics:")
print(stats)

# Check programs per task distribution
distribution_query = """
SELECT 
    task_id,
    program_type,
    COUNT(*) as num_programs
FROM `trelis-arc.arc.programs_50_correct_200_partial`
GROUP BY task_id, program_type
ORDER BY task_id, program_type
"""

distribution = client.query(distribution_query).to_dataframe()
print(f"\nPrograms per task distribution:")
print(f"Total task-type combinations: {len(distribution)}")
print(f"Distribution summary:")
print(distribution.groupby('program_type')['num_programs'].describe())

Executing BigQuery table creation...
‚úì Table `trelis-arc.arc.programs_50_correct_200_partial` created successfully
‚úì Table `trelis-arc.arc.programs_50_correct_200_partial` created successfully





Table statistics:
        program_type  num_programs  unique_tasks
0      fully_correct         12008           370
1  partially_correct         57475           398

Programs per task distribution:
Total task-type combinations: 768
Distribution summary:
                   count        mean        std  min   25%    50%    75%  \
program_type                                                               
fully_correct      370.0   32.454054   18.38166  1.0  12.0   38.5   50.0   
partially_correct  398.0  144.409548  71.175383  1.0  73.0  200.0  200.0   

                     max  
program_type              
fully_correct       50.0  
partially_correct  200.0  

Programs per task distribution:
Total task-type combinations: 768
Distribution summary:
                   count        mean        std  min   25%    50%    75%  \
program_type                                                               
fully_correct      370.0   32.454054   18.38166  1.0  12.0   38.5   50.0   
partially_corre



In [3]:
from google.cloud import bigquery, storage
from datetime import datetime
import pandas as pd

# Fast GCS export approach for large datasets
print("Using Google Cloud Storage export for fast data transfer...")

# Use fixed filenames for this notebook
gcs_uri = f"gs://trelis-arc/tmp/mixed_partial_dataset_50_200.parquet"
local_file = f"/tmp/mixed_partial_dataset_50_200.parquet"

print(f"Exporting BigQuery table to: {gcs_uri}")

# Export to Cloud Storage (much faster for large datasets)
export_job_config = bigquery.ExtractJobConfig()
export_job_config.destination_format = bigquery.DestinationFormat.PARQUET

extract_job = client.extract_table(
    "trelis-arc.arc.programs_50_correct_200_partial",
    gcs_uri,
    job_config=export_job_config
)

print("Waiting for BigQuery export to complete...")
extract_job.result()  # Wait for export to complete
print("‚úì Export to GCS completed successfully")

# Download and read the parquet file directly
print("Downloading from GCS...")
storage_client = storage.Client()
bucket = storage_client.bucket('trelis-arc')
blob = bucket.blob('tmp/mixed_partial_dataset_50_200.parquet')
blob.download_to_filename(local_file)
print("‚úì Download completed")

# Read the parquet file
print("Reading parquet file...")
raw_data = pd.read_parquet(local_file)
print(f"Loaded {len(raw_data)} programs from parquet file")

Using Google Cloud Storage export for fast data transfer...
Exporting BigQuery table to: gs://trelis-arc/tmp/mixed_partial_dataset_50_200.parquet
Waiting for BigQuery export to complete...
Waiting for BigQuery export to complete...
‚úì Export to GCS completed successfully
Downloading from GCS...
‚úì Export to GCS completed successfully
Downloading from GCS...
‚úì Download completed
Reading parquet file...
‚úì Download completed
Reading parquet file...
Loaded 69483 programs from parquet file
Loaded 69483 programs from parquet file


In [4]:
# First, let's inspect the actual data structure
print("Inspecting BigQuery data structure...")
sample_row = raw_data.iloc[0]
print(f"Sample row columns: {sample_row.index.tolist()}")
print(f"Train output type: {type(sample_row['predicted_train_output'])}")
print(f"Train output content: {sample_row['predicted_train_output']}")
print(f"Train correct type: {type(sample_row['correct_train_input'])}")
print(f"Train correct content: {sample_row['correct_train_input']}")

if hasattr(sample_row['predicted_train_output'], 'keys'):
    print(f"Train output keys: {list(sample_row['predicted_train_output'].keys())}")
if hasattr(sample_row['correct_train_input'], 'keys'):
    print(f"Train correct keys: {list(sample_row['correct_train_input'].keys())}")

print("\n" + "="*50)

from tqdm import tqdm
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from llm_python.datasets.schema import PARQUET_SCHEMA

def convert_bq_nested_structure(bq_data):
    """Convert BigQuery nested structure to proper list format
    Handles the complex nested structure from BigQuery exports
    """
    if bq_data is None:
        return []
    
    # If it's already a simple list, return it
    if isinstance(bq_data, list):
        return bq_data
    
    # Handle BigQuery's nested structure
    if isinstance(bq_data, dict):
        if 'list' in bq_data:
            list_data = bq_data['list']
            
            # Convert numpy array to list if needed
            if hasattr(list_data, 'tolist'):
                list_data = list_data.tolist()
            
            # If it's a list of dicts with 'element' key, extract the elements
            if isinstance(list_data, list) and len(list_data) > 0:
                if isinstance(list_data[0], dict) and 'element' in list_data[0]:
                    # This is a list of {"element": actual_data} structures
                    result = []
                    for item in list_data:
                        if isinstance(item, dict) and 'element' in item:
                            element = item['element']
                            # Recursively convert nested structures
                            if isinstance(element, dict) and 'list' in element:
                                result.append(convert_bq_nested_structure(element))
                            else:
                                result.append(element)
                        else:
                            result.append(item)
                    return result
                else:
                    return list_data
            else:
                return list_data if isinstance(list_data, list) else []
        else:
            # Not a standard BigQuery list structure
            return []
    
    return []

def extract_boolean_values(bool_array):
    """Extract boolean values from the {'element': bool} format"""
    if not isinstance(bool_array, list):
        return []
    
    result = []
    for item in bool_array:
        if isinstance(item, dict) and 'element' in item:
            result.append(bool(item['element']))
        else:
            result.append(bool(item))
    return result

def validate_converted_data(data_dict):
    """Validate a single converted data dict against the expected schema"""
    try:
        # Check required fields exist
        required_fields = ['task_id', 'code', 'model', 'predicted_train_output', 
                          'predicted_test_output', 'correct_train_input', 'correct_test_input']
        for field in required_fields:
            if field not in data_dict:
                return False, f"Missing field: {field}"
        
        # Check types
        if not isinstance(data_dict['task_id'], str):
            return False, f"task_id should be str, got {type(data_dict['task_id'])}"
        if not isinstance(data_dict['code'], str):
            return False, f"code should be str, got {type(data_dict['code'])}"
        if not isinstance(data_dict['model'], str):
            return False, f"model should be str, got {type(data_dict['model'])}"
        
        # Check 3D arrays (List[List[List[int]]])
        for field in ['predicted_train_output', 'predicted_test_output']:
            arr = data_dict[field]
            if not isinstance(arr, list):
                return False, f"{field} should be list, got {type(arr)}"
            for i, grid in enumerate(arr):
                if not isinstance(grid, list):
                    return False, f"{field}[{i}] should be list (2D grid), got {type(grid)}"
                for j, row in enumerate(grid):
                    if not isinstance(row, list):
                        return False, f"{field}[{i}][{j}] should be list (row), got {type(row)}"
                    for k, cell in enumerate(row):
                        if not isinstance(cell, int):
                            return False, f"{field}[{i}][{j}][{k}] should be int, got {type(cell)}"
        
        # Check boolean arrays
        for field in ['correct_train_input', 'correct_test_input']:
            arr = data_dict[field]
            if not isinstance(arr, list):
                return False, f"{field} should be list, got {type(arr)}"
            for i, val in enumerate(arr):
                if not isinstance(val, bool):
                    return False, f"{field}[{i}] should be bool, got {type(val)}"
        
        return True, "Valid"
    except Exception as e:
        return False, f"Validation error: {e}"

print("Converting BigQuery nested structures to proper arrays...")

# Test conversion on first few rows to debug
print("Testing conversion on first row...")
test_row = raw_data.iloc[0]
print(f"Original train output: {test_row['predicted_train_output']}")
converted_train = convert_bq_nested_structure(test_row['predicted_train_output'])
print(f"Converted train output: {converted_train}")
print(f"Converted type: {type(converted_train)}")
if isinstance(converted_train, list) and len(converted_train) > 0:
    print(f"First grid type: {type(converted_train[0])}")
    print(f"First grid content: {converted_train[0]}")

# Convert data with proper handling of BigQuery structure
converted_data = []
validation_errors = []

for idx in tqdm(range(len(raw_data)), desc="Converting structures"):
    row = raw_data.iloc[idx]
    
    try:
        converted_row = {
            'task_id': row['task_id'],
            'code': row['code'], 
            'model': row['model'],
            'predicted_train_output': convert_bq_nested_structure(row['predicted_train_output']),
            'predicted_test_output': convert_bq_nested_structure(row['predicted_test_output']),
            'correct_train_input': extract_boolean_values(convert_bq_nested_structure(row['correct_train_input'])),
            'correct_test_input': extract_boolean_values(convert_bq_nested_structure(row['correct_test_input']))
        }
        
        # Validate the converted row
        is_valid, error_msg = validate_converted_data(converted_row)
        if is_valid:
            converted_data.append(converted_row)
        else:
            validation_errors.append(f"Row {idx}: {error_msg}")
            if len(validation_errors) <= 5:  # Only print first few errors
                print(f"Validation error for row {idx}: {error_msg}")
    
    except Exception as e:
        validation_errors.append(f"Row {idx}: Conversion error: {e}")
        if len(validation_errors) <= 5:
            print(f"Conversion error for row {idx}: {e}")

print(f"Successfully converted {len(converted_data)} programs")
if validation_errors:
    print(f"Had {len(validation_errors)} validation/conversion errors")

# Create DataFrame from successfully converted data
final_dataset = pd.DataFrame(converted_data)
print(f"Final dataset has {len(final_dataset)} programs")

# Save the final dataset
if len(final_dataset) > 0:
    # Add missing columns with default values for schema compliance
    schema_df = final_dataset.copy()
    schema_df['reasoning'] = ''  # Empty reasoning for now
    schema_df['train_input'] = [[] for _ in range(len(schema_df))]  # Empty for now
    schema_df['test_input'] = [[] for _ in range(len(schema_df))]   # Empty for now
    schema_df['generation'] = 0  # Default generation

    # Reorder columns to match schema
    schema_columns = ['task_id', 'reasoning', 'code', 'correct_train_input', 'correct_test_input',
                     'predicted_train_output', 'predicted_test_output', 'train_input', 'test_input',
                     'model', 'generation']
    schema_df = schema_df[schema_columns]
    
    # Save with PyArrow to ensure proper schema - use fixed filename
    output_path = "/tmp/mixed_partial_dataset_50_200_final.parquet"
    print(f"Saving final dataset to: {output_path}")
    
    try:
        # Convert to PyArrow table with explicit schema
        table = pa.Table.from_pandas(schema_df, schema=PARQUET_SCHEMA)
        pq.write_table(table, output_path)
        print("‚úì Saved with proper PyArrow schema")
    except Exception as e:
        print(f"PyArrow save failed ({e}), using pandas fallback")
        schema_df.to_parquet(output_path, index=False)

else:
    print("No valid data to save!")

Inspecting BigQuery data structure...
Sample row columns: ['task_id', 'code', 'model', 'predicted_train_output', 'predicted_test_output', 'correct_train_input', 'correct_test_input', 'program_type']
Train output type: <class 'dict'>
Train output content: {'list': array([{'element': {'list': array([{'element': {'list': array([{'element': 0}, {'element': 0}, {'element': 0}, {'element': 0},
                     {'element': 0}, {'element': 0}, {'element': 0}, {'element': 0},
                     {'element': 0}, {'element': 0}], dtype=object)}}                                    ,
              {'element': {'list': array([{'element': 0}, {'element': 1}, {'element': 1}, {'element': 1},
                     {'element': 1}, {'element': 1}, {'element': 1}, {'element': 0},
                     {'element': 0}, {'element': 0}], dtype=object)}}                                    ,
              {'element': {'list': array([{'element': 0}, {'element': 1}, {'element': 0}, {'element': 0},
             

Converting structures: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 69483/69483 [00:06<00:00, 11428.77it/s]



Successfully converted 69483 programs
Final dataset has 69483 programs
Saving final dataset to: /tmp/mixed_partial_dataset_50_200_final.parquet
‚úì Saved with proper PyArrow schema
‚úì Saved with proper PyArrow schema


In [5]:
# Comprehensive verification of the final dataset
import duckdb
import pandas as pd

print("=" * 80)
print("COMPREHENSIVE DATASET VERIFICATION")
print("=" * 80)

file_path = "/tmp/mixed_partial_dataset_50_200_final.parquet"

# 1. Basic file and structure verification
print("\n1. BASIC DATASET INFO:")
print("-" * 40)
verification_df = pd.read_parquet(file_path)
print(f"‚úì File loaded successfully")
print(f"‚úì Dataset shape: {verification_df.shape}")
print(f"‚úì Columns: {list(verification_df.columns)}")
print(f"‚úì Unique tasks: {verification_df['task_id'].nunique()}")
print(f"‚úì Programs per task (mean): {verification_df.groupby('task_id').size().mean():.2f}")

# 2. Data type verification
print("\n2. DATA TYPE VERIFICATION:")
print("-" * 40)
sample = verification_df.iloc[0]

# Check predicted outputs (should be 3D arrays: List[List[List[int]]])
train_output = sample['predicted_train_output']
test_output = sample['predicted_test_output']

print(f"‚úì Train output type: {type(train_output)} (expected: list)")
print(f"‚úì Train output length: {len(train_output)} grids")
if len(train_output) > 0:
    first_grid = train_output[0]
    print(f"‚úì First grid type: {type(first_grid)} (expected: list)")
    if isinstance(first_grid, list) and len(first_grid) > 0:
        print(f"‚úì Grid dimensions: {len(first_grid)} x {len(first_grid[0])}")
        if len(first_grid[0]) > 0:
            cell_value = first_grid[0][0]
            print(f"‚úì Cell value type: {type(cell_value)} = {cell_value} (expected: int)")

print(f"‚úì Test output type: {type(test_output)} (expected: list)")
print(f"‚úì Test output length: {len(test_output)} grids")

# Check correctness arrays (should be 1D boolean arrays: List[bool])
train_correct = sample['correct_train_input']
test_correct = sample['correct_test_input']

print(f"‚úì Train correct type: {type(train_correct)} (expected: list)")
print(f"‚úì Train correct values: {train_correct}")
if len(train_correct) > 0:
    print(f"‚úì First correct value type: {type(train_correct[0])} (expected: bool)")

print(f"‚úì Test correct type: {type(test_correct)} (expected: list)")
print(f"‚úì Test correct values: {test_correct}")

# 3. Data completeness verification
print("\n3. DATA COMPLETENESS:")
print("-" * 40)
non_empty_train = verification_df['predicted_train_output'].apply(lambda x: len(x) > 0).sum()
non_empty_test = verification_df['predicted_test_output'].apply(lambda x: len(x) > 0).sum()
non_empty_train_correct = verification_df['correct_train_input'].apply(lambda x: len(x) > 0).sum()
non_empty_test_correct = verification_df['correct_test_input'].apply(lambda x: len(x) > 0).sum()

print(f"‚úì Rows with train outputs: {non_empty_train}/{len(verification_df)} ({100*non_empty_train/len(verification_df):.1f}%)")
print(f"‚úì Rows with test outputs: {non_empty_test}/{len(verification_df)} ({100*non_empty_test/len(verification_df):.1f}%)")
print(f"‚úì Rows with train correctness: {non_empty_train_correct}/{len(verification_df)} ({100*non_empty_train_correct/len(verification_df):.1f}%)")
print(f"‚úì Rows with test correctness: {non_empty_test_correct}/{len(verification_df)} ({100*non_empty_test_correct/len(verification_df):.1f}%)")

# 4. DuckDB compatibility verification
print("\n4. DUCKDB COMPATIBILITY:")
print("-" * 40)
con = duckdb.connect()

try:
    # Schema check
    schema = con.execute(f"DESCRIBE '{file_path}'").fetchdf()
    print(f"‚úì DuckDB can read schema ({len(schema)} columns)")
    
    # Basic query check
    sample_query = f"""
    SELECT 
        task_id,
        model,
        length(predicted_train_output) as num_train_grids,
        length(predicted_test_output) as num_test_grids,
        length(correct_train_input) as num_train_examples,
        length(correct_test_input) as num_test_examples
    FROM '{file_path}' 
    LIMIT 3
    """
    sample_data = con.execute(sample_query).fetchdf()
    print(f"‚úì Basic queries work")
    print(f"  Sample data shape: {sample_data.shape}")
    
    # 3D array access check
    nested_query = f"""
    SELECT 
        task_id,
        predicted_train_output[1] as first_train_grid,
        length(predicted_train_output[1]) as grid_height,
        length(predicted_train_output[1][1]) as grid_width
    FROM '{file_path}' 
    WHERE length(predicted_train_output) > 0 
      AND length(predicted_train_output[1]) > 0
    LIMIT 2
    """
    nested_data = con.execute(nested_query).fetchdf()
    print(f"‚úì 3D array access works")
    print(f"  Grid access sample: {nested_data.shape}")
    
    print(f"‚úì All DuckDB operations successful!")
    
except Exception as e:
    print(f"‚úó DuckDB error: {e}")

finally:
    con.close()

# 5. Summary statistics
print("\n5. DATASET STATISTICS:")
print("-" * 40)
task_stats = verification_df.groupby('task_id').size()
model_stats = verification_df['model'].value_counts()

print(f"‚úì Tasks with programs: {len(task_stats)}")
print(f"‚úì Programs per task: min={task_stats.min()}, max={task_stats.max()}, mean={task_stats.mean():.1f}")
print(f"‚úì Models represented: {len(model_stats)}")
print(f"‚úì Top 3 models: {dict(model_stats.head(3))}")

print("\n" + "=" * 80)
print("‚úÖ VERIFICATION COMPLETE - Dataset is ready for use!")
print("=" * 80)
print(f"üìÅ Final dataset location: {file_path}")
print(f"üìä Total programs: {len(verification_df):,}")
print(f"üéØ Unique tasks: {verification_df['task_id'].nunique()}")
print(f"ü§ñ Models: {len(model_stats)}")
print("=" * 80)

COMPREHENSIVE DATASET VERIFICATION

1. BASIC DATASET INFO:
----------------------------------------
‚úì File loaded successfully
‚úì Dataset shape: (69483, 8)
‚úì Columns: ['task_id', 'reasoning', 'code', 'correct_train_input', 'correct_test_input', 'predicted_train_output', 'predicted_test_output', 'model']
‚úì Unique tasks: 399
‚úì Programs per task (mean): 174.14

2. DATA TYPE VERIFICATION:
----------------------------------------
‚úì Train output type: <class 'numpy.ndarray'> (expected: list)
‚úì Train output length: 4 grids
‚úì First grid type: <class 'numpy.ndarray'> (expected: list)
‚úì Test output type: <class 'numpy.ndarray'> (expected: list)
‚úì Test output length: 1 grids
‚úì Train correct type: <class 'numpy.ndarray'> (expected: list)
‚úì Train correct values: [False  True  True False]
‚úì First correct value type: <class 'numpy.bool'> (expected: bool)
‚úì Test correct type: <class 'numpy.ndarray'> (expected: list)
‚úì Test correct values: [False]

3. DATA COMPLETENESS:
---

In [6]:
import duckdb

con = duckdb.connect()
sample_df = con.execute(f"SELECT * FROM '{file_path}' LIMIT 10").fetchdf()
con.close()
print(sample_df)

    task_id reasoning                                               code  \
0  56ff96f3            import numpy as np\n\ndef transform(grid_lst: ...   
1  56ff96f3            import numpy as np\n\ndef transform(grid_lst: ...   
2  56ff96f3            import numpy as np\n\ndef transform(grid_lst: ...   
3  56ff96f3            import numpy as np\n\ndef transform(grid_lst: ...   
4  56ff96f3            import numpy as np\n\ndef transform(grid_lst: ...   
5  56ff96f3            import numpy as np\n\ndef transform(grid_lst: ...   
6  56ff96f3            import numpy as np\n\ndef transform(grid_lst: ...   
7  56ff96f3            import numpy as np\n\ndef transform(grid_lst: ...   
8  56ff96f3            import numpy as np\n\ndef transform(grid_lst: ...   
9  56ff96f3            import numpy as np\n\ndef transform(grid_lst: ...   

          correct_train_input correct_test_input  \
0  [False, True, True, False]            [False]   
1   [True, True, False, True]             [True]   
2    [T