In [None]:
from pathlib import Path
import sys
project_root = next((parent for parent in [Path.cwd()] + list(Path.cwd().parents) if (parent / "pyproject.toml").exists()), Path.cwd())
sys.path.append(str(project_root))

In [None]:
from google.cloud import bigquery

client = bigquery.Client(project="trelis-arc")

table_name = "trelis-arc.arc.train_eval_blended_fake_ttt_50"
file_name = table_name.split('.')[-1]

In [None]:
create_final_table_query = f"""
CREATE OR REPLACE TABLE `{table_name}` AS

-- Get top 25 from training dataset (already filtered and ranked)
WITH training_top25 AS (
    SELECT 
        task_id, code, model, predicted_train_output, predicted_test_output,
        correct_train_input, correct_test_input,
        ROW_NUMBER() OVER (
            PARTITION BY task_id 
            ORDER BY task_id, code  -- Using existing order from the table
        ) as rank_in_task
    FROM `trelis-arc.arc.shortest_ratio_2_5x_filtered_250`
),
training_limited AS (
    SELECT task_id, code, model, predicted_train_output, predicted_test_output,
           correct_train_input, correct_test_input
    FROM training_top25
    WHERE rank_in_task <= 25
),
-- Get all from eval dataset (already limited to 25 per task)
eval_data AS (
    SELECT task_id, code, model, predicted_train_output, predicted_test_output,
           correct_train_input, correct_test_input
    FROM `trelis-arc.arc.shortest_ratio_2_5x_filtered_25_eval_masked_partialplus`
),
-- Union both datasets
blended_data AS (
    SELECT * FROM training_limited
    UNION ALL
    SELECT * FROM eval_data
)
SELECT task_id, code, model, predicted_train_output, predicted_test_output,
       correct_train_input, correct_test_input
FROM blended_data
ORDER BY task_id, code
"""

print("Executing BigQuery table creation...")
job = client.query(create_final_table_query)
result = job.result()
print(f"âœ“ Table `{table_name}` created successfully")

In [None]:
from llm_python.datasets.bigquery_export import load_bigquery_table_as_dataframe

# Load BigQuery table as DataFrame using our reusable function
print("Loading BigQuery table data...")
raw_data = load_bigquery_table_as_dataframe(
    client=client,
    table_name=table_name
)
print(f"Loaded {len(raw_data)} programs from BigQuery table")

In [None]:
from llm_python.datasets.bigquery_converter import convert_bigquery_to_soar, save_soar_parquet

# First, let's inspect the actual data structure
print("Inspecting BigQuery data structure...")
sample_row = raw_data.iloc[0]
print(f"Sample row columns: {sample_row.index.tolist()}")
print(f"Train output type: {type(sample_row['predicted_train_output'])}")
print(f"Train correct type: {type(sample_row['correct_train_input'])}")

print("\n" + "="*50)

# Convert BigQuery data to SOAR format using our reusable function
print("Converting BigQuery data to SOAR format...")
final_dataset = convert_bigquery_to_soar(raw_data, show_progress=True)

# Save the final dataset
if len(final_dataset) > 0:
    output_path = f"/tmp/{file_name}.parquet"
    print(f"Saving final dataset to: {output_path}")
    
    save_soar_parquet(final_dataset, output_path)
else:
    print("No valid data to save!")

In [None]:
# Validate the final dataset using our reusable validation function
from llm_python.datasets.schema import validate_soar_dataset
import pandas as pd

print("=" * 80)
print("DATASET VALIDATION")
print("=" * 80)

file_path = f"/tmp/{file_name}.parquet"

results = validate_soar_dataset(pd.read_parquet(file_path), max_grid_size=40, silent=False)


In [None]:
import duckdb

con = duckdb.connect()
sample_df = con.execute(f"SELECT * FROM '{file_path}' LIMIT 10").fetchdf()
con.close()
print(sample_df)