In [None]:
from pathlib import Path
import sys
project_root = next((parent for parent in [Path.cwd()] + list(Path.cwd().parents) if (parent / "pyproject.toml").exists()), Path.cwd())
sys.path.append(str(project_root))

In [None]:
output_path = project_root / "aa2_compounded.parquet"

In [None]:
import pandas as pd
from llm_python.datasets.io import read_soar_parquet

compounded_dir = project_root / "llm_python" / "datasets" / "compounded"
compounded_files = list(compounded_dir.glob("*.parquet"))

dataframes = [read_soar_parquet(f) for f in compounded_files]

superking_df = pd.concat(dataframes, ignore_index=True)

In [None]:
superking_df = superking_df.drop(columns=["reasoning"])

In [None]:
from llm_python.transduction.code_classifier import CodeTransductionClassifier


transduction_classifier = CodeTransductionClassifier()

superking_df["is_transductive"] = superking_df["code"].apply(lambda c: transduction_classifier.is_transductive(c)[0])

In [None]:
from llm_python.datasets.query import filter_soar_df

df = superking_df.copy()
df = filter_soar_df(
    df,
    include_subset="arc-prize-2024/training",
    no_train_correct=True,
    no_test_correct=True
)


In [None]:
print(f"Filtered down to {len(df)} tasks")

In [None]:
import numpy as np

from llm_python.datasets.query import sample_by_task

df["correct_train_input_count"] = df["correct_train_input"].apply(lambda x: np.sum(x))
df["correct_test_input_count"] = df["correct_test_input"].apply(lambda x: np.sum(x))
df["code_length"] = df["code"].str.len()

df = sample_by_task(
    df,
    sort_keys=["code_length"],
    sort_ascending=[True],
    task_limit=5,
)

In [None]:
print(len(df))
print("Average code length:", df["code"].str.len().mean())
print("Code length distribution:")
print(df["code_length"].describe())
print("90th percentile code length:", np.percentile(df["code_length"], 90))

df = df[df["code_length"] <= 4000]

In [None]:
from llm_python.datasets.io import write_soar_parquet
from llm_python.datasets.schema import PARQUET_SCHEMA

print(f"Saving final dataset to: {output_path}")
write_soar_parquet(df, output_path, schema=PARQUET_SCHEMA)

In [None]:
from llm_python.datasets.statistics import analyze_dataset_statistics

analyze_dataset_statistics(df, "refinement")

In [None]:
sampled_rows = df.sample(n=10, random_state=42)
for idx, row in sampled_rows.iterrows():
    print(f"row_id: {row['row_id']}\ntask_id: {row['task_id']}\ncode:\n{row['code']}\n{'-'*80}")