In [None]:
from pathlib import Path
import sys
project_root = next((parent for parent in [Path.cwd()] + list(Path.cwd().parents) if (parent / "pyproject.toml").exists()), Path.cwd())
sys.path.append(str(project_root))

In [None]:
output_path = project_root / "superking_aa2.parquet"

In [None]:
from llm_python.datasets.superking import load_superking

superking_df = load_superking()

In [None]:
superking_df = superking_df.drop(columns=["reasoning"])

In [None]:

from llm_python.datasets.query import filter_soar_df

df = superking_df.copy()
df = filter_soar_df(
    df,
    exclude_subset="arc-prize-2025/evaluation",
    # any_train_correct=True,
)


In [None]:
# Deduplicate df by task_id and code, preferring rows with refined_from_id notna
initial_len = len(df)
df = (
    df.sort_values(by="refined_from_id", na_position="last")  # refined rows first
      .drop_duplicates(subset=["task_id", "code"], keep="first")
      .reset_index(drop=True)
)
print(f"Deduplicated rows: {len(df) } (removed {initial_len - len(df)})")

In [None]:
# Merge df with superking_df to pull in original columns based on refined_from_id
df = df.merge(
    superking_df[["row_id", "code", "predicted_train_output", "predicted_test_output"]],
    left_on="refined_from_id",
    right_on="row_id",
    how="left",
    suffixes=("", "_original")
)

In [None]:
import numpy as np

from llm_python.datasets.query import sample_by_task

df["correct_train_input_count"] = df["correct_train_input"].apply(lambda x: np.sum(x))
df["correct_test_input_count"] = df["correct_test_input"].apply(lambda x: np.sum(x))
df["code_length"] = df["code"].str.len()

df = sample_by_task(
    df,
    sort_keys=["correct_test_input_count", "correct_train_input_count", "code_length"],
    sort_ascending=[False, False, True],
    task_limit=20,
)

In [None]:
print(len(df))
print(df.head())

In [None]:
from llm_python.datasets.io import write_soar_parquet
from llm_python.datasets.schema import REFINEMENT_PARQUET_SCHEMA

print(f"Saving final dataset to: {output_path}")
write_soar_parquet(df, output_path, schema=REFINEMENT_PARQUET_SCHEMA)

In [None]:
# Overall fraction of rows with code_original not NA
overall_refinement_fraction = df["code_original"].notna().mean()
print(f"Overall fraction of refinement examples: {overall_refinement_fraction:.3f}")

# Per task_id fraction of refinement examples
refinement_fraction_per_task = df.groupby("task_id")["code_original"].apply(lambda x: x.notna().mean())
quantiles = refinement_fraction_per_task.quantile([0.1, 0.5, 0.9])
print("Quantiles of per task_id refinement fraction (10%, 50%, 90%):")
print(quantiles)

In [None]:
from llm_python.datasets.statistics import analyze_dataset_statistics

analyze_dataset_statistics(df, "refinement")

In [None]:
sample = df[["code_original", "code"]].sample(n=10, random_state=42)
for idx, row in sample.iterrows():
    print(f"Row {idx}:")
    print("Original code:\n", row["code_original"])
    print("Refined code:\n", row["code"])
    print("-" * 80)