In [None]:
from pathlib import Path
import sys
project_root = next((parent for parent in [Path.cwd()] + list(Path.cwd().parents) if (parent / "pyproject.toml").exists()), Path.cwd())
sys.path.append(str(project_root))

In [None]:
output_path = project_root / "arc_2_training_hard_curriculum.parquet"

In [None]:
from llm_python.datasets.superking import load_superking

superking_df = load_superking()

In [None]:
# Keep only refined examples.
from llm_python.datasets.query import filter_soar_df

df = superking_df.copy()
df = filter_soar_df(
    df,
    include_subset="arc-prize-2025/training-hard",
    exclude_transductive=True,
    any_train_correct=True,
)
print(f"Number of rows after filtering to arc-prize-2025/training-hard: {len(df)}")


In [None]:
import numpy as np


def select_top_n_by_task(
    df,
    n=1,
):
    """
    Groups the dataframe by `groupby_column`, sorts within each group by the sum of `correct_columns` (descending),
    then by the length of `code_column` (ascending), and selects the top N rows per group.

    Parameters:
        df (pd.DataFrame): Input dataframe.
        n (int): Number of top rows to select per group.
        correct_columns (tuple): Columns to sum for correctness.
        code_column (str): Column containing code whose length is used for sorting.
        groupby_column (str): Column to group by.

    Returns:
        pd.DataFrame: Filtered dataframe with top N per group.
    """
    df = df.copy()
    df["correct_train_input_count"] = df["correct_train_input"].apply(
        lambda x: np.sum(x)
    )
    df["correct_test_input_count"] = df["correct_test_input"].apply(lambda x: np.sum(x))
    df["code_length"] = df["code"].str.len()
    grouped = (
        df.sort_values(
            by=["correct_test_input_count", "correct_train_input_count", "code_length"],
            ascending=[False, False, True],
        )
        .groupby("task_id")
    )
    def filter_group(group):
        top_code_length = group.iloc[0]["code_length"]
        filtered = group[group["code_length"] <= 2.5 * top_code_length]
        return filtered.head(n)

    grouped = grouped.apply(filter_group).reset_index(drop=True)
    return grouped.drop(columns=["correct_train_input_count", "code_length"])


df_top = select_top_n_by_task(df, n=10)
print(df_top.head())

In [None]:
import pyarrow as pa
import pyarrow.parquet as pq

from llm_python.datasets.schema import PARQUET_SCHEMA


print(f"Saving final dataset to: {output_path}")
table = pa.Table.from_pandas(df, schema=PARQUET_SCHEMA)
pq.write_table(table, output_path)

In [None]:
from llm_python.datasets.statistics import analyze_dataset_statistics

analyze_dataset_statistics(df_top, "curriculum")