In [None]:
from pathlib import Path
import sys
project_root = next((parent for parent in [Path.cwd()] + list(Path.cwd().parents) if (parent / "pyproject.toml").exists()), Path.cwd())
sys.path.append(str(project_root))

In [None]:
for file in Path.home().joinpath("julien_soar").iterdir():
    print(file)

/home/lewis/julien_soar/arc_1_val_Qwen2.5-72B-Instruct_solution.parquet
/home/lewis/julien_soar/arc_1_val_Qwen2.5-Coder-7B-Instruct_solution.parquet
/home/lewis/julien_soar/arc_1_val_Qwen2.5-Coder-14B-Instruct_solution.parquet
/home/lewis/julien_soar/arc_1_val_refinement.pkl
/home/lewis/julien_soar/arc_1_train_refinement.pkl
/home/lewis/julien_soar/arc_1_val_Qwen2.5-Coder-32B-Instruct_solution.parquet
/home/lewis/julien_soar/arc_1_val_Mistral-Large-Instruct-2407_solution.parquet


In [None]:
import pandas as pd
from pathlib import Path
import os
from io import StringIO

import pyarrow.parquet as pq

# Parquet schema inspection
for file in Path.home().joinpath("julien_soar").iterdir():
    if file.suffix == '.parquet':
        print(f"Schema for {file.name}:")
        if pq is not None:
            pf = pq.ParquetFile(file)
            print(pf.schema)
        else:
            print("pyarrow not available, falling back to pandas (loads data).")
            df = pd.read_parquet(file)
            print(df.dtypes)
        print()

In [None]:
import pandas as pd
from pathlib import Path
from llm_python.datasets.collector import generate_unique_hex_id
from llm_python.datasets.io import write_soar_parquet

def process_refinement_pickle(pkl_path, out_dir):
    """
    Loads a pickle file with dict-of-list structure, merges task_id into each dict, flattens, and writes to parquet.
    """
    obj = pd.read_pickle(pkl_path)
    rows = []
    for task_id, sample_list in obj.items():
        for sample in sample_list:
            sample = dict(sample)  # copy to avoid mutating original
            sample['task_id'] = task_id
            sample['row_id'] = generate_unique_hex_id()
            sample['is_transductive'] = False
            rows.append(sample)
    df = pd.DataFrame(rows)
    out_path = Path(out_dir) / (Path(pkl_path).stem + '.parquet')
    out_path.parent.mkdir(parents=True, exist_ok=True)
    write_soar_parquet(df, out_path)
    print(f"Wrote {out_path} with {len(df)} rows.")



In [None]:
pkl_files = [f for f in Path.home().joinpath("julien_soar").iterdir() if f.suffix == '.pkl']
for pkl_file in pkl_files:
    process_refinement_pickle(pkl_file, Path.home().joinpath("julien_soar"))

Wrote /home/lewis/julien_soar/parquet/arc_1_val_refinement.parquet with 17408 rows.
Wrote /home/lewis/julien_soar/parquet/arc_1_train_refinement.parquet with 19002 rows.
Wrote /home/lewis/julien_soar/parquet/arc_1_train_refinement.parquet with 19002 rows.


In [None]:
from llm_python.datasets.validation import (
    validate_soar_dataframe,
    validate_soar_dataframe_correctness,
    validate_soar_row,
)


def clean_soar_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the SOAR DataFrame by ensuring required columns are present and correctly typed.
    """
    # We have already checked required columns in previous steps.

    print("Validating row correctness...")
    df = df.copy()

    def validate_and_log_errors(df: pd.DataFrame) -> pd.Series:
        errors_set = set()
        def validate_row(row):
            result = validate_soar_row(row)
            if hasattr(result, "errors") and result.errors:
                errors_set.update(result.errors)
            return result.is_valid
        is_valid_series = df.apply(validate_row, axis=1)
        if errors_set:
            print(f"Unique validation errors encountered: {errors_set}")
        return is_valid_series

    df["is_valid"] = validate_and_log_errors(df)
    print(f"After cleaning, {df['is_valid'].sum()} out of {len(df)} rows are valid.")
    df = df[df["is_valid"]]
    df = df.drop(columns=["is_valid"])

    df = df[~df["code"].str.lower().str.contains("random|randbelow|rvs")]
    print(f"Kept {len(df)}/{len(df)} rows after filtering for randomness.")

    correctness_result = validate_soar_dataframe_correctness(
        df, correctness_samples=1000
    )
    print(correctness_result.summary())
    if not correctness_result.is_valid:
        raise ValueError(
            "Validation failed: Some programs do not meet the correctness requirements."
        )

    return df


In [None]:
from llm_python.datasets.io import read_soar_parquet


parquet_files = [f for f in Path.home().joinpath("julien_soar").iterdir() if f.suffix == '.parquet']

for file in parquet_files[:1]:
    df = read_soar_parquet(file)
    print(df.iloc[0])
    df = clean_soar_dataframe(df)
    del df

Validating row correctness...
Unique validation errors encountered: {'reasoning must be a string if provided', 'predicted_test_output must be a list', 'correct_test_input must be a list', 'predicted_train_output must be a list', 'correct_train_input must be a list'}
After cleaning, 0 out of 19002 rows are valid.
Kept 0/0 rows after filtering for randomness.
Correctness validation:
    Total programs: 0
    Sample size: 0
    Correctness valid: PASS
    Errors: 0
Unique validation errors encountered: {'reasoning must be a string if provided', 'predicted_test_output must be a list', 'correct_test_input must be a list', 'predicted_train_output must be a list', 'correct_train_input must be a list'}
After cleaning, 0 out of 19002 rows are valid.
Kept 0/0 rows after filtering for randomness.
Correctness validation:
    Total programs: 0
    Sample size: 0
    Correctness valid: PASS
    Errors: 0
