In [9]:
from pathlib import Path
import sys
project_root = next((parent for parent in [Path.cwd()] + list(Path.cwd().parents) if (parent / "pyproject.toml").exists()), Path.cwd())
sys.path.append(str(project_root))

In [10]:
for file in Path.home().joinpath("julien_soar").iterdir():
    print(file)

/home/lewis/julien_soar/parquet
/home/lewis/julien_soar/arc_1_train_refinement.parquet
/home/lewis/julien_soar/arc_1_val_Qwen2.5-72B-Instruct_solution.parquet
/home/lewis/julien_soar/arc_1_val_Qwen2.5-Coder-7B-Instruct_solution.parquet
/home/lewis/julien_soar/arc_1_val_refinement.parquet
/home/lewis/julien_soar/arc_1_val_Qwen2.5-Coder-14B-Instruct_solution.parquet
/home/lewis/julien_soar/arc_1_val_refinement.pkl
/home/lewis/julien_soar/arc_1_train_refinement.pkl
/home/lewis/julien_soar/arc_1_val_Qwen2.5-Coder-32B-Instruct_solution.parquet
/home/lewis/julien_soar/arc_1_val_Mistral-Large-Instruct-2407_solution.parquet


In [11]:
import pandas as pd
from pathlib import Path
import os
from io import StringIO

import pyarrow.parquet as pq

# Parquet schema inspection
for file in Path.home().joinpath("julien_soar").iterdir():
    if file.suffix == '.parquet':
        print(f"Schema for {file.name}:")
        if pq is not None:
            pf = pq.ParquetFile(file)
            print(pf.schema)
        else:
            print("pyarrow not available, falling back to pandas (loads data).")
            df = pd.read_parquet(file)
            print(df.dtypes)
        print()

Schema for arc_1_train_refinement.parquet:
<pyarrow._parquet.ParquetSchema object at 0x73ee37714bc0>
required group field_id=-1 schema {
  required binary field_id=-1 row_id (String);
  required binary field_id=-1 task_id (String);
  optional binary field_id=-1 reasoning (String);
  required binary field_id=-1 code (String);
  required group field_id=-1 correct_train_input (List) {
    repeated group field_id=-1 list {
      optional boolean field_id=-1 element;
    }
  }
  required group field_id=-1 correct_test_input (List) {
    repeated group field_id=-1 list {
      optional boolean field_id=-1 element;
    }
  }
  required group field_id=-1 predicted_train_output (List) {
    repeated group field_id=-1 list {
      optional group field_id=-1 element (List) {
        repeated group field_id=-1 list {
          optional group field_id=-1 element (List) {
            repeated group field_id=-1 list {
              optional int64 field_id=-1 element;
            }
          }
       

In [12]:
import pandas as pd
from pathlib import Path
from llm_python.datasets.collector import generate_unique_hex_id
from llm_python.datasets.io import write_soar_parquet

def process_refinement_pickle(pkl_path, out_dir):
    """
    Loads a pickle file with dict-of-list structure, merges task_id into each dict, flattens, and writes to parquet.
    """
    obj = pd.read_pickle(pkl_path)
    rows = []
    for task_id, sample_list in obj.items():
        for sample in sample_list:
            sample = dict(sample)  # copy to avoid mutating original
            sample['task_id'] = task_id
            sample['row_id'] = generate_unique_hex_id()
            sample['is_transductive'] = False
            rows.append(sample)
    df = pd.DataFrame(rows)
    out_path = Path(out_dir) / (Path(pkl_path).stem + '.parquet')
    out_path.parent.mkdir(parents=True, exist_ok=True)
    write_soar_parquet(df, out_path)
    print(f"Wrote {out_path} with {len(df)} rows.")



In [13]:
pkl_files = [f for f in Path.home().joinpath("julien_soar").iterdir() if f.suffix == '.pkl']
for pkl_file in pkl_files:
    process_refinement_pickle(pkl_file, Path.home().joinpath("julien_soar"))

Wrote /home/lewis/julien_soar/arc_1_val_refinement.parquet with 17408 rows.
Wrote /home/lewis/julien_soar/arc_1_train_refinement.parquet with 19002 rows.


In [None]:
from llm_python.datasets.validation import (
    validate_soar_dataframe,
    validate_soar_dataframe_correctness,
    validate_soar_row,
)
from llm_python.transduction.code_classifier import CodeTransductionClassifier
from llm_python.utils.numpy import convert_numpy_types


def clean_soar_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the SOAR DataFrame by ensuring required columns are present and correctly typed.
    """
    # We have already checked required columns in previous steps.

    print("Validating row correctness...")
    df = df.copy()
    df["predicted_train_output"] = df["predicted_train_output"].apply(convert_numpy_types)
    df["predicted_test_output"] = df["predicted_test_output"].apply(convert_numpy_types)
    df["correct_train_input"] = df["correct_train_input"].apply(convert_numpy_types)
    df["correct_test_input"] = df["correct_test_input"].apply(convert_numpy_types)
    df["reasoning"] = df["reasoning"].astype(str)

    def validate_and_log_errors(df: pd.DataFrame) -> pd.Series:
        errors_set = set()
        def validate_row(row):
            result = validate_soar_row(row)
            if hasattr(result, "errors") and result.errors:
                errors_set.update(result.errors)
            return result.is_valid
        is_valid_series = df.apply(validate_row, axis=1)
        if errors_set:
            print(f"Unique validation errors encountered: {errors_set}")
        return is_valid_series

    df["is_valid"] = validate_and_log_errors(df)
    print(f"After cleaning, {df['is_valid'].sum()} out of {len(df)} rows are valid.")
    df = df[df["is_valid"]]
    df = df.drop(columns=["is_valid"])

    df = df[~df["code"].str.lower().str.contains("random|randbelow|rvs")]
    print(f"Kept {len(df)}/{len(df)} rows after filtering for randomness.")

    transductive_classifier = CodeTransductionClassifier()
    df["is_transductive"] = df["code"].apply(lambda code: transductive_classifier.is_transductive(code)[0])
    print(f"{len(df[df['is_transductive']])} out of {len(df)} rows are transductive.")

    correctness_result = validate_soar_row_correctness(row)
    print(correctness_result.summary())
    if not correctness_result.is_valid:
        raise ValueError(
            "Validation failed: Some programs do not meet the correctness requirements."
        )

    return df


In [15]:
from llm_python.datasets.io import read_soar_parquet


parquet_files = [f for f in Path.home().joinpath("julien_soar").iterdir() if f.suffix == '.parquet']

for file in parquet_files[:1]:
    df = read_soar_parquet(file)
    print(df.dtypes)
    print(df.iloc[0])
    df = clean_soar_dataframe(df)
    del df

row_id                                                       string[pyarrow]
task_id                                                      string[pyarrow]
reasoning                                              large_string[pyarrow]
code                                                   large_string[pyarrow]
correct_train_input                                list<item: bool>[pyarrow]
correct_test_input                                 list<item: bool>[pyarrow]
predicted_train_output     list<item: list<item: list<item: int64>>>[pyar...
predicted_test_output      list<item: list<item: list<item: int64>>>[pyar...
model                                                        string[pyarrow]
is_transductive                                                bool[pyarrow]
refined_from_id                                              string[pyarrow]
compound_inspiration_id                                      string[pyarrow]
dtype: object
row_id                                      7873ca05d0710d5711

In [16]:
for file in parquet_files[:1]:
    df = read_soar_parquet(file)
    print('\n\n'.join(df['code'].sample(20, random_state=42).tolist()))

import numpy as np

def transform(grid_lst: list[list[int]]) -> list[list[int]]:
    grid = np.array(grid_lst)
    rows, cols = grid.shape
    new_grid = np.copy(grid)
    rows_to_transform = []
    for row in grid:
        unique_values = np.unique(row)
        if len(unique_values) == 1 and unique_values[0] == 0:
            rows_to_transform.append(True)
        elif len(unique_values) == 2 and 0 in unique_values and (4 in unique_values):
            if np.sum(row == 0) > np.sum(row == 4):
                rows_to_transform.append(True)
            else:
                rows_to_transform.append(False)
        else:
            rows_to_transform.append(False)
    cols_to_transform = []
    for col in grid.T:
        unique_values = np.unique(col)
        if len(unique_values) == 1 and unique_values[0] == 0:
            cols_to_transform.append(True)
        elif len(unique_values) == 2 and 0 in unique_values and (4 in unique_values):
            if np.sum(col == 0) > np.sum(col == 4):