In [1]:
from pathlib import Path
import sys
project_root = next((parent for parent in [Path.cwd()] + list(Path.cwd().parents) if (parent / "pyproject.toml").exists()), Path.cwd())
sys.path.append(str(project_root))

In [2]:
for file in Path.home().joinpath("julien_soar").iterdir():
    print(file)

/home/lewis/julien_soar/arc_1_val_Qwen2.5-72B-Instruct_solution.parquet
/home/lewis/julien_soar/arc_1_val_Qwen2.5-Coder-7B-Instruct_solution.parquet
/home/lewis/julien_soar/arc_1_val_Qwen2.5-Coder-14B-Instruct_solution.parquet
/home/lewis/julien_soar/arc_1_val_refinement.pkl
/home/lewis/julien_soar/arc_1_train_refinement.pkl
/home/lewis/julien_soar/arc_1_val_Qwen2.5-Coder-32B-Instruct_solution.parquet
/home/lewis/julien_soar/arc_1_val_Mistral-Large-Instruct-2407_solution.parquet


In [3]:
import pandas as pd
from pathlib import Path
import os
from io import StringIO

import pyarrow.parquet as pq

# Parquet schema inspection
for file in Path.home().joinpath("julien_soar").iterdir():
    if file.suffix == '.parquet':
        print(f"Schema for {file.name}:")
        if pq is not None:
            pf = pq.ParquetFile(file)
            print(pf.schema)
        else:
            print("pyarrow not available, falling back to pandas (loads data).")
            df = pd.read_parquet(file)
            print(df.dtypes)
        print()

Schema for arc_1_val_Qwen2.5-72B-Instruct_solution.parquet:
<pyarrow._parquet.ParquetSchema object at 0x74b549381a00>
required group field_id=-1 schema {
  optional binary field_id=-1 task_id (String);
  optional binary field_id=-1 text (String);
  optional binary field_id=-1 code (String);
  optional group field_id=-1 correct_train_input (List) {
    repeated group field_id=-1 list {
      optional boolean field_id=-1 element;
    }
  }
  optional group field_id=-1 predicted_train_output (List) {
    repeated group field_id=-1 list {
      optional group field_id=-1 element (List) {
        repeated group field_id=-1 list {
          optional group field_id=-1 element (List) {
            repeated group field_id=-1 list {
              optional int64 field_id=-1 element;
            }
          }
        }
      }
    }
  }
  optional group field_id=-1 correct_test_input (List) {
    repeated group field_id=-1 list {
      optional boolean field_id=-1 element;
    }
  }
  optional gro

In [4]:
import pandas as pd
from pathlib import Path
from llm_python.datasets.collector import generate_unique_hex_id
from llm_python.datasets.io import write_soar_parquet

def process_refinement_pickle(pkl_path, out_dir):
    """
    Loads a pickle file with dict-of-list structure, merges task_id into each dict, flattens, and writes to parquet.
    """
    obj = pd.read_pickle(pkl_path)
    rows = []
    for task_id, sample_list in obj.items():
        for sample in sample_list:
            sample = dict(sample)  # copy to avoid mutating original
            sample['task_id'] = task_id
            sample['row_id'] = generate_unique_hex_id()
            sample['is_transductive'] = False
            rows.append(sample)
    df = pd.DataFrame(rows)
    out_path = Path(out_dir) / (Path(pkl_path).stem + '.parquet')
    out_path.parent.mkdir(parents=True, exist_ok=True)
    write_soar_parquet(df, out_path)
    print(f"Wrote {out_path} with {len(df)} rows.")



In [5]:
# pkl_files = [f for f in Path.home().joinpath("julien_soar").iterdir() if f.suffix == '.pkl']
# for pkl_file in pkl_files:
#     process_refinement_pickle(pkl_file, Path.home().joinpath("julien_soar"))

In [None]:
from llm_python.datasets.validation import (
    CorrectnessRowValidationResult,
    validate_soar_dataframe,
    validate_soar_dataframe_correctness,
    validate_soar_row,
    validate_soar_row_correctness,
)
from llm_python.transduction.code_classifier import CodeTransductionClassifier
from llm_python.utils.arc_tester import ArcTester
from llm_python.utils.numpy import convert_numpy_types
import concurrent.futures
from tqdm.auto import tqdm

arc_tester = ArcTester()

def clean_soar_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the SOAR DataFrame by ensuring required columns are present and correctly typed.
    """
    # We have already checked required columns in previous steps.

    df = df.copy()
    
    def process_row_initial(row_tuple):
        _ , row = row_tuple
        row = row.to_dict()
        row["predicted_train_output"] = convert_numpy_types(row["predicted_train_output"])
        row["predicted_test_output"] = convert_numpy_types(row["predicted_test_output"])
        row["correct_train_input"] = convert_numpy_types(row["correct_train_input"])
        row["correct_test_input"] = convert_numpy_types(row["correct_test_input"])
        row["reasoning"] = str(row["reasoning"])
        row["row_id"] = generate_unique_hex_id()
        transductive_classifier = CodeTransductionClassifier()
        row["is_transductive"] = transductive_classifier.is_transductive(row["code"])[0]
        return row

    print("Computing base fields in parallel...")
    with concurrent.futures.ThreadPoolExecutor() as executor:
        processed_rows = list(tqdm(executor.map(process_row_initial, df.iterrows()), total=len(df), desc="Computing base fields"))
    
    df = pd.DataFrame(processed_rows)

    def validate_and_log_errors_parallel(df: pd.DataFrame) -> pd.Series:
        errors_set = set()
        
        def validate_row_wrapper(row_tuple):
            _ , row = row_tuple
            result = validate_soar_row(row)
            return result.is_valid, result.errors

        with concurrent.futures.ThreadPoolExecutor() as executor:
            results = list(tqdm(executor.map(validate_row_wrapper, df.iterrows()), total=len(df), desc="Validating rows"))

        is_valid_list = []
        for is_valid, errors in results:
            is_valid_list.append(is_valid)
            if errors:
                errors_set.update(errors)
        
        if errors_set:
            print(f"Unique validation errors encountered: {errors_set}")
            
        return pd.Series(is_valid_list, index=df.index)

    print("Validating rows in parallel...")
    df["is_valid"] = validate_and_log_errors_parallel(df)
    print(f"After cleaning, {df['is_valid'].sum()} out of {len(df)} rows are valid.")
    df = df[df["is_valid"]]
    df = df.drop(columns=["is_valid"])

    df = df[~df["code"].str.lower().str.contains("random|randbelow|rvs")]
    print(f"Kept {len(df)}/{len(df)} rows after filtering for randomness.")

    print(f"{len(df[df['is_transductive']])} out of {len(df)} rows are transductive.")

    print("Validating row correctness with ThreadPoolExecutor...")
    
    def validate_correctness_wrapper(row_tuple):
        _, row = row_tuple
        return validate_soar_row_correctness(row, arc_tester)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        correctness_results = list(tqdm(executor.map(validate_correctness_wrapper, df.iterrows()), total=len(df), desc="Validating correctness"))

    cleaned_rows = []
    original_rows = df.to_dict('records')
    for i, (correctness_result) in enumerate(correctness_results):
        row = original_rows[i]
        if not correctness_result.correctness_valid:
            # For safety, check if counts change
            orig_true_count = sum(row["correct_train_input"]) if isinstance(row["correct_train_input"], list) else 0
            new_true_count = sum(correctness_result.new_correct_train_input) if isinstance(correctness_result.new_correct_train_input, list) else 0
            if orig_true_count != new_true_count:
                print(
                    f"Warning: correct_train_input True count changed from {orig_true_count} to {new_true_count} for row_id {row.get('row_id', '<unknown>')}"
                )

            row["predicted_train_output"] = correctness_result.new_predicted_train_output
            row["predicted_test_output"] = correctness_result.new_predicted_test_output
            row["correct_train_input"] = correctness_result.new_correct_train_input
            row["correct_test_input"] = correctness_result.new_correct_test_input
        
        cleaned_rows.append(row)

    return pd.DataFrame(cleaned_rows)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from llm_python.datasets.io import read_soar_parquet


parquet_files = [f for f in Path.home().joinpath("julien_soar").iterdir() if f.suffix == '.parquet']

for file in parquet_files:
    print(f"Processing file: {file}")
    df = read_soar_parquet(file)
    # df = df.head(1000) # Testing
    # print(df.dtypes)
    # print(df.iloc[0])
    df = clean_soar_dataframe(df)
    out_dir = Path.home().joinpath("julien_soar_cleaned")
    out_dir.mkdir(parents=True, exist_ok=True)
    write_soar_parquet(df, out_dir / file.name)
    del df

Processing file: /home/lewis/julien_soar/arc_1_val_Qwen2.5-72B-Instruct_solution.parquet
Computing base fields in parallel...
Computing base fields in parallel...


Computing base fields:   0%|          | 0/1289137 [00:01<?, ?it/s]



AttributeError: Can't pickle local object 'clean_soar_dataframe.<locals>.process_row_initial'

In [None]:
for file in parquet_files[:1]:
    df = read_soar_parquet(file)
    print('\n\n'.join(df['code'].sample(20, random_state=42).tolist()))

def transform(grid_lst: list[list[int]]) -> list[list[int]]:
    grid = grid_lst[0]
    n = len(grid)
    output_size = n * 2 - 1
    output_grid = [[0 for _ in range(output_size)] for _ in range(output_size)]
    for i in range(n):
        if grid[i] != 0:
            for j in range(output_size):
                if i + j < output_size:
                    output_grid[i + j][output_size - 1 - j] = grid[i]
                if i - j >= 0:
                    output_grid[i - j][j] = grid[i]
                if i + j < output_size:
                    output_grid[output_size - 1 - i - j][j] = 1
                if i - j >= 0:
                    output_grid[j][i - j] = 1
    return output_grid

import numpy as np

def transform(grid_lst: list[list[int]]) -> list[list[int]]:
    grid = np.array(grid_lst)
    non_zero_indices = np.argwhere(grid != 0)
    if non_zero_indices.size == 0:
        return grid.tolist()
    min_row, min_col = non_zero_indices.min(axis=0)
    max_row, max_col = non_zer