In [None]:
from pathlib import Path
import sys
project_root = next((parent for parent in [Path.cwd()] + list(Path.cwd().parents) if (parent / "pyproject.toml").exists()), Path.cwd())
sys.path.append(str(project_root))

In [None]:
# Want this to load early.
import sandbox.forkserver_executor  # noqa: F401

In [None]:
for file in Path.home().joinpath("julien_soar").iterdir():
    print(file)

In [None]:
import pandas as pd
from pathlib import Path
import os
from io import StringIO

import pyarrow.parquet as pq

# Parquet schema inspection
for file in Path.home().joinpath("julien_soar").iterdir():
    if file.suffix == '.parquet':
        print(f"Schema for {file.name}:")
        if pq is not None:
            pf = pq.ParquetFile(file)
            print(pf.schema)
        else:
            print("pyarrow not available, falling back to pandas (loads data).")
            df = pd.read_parquet(file)
            print(df.dtypes)
        print()

In [None]:
import pandas as pd
from pathlib import Path
from llm_python.datasets.collector import generate_unique_hex_id
from llm_python.datasets.io import write_soar_parquet

def process_refinement_pickle(pkl_path, out_dir):
    """
    Loads a pickle file with dict-of-list structure, merges task_id into each dict, flattens, and writes to parquet.
    """
    obj = pd.read_pickle(pkl_path)
    rows = []
    for task_id, sample_list in obj.items():
        for sample in sample_list:
            sample = dict(sample)  # copy to avoid mutating original
            sample['task_id'] = task_id
            sample['row_id'] = generate_unique_hex_id()
            sample['is_transductive'] = False
            rows.append(sample)
    df = pd.DataFrame(rows)
    out_path = Path(out_dir) / (Path(pkl_path).stem + '.parquet')
    out_path.parent.mkdir(parents=True, exist_ok=True)
    write_soar_parquet(df, out_path)
    print(f"Wrote {out_path} with {len(df)} rows.")



In [None]:
# pkl_files = [f for f in Path.home().joinpath("julien_soar").iterdir() if f.suffix == '.pkl']
# for pkl_file in pkl_files:
#     process_refinement_pickle(pkl_file, Path.home().joinpath("julien_soar"))

In [None]:
from llm_python.datasets.validation import (
    CorrectnessRowValidationResult,
    validate_soar_dataframe,
    validate_soar_dataframe_correctness,
    validate_soar_row,
    validate_soar_row_correctness,
)
from llm_python.transduction.code_classifier import CodeTransductionClassifier
from llm_python.utils.arc_tester import ArcTester
from llm_python.utils.numpy import convert_numpy_types
import concurrent.futures
from tqdm.auto import tqdm

arc_tester = ArcTester()
transductive_classifier = CodeTransductionClassifier()

def clean_soar_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the SOAR DataFrame by ensuring required columns are present and correctly typed.
    """
    # We have already checked required columns in previous steps.

    df = df.copy()

    df = df[~df["code"].str.lower().str.contains("random|randbelow|rvs")]
    print(f"Kept {len(df)}/{len(df)} rows after filtering for randomness.")

    print("Validating row correctness with ThreadPoolExecutor...")
    
    def validate_correctness_wrapper(row_tuple):
        _, row = row_tuple
        res = validate_soar_row_correctness(row, arc_tester)
        # print(res.correctness_errors)
        # print(res.new_predicted_train_output)
        return res

    with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
        correctness_results = list(tqdm(executor.map(validate_correctness_wrapper, df.iterrows()), total=len(df), desc="Validating correctness"))

    cleaned_rows = []
    original_rows = df.to_dict('records')
    for i, (correctness_result) in enumerate(correctness_results):
        row = original_rows[i]
        
        if not correctness_result.correctness_valid:
            # For safety, check if counts change
            orig_true_count = sum(row["correct_train_input"]) if isinstance(row["correct_train_input"], list) else 0
            new_true_count = sum(correctness_result.new_correct_train_input) if isinstance(correctness_result.new_correct_train_input, list) else 0
            if orig_true_count != new_true_count:
                print(
                    f"Warning: correct_train_input True count changed from {orig_true_count} to {new_true_count} for row_id {row.get('row_id', '<unknown>')}"
                )
                # Print original and corrected train_0 arrays, each row on a new line for readability
                # print("Original train_0:")
                # for row_item in row['predicted_train_output'][0]:
                #     print(row_item)
                # print("Corrected train_0:")
                # for row_item in correctness_result.new_predicted_train_output[0]:
                #     print(row_item)

            row["predicted_train_output"] = correctness_result.new_predicted_train_output
            row["predicted_test_output"] = correctness_result.new_predicted_test_output
            row["correct_train_input"] = correctness_result.new_correct_train_input
            row["correct_test_input"] = correctness_result.new_correct_test_input
        
        cleaned_rows.append(row)

    df = pd.DataFrame(cleaned_rows)

    def process_row_initial(row_tuple):
        _ , row = row_tuple
        row = row.to_dict()
        row["predicted_train_output"] = convert_numpy_types(row["predicted_train_output"])
        row["predicted_test_output"] = convert_numpy_types(row["predicted_test_output"])
        row["correct_train_input"] = convert_numpy_types(row["correct_train_input"])
        row["correct_test_input"] = convert_numpy_types(row["correct_test_input"])
        row["reasoning"] = str(row["reasoning"])
        row["row_id"] = generate_unique_hex_id()
        row["is_transductive"] = transductive_classifier.is_transductive(row["code"])[0]
        return row

    print("Computing base fields in parallel...")
    with concurrent.futures.ThreadPoolExecutor() as executor:
        processed_rows = list(tqdm(executor.map(process_row_initial, df.iterrows()), total=len(df), desc="Computing base fields"))
    
    df = pd.DataFrame(processed_rows)

    def validate_and_log_errors_parallel(df: pd.DataFrame) -> pd.Series:
        errors_set = set()
        
        def validate_row_wrapper(row_tuple):
            _ , row = row_tuple
            result = validate_soar_row(row)
            return result.is_valid, result.errors

        with concurrent.futures.ThreadPoolExecutor() as executor:
            results = list(tqdm(executor.map(validate_row_wrapper, df.iterrows()), total=len(df), desc="Validating rows"))

        is_valid_list = []
        for is_valid, errors in results:
            is_valid_list.append(is_valid)
            if errors:
                errors_set.update(errors)
        
        if errors_set:
            print(f"Unique validation errors encountered: {errors_set}")
            
        return pd.Series(is_valid_list, index=df.index)

    print("Validating rows in parallel...")
    df["is_valid"] = validate_and_log_errors_parallel(df)
    print(f"After cleaning, {df['is_valid'].sum()} out of {len(df)} rows are valid.")
    df = df[df["is_valid"]]
    df = df.drop(columns=["is_valid"])

    print(f"{len(df[df['is_transductive']])} out of {len(df)} rows are transductive.")
    
    print("Validating rows post-fix in parallel...")
    df["is_valid"] = validate_and_log_errors_parallel(df)
    print(f"After cleaning, {df['is_valid'].sum()} out of {len(df)} rows are valid.")
    df = df[df["is_valid"]]

    return df

In [None]:
from llm_python.datasets.io import read_soar_parquet
import numpy as np


parquet_files = [f for f in Path.home().joinpath("julien_soar").iterdir() if f.suffix == '.parquet']

for file in parquet_files:
    print(f"Processing file: {file}")
    df = read_soar_parquet(file)
    def any_true(x):
        if isinstance(x, (list, np.ndarray)):
            return np.any(x)
        return False

    df["any_train_correct"] = df["correct_train_input"].apply(any_true)
    df["any_test_correct"] = df["correct_test_input"].apply(any_true)
    original_len = len(df)
    df = df[df["any_test_correct"] | df["any_train_correct"]]
    print(f"Filtered from {original_len} to {len(df)} rows based on any_train_correct or any_test_correct.")
    df = clean_soar_dataframe(df)
    out_dir = Path.home().joinpath("julien_soar_cleaned")
    out_dir.mkdir(parents=True, exist_ok=True)
    write_soar_parquet(df, out_dir / file.name)