In [None]:
from pathlib import Path
import sys
project_root = next((parent for parent in [Path.cwd()] + list(Path.cwd().parents) if (parent / "pyproject.toml").exists()), Path.cwd())
sys.path.append(str(project_root))

In [None]:
from glob import glob
import pandas as pd

from llm_python.datasets.io import read_soar_parquet

# Get all parquet file paths
parquet_files = glob("/tmp/superking_snapshot/*.parquet")

# Read each parquet file into a dataframe using read_soar_parquet()
dfs = []
for f in parquet_files:
    try:
        df = read_soar_parquet(f)
        dfs.append(df)
    except Exception as e:
        print(f"Error reading {f}: {e}")

# Merge all dataframes into a single dataframe
merged_df = pd.concat(dfs, ignore_index=True)
print(f"Merged {len(parquet_files)} parquet files with total {len(merged_df)} rows.")

In [None]:
from llm_python.transduction.code_classifier import CodeTransductionClassifier
from tqdm import tqdm

transduction_classifier = CodeTransductionClassifier()

print("Applying transduction classifier...")
merged_df["is_transductive"] = [
    transduction_classifier.is_transductive(row["code"])[0]
    for _, row in tqdm(merged_df.iterrows(), total=len(merged_df))
]

In [None]:
print(dfs[0].info())
print(dfs[0].head())

In [None]:
from llm_python.datasets.io import validate_soar_dataframe_schema
from llm_python.utils.numpy import convert_numpy_types

merged_df["predicted_train_output"] = merged_df["predicted_train_output"].apply(convert_numpy_types)
merged_df["correct_train_input"] = merged_df["correct_train_input"].apply(convert_numpy_types)
merged_df["predicted_test_output"] = merged_df["predicted_test_output"].apply(convert_numpy_types)
merged_df["correct_test_input"] = merged_df["correct_test_input"].apply(convert_numpy_types)
validate_soar_dataframe_schema(merged_df)


In [None]:
from llm_python.datasets.validation import validate_soar_row

print("Validating rows formats...")
valid_mask = []
errors = []
for i, row in tqdm(merged_df.iterrows(), total=len(merged_df)):
    result = validate_soar_row(row)
    valid_mask.append(result.is_valid)
    if not result.is_valid:
        errors.append((i, result.errors))

# Filter out invalid rows
merged_df = merged_df[pd.Series(valid_mask, index=merged_df.index)]
print(f"Filtered dataframe: {len(merged_df)} valid rows out of {len(valid_mask)} total.")
print(f"Total invalid rows: {len(errors)}")

In [None]:
from llm_python.datasets.validation import validate_soar_dataframe_correctness


print("Validating row correctness...")

correctness_result = validate_soar_dataframe_correctness(merged_df, correctness_samples=1000)
print(correctness_result.summary())
if not correctness_result.is_valid:
    raise ValueError(
        "Validation failed: Some programs do not meet the correctness requirements."
    )

In [None]:
from llm_python.utils.arc_tester import ArcTester
from llm_python.utils.task_loader import get_task_loader
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time

print(f"Original dataset shape: {df.shape}")
print(f"Original columns: {df.columns.tolist()}")

task_loader = get_task_loader()


def process_single_row(row_data):
    """Process a single row - this function will be called in parallel"""
    idx, row = row_data
    try:
        # Create instances inside the worker process
        arc_tester = ArcTester()

        result = arc_tester.test_program(
            row["code"], task_loader.get_task(row["task_id"])
        )

        # Create corrected row with actual values from arc_tester
        corrected_row = row.copy()
        corrected_row["predicted_train_output"] = result.train_outputs
        corrected_row["predicted_test_output"] = result.test_outputs
        corrected_row["correct_train_input"] = result.correct_train_input
        corrected_row["correct_test_input"] = result.correct_test_input

        return ("success", idx, corrected_row)

    except Exception as e:
        return ("failed", idx, str(e))


# Determine optimal number of workers
num_workers = min(mp.cpu_count() - 2, 8)  # Don't use too many to avoid memory issues
print(f"Using {num_workers} parallel workers")

total_rows = len(merged_df)
all_corrected_rows = []
all_failed_indices = []

print(f"Processing {total_rows}...")

start_time = time.time()

corrected_rows = []
failed_indices = []

# Process batch in parallel with progress bar
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    futures = {
        executor.submit(process_single_row, row): row[0] for row in merged_df.iterrows()
    }
    corrected_count = 0
    failed_count = 0

    pbar = tqdm(
        as_completed(futures),
        total=len(futures),
        desc="Processing Rows",
        unit=" row",
        bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]",
    )

    for future in pbar:
        try:
            result_type, idx, result_data = future.result()
            if result_type == "success":
                corrected_rows.append(result_data)
                corrected_count += 1
            else:
                failed_indices.append(idx)
                failed_count += 1
        except Exception as e:
            failed_count += 1
            failed_idx = futures[future]
            failed_indices.append(failed_idx)
            print(f"\nAn error occurred processing row {failed_idx}: {e}")

        pbar.set_postfix(completed=corrected_count, failed=failed_count, refresh=True)

print("\nProcessing complete!")
print(f"Total successful rows: {len(corrected_rows)}")
print(f"Total failed rows: {len(failed_indices)}")
print(f"Total time: {(time.time() - start_time) / 60:.1f} minutes")


In [None]:
fixed_df = pd.DataFrame(corrected_rows)
fixed_df = fixed_df[~fixed_df["code"].str.lower().str.contains("random|randbelow|rvs")]
print(f"Kept {len(fixed_df)}/{len(corrected_rows)} rows after filtering for randomness.")

In [None]:
from llm_python.datasets.validation import validate_soar_dataframe_correctness


print("Validating row correctness...")

correctness_result = validate_soar_dataframe_correctness(fixed_df, correctness_samples=10000, seed=41)
print(correctness_result.summary())
if not correctness_result.is_valid:
    raise ValueError(
        "Validation failed: Some programs do not meet the correctness requirements."
    )

In [None]:
from llm_python.datasets.io import write_soar_parquet


write_soar_parquet(fixed_df, "/tmp/superking_merged_and_cleaned.parquet")