In [None]:
from pathlib import Path
import sys
project_root = next((parent for parent in [Path.cwd()] + list(Path.cwd().parents) if (parent / "pyproject.toml").exists()), Path.cwd())
sys.path.append(str(project_root))

In [None]:
from glob import glob
import pandas as pd

from llm_python.datasets.io import read_soar_parquet

# Get all parquet file paths
parquet_files = glob("/tmp/superking_snapshot/*.parquet")

# Read each parquet file into a dataframe using read_soar_parquet()
dfs = []
for f in parquet_files:
    try:
        df = read_soar_parquet(f)
        dfs.append(df)
    except Exception as e:
        print(f"Error reading {f}: {e}")

# Merge all dataframes into a single dataframe
merged_df = pd.concat(dfs, ignore_index=True)
print(f"Merged {len(parquet_files)} parquet files with total {len(merged_df)} rows.")

In [None]:
print(dfs[0].info())
print(dfs[0].head())

In [None]:
from llm_python.datasets.io import validate_soar_dataframe_schema
from llm_python.utils.numpy import convert_numpy_types

merged_df["predicted_train_output"] = merged_df["predicted_train_output"].apply(convert_numpy_types)
merged_df["correct_train_input"] = merged_df["correct_train_input"].apply(convert_numpy_types)
merged_df["predicted_test_output"] = merged_df["predicted_test_output"].apply(convert_numpy_types)
merged_df["correct_test_input"] = merged_df["correct_test_input"].apply(convert_numpy_types)
validate_soar_dataframe_schema(merged_df)


In [None]:
import secrets

def generate_unique_hex_id(n_rows, hex_length=32):
    # 32 hex chars = 128 bits, collision probability is negligible for 10M rows
    return [secrets.token_hex(hex_length // 2) for _ in range(n_rows)]

fixed_df["id"] = generate_unique_hex_id(len(fixed_df))

In [None]:
from llm_python.datasets.validation import validate_soar_row
from tqdm import tqdm

print("Validating rows formats...")
valid_mask = []
errors = []
for i, row in tqdm(merged_df.iterrows(), total=len(merged_df)):
    result = validate_soar_row(row)
    valid_mask.append(result.is_valid)
    if not result.is_valid:
        errors.append((i, result.errors))

# Filter out invalid rows
merged_df = merged_df[pd.Series(valid_mask, index=merged_df.index)]``
print(f"Filtered dataframe: {len(merged_df)} valid rows out of {len(valid_mask)} total.")
print(f"Total invalid rows: {len(errors)}")

In [None]:
from llm_python.datasets.validation import validate_soar_dataframe_correctness


print("Validating row correctness...")

correctness_result = validate_soar_dataframe_correctness(merged_df, correctness_samples=1000)
print(correctness_result.summary())
if not correctness_result.is_valid:
    raise ValueError(
        "Validation failed: Some programs do not meet the correctness requirements."
    )

In [None]:
fixed_df = pd.DataFrame(merged_df)
fixed_df = fixed_df[~fixed_df["code"].str.lower().str.contains("random|randbelow|rvs")]
print(f"Kept {len(fixed_df)}/{len(merged_df)} rows after filtering for randomness.")

In [None]:
from llm_python.datasets.io import write_soar_parquet


write_soar_parquet(fixed_df, "/tmp/superking_merged_and_cleaned.parquet")