# Clean Superking Dataset - Transduction Filtering

This notebook loads the SOAR format parquet dataset and filters out transductive programs using the CodeTransductionClassifier.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

# Add the project root to path
sys.path.append('..')
from llm_python.transduction.code_classifier import CodeTransductionClassifier

## Load the Parquet Dataset

In [2]:
# Load the dataset
input_file = "/tmp/superking_analysis.parquet"
df = pd.read_parquet(input_file)

print(f"Loaded dataset with {len(df)} programs")
print(f"Columns: {list(df.columns)}")
print(f"\nDataset info:")
print(df.info())
print(f"\nModels in dataset:")
print(df['model'].value_counts())

Loaded dataset with 292866 programs
Columns: ['task_id', 'reasoning', 'code', 'correct_train_input', 'correct_test_input', 'predicted_train_output', 'predicted_test_output', 'model', 'is_test_transductive']

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292866 entries, 0 to 292865
Data columns (total 9 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   task_id                 292866 non-null  object
 1   reasoning               292866 non-null  object
 2   code                    292866 non-null  object
 3   correct_train_input     292866 non-null  object
 4   correct_test_input      292866 non-null  object
 5   predicted_train_output  292866 non-null  object
 6   predicted_test_output   292866 non-null  object
 7   model                   292866 non-null  object
 8   is_test_transductive    16206 non-null   object
dtypes: object(9)
memory usage: 20.1+ MB
None

Models in dataset:
model
Qwen2.5-Coder

## Initialize the Code Transduction Classifier

In [3]:
# Initialize the classifier
classifier = CodeTransductionClassifier()
print("CodeTransductionClassifier initialized")

CodeTransductionClassifier initialized


## Apply Transduction Classification

In [5]:
# Apply transduction classification to all programs using parallel processing
from multiprocessing import Pool, cpu_count
from functools import partial

print("Classifying programs for transductive behavior...")
print(f"Using {cpu_count()} CPU cores for parallel processing")

def classify_program(code_and_classifier):
    """Helper function for parallel processing"""
    code, classifier = code_and_classifier
    is_transductive, _ = classifier.is_transductive(code)
    return is_transductive

# Create a partial function with the classifier
classify_func = partial(classify_program)

# Prepare data for parallel processing
codes_and_classifier = [(row['code'], classifier) for _, row in df.iterrows()]

# Use multiprocessing pool for parallel classification
with Pool(processes=cpu_count()) as pool:
    print(f"Processing {len(codes_and_classifier)} programs in parallel...")
    transduction_results = pool.map(classify_func, codes_and_classifier)

# Add results to dataframe
df['code_is_transductive'] = transduction_results

print(f"\nClassification complete!")

Classifying programs for transductive behavior...
Using 32 CPU cores for parallel processing
Processing 292866 programs in parallel...
Processing 292866 programs in parallel...

Classification complete!

Classification complete!


In [10]:
total_programs = len(df)
transductive_programs = df['code_is_transductive'].sum()
non_transductive_programs = total_programs - transductive_programs

print("=" * 60)
print("TRANSDUCTION FILTERING STATISTICS")
print("=" * 60)
print(f"Total programs: {total_programs:,}")
print(f"Transductive programs: {transductive_programs:,} ({transductive_programs/total_programs*100:.1f}%)")
print(f"Non-transductive programs: {non_transductive_programs:,} ({non_transductive_programs/total_programs*100:.1f}%)")
print(f"Programs filtered out: {transductive_programs:,}")

print("\n" + "=" * 60)
print("STATISTICS BY MODEL")
print("=" * 60)

# Create a copy of df with truncated model names for display
df_display = df.copy()
df_display['model_display'] = df_display['model'].str[:30]

model_stats = df_display.groupby('model_display').agg({
    'code_is_transductive': ['count', 'sum', 'mean']
}).round(3)

model_stats.columns = ['Total', 'Transductive', 'Pct_Transductive']
model_stats['Non_Transductive'] = model_stats['Total'] - model_stats['Transductive']
model_stats['Pct_Transductive'] = model_stats['Pct_Transductive'] * 100

# Sort by total number of programs (descending)
model_stats = model_stats.sort_values('Total', ascending=False)

# Configure pandas display options to prevent wrapping
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', 30)
pd.set_option('display.expand_frame_repr', False)

print(model_stats)

# Reset pandas display options
pd.reset_option('display.max_rows')
pd.reset_option('display.width')
pd.reset_option('display.max_colwidth')
pd.reset_option('display.expand_frame_repr')

TRANSDUCTION FILTERING STATISTICS
Total programs: 292,866
Transductive programs: 111,957 (38.2%)
Non-transductive programs: 180,909 (61.8%)
Programs filtered out: 111,957

STATISTICS BY MODEL
                                Total  Transductive  Pct_Transductive  Non_Transductive
model_display                                                                          
Qwen2.5-Coder-32B-Instruct      62543         21757              34.8             40786
Qwen2.5-72B-Instruct            43963         12464              28.4             31499
Mistral-Large-Instruct-2407     42863         17856              41.7             25007
Qwen2.5-Coder-14B-Instruct      34564         16617              48.1             17947
Trelis/Qwen3-4B_dsarc-programs  29880         18937              63.4             10943
Qwen2.5-Coder-7B-Instruct       18441          5851              31.7             12590
julien31/Soar-qwen-7b           14710          2876              19.6             11834
julien31/soar-qw

## Statistics for Fully Complete Programs Only

In [14]:
# Filter for fully complete programs (all train/test inputs and outputs correct)
def all_correct(row):
    """Check if all train and test inputs/outputs are correct"""
    def check_all_true(values):
        """Helper to check if all values in a list/array are True"""
        if values is None or len(values) == 0:
            return False
        return all(values)
    
    train_input_all_correct = check_all_true(row['correct_train_input'])
    test_input_all_correct = check_all_true(row['correct_test_input'])

    return train_input_all_correct and test_input_all_correct

complete_df = df[df.apply(all_correct, axis=1)]

total_complete_programs = len(complete_df)
transductive_complete_programs = complete_df['code_is_transductive'].sum()
non_transductive_complete_programs = total_complete_programs - transductive_complete_programs

print("=" * 60)
print("TRANSDUCTION FILTERING STATISTICS - COMPLETE PROGRAMS ONLY")
print("=" * 60)
print(f"Total complete programs: {total_complete_programs:,}")
if total_complete_programs > 0:
    print(f"Transductive complete programs: {transductive_complete_programs:,} ({transductive_complete_programs/total_complete_programs*100:.1f}%)")
    print(f"Non-transductive complete programs: {non_transductive_complete_programs:,} ({non_transductive_complete_programs/total_complete_programs*100:.1f}%)")
else:
    print("No complete programs found")
print(f"Complete programs filtered out: {transductive_complete_programs:,}")

print("\n" + "=" * 60)
print("STATISTICS BY MODEL - COMPLETE PROGRAMS ONLY")
print("=" * 60)

if len(complete_df) > 0:
    # Create a copy of complete_df with truncated model names for display
    complete_df_display = complete_df.copy()
    complete_df_display['model_display'] = complete_df_display['model'].str[:30]

    complete_model_stats = complete_df_display.groupby('model_display').agg({
        'code_is_transductive': ['count', 'sum', 'mean']
    }).round(3)

    complete_model_stats.columns = ['Total', 'Transductive', 'Pct_Transductive']
    complete_model_stats['Non_Transductive'] = complete_model_stats['Total'] - complete_model_stats['Transductive']
    complete_model_stats['Pct_Transductive'] = complete_model_stats['Pct_Transductive'] * 100

    # Sort by total number of programs (descending)
    complete_model_stats = complete_model_stats.sort_values('Total', ascending=False)

    # Configure pandas display options to prevent wrapping
    pd.set_option('display.max_rows', None)
    pd.set_option('display.width', 200)
    pd.set_option('display.max_colwidth', 30)
    pd.set_option('display.expand_frame_repr', False)

    print(complete_model_stats)

    # Reset pandas display options
    pd.reset_option('display.max_rows')
    pd.reset_option('display.width')
    pd.reset_option('display.max_colwidth')
    pd.reset_option('display.expand_frame_repr')
else:
    print("No complete programs found - cannot generate model statistics")

TRANSDUCTION FILTERING STATISTICS - COMPLETE PROGRAMS ONLY
Total complete programs: 54,567
Transductive complete programs: 10,476 (19.2%)
Non-transductive complete programs: 44,091 (80.8%)
Complete programs filtered out: 10,476

STATISTICS BY MODEL - COMPLETE PROGRAMS ONLY
                                Total  Transductive  Pct_Transductive  Non_Transductive
model_display                                                                          
Mistral-Large-Instruct-2407      9338           393               4.2              8945
Qwen2.5-Coder-32B-Instruct       7705           306               4.0              7399
Trelis/Qwen3-4B_dsarc-programs   7565          4939              65.3              2626
Qwen2.5-72B-Instruct             7170           266               3.7              6904
julien31/Soar-qwen-7b            5243           378               7.2              4865
julien31/soar-qwen-14b           2608           372              14.3              2236
Qwen2.5-Coder-14B-Inst

## Sample Transductive Programs

In [9]:
# Show sample transductive programs from Mistral-Large-Instruct-2407
target_model = "Mistral-Large-Instruct-2407"
sample_count = 10

# Filter for transductive programs from the target model
transductive_samples = df[
    (df['model'] == target_model) & 
    (df['code_is_transductive'] == True)
].head(sample_count)

print(f"=" * 80)
print(f"SAMPLE TRANSDUCTIVE PROGRAMS FROM {target_model}")
print(f"=" * 80)
print(f"Showing {len(transductive_samples)} out of {len(df[(df['model'] == target_model) & (df['code_is_transductive'] == True)])} transductive programs from this model")
print()

for idx, (_, row) in enumerate(transductive_samples.iterrows(), 1):
    print(f"--- SAMPLE {idx} ---")
    print(f"Task ID: {row.get('task_id', 'N/A')}")
    print(f"Model: {row['model']}")
    print(f"Code:")
    print(row['code'])
    print()
    print("-" * 60)
    print()

SAMPLE TRANSDUCTIVE PROGRAMS FROM Mistral-Large-Instruct-2407
Showing 10 out of 17856 transductive programs from this model

--- SAMPLE 1 ---
Task ID: 007bbfb7
Model: Mistral-Large-Instruct-2407
Code:
def transform(grid):
    n = len(grid)
    new_size = n * 3
    new_grid = [[0 for _ in range(new_size)] for _ in range(new_size)]

    def place_block(block, row, col):
        for i in range(3):
            for j in range(3):
                new_grid[row + i][col + j] = block[i][j]
    for i in range(n):
        for j in range(n):
            if grid[i][j] != 0:
                color = grid[i][j]
                if color == 4:
                    block = [[4, 0, 4], [0, 0, 0], [0, 4, 0]]
                elif color == 6:
                    block = [[6, 6, 0], [6, 0, 0], [0, 6, 6]]
                elif color == 7:
                    block = [[0, 7, 7], [7, 7, 7], [0, 7, 7]]
                elif color == 2:
                    block = [[0, 0, 0], [0, 0, 2], [2, 0, 2]]
                els

## Save Filtered Dataset

In [None]:
# Updated filtering logic:
# 1. Keep all fully correct programs from high-quality models
# 2. Apply transduction filtering to all other programs

# Define high-quality models that we trust
high_quality_models = {
    "Mistral-Large-Instruct-2407",
    "Qwen2.5-Coder-32B-Instruct", 
    "Qwen2.5-72B-Instruct"
}

def should_keep_program(row):
    """
    Determine if a program should be kept based on the updated filtering logic.
    
    Rules:
    1. If program is fully correct AND from a high-quality model: KEEP
    2. Otherwise: KEEP only if non-transductive
    """
    is_fully_correct = all_correct(row)
    is_high_quality_model = row['model'] in high_quality_models
    is_non_transductive = not row['code_is_transductive']
    
    if is_fully_correct and is_high_quality_model:
        return True  # Keep all fully correct programs from high-quality models
    else:
        return is_non_transductive  # Apply transduction filtering to everything else

# Apply the filtering logic
keep_mask = df.apply(should_keep_program, axis=1)
filtered_df = df[keep_mask].copy()

# Analyze what we're keeping
total_programs = len(df)
kept_programs = len(filtered_df)
removed_programs = total_programs - kept_programs

print("=" * 80)
print("UPDATED FILTERING STATISTICS")
print("=" * 80)
print(f"Total programs: {total_programs:,}")
print(f"Programs kept: {kept_programs:,} ({kept_programs/total_programs*100:.1f}%)")
print(f"Programs removed: {removed_programs:,} ({removed_programs/total_programs*100:.1f}%)")

# Break down by filtering reason
fully_correct_mask = df.apply(all_correct, axis=1)
high_quality_mask = df['model'].isin(high_quality_models)
non_transductive_mask = ~df['code_is_transductive']

# Programs kept because they're fully correct from high-quality models
kept_hq_correct = len(df[fully_correct_mask & high_quality_mask])
# Programs kept because they're non-transductive (but not in the above category)
kept_non_transductive = len(df[~(fully_correct_mask & high_quality_mask) & non_transductive_mask])

print(f"\nBreakdown of kept programs:")
print(f"  - Fully correct from high-quality models: {kept_hq_correct:,}")
print(f"  - Non-transductive (other): {kept_non_transductive:,}")
print(f"  - Total kept: {kept_hq_correct + kept_non_transductive:,}")

print(f"\nHigh-quality models analysis:")
for model in high_quality_models:
    model_df = df[df['model'] == model]
    if len(model_df) > 0:
        model_correct = model_df.apply(all_correct, axis=1).sum()
        model_total = len(model_df)
        print(f"  - {model}: {model_correct:,}/{model_total:,} fully correct ({model_correct/model_total*100:.1f}%)")

# Save the filtered dataset
output_file = "/tmp/superking_analysis_quality_filtered.parquet"
filtered_df.to_parquet(output_file, index=False)

print(f"\nFiltered dataset saved to: {output_file}")
print(f"Reduction: {removed_programs:,} programs ({removed_programs/total_programs*100:.1f}%)")

In [None]:
# Compare old vs new filtering approaches
print("=" * 80)
print("FILTERING APPROACH COMPARISON")
print("=" * 80)

# Old approach: simple transduction filtering
old_filtered_df = df[~df['code_is_transductive']]
old_kept = len(old_filtered_df)

# New approach: keep high-quality fully correct + non-transductive others
new_kept = len(filtered_df)

print(f"Original dataset: {total_programs:,} programs")
print(f"Old filtering (transduction only): {old_kept:,} kept ({old_kept/total_programs*100:.1f}%)")
print(f"New filtering (quality + transduction): {new_kept:,} kept ({new_kept/total_programs*100:.1f}%)")
print(f"Difference: {new_kept - old_kept:,} more programs kept with new approach")

# Analyze what the new approach rescued
rescued_programs = len(df[
    (df['code_is_transductive'] == True) &  # Would have been filtered by old approach
    df.apply(all_correct, axis=1) &         # Are fully correct
    (df['model'].isin(high_quality_models)) # From high-quality models
])

print(f"\nPrograms rescued by new approach:")
print(f"  - Transductive but fully correct from high-quality models: {rescued_programs:,}")

# Show rescued programs by model
print(f"\nRescued programs by model:")
for model in high_quality_models:
    model_rescued = len(df[
        (df['model'] == model) &
        (df['code_is_transductive'] == True) &
        df.apply(all_correct, axis=1)
    ])
    if model_rescued > 0:
        print(f"  - {model}: {model_rescued:,} programs")

# Quality check: how many of the rescued programs are actually fully correct?
if rescued_programs > 0:
    print(f"\nQuality verification of rescued programs:")
    print(f"  - All {rescued_programs:,} rescued programs are fully correct by definition")
    print(f"  - These represent high-quality solutions that happened to be transductive")