In [None]:
# Import required modules
import os
import sys

import pandas as pd

# Add the src directory to the path to import novaeval modules
sys.path.append("../../../")

from novaeval.datasets.swe_agent_trajectories_dataset import (
    create_dataset,
    swe_agent_trajectories_preprocessing,
)

In [None]:
# Define the parquet file path
parquet_file_path = os.environ.get("SWE_PARQUET_PATH", "/mnt/drive2/train-00000-of-00012.parquet")

# Check if the file exists
print(f"Checking if file exists: {parquet_file_path}")
print(f"File exists: {os.path.exists(parquet_file_path)}")

# If file doesn't exist, let's check what's in the directory
if not os.path.exists(parquet_file_path):
    print("\nFile not found. Checking directory contents:")
    try:
        dir_path = "/mnt/drive2/"
        if os.path.exists(dir_path):
            files = os.listdir(dir_path)
            parquet_files = [f for f in files if f.endswith(".parquet")]
            print(f"Parquet files in {dir_path}: {parquet_files[:10]}")  # Show first 10
        else:
            print(f"Directory {dir_path} does not exist")
    except Exception as e:
        print(f"Error checking directory: {e}")

In [None]:
# Test 1: Try to read the parquet file directly with pandas first
try:
    print("Attempting to read parquet file directly with pandas...")
    df = pd.read_parquet(parquet_file_path)
    print("Successfully read parquet file!")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print("\nFirst few rows:")
    print(df.head())

    # Check if required columns exist
    required_cols = [
        "instance_id",
        "model_name",
        "target",
        "trajectory",
        "exit_status",
        "generated_patch",
        "eval_logs",
    ]
    missing = [col for col in required_cols if col not in df.columns]
    if missing:
        print(f"\nMissing required columns: {missing}")
    else:
        print("\nAll required columns present!")

    # Check trajectory column structure
    if "trajectory" in df.columns:
        print("\nTrajectory column sample:")
        print(df["trajectory"].iloc[0])
        print(f"Type: {type(df['trajectory'].iloc[0])}")

except Exception as e:
    print(f"Error reading parquet file: {e}")

In [None]:
# Test 2: Test the preprocessing function
try:
    print("Testing swe_agent_trajectories_preprocessing function...")
    output_csv = "/mnt/drive2/test_output.csv"

    # Call the preprocessing function
    swe_agent_trajectories_preprocessing(
        parquet_files=[parquet_file_path], output_csv=output_csv
    )

    print(f"Preprocessing completed! Output saved to: {output_csv}")

    # Read and display the output
    output_df = pd.read_csv(output_csv)
    print(f"\nOutput shape: {output_df.shape}")
    print(f"Output columns: {list(output_df.columns)}")
    print("\nFirst few rows of output:")
    print(output_df.head())

except Exception as e:
    print(f"Error in preprocessing: {e}")
    import traceback

    traceback.print_exc()

In [None]:
# Test 3: Test the create_dataset function
try:
    print("Testing create_dataset function...")

    # Create dataset from the preprocessed CSV
    dataset = create_dataset(output_csv)

    print("Dataset created successfully!")
    print(f"Dataset type: {type(dataset)}")

    # Try to access some dataset properties/methods
    if hasattr(dataset, "__len__"):
        print(f"Dataset length: {len(dataset)}")

    if hasattr(dataset, "data"):
        print(
            f"Dataset data keys: {list(dataset.data.keys()) if isinstance(dataset.data, dict) else 'Not a dict'}"
        )

except Exception as e:
    print(f"Error in create_dataset: {e}")
    import traceback

    traceback.print_exc()

In [None]:
print("Hello World")
print(type(dataset))
print(dir(dataset))
gen = dataset.get_datapoint()
obj = next(gen)
print(next(gen))
print(obj.model_dump())
print(type(obj))

In [None]:

# Add the src directory to the path to import novaeval modules
sys.path.append("../../../")

from novaeval.datasets.swe_agent_trajectories_dataset import (
    swe_agent_trajectories_preprocessing,
)

total_processed = 0
for chunk in dataset.stream_from_csv(
    file_path=output_csv,
    chunk_size=500,  # Process 500 rows at a time
    turn_id="instance_id",
    agent_name="model_name",
    agent_task="target",
    tool_call_results="generated_patch",
    metadata="eval_logs",
):
    # chunk is list[AgentData] with max 500 items
    chunk_size = len(chunk)
    total_processed += chunk_size

    # Example processing: Print first item in chunk
    if total_processed <= 500:  # Only print from first chunk
        print("\nSample data from first chunk:")
        print(f"Agent Name: {chunk[0].agent_name}")
        print(f"Task: {chunk[0].agent_task}")
        print(f"Turn ID: {chunk[0].turn_id}")

    print(f"Processed chunk of {chunk_size} items. Total processed: {total_processed}")

In [None]:
# Test 4: Test with directory approach (if we have multiple files)
try:
    print("Testing with directory approach...")

    # Get the directory of the parquet file
    parquet_dir = os.path.dirname(parquet_file_path)
    print(f"Parquet directory: {parquet_dir}")

    # Check if directory exists and contains parquet files
    if os.path.exists(parquet_dir):
        files = os.listdir(parquet_dir)
        parquet_files = [f for f in files if f.endswith(".parquet")]
        print(f"Found {len(parquet_files)} parquet files in directory")

        if len(parquet_files) > 1:
            print("Testing with multiple files using directory approach...")
            output_csv_multi = "test_output_multi.csv"

            swe_agent_trajectories_preprocessing(
                parquet_dir=parquet_dir, output_csv=output_csv_multi
            )

            print(
                f"Multi-file preprocessing completed! Output saved to: {output_csv_multi}"
            )

            # Read and display the output
            output_df_multi = pd.read_csv(output_csv_multi)
            print(f"\nMulti-file output shape: {output_df_multi.shape}")
            print(f"Multi-file output columns: {list(output_df_multi.columns)}")
        else:
            print("Only one parquet file found, skipping multi-file test")
    else:
        print(f"Directory {parquet_dir} does not exist")

except Exception as e:
    print(f"Error in directory approach test: {e}")
    import traceback

    traceback.print_exc()

In [None]:
# Cleanup: Remove temporary files

files_to_cleanup = ["test_output.csv", "test_output_multi.csv"]
for file in files_to_cleanup:
    if os.path.exists(file):
        os.remove(file)
        print(f"Cleaned up: {file}")

print("\nTesting completed!")

In [None]:
# Test preprocessing with a small sample first
sample_size = 100  # adjust this number as needed
df_sample = df.head(sample_size).copy()

# Analyze trajectory lengths BEFORE preprocessing
trajectory_lengths = []
for _idx, row in df_sample.iterrows():
    traj = row["trajectory"]
    traj_len = len(traj) if hasattr(traj, "__len__") else 0
    trajectory_lengths.append(traj_len)

print(f"Total trajectory elements: {sum(trajectory_lengths)}")

# Save sample to parquet
sample_parquet = "sample.parquet"
df_sample.to_parquet(sample_parquet)

# Process the sample
output_csv_sample = "sample_output.csv"
swe_agent_trajectories_preprocessing(
    parquet_files=[sample_parquet], output_csv=output_csv_sample
)

# Check sizes AFTER preprocessing
sample_output_df = pd.read_csv(output_csv_sample)
print(f"Actual output rows: {len(sample_output_df)}")

print("\nSample processing results:")
print(f"Input rows: {len(df_sample)}")
print(f"Output rows: {len(sample_output_df)}")
expansion_factor = len(sample_output_df) / len(df_sample)
print(f"Expansion factor: {expansion_factor:.2f}x")

# Check memory usage of output
print("\nOutput DataFrame memory usage per column:")
print(sample_output_df.memory_usage(deep=True) / (1024 * 1024), "MB")

# Clean up sample files
os.remove(sample_parquet)
os.remove(output_csv_sample)