In [None]:
# Import required modules
import os
import sys

import pandas as pd

# Add the src directory to the path to import novaeval modules
sys.path.append("../../../")

from novaeval.datasets.swe_agent_trajectories_dataset import (
    create_dataset,
    swe_agent_trajectories_preprocessing,
)

In [2]:
# Define the parquet file path
parquet_file_path = "/mnt/drive2/train-00000-of-00012.parquet"

# Check if the file exists
print(f"Checking if file exists: {parquet_file_path}")
print(f"File exists: {os.path.exists(parquet_file_path)}")

# If file doesn't exist, let's check what's in the directory
if not os.path.exists(parquet_file_path):
    print("\nFile not found. Checking directory contents:")
    try:
        dir_path = "/mnt/drive2/"
        if os.path.exists(dir_path):
            files = os.listdir(dir_path)
            parquet_files = [f for f in files if f.endswith(".parquet")]
            print(f"Parquet files in {dir_path}: {parquet_files[:10]}")  # Show first 10
        else:
            print(f"Directory {dir_path} does not exist")
    except Exception as e:
        print(f"Error checking directory: {e}")

Checking if file exists: /mnt/drive2/train-00000-of-00012.parquet
File exists: True


In [3]:
# Test 1: Try to read the parquet file directly with pandas first
try:
    print("Attempting to read parquet file directly with pandas...")
    df = pd.read_parquet(parquet_file_path)
    print("Successfully read parquet file!")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print("\nFirst few rows:")
    print(df.head())

    # Check if required columns exist
    required_cols = [
        "instance_id",
        "model_name",
        "target",
        "trajectory",
        "exit_status",
        "generated_patch",
        "eval_logs",
    ]
    missing = [col for col in required_cols if col not in df.columns]
    if missing:
        print(f"\nMissing required columns: {missing}")
    else:
        print("\nAll required columns present!")

    # Check trajectory column structure
    if "trajectory" in df.columns:
        print("\nTrajectory column sample:")
        print(df["trajectory"].iloc[0])
        print(f"Type: {type(df['trajectory'].iloc[0])}")

except Exception as e:
    print(f"Error reading parquet file: {e}")

Attempting to read parquet file directly with pandas...
Successfully read parquet file!
Shape: (6670, 7)
Columns: ['instance_id', 'model_name', 'target', 'trajectory', 'exit_status', 'generated_patch', 'eval_logs']

First few rows:
            instance_id           model_name  target  \
0  AnalogJ__lexicon-336  swe-agent-llama-70b   False   
1  AnalogJ__lexicon-336  swe-agent-llama-70b   False   
2  AnalogJ__lexicon-336  swe-agent-llama-70b   False   
3  AnalogJ__lexicon-336  swe-agent-llama-70b   False   
4  AnalogJ__lexicon-336  swe-agent-llama-70b   False   

                                          trajectory  \
0  [{'cutoff_date': '01.01.2023', 'mask': False, ...   
1  [{'cutoff_date': '01.01.2023', 'mask': False, ...   
2  [{'cutoff_date': '01.01.2023', 'mask': False, ...   
3  [{'cutoff_date': '01.01.2023', 'mask': False, ...   
4  [{'cutoff_date': '01.01.2023', 'mask': False, ...   

                exit_status  \
0  submitted (exit_context)   
1  submitted (exit_context)   
2

In [4]:
# Test 2: Test the preprocessing function
try:
    print("Testing swe_agent_trajectories_preprocessing function...")
    output_csv = "/mnt/drive2/test_output.csv"

    # Call the preprocessing function
    swe_agent_trajectories_preprocessing(
        parquet_files=[parquet_file_path], output_csv=output_csv
    )

    print(f"Preprocessing completed! Output saved to: {output_csv}")

    # Read and display the output
    output_df = pd.read_csv(output_csv)
    print(f"\nOutput shape: {output_df.shape}")
    print(f"Output columns: {list(output_df.columns)}")
    print("\nFirst few rows of output:")
    print(output_df.head())

except Exception as e:
    print(f"Error in preprocessing: {e}")
    import traceback

    traceback.print_exc()

Testing swe_agent_trajectories_preprocessing function...
Processing /mnt/drive2/train-00000-of-00012.parquet
Preprocessing completed! Output saved to: /mnt/drive2/test_output.csv

Output shape: (356301, 12)
Output columns: ['instance_id', 'model_name', 'target', 'exit_status', 'generated_patch', 'eval_logs', 'cutoff_date', 'mask', 'role', 'system_prompt', 'text', 'agent_exit']

First few rows of output:
            instance_id           model_name  target  \
0  AnalogJ__lexicon-336  swe-agent-llama-70b   False   
1  AnalogJ__lexicon-336  swe-agent-llama-70b   False   
2  AnalogJ__lexicon-336  swe-agent-llama-70b   False   
3  AnalogJ__lexicon-336  swe-agent-llama-70b   False   
4  AnalogJ__lexicon-336  swe-agent-llama-70b   False   

                exit_status  \
0  submitted (exit_context)   
1  submitted (exit_context)   
2  submitted (exit_context)   
3  submitted (exit_context)   
4  submitted (exit_context)   

                                     generated_patch  \
0  \ndiff --g

In [5]:
# Test 3: Test the create_dataset function
try:
    print("Testing create_dataset function...")

    # Create dataset from the preprocessed CSV
    dataset = create_dataset(output_csv)

    print("Dataset created successfully!")
    print(f"Dataset type: {type(dataset)}")

    # Try to access some dataset properties/methods
    if hasattr(dataset, "__len__"):
        print(f"Dataset length: {len(dataset)}")

    if hasattr(dataset, "data"):
        print(
            f"Dataset data keys: {list(dataset.data.keys()) if isinstance(dataset.data, dict) else 'Not a dict'}"
        )

except Exception as e:
    print(f"Error in create_dataset: {e}")
    import traceback

    traceback.print_exc()

Testing create_dataset function...
Dataset created successfully!
Dataset type: <class 'novaeval.agents.agent_dataset.AgentDataset'>
Dataset data keys: Not a dict


In [6]:
print("Hello World")
print(type(dataset))
print(dir(dataset))
gen = dataset.get_datapoint()
obj = next(gen)
print(next(gen))
print(obj.model_dump())
print(type(obj))

Hello World
<class 'novaeval.agents.agent_dataset.AgentDataset'>
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_dict_fields', '_list_fields', '_parse_field', 'data', 'export_to_csv', 'export_to_json', 'get_data', 'get_datapoint', 'ingest_from_csv', 'ingest_from_json', 'stream_from_csv', 'stream_from_json']
user_id=None task_id=None turn_id='AnalogJ__lexicon-336' ground_truth=None expected_tool_call=None agent_name='swe-agent-llama-70b' agent_role='user' agent_task='False' system_prompt='nan' agent_response='We\'re currently solving the following issue within our repository. Here\'s the issue text:\nISSUE:\nMemset provider: TypeError: string indices must be integers\nHi,\r\n\r\nWhen using the

In [None]:
import os
import sys

import pandas as pd

# Add the src directory to the path to import novaeval modules
sys.path.append("../../../")

from novaeval.datasets.swe_agent_trajectories_dataset import (
    swe_agent_trajectories_preprocessing,
)

total_processed = 0
for chunk in dataset.stream_from_csv(
    file_path=output_csv,
    chunk_size=500,  # Process 500 rows at a time
    turn_id="instance_id",
    agent_name="model_name",
    agent_task="target",
    tool_call_results="generated_patch",
    metadata="eval_logs",
):
    # chunk is list[AgentData] with max 500 items
    chunk_size = len(chunk)
    total_processed += chunk_size

    # Example processing: Print first item in chunk
    if total_processed <= 500:  # Only print from first chunk
        print("\nSample data from first chunk:")
        print(f"Agent Name: {chunk[0].agent_name}")
        print(f"Task: {chunk[0].agent_task}")
        print(f"Turn ID: {chunk[0].turn_id}")

    print(f"Processed chunk of {chunk_size} items. Total processed: {total_processed}")


Sample data from first chunk:
Agent Name: swe-agent-llama-70b
Task: False
Turn ID: AnalogJ__lexicon-336
Processed chunk of 500 items. Total processed: 500
Processed chunk of 500 items. Total processed: 1000
Processed chunk of 500 items. Total processed: 1500
Processed chunk of 500 items. Total processed: 2000
Processed chunk of 500 items. Total processed: 2500
Processed chunk of 500 items. Total processed: 3000
Processed chunk of 500 items. Total processed: 3500
Processed chunk of 500 items. Total processed: 4000
Processed chunk of 500 items. Total processed: 4500
Processed chunk of 500 items. Total processed: 5000
Processed chunk of 500 items. Total processed: 5500
Processed chunk of 500 items. Total processed: 6000
Processed chunk of 500 items. Total processed: 6500
Processed chunk of 500 items. Total processed: 7000
Processed chunk of 500 items. Total processed: 7500
Processed chunk of 500 items. Total processed: 8000
Processed chunk of 500 items. Total processed: 8500
Processed ch

In [8]:
# Test 4: Test with directory approach (if we have multiple files)
try:
    print("Testing with directory approach...")

    # Get the directory of the parquet file
    parquet_dir = os.path.dirname(parquet_file_path)
    print(f"Parquet directory: {parquet_dir}")

    # Check if directory exists and contains parquet files
    if os.path.exists(parquet_dir):
        files = os.listdir(parquet_dir)
        parquet_files = [f for f in files if f.endswith(".parquet")]
        print(f"Found {len(parquet_files)} parquet files in directory")

        if len(parquet_files) > 1:
            print("Testing with multiple files using directory approach...")
            output_csv_multi = "test_output_multi.csv"

            swe_agent_trajectories_preprocessing(
                parquet_dir=parquet_dir, output_csv=output_csv_multi
            )

            print(
                f"Multi-file preprocessing completed! Output saved to: {output_csv_multi}"
            )

            # Read and display the output
            output_df_multi = pd.read_csv(output_csv_multi)
            print(f"\nMulti-file output shape: {output_df_multi.shape}")
            print(f"Multi-file output columns: {list(output_df_multi.columns)}")
        else:
            print("Only one parquet file found, skipping multi-file test")
    else:
        print(f"Directory {parquet_dir} does not exist")

except Exception as e:
    print(f"Error in directory approach test: {e}")
    import traceback

    traceback.print_exc()

Testing with directory approach...
Parquet directory: /mnt/drive2
Found 1 parquet files in directory
Only one parquet file found, skipping multi-file test


In [9]:
# Cleanup: Remove temporary files
import os

files_to_cleanup = ["test_output.csv", "test_output_multi.csv"]
for file in files_to_cleanup:
    if os.path.exists(file):
        os.remove(file)
        print(f"Cleaned up: {file}")

print("\nTesting completed!")


Testing completed!


In [None]:
# Test preprocessing with a small sample first
sample_size = 100  # adjust this number as needed
df_sample = df.head(sample_size).copy()

# Analyze trajectory lengths BEFORE preprocessing
trajectory_lengths = []
for _idx, row in df_sample.iterrows():
    traj = row["trajectory"]
    traj_len = len(traj) if hasattr(traj, "__len__") else 0
    trajectory_lengths.append(traj_len)

print(f"Total trajectory elements: {sum(trajectory_lengths)}")

# Save sample to parquet
sample_parquet = "sample.parquet"
df_sample.to_parquet(sample_parquet)

# Process the sample
output_csv_sample = "sample_output.csv"
swe_agent_trajectories_preprocessing(
    parquet_files=[sample_parquet], output_csv=output_csv_sample
)

# Check sizes AFTER preprocessing
sample_output_df = pd.read_csv(output_csv_sample)
print(f"Actual output rows: {len(sample_output_df)}")

print("\nSample processing results:")
print(f"Input rows: {len(df_sample)}")
print(f"Output rows: {len(sample_output_df)}")
expansion_factor = len(sample_output_df) / len(df_sample)
print(f"Expansion factor: {expansion_factor:.2f}x")

# Check memory usage of output
print("\nOutput DataFrame memory usage per column:")
print(sample_output_df.memory_usage(deep=True) / (1024 * 1024), "MB")

# Clean up sample files
os.remove(sample_parquet)
os.remove(output_csv_sample)

Total trajectory elements: 4898
Processing sample.parquet
Actual output rows: 4898

Sample processing results:
Input rows: 100
Output rows: 4898
Expansion factor: 48.98x

Output DataFrame memory usage per column:
Index               0.000122
instance_id         0.389066
model_name          0.354831
target              0.004671
exit_status         0.342138
generated_patch     7.765799
eval_logs          21.691992
cutoff_date         0.152813
mask                0.004671
role                0.280552
system_prompt       0.616585
text                5.412631
dtype: float64 MB
