In [1]:
# Import required modules
import sys
import os
import pandas as pd
import json

# Add the src directory to the path to import novaeval modules
sys.path.append('../../../')

from novaeval.agents.swe_agent_trajectories import swe_agent_trajectories_preprocessing, create_dataset


In [2]:
# Define the parquet file path
parquet_file_path = "/mnt/drive2/train-00000-of-00012.parquet"

# Check if the file exists
print(f"Checking if file exists: {parquet_file_path}")
print(f"File exists: {os.path.exists(parquet_file_path)}")

# If file doesn't exist, let's check what's in the directory
if not os.path.exists(parquet_file_path):
    print("\nFile not found. Checking directory contents:")
    try:
        dir_path = "/mnt/drive2/"
        if os.path.exists(dir_path):
            files = os.listdir(dir_path)
            parquet_files = [f for f in files if f.endswith('.parquet')]
            print(f"Parquet files in {dir_path}: {parquet_files[:10]}")  # Show first 10
        else:
            print(f"Directory {dir_path} does not exist")
    except Exception as e:
        print(f"Error checking directory: {e}")


Checking if file exists: /mnt/drive2/train-00000-of-00012.parquet
File exists: True


In [3]:
# Test 1: Try to read the parquet file directly with pandas first
try:
    print("Attempting to read parquet file directly with pandas...")
    df = pd.read_parquet(parquet_file_path)
    print(f"Successfully read parquet file!")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"\nFirst few rows:")
    print(df.head())
    
    # Check if required columns exist
    required_cols = ['instance_id', 'model_name', 'target', 'trajectory', 'exit_status', 'generated_patch', 'eval_logs']
    missing = [col for col in required_cols if col not in df.columns]
    if missing:
        print(f"\nMissing required columns: {missing}")
    else:
        print(f"\nAll required columns present!")
        
    # Check trajectory column structure
    if 'trajectory' in df.columns:
        print(f"\nTrajectory column sample:")
        print(df['trajectory'].iloc[0])
        print(f"Type: {type(df['trajectory'].iloc[0])}")
        
except Exception as e:
    print(f"Error reading parquet file: {e}")


Attempting to read parquet file directly with pandas...
Successfully read parquet file!
Shape: (6670, 7)
Columns: ['instance_id', 'model_name', 'target', 'trajectory', 'exit_status', 'generated_patch', 'eval_logs']

First few rows:
            instance_id           model_name  target  \
0  AnalogJ__lexicon-336  swe-agent-llama-70b   False   
1  AnalogJ__lexicon-336  swe-agent-llama-70b   False   
2  AnalogJ__lexicon-336  swe-agent-llama-70b   False   
3  AnalogJ__lexicon-336  swe-agent-llama-70b   False   
4  AnalogJ__lexicon-336  swe-agent-llama-70b   False   

                                          trajectory  \
0  [{'cutoff_date': '01.01.2023', 'mask': False, ...   
1  [{'cutoff_date': '01.01.2023', 'mask': False, ...   
2  [{'cutoff_date': '01.01.2023', 'mask': False, ...   
3  [{'cutoff_date': '01.01.2023', 'mask': False, ...   
4  [{'cutoff_date': '01.01.2023', 'mask': False, ...   

                exit_status  \
0  submitted (exit_context)   
1  submitted (exit_context)   
2

In [4]:
# Test 2: Test the preprocessing function
try:
    print("Testing swe_agent_trajectories_preprocessing function...")
    output_csv = "/mnt/drive2/test_output.csv"
    
    # Call the preprocessing function
    swe_agent_trajectories_preprocessing(
        parquet_files=[parquet_file_path],
        output_csv=output_csv
    )
    
    print(f"Preprocessing completed! Output saved to: {output_csv}")
    
    # Read and display the output
    output_df = pd.read_csv(output_csv)
    print(f"\nOutput shape: {output_df.shape}")
    print(f"Output columns: {list(output_df.columns)}")
    print(f"\nFirst few rows of output:")
    print(output_df.head())
    
except Exception as e:
    print(f"Error in preprocessing: {e}")
    import traceback
    traceback.print_exc()


Testing swe_agent_trajectories_preprocessing function...
Processing /mnt/drive2/train-00000-of-00012.parquet
Preprocessing completed! Output saved to: /mnt/drive2/test_output.csv

Output shape: (356301, 11)
Output columns: ['instance_id', 'model_name', 'target', 'exit_status', 'generated_patch', 'eval_logs', 'cutoff_date', 'mask', 'role', 'system_prompt', 'text']

First few rows of output:
            instance_id           model_name  target  \
0  AnalogJ__lexicon-336  swe-agent-llama-70b   False   
1  AnalogJ__lexicon-336  swe-agent-llama-70b   False   
2  AnalogJ__lexicon-336  swe-agent-llama-70b   False   
3  AnalogJ__lexicon-336  swe-agent-llama-70b   False   
4  AnalogJ__lexicon-336  swe-agent-llama-70b   False   

                exit_status  \
0  submitted (exit_context)   
1  submitted (exit_context)   
2  submitted (exit_context)   
3  submitted (exit_context)   
4  submitted (exit_context)   

                                     generated_patch  \
0  \ndiff --git a/lexicon/p

In [9]:
# Test 3: Test the create_dataset function
try:
    print("Testing create_dataset function...")
    
    # Create dataset from the preprocessed CSV
    dataset = create_dataset(output_csv)
    
    print(f"Dataset created successfully!")
    print(f"Dataset type: {type(dataset)}")
    
    # Try to access some dataset properties/methods
    if hasattr(dataset, '__len__'):
        print(f"Dataset length: {len(dataset)}")
    
    if hasattr(dataset, 'data'):
        print(f"Dataset data keys: {list(dataset.data.keys()) if isinstance(dataset.data, dict) else 'Not a dict'}")
    
except Exception as e:
    print(f"Error in create_dataset: {e}")
    import traceback
    traceback.print_exc()


Testing create_dataset function...
Dataset created successfully!
Dataset type: <class 'novaeval.agents.agent_dataset.AgentDataset'>
Dataset data keys: Not a dict


In [10]:
print("Hello World")
print(type(dataset))
print(dir(dataset))
gen = dataset.get_datapoint()
obj = next(gen)
print(next(gen))    
print(obj.model_dump())
print(type(obj))

Hello World
<class 'novaeval.agents.agent_dataset.AgentDataset'>
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_dict_fields', '_list_fields', '_parse_field', 'data', 'export_to_csv', 'export_to_json', 'get_data', 'get_datapoint', 'ingest_from_csv', 'ingest_from_json', 'stream_from_csv', 'stream_from_json']
user_id=None task_id=None turn_id='AnalogJ__lexicon-336' ground_truth=None expected_tool_call=None agent_name='swe-agent-llama-70b' agent_role='user' agent_task='False' system_prompt='nan' agent_response='We\'re currently solving the following issue within our repository. Here\'s the issue text:\nISSUE:\nMemset provider: TypeError: string indices must be integers\nHi,\r\n\r\nWhen using the

In [11]:
import sys
import os
import pandas as pd
import json

# Add the src directory to the path to import novaeval modules
sys.path.append('../../../')

from novaeval.agents.swe_agent_trajectories import swe_agent_trajectories_preprocessing, create_dataset
total_processed = 0
for chunk in dataset.stream_from_csv(
    file_path=output_csv,
    chunk_size=500,  # Process 500 rows at a time
    turn_id='instance_id',
    agent_name='model_name',
    agent_task='target',
    tool_call_results='generated_patch',
    metadata='eval_logs'
):
    # chunk is list[AgentData] with max 500 items
    chunk_size = len(chunk)
    total_processed += chunk_size
    
    # Example processing: Print first item in chunk
    if total_processed <= 500:  # Only print from first chunk
        print(f"\nSample data from first chunk:")
        print(f"Agent Name: {chunk[0].agent_name}")
        print(f"Task: {chunk[0].agent_task}")
        print(f"Turn ID: {chunk[0].turn_id}")
    
    print(f"Processed chunk of {chunk_size} items. Total processed: {total_processed}")




Sample data from first chunk:
Agent Name: swe-agent-llama-70b
Task: False
Turn ID: AnalogJ__lexicon-336
Processed chunk of 500 items. Total processed: 500
Processed chunk of 500 items. Total processed: 1000
Processed chunk of 500 items. Total processed: 1500
Processed chunk of 500 items. Total processed: 2000
Processed chunk of 500 items. Total processed: 2500
Processed chunk of 500 items. Total processed: 3000
Processed chunk of 500 items. Total processed: 3500
Processed chunk of 500 items. Total processed: 4000
Processed chunk of 500 items. Total processed: 4500
Processed chunk of 500 items. Total processed: 5000
Processed chunk of 500 items. Total processed: 5500
Processed chunk of 500 items. Total processed: 6000
Processed chunk of 500 items. Total processed: 6500
Processed chunk of 500 items. Total processed: 7000
Processed chunk of 500 items. Total processed: 7500
Processed chunk of 500 items. Total processed: 8000
Processed chunk of 500 items. Total processed: 8500
Processed ch

KeyboardInterrupt: 

In [None]:
# Test 4: Test with directory approach (if we have multiple files)
try:
    print("Testing with directory approach...")
    
    # Get the directory of the parquet file
    parquet_dir = os.path.dirname(parquet_file_path)
    print(f"Parquet directory: {parquet_dir}")
    
    # Check if directory exists and contains parquet files
    if os.path.exists(parquet_dir):
        files = os.listdir(parquet_dir)
        parquet_files = [f for f in files if f.endswith('.parquet')]
        print(f"Found {len(parquet_files)} parquet files in directory")
        
        if len(parquet_files) > 1:
            print("Testing with multiple files using directory approach...")
            output_csv_multi = "test_output_multi.csv"
            
            swe_agent_trajectories_preprocessing(
                parquet_dir=parquet_dir,
                output_csv=output_csv_multi
            )
            
            print(f"Multi-file preprocessing completed! Output saved to: {output_csv_multi}")
            
            # Read and display the output
            output_df_multi = pd.read_csv(output_csv_multi)
            print(f"\nMulti-file output shape: {output_df_multi.shape}")
            print(f"Multi-file output columns: {list(output_df_multi.columns)}")
        else:
            print("Only one parquet file found, skipping multi-file test")
    else:
        print(f"Directory {parquet_dir} does not exist")
        
except Exception as e:
    print(f"Error in directory approach test: {e}")
    import traceback
    traceback.print_exc()


In [None]:
..

In [None]:
# Cleanup: Remove temporary files
import os

files_to_cleanup = ["test_output.csv", "test_output_multi.csv"]
for file in files_to_cleanup:
    if os.path.exists(file):
        os.remove(file)
        print(f"Cleaned up: {file}")

print("\nTesting completed!")


In [None]:
# Example 1: Basic usage comparison of create_dataset and stream_dataset

print("1. Using create_dataset (loads everything at once):")
try:
    # Create full dataset
    dataset = create_dataset(output_csv)
    print(f"Total records loaded: {len(dataset.data)}")
    
    if dataset.data:
        first_record = dataset.data[0]
        print("\nSample from create_dataset:")
        print(f"Agent Name: {first_record.agent_name}")
        print(f"Task: {first_record.agent_task}")
        print(f"Tool Call Results type: {type(first_record.tool_call_results)}")
        print(f"Tool Call Results: {first_record.tool_call_results}")
        print(f"Metadata: {first_record.metadata}")
except Exception as e:
    print(f"Error in create_dataset: {e}")

print("\n2. Using stream_dataset (processes in chunks):")
try:
    total_processed = 0
    first_chunk_sample = None
    
    # Process in chunks of 500
    for chunk_num, chunk in enumerate(stream_dataset(output_csv, chunk_size=500), 1):
        chunk_size = len(chunk)
        total_processed += chunk_size
        
        # Save sample from first chunk
        if first_chunk_sample is None and chunk:
            first_chunk_sample = chunk[0]
            print("\nSample from first chunk:")
            print(f"Agent Name: {first_chunk_sample.agent_name}")
            print(f"Task: {first_chunk_sample.agent_task}")
            print(f"Tool Call Results type: {type(first_chunk_sample.tool_call_results)}")
            print(f"Tool Call Results: {first_chunk_sample.tool_call_results}")
            print(f"Metadata: {first_chunk_sample.metadata}")
        
        print(f"\nProcessed chunk {chunk_num}: {chunk_size} records. Running total: {total_processed}")
        
except Exception as e:
    print(f"Error in stream_dataset: {e}")

print("\nProcessing completed!")


In [None]:
# Example 2: Detailed data structure comparison

def compare_agent_data(data1, data2, prefix=""):
    """Helper function to compare two AgentData objects"""
    print(f"{prefix}Comparing AgentData objects:")
    
    # Compare basic fields
    print(f"{prefix}Basic fields:")
    print(f"{prefix}  Agent Name match: {data1.agent_name == data2.agent_name}")
    print(f"{prefix}  Task match: {data1.agent_task == data2.agent_task}")
    print(f"{prefix}  Role match: {data1.agent_role == data2.agent_role}")
    
    # Compare tool_call_results
    print(f"\n{prefix}Tool Call Results:")
    print(f"{prefix}  Type match: {type(data1.tool_call_results) == type(data2.tool_call_results)}")
    if data1.tool_call_results and data2.tool_call_results:
        print(f"{prefix}  Structure match: {data1.tool_call_results == data2.tool_call_results}")
        # Show structure of first tool call result
        print(f"{prefix}  First tool call structure:")
        print(f"{prefix}    call_id present: {'call_id' in data1.tool_call_results[0]}")
        print(f"{prefix}    result present: {'result' in data1.tool_call_results[0]}")
        print(f"{prefix}    success present: {'success' in data1.tool_call_results[0]}")
    
    # Compare metadata
    print(f"\n{prefix}Metadata:")
    print(f"{prefix}  Type match: {type(data1.metadata) == type(data2.metadata)}")
    if data1.metadata and data2.metadata:
        print(f"{prefix}  Content match: {data1.metadata == data2.metadata}")
        # Show metadata structure
        print(f"{prefix}  Structure contains:")
        print(f"{prefix}    exit_status present: {'exit_status' in data1.metadata}")
        print(f"{prefix}    mask present: {'mask' in data1.metadata}")

print("Detailed comparison of create_dataset and stream_dataset outputs:")

# Get data from both methods
dataset = create_dataset(output_csv)
create_record = dataset.data[0] if dataset.data else None

stream_record = None
for chunk in stream_dataset(output_csv, chunk_size=500):
    if chunk:
        stream_record = chunk[0]
        break

if create_record and stream_record:
    print("\nComparing first record from both methods:")
    compare_agent_data(create_record, stream_record, "  ")
else:
    print("No records available for comparison")


In [None]:
# Example 3: Processing chunks with a specific task

print("Example of processing chunks with a specific task:")

def process_chunk(chunk: list[AgentData], task_name: str) -> int:
    """
    Example function to process a chunk of data.
    Returns count of records matching the task.
    """
    matching_count = sum(1 for record in chunk if record.agent_task == task_name)
    return matching_count

try:
    # Process chunks and count records with specific task
    target_task = "fix bug"  # example task name
    total_matching = 0
    total_processed = 0
    
    print(f"\nCounting records with task: '{target_task}'")
    
    for chunk_num, chunk in enumerate(stream_dataset(output_csv, chunk_size=500), 1):
        # Process this chunk
        matching_in_chunk = process_chunk(chunk, target_task)
        
        # Update totals
        total_matching += matching_in_chunk
        total_processed += len(chunk)
        
        # Progress report
        print(f"Chunk {chunk_num}: Found {matching_in_chunk} matching records out of {len(chunk)}")
        
    # Final summary
    print(f"\nProcessing complete!")
    print(f"Total records processed: {total_processed}")
    print(f"Total records matching '{target_task}': {total_matching}")
    if total_processed > 0:
        print(f"Percentage matching: {(total_matching/total_processed)*100:.2f}%")
        
except Exception as e:
    print(f"Error processing chunks: {e}")


In [None]:
# Example of using stream_dataset vs create_dataset

print("Comparing stream_dataset with create_dataset:")

# 1. Using create_dataset (loads everything at once)
print("\n1. Using create_dataset:")
try:
    dataset = create_dataset(output_csv)
    print(f"Total records loaded: {len(dataset.data)}")
    print("\nFirst record sample:")
    first_record = dataset.data[0]
    print(f"Agent Name: {first_record.agent_name}")
    print(f"Task: {first_record.agent_task}")
    print(f"Tool Call Results: {first_record.tool_call_results[:200]}...")  # Show first 200 chars
except Exception as e:
    print(f"Error in create_dataset: {e}")

# 2. Using stream_dataset (processes in chunks)
print("\n2. Using stream_dataset:")
try:
    total_processed = 0
    records_sample = None
    chunk_count = 0
    
    # Process chunks of 500 records
    for chunk in stream_dataset(output_csv, chunk_size=500):
        chunk_count += 1
        chunk_size = len(chunk)
        total_processed += chunk_size
        
        # Save first chunk's first record for comparison
        if records_sample is None and chunk:
            records_sample = chunk[0]
            
        print(f"Processed chunk {chunk_count}: {chunk_size} records. Total processed: {total_processed}")
    
    # Show sample from streaming for comparison
    if records_sample:
        print("\nFirst record sample from streaming:")
        print(f"Agent Name: {records_sample.agent_name}")
        print(f"Task: {records_sample.agent_task}")
        print(f"Tool Call Results: {records_sample.tool_call_results[:200]}...")  # Show first 200 chars
        
except Exception as e:
    print(f"Error in stream_dataset: {e}")

print("\nComparison completed!")


In [None]:
# Detailed comparison of create_dataset and stream_dataset outputs

def compare_agent_data(data1, data2, prefix=""):
    """Helper function to compare two AgentData objects"""
    print(f"{prefix}Comparing AgentData objects:")
    print(f"{prefix}  Agent Name: {data1.agent_name == data2.agent_name}")
    print(f"{prefix}  Task: {data1.agent_task == data2.agent_task}")
    print(f"{prefix}  Tool Call Results structure matches: {type(data1.tool_call_results) == type(data2.tool_call_results)}")
    if data1.tool_call_results and data2.tool_call_results:
        print(f"{prefix}  Tool Call Results content matches: {data1.tool_call_results == data2.tool_call_results}")
    print(f"{prefix}  Metadata structure matches: {type(data1.metadata) == type(data2.metadata)}")
    if data1.metadata and data2.metadata:
        print(f"{prefix}  Metadata content matches: {data1.metadata == data2.metadata}")

print("Comparing outputs of create_dataset and stream_dataset:")

# Get first record from create_dataset
dataset = create_dataset(output_csv)
create_record = dataset.data[0] if dataset.data else None

# Get first record from stream_dataset
stream_record = None
for chunk in stream_dataset(output_csv, chunk_size=500):
    if chunk:
        stream_record = chunk[0]
        break

if create_record and stream_record:
    print("\nDetailed comparison of first record:")
    compare_agent_data(create_record, stream_record, "  ")
    
    print("\nSample values from create_dataset:")
    print(f"  Tool Call Results: {create_record.tool_call_results}")
    print(f"  Metadata: {create_record.metadata}")
    
    print("\nSample values from stream_dataset:")
    print(f"  Tool Call Results: {stream_record.tool_call_results}")
    print(f"  Metadata: {stream_record.metadata}")
else:
    print("No records available for comparison")


In [None]:
# Example usage of streaming methods

# 1. Process huge CSV in chunks
# 2. Save chunk to JSON and test JSON streaming
print("\nExample 2: Testing JSON streaming")
# First save a chunk to JSON for testing
if total_processed > 0:
    # Save first 100 items to JSON
    with open('test_chunk.json', 'w') as f:
        json.dump(
            [agent.model_dump() for agent in chunk[:100]], 
            f, 
            indent=2
        )
    
    # Now read it back using stream_from_json
    print("\nReading back from JSON in chunks:")
    json_total = 0
    for json_chunk in dataset.stream_from_json(
        file_path='test_chunk.json',
        chunk_size=10,  # Small chunks for demonstration
        turn_id='turn_id',
        agent_name='agent_name',
        agent_task='agent_task',
        tool_call_results='tool_call_results',
        metadata='metadata'
    ):
        json_total += len(json_chunk)
        print(f"Read JSON chunk of {len(json_chunk)} items. Total: {json_total}")

    # Cleanup
    os.remove('test_chunk.json')

print("\nStreaming examples completed!")


In [None]:
# Check data sizes at each step
import os

# Original parquet file size
parquet_size = os.path.getsize(parquet_file_path) / (1024 * 1024)  # in MB
print(f"Original parquet file size: {parquet_size:.2f} MB")

# Read parquet and check DataFrame size
df = pd.read_parquet(parquet_file_path)
print(f"\nParquet DataFrame info:")
print(f"Number of rows: {len(df)}")
print(f"Number of columns: {len(df.columns)}")
print("\nMemory usage per column:")
print(df.memory_usage(deep=True) / (1024 * 1024), "MB")

# Check trajectory column
print("\nTrajectory column sample:")
if 'trajectory' in df.columns:
    sample_traj = df['trajectory'].iloc[0]
    print(f"Type of trajectory: {type(sample_traj)}")
    if hasattr(sample_traj, '__len__'):
        print(f"Length of first trajectory: {len(sample_traj)}")
    print("\nFirst trajectory content (truncated):")
    print(str(sample_traj)[:500], "...")


In [None]:
# Test preprocessing with a small sample first
sample_size = 100  # adjust this number as needed
df_sample = df.head(sample_size).copy()

# Save sample to parquet
sample_parquet = "sample.parquet"
df_sample.to_parquet(sample_parquet)

# Process the sample
output_csv_sample = "sample_output.csv"
swe_agent_trajectories_preprocessing(
    parquet_files=[sample_parquet],
    output_csv=output_csv_sample
)

# Check sizes
sample_output_df = pd.read_csv(output_csv_sample)
print(f"\nSample processing results:")
print(f"Input rows: {len(df_sample)}")
print(f"Output rows: {len(sample_output_df)}")
print(f"Expansion factor: {len(sample_output_df) / len(df_sample):.2f}x")

# Check memory usage of output
print("\nOutput DataFrame memory usage per column:")
print(sample_output_df.memory_usage(deep=True) / (1024 * 1024), "MB")

# Clean up sample files
os.remove(sample_parquet)
os.remove(output_csv_sample)
