In [None]:
import os

# Set your JSON file path here
# Configuration - can be overridden with environment variables for portability
JSON_FILE_PATH = os.getenv("NOVEUM_JSON_PATH", "/mnt/drive2/trace_details.json")
OUTPUT_CSV = os.getenv("NOVEUM_OUTPUT_CSV", "/mnt/drive2/noveum_spans_output.csv")
CHUNK_SIZE = int(os.getenv("NOVEUM_CHUNK_SIZE", "2"))  # Set to 2 for testing purposes to verify streaming behavior

# Alternative: Use relative paths if the files are in the current directory
# JSON_FILE_PATH = os.getenv("NOVEUM_JSON_PATH", "./trace_details.json")
# OUTPUT_CSV = os.getenv("NOVEUM_OUTPUT_CSV", "./noveum_spans_output.csv")

# Output paths


In [None]:
import json
import os
import sys

import pandas as pd

# Add the parent directory to the path to import noveum_spans_dataset
sys.path.append(os.path.dirname(os.getcwd()))

from novaeval.datasets.noveum_spans_dataset import (
    create_dataset,
    noveum_spans_preprocessing,
    stream_dataset,
)


In [None]:
# Check if file exists and preview its structure
if os.path.exists(JSON_FILE_PATH):
    print(f"✓ JSON file found: {JSON_FILE_PATH}")

    # Load and preview the JSON structure
    with open(JSON_FILE_PATH) as f:
        data = json.load(f)

    print(f"\nTrace ID: {data.get('trace_id')}")
    print(f"Number of spans: {len(data.get('spans', []))}")
    print(f"Trace name: {data.get('name')}")

    # Preview first span
    if data.get("spans"):
        first_span = data["spans"][0]
        print("\nFirst span:")
        print(f"  - Span ID: {first_span.get('span_id')}")
        print(f"  - Name: {first_span.get('name')}")
        print(f"  - Status: {first_span.get('status')}")
        print(f"  - Duration: {first_span.get('duration_ms')} ms")

        # Check attributes
        attributes = first_span.get("attributes", {})
        print(f"  - Function name: {attributes.get('function.name')}")

        # Check for input fields
        input_fields = [k for k in attributes if k.startswith("agent.input.") or k.startswith("tool.input.")]
        print(f"  - Input fields: {input_fields}")

else:
    print(f"❌ JSON file not found: {JSON_FILE_PATH}")
    print("Please update the JSON_FILE_PATH variable above.")


In [None]:
# Test the preprocessing function
print("Testing noveum_spans_preprocessing...")

try:
    noveum_spans_preprocessing(
        json_files=[JSON_FILE_PATH],
        output_csv=OUTPUT_CSV
    )
    print("✓ Preprocessing completed successfully!")

    # Check the output CSV
    if os.path.exists(OUTPUT_CSV):
        df = pd.read_csv(OUTPUT_CSV)
        print(f"\nOutput CSV created with {len(df)} rows")
        print(f"Columns: {list(df.columns)}")

        # Show first few rows
        print("\nFirst few rows:")
        print(df[["turn_id", "agent_name", "agent_task", "status"]].head())

except Exception as e:
    print(f"❌ Error during preprocessing: {e}")
    import traceback
    traceback.print_exc()


In [None]:
# Test creating a dataset from the CSV
print("Testing create_dataset...")

try:
    dataset = create_dataset(OUTPUT_CSV)
    print("✓ Dataset created successfully!")

    # Check dataset properties
    print("\nDataset info:")
    print(f"Number of records: {len(dataset.data)}")

    if dataset.data:
        first_record = dataset.data[0]
        print("\nFirst record:")
        print(f"  - Turn ID: {first_record.turn_id}")
        print(f"  - Agent name: {first_record.agent_name}")
        print(f"  - Agent role: {first_record.agent_role}")
        if len(first_record.agent_task) > 100:
            print(f"  - Task: {first_record.agent_task[:100]}...")
        else:
            print(f"  - Task: {first_record.agent_task}")
        if len(first_record.agent_response) > 100:
            print(f"  - Response: {first_record.agent_response[:100]}...")
        else:
            print(f"  - Response: {first_record.agent_response}")

        # Check metadata
        if first_record.metadata:
            metadata = json.loads(first_record.metadata)
            print(f"  - Metadata keys: {list(metadata.keys())}")

except Exception as e:
    print(f"❌ Error creating dataset: {e}")
    import traceback
    traceback.print_exc()


In [None]:
# Test streaming the dataset with chunk size 2
print(f"Testing stream_dataset with chunk_size={CHUNK_SIZE}...")

try:
    chunk_count = 0
    total_records = 0

    for chunk in stream_dataset(OUTPUT_CSV, chunk_size=CHUNK_SIZE):
        chunk_count += 1
        chunk_size_actual = len(chunk)
        total_records += chunk_size_actual

        print(f"\nChunk {chunk_count}: {chunk_size_actual} records")

        # Show details for first chunk
        if chunk_count == 1:
            for i, record in enumerate(chunk):
                print(f"  Record {i+1}:")
                print(f"    - Turn ID: {record.turn_id}")
                print(f"    - Agent: {record.agent_name}")
                print(f"    - Role: {record.agent_role}")
                task_preview = record.agent_task[:50] + "..." if len(record.agent_task) > 50 else record.agent_task
                print(f"    - Task: {task_preview}")

        # Limit output for large datasets
        if chunk_count >= 5:
            print("\n... (showing first 5 chunks only)")
            # Continue counting without printing
            # Continue counting remaining chunks without printing to preserve streaming behavior
            # Skip the first 5 chunks and process the rest
            skip_count = 0
            for remaining_chunk in stream_dataset(OUTPUT_CSV, chunk_size=CHUNK_SIZE):
                if skip_count < 5:
                    skip_count += 1
                    continue
                chunk_count += 1
                total_records += len(remaining_chunk)
            break

    print("\n✓ Streaming completed!")
    print(f"Total chunks: {chunk_count}")
    print(f"Total records: {total_records}")

except Exception as e:
    print(f"❌ Error during streaming: {e}")
    import traceback
    traceback.print_exc()


In [None]:
# Analyze the processed data
print("Data Analysis:")

if os.path.exists(OUTPUT_CSV):
    df = pd.read_csv(OUTPUT_CSV)

    print(f"\nUnique agent names: {df['agent_name'].nunique()}")
    print("Agent name distribution:")
    print(df["agent_name"].value_counts())

    print("\nStatus distribution:")
    print(df["status"].value_counts())

    # Check for empty fields
    print(f"\nEmpty agent_task fields: {df['agent_task'].isna().sum() + (df['agent_task'] == '').sum()}")
    print(f"Empty agent_response fields: {df['agent_response'].isna().sum() + (df['agent_response'] == '').sum()}")

    # Show a sample of different span types
    print("\nSample of different span types:")
    unique_names = df["agent_name"].unique()[:5]
    for name in unique_names:
        sample = df[df["agent_name"] == name].iloc[0]
        print(f"\n{name}:")
        print(f"  Span name: {sample['span_name']}")
        if len(str(sample["agent_task"])) > 100:
            print(f"  Task: {sample['agent_task'][:100]}...")
        else:
            print(f"  Task: {sample['agent_task']}")
        print(f"  Response length: {len(str(sample['agent_response']))} chars")

else:
    print("❌ Output CSV not found")


In [None]:
# Uncomment the line below to clean up the output CSV file
# os.remove(OUTPUT_CSV)
# print(f"Cleaned up {OUTPUT_CSV}")

print("Testing completed!")
