In [1]:
# Set your JSON file path here
JSON_FILE_PATH = "/mnt/drive2/trace_details.json"  # Update this path

# Output paths
OUTPUT_CSV = "/mnt/drive2/noveum_spans_output.csv"
CHUNK_SIZE = 2  # For streaming test


In [2]:
import sys
import os
import pandas as pd
import json

# Add the parent directory to the path to import noveum_spans_dataset
sys.path.append(os.path.dirname(os.getcwd()))

from noveum_spans_dataset import noveum_spans_preprocessing, create_dataset, stream_dataset


In [3]:
# Check if file exists and preview its structure
if os.path.exists(JSON_FILE_PATH):
    print(f"✓ JSON file found: {JSON_FILE_PATH}")
    
    # Load and preview the JSON structure
    with open(JSON_FILE_PATH, 'r') as f:
        data = json.load(f)
    
    print(f"\nTrace ID: {data.get('trace_id')}")
    print(f"Number of spans: {len(data.get('spans', []))}")
    print(f"Trace name: {data.get('name')}")
    
    # Preview first span
    if data.get('spans'):
        first_span = data['spans'][0]
        print(f"\nFirst span:")
        print(f"  - Span ID: {first_span.get('span_id')}")
        print(f"  - Name: {first_span.get('name')}")
        print(f"  - Status: {first_span.get('status')}")
        print(f"  - Duration: {first_span.get('duration_ms')} ms")
        
        # Check attributes
        attributes = first_span.get('attributes', {})
        print(f"  - Function name: {attributes.get('function.name')}")
        
        # Check for input fields
        input_fields = [k for k in attributes.keys() if k.startswith('agent.input.') or k.startswith('tool.input.')]
        print(f"  - Input fields: {input_fields}")
        
else:
    print(f"❌ JSON file not found: {JSON_FILE_PATH}")
    print("Please update the JSON_FILE_PATH variable above.")


✓ JSON file found: /mnt/drive2/trace_details.json

Trace ID: 2d842385-2baa-40f3-8e12-9a7da4240d17
Number of spans: 8
Trace name: test_research_workflow_openai_gpt-3.5-turbo

First span:
  - Span ID: f2d9563d-6a0b-4ff6-b46f-330c1cdf9b07
  - Name: agent:research_coordinator:research_coordinator
  - Status: ok
  - Duration: 1801.633 ms
  - Function name: research_coordinator
  - Input fields: ['agent.input.research_topic']


In [4]:
# Test the preprocessing function
print("Testing noveum_spans_preprocessing...")

try:
    noveum_spans_preprocessing(
        json_files=[JSON_FILE_PATH],
        output_csv=OUTPUT_CSV
    )
    print("✓ Preprocessing completed successfully!")
    
    # Check the output CSV
    if os.path.exists(OUTPUT_CSV):
        df = pd.read_csv(OUTPUT_CSV)
        print(f"\nOutput CSV created with {len(df)} rows")
        print(f"Columns: {list(df.columns)}")
        
        # Show first few rows
        print("\nFirst few rows:")
        print(df[['turn_id', 'agent_name', 'agent_task', 'status']].head())
        
except Exception as e:
    print(f"❌ Error during preprocessing: {e}")
    import traceback
    traceback.print_exc()


Testing noveum_spans_preprocessing...
Processing /mnt/drive2/trace_details.json
Processed 8 spans and saved to /mnt/drive2/noveum_spans_output.csv
✓ Preprocessing completed successfully!

Output CSV created with 8 rows
Columns: ['turn_id', 'agent_name', 'agent_task', 'agent_response', 'metadata', 'trace_id', 'span_name', 'status', 'start_time', 'end_time', 'attributes']

First few rows:
                                turn_id            agent_name  \
0  f2d9563d-6a0b-4ff6-b46f-330c1cdf9b07  research_coordinator   
1  d33dba4a-dc67-46ae-9836-ac49ed938c6f          search_agent   
2  f1e4204b-9cbb-4322-9ba0-cfada1607da7       web_search_tool   
3  8387dbbd-9923-4299-aef9-62a60b0e4c70  academic_search_tool   
4  7826fa8a-3534-4206-a336-d15fe41ae35a        analysis_agent   

                                          agent_task status  
0              artificial intelligence in healthcare     ok  
1                                                NaN     ok  
2              artificial intelli

In [5]:
# Test creating a dataset from the CSV
print("Testing create_dataset...")

try:
    dataset = create_dataset(OUTPUT_CSV)
    print("✓ Dataset created successfully!")
    
    # Check dataset properties
    print(f"\nDataset info:")
    print(f"Number of records: {len(dataset.data)}")
    
    if dataset.data:
        first_record = dataset.data[0]
        print(f"\nFirst record:")
        print(f"  - Turn ID: {first_record.turn_id}")
        print(f"  - Agent name: {first_record.agent_name}")
        print(f"  - Agent role: {first_record.agent_role}")
        print(f"  - Task: {first_record.agent_task[:100]}...") if len(first_record.agent_task) > 100 else print(f"  - Task: {first_record.agent_task}")
        print(f"  - Response: {first_record.agent_response[:100]}...") if len(first_record.agent_response) > 100 else print(f"  - Response: {first_record.agent_response}")
        
        # Check metadata
        if first_record.metadata:
            metadata = json.loads(first_record.metadata)
            print(f"  - Metadata keys: {list(metadata.keys())}")
            
except Exception as e:
    print(f"❌ Error creating dataset: {e}")
    import traceback
    traceback.print_exc()


Testing create_dataset...
✓ Dataset created successfully!

Dataset info:
Number of records: 8

First record:
  - Turn ID: f2d9563d-6a0b-4ff6-b46f-330c1cdf9b07
  - Agent name: research_coordinator
  - Agent role: coordinator
  - Task: artificial intelligence in healthcare
  - Response: {'topic': 'artificial intelligence in healthcare', 'search_results': [{'title': 'Understanding artif...
  - Metadata keys: ['trace_id', 'duration_ms', 'parent_span_id', 'status_message', 'agent_type', 'span_name', 'status', 'start_time', 'end_time']


In [6]:
# Test streaming the dataset with chunk size 2
print(f"Testing stream_dataset with chunk_size={CHUNK_SIZE}...")

try:
    chunk_count = 0
    total_records = 0
    
    for chunk in stream_dataset(OUTPUT_CSV, chunk_size=CHUNK_SIZE):
        chunk_count += 1
        chunk_size_actual = len(chunk)
        total_records += chunk_size_actual
        
        print(f"\nChunk {chunk_count}: {chunk_size_actual} records")
        
        # Show details for first chunk
        if chunk_count == 1:
            for i, record in enumerate(chunk):
                print(f"  Record {i+1}:")
                print(f"    - Turn ID: {record.turn_id}")
                print(f"    - Agent: {record.agent_name}")
                print(f"    - Role: {record.agent_role}")
                task_preview = record.agent_task[:50] + "..." if len(record.agent_task) > 50 else record.agent_task
                print(f"    - Task: {task_preview}")
        
        # Limit output for large datasets
        if chunk_count >= 5:
            print(f"\n... (showing first 5 chunks only)")
            # Continue counting without printing
            remaining_chunks = list(stream_dataset(OUTPUT_CSV, chunk_size=CHUNK_SIZE))[5:]
            for remaining_chunk in remaining_chunks:
                chunk_count += 1
                total_records += len(remaining_chunk)
            break
    
    print(f"\n✓ Streaming completed!")
    print(f"Total chunks: {chunk_count}")
    print(f"Total records: {total_records}")
    
except Exception as e:
    print(f"❌ Error during streaming: {e}")
    import traceback
    traceback.print_exc()


Testing stream_dataset with chunk_size=2...

Chunk 1: 2 records
  Record 1:
    - Turn ID: f2d9563d-6a0b-4ff6-b46f-330c1cdf9b07
    - Agent: research_coordinator
    - Role: coordinator
    - Task: artificial intelligence in healthcare
  Record 2:
    - Turn ID: d33dba4a-dc67-46ae-9836-ac49ed938c6f
    - Agent: search_agent
    - Role: researcher
    - Task: nan

Chunk 2: 2 records

Chunk 3: 2 records

Chunk 4: 2 records

✓ Streaming completed!
Total chunks: 4
Total records: 8


In [7]:
# Analyze the processed data
print("Data Analysis:")

if os.path.exists(OUTPUT_CSV):
    df = pd.read_csv(OUTPUT_CSV)
    
    print(f"\nUnique agent names: {df['agent_name'].nunique()}")
    print("Agent name distribution:")
    print(df['agent_name'].value_counts())
    
    print(f"\nStatus distribution:")
    print(df['status'].value_counts())
    
    # Check for empty fields
    print(f"\nEmpty agent_task fields: {df['agent_task'].isna().sum() + (df['agent_task'] == '').sum()}")
    print(f"Empty agent_response fields: {df['agent_response'].isna().sum() + (df['agent_response'] == '').sum()}")
    
    # Show a sample of different span types
    print(f"\nSample of different span types:")
    unique_names = df['agent_name'].unique()[:5]
    for name in unique_names:
        sample = df[df['agent_name'] == name].iloc[0]
        print(f"\n{name}:")
        print(f"  Span name: {sample['span_name']}")
        print(f"  Task: {sample['agent_task'][:100]}...") if len(str(sample['agent_task'])) > 100 else print(f"  Task: {sample['agent_task']}")
        print(f"  Response length: {len(str(sample['agent_response']))} chars")
        
else:
    print("❌ Output CSV not found")


Data Analysis:

Unique agent names: 8
Agent name distribution:
agent_name
research_coordinator    1
search_agent            1
web_search_tool         1
academic_search_tool    1
analysis_agent          1
llm_analysis_call       1
summary_agent           1
llm_summary_call        1
Name: count, dtype: int64

Status distribution:
status
ok    8
Name: count, dtype: int64

Empty agent_task fields: 1
Empty agent_response fields: 0

Sample of different span types:

research_coordinator:
  Span name: agent:research_coordinator:research_coordinator
  Task: artificial intelligence in healthcare
  Response length: 1000 chars

search_agent:
  Span name: agent:search_agent:search_agent
  Task: nan
  Response length: 1000 chars

web_search_tool:
  Span name: tool:web_search:web_search_tool
  Task: artificial intelligence in healthcare
  Response length: 632 chars

academic_search_tool:
  Span name: tool:academic_search:academic_search_tool
  Task: artificial intelligence in healthcare
  Response le

In [8]:
# Uncomment the line below to clean up the output CSV file
# os.remove(OUTPUT_CSV)
# print(f"Cleaned up {OUTPUT_CSV}")

print("Testing completed!")


Testing completed!
