# Agent Evaluation Demo with NovaEval

This notebook demonstrates how to:
1. Load agent trace data from dataset.json
2. Map trace spans to AgentData format
3. Create an AgentDataset 
4. Evaluate agent performance using AgentEvaluator with Gemini model
5. Analyze results with multiple scorers


In [24]:
import json
import re
from typing import Any, Dict, List, Optional, Union

# NovaEval imports
from novaeval.agents.agent_data import AgentData, ToolSchema, ToolCall, ToolResult
from novaeval.datasets.agent_dataset import AgentDataset
from novaeval.evaluators.agent_evaluator import AgentEvaluator
from novaeval.models.gemini import GeminiModel
from novaeval.scorers.agent_scorers import (
    context_relevancy_scorer,
    role_adherence_scorer,
    task_progression_scorer,
    tool_relevancy_scorer,
    tool_correctness_scorer,
    parameter_correctness_scorer
)

print("✅ All imports successful!")


✅ All imports successful!


## Step 1: Load and Examine Dataset Structure


In [25]:
# Load the dataset
with open('dataset.json', 'r') as f:
    spans_data = json.load(f)

print(f"📊 Loaded {len(spans_data)} spans from dataset.json")
print(f"\n🔍 Sample span types:")

# Analyze span types
span_types = {}
for span in spans_data[:10]:  # Look at first 10 spans
    span_name = span.get('name', 'unknown')
    if span_name not in span_types:
        span_types[span_name] = 0
    span_types[span_name] += 1

for span_type, count in span_types.items():
    print(f"  - {span_type}: {count}")


📊 Loaded 56 spans from dataset.json

🔍 Sample span types:
  - chain_start.unknown: 1
  - llm.openai: 5
  - tool:user_input:user_input: 4


## Step 2: Implement Field Mapping Logic


In [26]:
def parse_tools_from_prompt(prompt: str) -> List[ToolSchema]:
    """
    Parse tool definitions from LLM prompts using regex.
    
    Expected format: tool_name(param: type = default) -> return_type - description
    """
    # Pattern to match tool signatures
    pattern = r'(\w+)\(([^)]*)\)\s*->\s*(\w+)\s*-\s*(.+?)(?=\n\w+\(|$)'
    matches = re.findall(pattern, prompt, re.DOTALL)
    
    tools = []
    for match in matches:
        tool_name, params_str, return_type, description = match
        
        # Parse parameters
        args_schema = parse_params(params_str)
        
        tool = ToolSchema(
            name=tool_name,
            description=description.strip(),
            args_schema=args_schema,
            return_schema={"type": return_type}
        )
        tools.append(tool)
    
    return tools

def parse_params(params_str: str) -> Dict[str, Any]:
    """
    Parse parameter string into schema dictionary.
    
    Format: param_name: type = default_value
    """
    if not params_str.strip():
        return {}
    
    # Split parameters by comma
    params = [p.strip() for p in params_str.split(',') if p.strip()]
    schema = {}
    
    for param in params:
        if ':' in param:
            parts = param.split(':', 1)
            param_name = parts[0].strip()
            type_and_default = parts[1].strip()
            
            # Extract type and default value
            if '=' in type_and_default:
                type_part, default_part = type_and_default.split('=', 1)
                param_type = type_part.strip()
                default_val = default_part.strip().strip('"\'')
                schema[param_name] = {'type': param_type, 'default': default_val}
            else:
                param_type = type_and_default.strip()
                schema[param_name] = {'type': param_type}
    
    return schema

def identify_span_type(span: Dict[str, Any]) -> str:
    """
    Identify span type based on attributes.
    """
    attributes = span.get('attributes', {})
    
    # Check for agent attributes
    if any(key.startswith('agent.') for key in attributes.keys()):
        return 'agent'
    
    # Check for LLM attributes
    if any(key.startswith('llm.') for key in attributes.keys()):
        return 'llm'
    
    # Check for tool attributes
    if any(key.startswith('tool.') for key in attributes.keys()):
        return 'tool'
    
    return 'unknown'


In [27]:
def map_span_to_agent_data(span: Dict[str, Any]) -> AgentData:
    """
    Map a single span from dataset.json to AgentData format.
    """
    attributes = span.get('attributes', {})
    events = span.get('events', [])
    span_type = identify_span_type(span)
    
    # Base mappings
    data = {
        'user_id': span.get('metadata', {}).get('user_id', None),
        'task_id': span.get('trace_id'),
        'turn_id': span.get('span_id'),
        'ground_truth': None,
        'expected_tool_call': None,
        'agent_name': span_type,
        'agent_role': span_type,
        'system_prompt': None,
        'metadata': None,
        'exit_status': span.get('status'),
        'tools_available': [],
        'tool_calls': [],
        'parameters_passed': {},
        'tool_call_results': [],
        'retrieval_query': None,
        'retrieved_context': None,
        'agent_exit': False,
        'trace': None
    }
    
    # Span-specific mappings
    if span_type == 'agent':
        # Agent task
        chain_inputs = attributes.get('chain.inputs', {})
        if isinstance(chain_inputs, dict) and 'input' in chain_inputs:
            data['agent_task'] = chain_inputs['input']
        
        # Agent response
        finish_values = attributes.get('agent.output.finish.return_values', {})
        if isinstance(finish_values, dict) and 'output' in finish_values:
            data['agent_response'] = finish_values['output']
        
        # Tool calls from agent actions
        tool_name = attributes.get('agent.output.action.tool')
        tool_input = attributes.get('agent.output.action.tool_input')
        
        if tool_name:
            tool_call = ToolCall(
                tool_name=tool_name,
                parameters={'input': tool_input} if tool_input else {},
                call_id=span['span_id']
            )
            data['tool_calls'] = [tool_call]
            data['parameters_passed'] = {'input': tool_input} if tool_input else {}
            
            # Handle retrieval query for langchain_retriever
            if tool_name == 'langchain_retriever' and tool_input:
                data['retrieval_query'] = [tool_input]
        
        # Agent exit status
        data['agent_exit'] = any(event.get('name') == 'agent_finish' for event in events)
        
        # Trace (dump events as JSON)
        if events:
            data['trace'] = json.dumps(events)
    
    elif span_type == 'llm':
        # Agent response from LLM output
        llm_responses = attributes.get('llm.output.response', [])
        if llm_responses:
            data['agent_response'] = llm_responses[0]
        
        # Parse tools from prompt
        prompts = attributes.get('llm.input.prompts', [])
        if prompts:
            try:
                tools = parse_tools_from_prompt(prompts[0])
                data['tools_available'] = tools
            except Exception:
                # Fallback to empty list if parsing fails
                data['tools_available'] = []
        
        data['parameters_passed'] = {}
    
    elif span_type == 'tool':
        # Agent response from tool output
        tool_output = attributes.get('tool.output.output')
        if tool_output:
            data['agent_response'] = tool_output
        
        # Tool call results
        tool_name = attributes.get('tool.name')
        if tool_name and tool_output is not None:
            tool_result = ToolResult(
                call_id=span['span_id'],
                result=tool_output,
                success=span.get('status') == 'ok',
                error_message=None if span.get('status') == 'ok' else 'Tool execution failed'
            )
            data['tool_call_results'] = [tool_result]
            
            # Handle retrieved context for langchain_retriever
            if tool_name == 'langchain_retriever':
                data['retrieved_context'] = [[tool_output]]
        
        # Parameters from tool input
        tool_input_keys = [key for key in attributes.keys() if key.startswith('tool.input.')]
        tool_params = {}
        for key in tool_input_keys:
            param_name = key.replace('tool.input.', '')
            tool_params[param_name] = attributes[key]
        data['parameters_passed'] = tool_params
    
    return AgentData(**data)

print("✅ Field mapping functions defined!")


✅ Field mapping functions defined!


## Step 3: Create AgentDataset from Spans


In [28]:
# Convert spans to AgentData objects
print("🔄 Converting spans to AgentData objects...")

agent_data_list = []
errors = []

for i, span in enumerate(spans_data):
    try:
        agent_data = map_span_to_agent_data(span)
        agent_data_list.append(agent_data)
    except Exception as e:
        errors.append(f"Span {i}: {str(e)}")
        if len(errors) <= 5:  # Show first 5 errors only
            print(f"⚠️  Error processing span {i}: {e}")

print(f"\n✅ Successfully converted {len(agent_data_list)} spans to AgentData")
if errors:
    print(f"❌ {len(errors)} spans had errors")

# Create AgentDataset
dataset = AgentDataset()
dataset.data = agent_data_list

print(f"📊 AgentDataset created with {len(dataset.data)} records")


🔄 Converting spans to AgentData objects...

✅ Successfully converted 56 spans to AgentData
📊 AgentDataset created with 56 records


## Step 4: Examine Sample Data


In [29]:
# Show statistics about the dataset
print("📈 Dataset Statistics:")

agent_types = {}
tool_usage = {}
with_responses = 0
with_tool_calls = 0
with_retrieval = 0

for data in dataset.data:
    # Agent types
    if data.agent_name:
        agent_types[data.agent_name] = agent_types.get(data.agent_name, 0) + 1
    
    # Responses
    if data.agent_response:
        with_responses += 1
    
    # Tool calls
    if data.tool_calls:
        with_tool_calls += 1
        for tool_call in data.tool_calls:
            if hasattr(tool_call, 'tool_name'):
                tool_usage[tool_call.tool_name] = tool_usage.get(tool_call.tool_name, 0) + 1
    
    # Retrieval
    if data.retrieval_query:
        with_retrieval += 1

print(f"\nAgent Types: {dict(agent_types)}")
print(f"Records with responses: {with_responses}")
print(f"Records with tool calls: {with_tool_calls}")
print(f"Records with retrieval: {with_retrieval}")
print(f"Tool usage: {dict(tool_usage)}")

# Show sample records
print("\n🔍 Sample AgentData records:")
for i, data in enumerate(dataset.data[:3]):
    print(f"\n--- Record {i+1} ({data.agent_name}) ---")
    print(f"Task: {data.agent_task[:100] if data.agent_task else 'None'}...")
    print(f"Response: {data.agent_response[:100] if data.agent_response else 'None'}...")
    print(f"Tool calls: {len(data.tool_calls) if data.tool_calls else 0}")
    print(f"Exit status: {data.exit_status}")


📈 Dataset Statistics:

Agent Types: {'agent': 21, 'llm': 28, 'tool': 7}
Records with responses: 43
Records with tool calls: 20
Records with retrieval: 4
Tool usage: {'user_input': 12, 'escalate_to_human': 4, 'langchain_retriever': 4}

🔍 Sample AgentData records:

--- Record 1 (agent) ---
Task: Keep using user_input tool, until i tell you to exit...
Response: The user's query has been escalated to human customer support for further assistance....
Tool calls: 1
Exit status: ok

--- Record 2 (llm) ---
Task: None...
Response: I need to prompt the user for more information until instructed otherwise.
Action: user_input
Action...
Tool calls: 0
Exit status: ok

--- Record 3 (agent) ---
Task: None...
Response: None...
Tool calls: 1
Exit status: ok


## Step 5: Setup Gemini Model and Evaluator


In [30]:
import os

# Check for API key
if 'GEMINI_API_KEY' not in os.environ:
    print("⚠️  GEMINI_API_KEY environment variable not set!")
    print("Please set it before running evaluation:")
    print("export GEMINI_API_KEY='your-api-key-here'")
else:
    print("✅ GEMINI_API_KEY found in environment")

# Initialize Gemini model
try:
    gemini_model = GeminiModel(
        model_name="gemini-1.5-flash",  # Using flash model for cost efficiency
        temperature=0.1,  # Low temperature for consistent evaluation
        max_tokens=1024
    )
    print("✅ Gemini model initialized")
except Exception as e:
    print(f"❌ Error initializing Gemini model: {e}")
    gemini_model = None


✅ GEMINI_API_KEY found in environment
2025-09-12 03:55:19 - INFO - novaeval.models.base - Noveum tracing initialized successfully
✅ Gemini model initialized


In [31]:
# Initialize scoring functions for evaluation
scoring_functions = [
    task_progression_scorer,
    context_relevancy_scorer,
    role_adherence_scorer,
    tool_relevancy_scorer,
    tool_correctness_scorer,
    parameter_correctness_scorer
]

print(f"✅ Initialized {len(scoring_functions)} scoring functions:")
for func in scoring_functions:
    print(f"  - {func.__name__}")

# Create AgentEvaluator
if gemini_model:
    evaluator = AgentEvaluator(
        agent_dataset=dataset,
        models=[gemini_model],
        scoring_functions=scoring_functions,
        output_dir="./demo_results",
        stream=False,
        include_reasoning=True
    )
    print("\n✅ AgentEvaluator created with Gemini model and scoring functions")
else:
    print("\n❌ Cannot create evaluator - Gemini model not available")


✅ Initialized 6 scoring functions:
  - task_progression_scorer
  - context_relevancy_scorer
  - role_adherence_scorer
  - tool_relevancy_scorer
  - tool_correctness_scorer
  - parameter_correctness_scorer

✅ AgentEvaluator created with Gemini model and scoring functions


## Step 6: Run Evaluation (Sample)


In [36]:
# Run evaluation using the AgentEvaluator's run_all method
print("🚀 Running evaluation on sample data...")

if gemini_model and evaluator:
    try:
        # Create a smaller dataset for demo purposes
        sample_data = [data for data in dataset.data if data.agent_response][:10]
        print(f"\n📊 Evaluating {len(sample_data)} sample records...")
        
        # Create a temporary dataset with just the sample data
        sample_dataset = AgentDataset()
        sample_dataset.data = sample_data
        
        # Create a new evaluator with the sample dataset
        sample_evaluator = AgentEvaluator(
            agent_dataset=sample_dataset,
            models=[gemini_model],
            scoring_functions=scoring_functions,
            output_dir="./demo_results/sample_evaluation",
            stream=False,
            include_reasoning=True
        )
        
        # Run the evaluation
        sample_evaluator.run_all(save_every=1, file_type="csv")
        
        print("\n✅ Evaluation completed!")
        
        # Read and display results
        import pandas as pd
        results_file = "./demo_results/sample_evaluation/agent_evaluation_results.csv"
        
        if pd.io.common.file_exists(results_file):
            results_df = pd.read_csv(results_file)
            print(f"\n📊 Results Summary:")
            
            # Calculate averages for each scorer
            scorer_columns = [col for col in results_df.columns if col not in ['user_id', 'task_id', 'turn_id', 'agent_name'] and not col.endswith('_reasoning')]
            
            for col in scorer_columns:
                if results_df[col].dtype in ['float64', 'int64']:
                    avg_score = results_df[col].mean()
                    print(f"  - {col}: {avg_score:.2f}")
            
            # Show individual scores
            print(f"\n🔍 Individual Scores:")
            for i, row in results_df.iterrows():
                print(f"\n  Record {i+1} (Task: {row.get('task_id', 'N/A')}):")
                for col in scorer_columns:
                    if pd.notna(row[col]):
                        print(f"    - {col}: {row[col]}")
        else:
            print("❌ Results file not found")
        
    except Exception as e:
        print(f"❌ Error during evaluation: {e}")
        print(f"Error type: {type(e).__name__}")
        import traceback
        traceback.print_exc()
        
else:
    print("⚠️  Skipping evaluation - missing model or evaluator")


🚀 Running evaluation on sample data...

📊 Evaluating 10 sample records...
2025-09-12 03:57:37 - INFO - novaeval.evaluators.agent_evaluator - Starting agent evaluation process


Evaluating samples: 0it [00:00, ?it/s]

2025-09-12 03:57:37 - INFO - google_genai.models - AFC is enabled with max remote calls: 10.
2025-09-12 03:57:39 - INFO - google_genai.models - AFC remote call 1 is done.


2025-09-12 03:57:39 - noveum_trace.transport.http_transport - INFO - 📤 EXPORTING TRACE: auto_trace_generate (ID: 874be6a5-be94-4610-a19f-bfb30da97e00) - 1 spans
2025-09-12 03:57:39 - noveum_trace.transport.batch_processor - INFO - 📥 ADDING TRACE TO QUEUE: auto_trace_generate (ID: 874be6a5-be94-4610-a19f-bfb30da97e00) - 1 spans
2025-09-12 03:57:39 - noveum_trace.transport.batch_processor - INFO - ✅ Successfully queued trace 874be6a5-be94-4610-a19f-bfb30da97e00
2025-09-12 03:57:39 - noveum_trace.transport.http_transport - INFO - ✅ Trace 874be6a5-be94-4610-a19f-bfb30da97e00 successfully queued for export


2025-09-12 03:57:39 - INFO - google_genai.models - AFC is enabled with max remote calls: 10.
2025-09-12 03:57:41 - INFO - google_genai.models - AFC remote call 1 is done.


2025-09-12 03:57:41 - noveum_trace.transport.http_transport - INFO - 📤 EXPORTING TRACE: auto_trace_generate (ID: 0fafae84-5bb8-441b-8b75-45525249011b) - 1 spans
2025-09-12 03:57:41 - noveum_trace.transport.batch_processor - INFO - 📥 ADDING TRACE TO QUEUE: auto_trace_generate (ID: 0fafae84-5bb8-441b-8b75-45525249011b) - 1 spans
2025-09-12 03:57:41 - noveum_trace.transport.batch_processor - INFO - ✅ Successfully queued trace 0fafae84-5bb8-441b-8b75-45525249011b
2025-09-12 03:57:41 - noveum_trace.transport.http_transport - INFO - ✅ Trace 0fafae84-5bb8-441b-8b75-45525249011b successfully queued for export


2025-09-12 03:57:41 - INFO - google_genai.models - AFC is enabled with max remote calls: 10.
2025-09-12 03:57:42 - INFO - google_genai.models - AFC remote call 1 is done.


2025-09-12 03:57:42 - noveum_trace.transport.http_transport - INFO - 📤 EXPORTING TRACE: auto_trace_generate (ID: 2c23140a-365e-4197-93fc-91e24e585062) - 1 spans
2025-09-12 03:57:42 - noveum_trace.transport.batch_processor - INFO - 📥 ADDING TRACE TO QUEUE: auto_trace_generate (ID: 2c23140a-365e-4197-93fc-91e24e585062) - 1 spans
2025-09-12 03:57:42 - noveum_trace.transport.batch_processor - INFO - ✅ Successfully queued trace 2c23140a-365e-4197-93fc-91e24e585062
2025-09-12 03:57:42 - noveum_trace.transport.http_transport - INFO - ✅ Trace 2c23140a-365e-4197-93fc-91e24e585062 successfully queued for export


2025-09-12 03:57:42 - INFO - google_genai.models - AFC is enabled with max remote calls: 10.
2025-09-12 03:57:43 - INFO - google_genai.models - AFC remote call 1 is done.


2025-09-12 03:57:43 - noveum_trace.transport.http_transport - INFO - 📤 EXPORTING TRACE: auto_trace_generate (ID: 69c93676-ced1-42e7-9da8-2886c6035fcc) - 1 spans
2025-09-12 03:57:43 - noveum_trace.transport.batch_processor - INFO - 📥 ADDING TRACE TO QUEUE: auto_trace_generate (ID: 69c93676-ced1-42e7-9da8-2886c6035fcc) - 1 spans
2025-09-12 03:57:43 - noveum_trace.transport.batch_processor - INFO - ✅ Successfully queued trace 69c93676-ced1-42e7-9da8-2886c6035fcc
2025-09-12 03:57:43 - noveum_trace.transport.http_transport - INFO - ✅ Trace 69c93676-ced1-42e7-9da8-2886c6035fcc successfully queued for export


2025-09-12 03:57:44 - INFO - novaeval.evaluators.agent_evaluator - Saving intermediate results after 1 samples
2025-09-12 03:57:44 - INFO - novaeval.evaluators.agent_evaluator - Intermediate results saved to demo_results/sample_evaluation/agent_evaluation_results.csv


Evaluating samples: 1it [00:06,  6.49s/it]

2025-09-12 03:57:44 - INFO - novaeval.evaluators.agent_evaluator - Saving intermediate results after 2 samples
2025-09-12 03:57:44 - INFO - novaeval.evaluators.agent_evaluator - Intermediate results saved to demo_results/sample_evaluation/agent_evaluation_results.csv
2025-09-12 03:57:44 - INFO - novaeval.evaluators.agent_evaluator - Saving intermediate results after 3 samples
2025-09-12 03:57:44 - INFO - novaeval.evaluators.agent_evaluator - Intermediate results saved to demo_results/sample_evaluation/agent_evaluation_results.csv
2025-09-12 03:57:44 - INFO - novaeval.evaluators.agent_evaluator - Saving intermediate results after 4 samples
2025-09-12 03:57:44 - INFO - novaeval.evaluators.agent_evaluator - Intermediate results saved to demo_results/sample_evaluation/agent_evaluation_results.csv
2025-09-12 03:57:44 - INFO - novaeval.evaluators.agent_evaluator - Saving intermediate results after 5 samples
2025-09-12 03:57:44 - INFO - novaeval.evaluators.agent_evaluator - Intermediate resul

2025-09-12 03:57:45 - noveum_trace.transport.http_transport - INFO - 📤 EXPORTING TRACE: auto_trace_generate (ID: aa836924-71ce-42c4-b0fd-0d6e5dec735e) - 1 spans
2025-09-12 03:57:45 - noveum_trace.transport.batch_processor - INFO - 📥 ADDING TRACE TO QUEUE: auto_trace_generate (ID: aa836924-71ce-42c4-b0fd-0d6e5dec735e) - 1 spans
2025-09-12 03:57:45 - noveum_trace.transport.batch_processor - INFO - ✅ Successfully queued trace aa836924-71ce-42c4-b0fd-0d6e5dec735e
2025-09-12 03:57:45 - noveum_trace.transport.http_transport - INFO - ✅ Trace aa836924-71ce-42c4-b0fd-0d6e5dec735e successfully queued for export


2025-09-12 03:57:45 - INFO - google_genai.models - AFC is enabled with max remote calls: 10.
2025-09-12 03:57:46 - INFO - google_genai.models - AFC remote call 1 is done.


2025-09-12 03:57:46 - noveum_trace.transport.http_transport - INFO - 📤 EXPORTING TRACE: auto_trace_generate (ID: cd151e32-5147-467b-9c6a-e92e2becfe69) - 1 spans
2025-09-12 03:57:46 - noveum_trace.transport.batch_processor - INFO - 📥 ADDING TRACE TO QUEUE: auto_trace_generate (ID: cd151e32-5147-467b-9c6a-e92e2becfe69) - 1 spans
2025-09-12 03:57:46 - noveum_trace.transport.batch_processor - INFO - ✅ Successfully queued trace cd151e32-5147-467b-9c6a-e92e2becfe69
2025-09-12 03:57:46 - noveum_trace.transport.http_transport - INFO - ✅ Trace cd151e32-5147-467b-9c6a-e92e2becfe69 successfully queued for export


2025-09-12 03:57:46 - INFO - google_genai.models - AFC is enabled with max remote calls: 10.
2025-09-12 03:57:48 - INFO - google_genai.models - AFC remote call 1 is done.


2025-09-12 03:57:48 - noveum_trace.transport.http_transport - INFO - 📤 EXPORTING TRACE: auto_trace_generate (ID: 20fef7a0-c44e-49d3-9a83-93fb7bb8075c) - 1 spans
2025-09-12 03:57:48 - noveum_trace.transport.batch_processor - INFO - 📥 ADDING TRACE TO QUEUE: auto_trace_generate (ID: 20fef7a0-c44e-49d3-9a83-93fb7bb8075c) - 1 spans
2025-09-12 03:57:48 - noveum_trace.transport.batch_processor - INFO - ✅ Successfully queued trace 20fef7a0-c44e-49d3-9a83-93fb7bb8075c
2025-09-12 03:57:48 - noveum_trace.transport.http_transport - INFO - ✅ Trace 20fef7a0-c44e-49d3-9a83-93fb7bb8075c successfully queued for export


2025-09-12 03:57:48 - INFO - google_genai.models - AFC is enabled with max remote calls: 10.
2025-09-12 03:57:50 - INFO - google_genai.models - AFC remote call 1 is done.


2025-09-12 03:57:50 - noveum_trace.transport.http_transport - INFO - 📤 EXPORTING TRACE: auto_trace_generate (ID: c8f743bc-2c02-4c2a-93cb-3314c2998e68) - 1 spans
2025-09-12 03:57:50 - noveum_trace.transport.batch_processor - INFO - 📥 ADDING TRACE TO QUEUE: auto_trace_generate (ID: c8f743bc-2c02-4c2a-93cb-3314c2998e68) - 1 spans
2025-09-12 03:57:50 - noveum_trace.transport.batch_processor - INFO - ✅ Successfully queued trace c8f743bc-2c02-4c2a-93cb-3314c2998e68
2025-09-12 03:57:50 - noveum_trace.transport.http_transport - INFO - ✅ Trace c8f743bc-2c02-4c2a-93cb-3314c2998e68 successfully queued for export


2025-09-12 03:57:50 - INFO - novaeval.evaluators.agent_evaluator - Saving intermediate results after 9 samples
2025-09-12 03:57:50 - INFO - novaeval.evaluators.agent_evaluator - Intermediate results saved to demo_results/sample_evaluation/agent_evaluation_results.csv


Evaluating samples: 9it [00:12,  1.22s/it]

2025-09-12 03:57:50 - INFO - novaeval.evaluators.agent_evaluator - Saving intermediate results after 10 samples
2025-09-12 03:57:50 - INFO - novaeval.evaluators.agent_evaluator - Intermediate results saved to demo_results/sample_evaluation/agent_evaluation_results.csv


Evaluating samples: 10it [00:12,  1.26s/it]

2025-09-12 03:57:50 - INFO - novaeval.evaluators.agent_evaluator - Saving final results
2025-09-12 03:57:50 - INFO - novaeval.evaluators.agent_evaluator - Reloaded 10 results from CSV
2025-09-12 03:57:50 - INFO - novaeval.evaluators.agent_evaluator - Agent evaluation completed

✅ Evaluation completed!

📊 Results Summary:
  - task_progression: 0.00
  - context_relevancy: 1.10
  - role_adherence: 1.23
  - tool_relevancy: 1.30
  - tool_correctness: 0.00
  - parameter_correctness: 1.40

🔍 Individual Scores:

  Record 1 (Task: 27a8252b-43e5-48d1-8707-12c8537feff8):
    - task_progression: 0.0
    - context_relevancy: 4.2
    - role_adherence: 7.8
    - tool_relevancy: 6.5
    - tool_correctness: 0.0
    - parameter_correctness: 6.5

  Record 2 (Task: 27a8252b-43e5-48d1-8707-12c8537feff8):
    - task_progression: 0.0
    - context_relevancy: 0.0
    - role_adherence: 0.0
    - tool_relevancy: 0.0
    - tool_correctness: 0.0
    - parameter_correctness: 0.0

  Record 3 (Task: 27a8252b-43e5-48




## Step 7: Analysis and Insights


In [33]:
# Analyze the dataset characteristics
print("🔍 Dataset Analysis:")
print("\n=== Agent Behavior Patterns ===")

# Analyze tool usage patterns
tool_patterns = {}
task_types = {}
response_lengths = []

for data in dataset.data:
    # Tool usage
    if data.tool_calls:
        for tool_call in data.tool_calls:
            if hasattr(tool_call, 'tool_name'):
                tool_name = tool_call.tool_name
                if tool_name not in tool_patterns:
                    tool_patterns[tool_name] = {'count': 0, 'success_rate': 0}
                tool_patterns[tool_name]['count'] += 1
    
    # Task analysis
    if data.agent_task:
        # Simple categorization
        task_lower = data.agent_task.lower()
        if 'user_input' in task_lower:
            task_types['user_input'] = task_types.get('user_input', 0) + 1
        elif 'exit' in task_lower:
            task_types['exit_command'] = task_types.get('exit_command', 0) + 1
        else:
            task_types['other'] = task_types.get('other', 0) + 1
    
    # Response analysis
    if data.agent_response:
        response_lengths.append(len(data.agent_response))

print(f"\n📈 Tool Usage:")
for tool, stats in tool_patterns.items():
    print(f"  - {tool}: {stats['count']} uses")

print(f"\n📋 Task Types:")
for task_type, count in task_types.items():
    print(f"  - {task_type}: {count}")

if response_lengths:
    avg_response_length = sum(response_lengths) / len(response_lengths)
    print(f"\n📝 Response Statistics:")
    print(f"  - Average response length: {avg_response_length:.1f} characters")
    print(f"  - Min response length: {min(response_lengths)}")
    print(f"  - Max response length: {max(response_lengths)}")


🔍 Dataset Analysis:

=== Agent Behavior Patterns ===

📈 Tool Usage:
  - user_input: 12 uses
  - escalate_to_human: 4 uses
  - langchain_retriever: 4 uses

📋 Task Types:
  - user_input: 1
  - other: 4
  - exit_command: 3

📝 Response Statistics:
  - Average response length: 120.8 characters
  - Min response length: 3
  - Max response length: 239


## Step 8: Export Results (Optional)


In [34]:

# Export the processed dataset for future use
print("💾 Exporting processed dataset...")

try:
    # Export to JSON
    dataset.export_to_json('processed_agent_dataset.json')
    print("✅ Exported to processed_agent_dataset.json")
    
    # Export to CSV (optional)
    dataset.export_to_csv('processed_agent_dataset.csv')
    print("✅ Exported to processed_agent_dataset.csv")
    
except Exception as e:
    print(f"❌ Export error: {e}")

print("\n🎉 Demo completed successfully!")
print("\n📋 Summary:")
print(f"  - Processed {len(spans_data)} spans from dataset.json")
print(f"  - Created {len(dataset.data)} AgentData records")
print(f"  - Configured evaluation with Gemini model and 3 scorers")
if 'results' in locals():
    print(f"  - Successfully evaluated sample data")
print(f"  - Exported processed dataset for future use")


💾 Exporting processed dataset...
✅ Exported to processed_agent_dataset.json
✅ Exported to processed_agent_dataset.csv

🎉 Demo completed successfully!

📋 Summary:
  - Processed 56 spans from dataset.json
  - Created 56 AgentData records
  - Configured evaluation with Gemini model and 3 scorers
  - Exported processed dataset for future use
