In [None]:

from IPython.display import Image; Image("/Users/mramanindia/Documents/Work/NovaEval/noveum_customer_support_bt/custmomer_support_agent.png")


from IPython.display import Image; Image("path/to/image.png")


# Final Agent Evaluation Demo with NovaEval

This notebook demonstrates a streamlined approach to agent evaluation using modular utility functions:

1. **Load agent trace data** from JSON datasets
2. **Map trace spans** to AgentData format using utility functions
3. **Create and analyze** AgentDataset
4. **Evaluate agent performance** using AgentEvaluator with Gemini model
5. **Analyze results** and export data



# Scorers Used

**context_relevancy_scorer** - Evaluates whether the agent response is appropriate and relevant given the agent's task and role.

**role_adherence_scorer** - Scores whether the agent's tool calls and response adhere to its assigned role and task.

**task_progression_scorer** - Measures whether the agent has made meaningful progress on the assigned task.

**tool_relevancy_scorer** - Assesses how relevant and appropriate the tool call is given the available tools and the agent's context.

**tool_correctness_scorer** - Compares actual tool calls against expected tool calls to evaluate correctness of tool usage and parameters.

**parameter_correctness_scorer** - Validates whether correct parameters were passed to tool calls by analyzing the tool results.

## Step 1: Import Dependencies and Utility Functions


In [None]:
# Import our custom utility functions
from demo_utils import (
    list_dataset_files,
    load_and_analyze_dataset,
    convert_spans_to_agent_dataset,
    analyze_dataset_statistics,
    setup_gemini_model,
    setup_agent_evaluator,
    run_evaluation,
    analyze_agent_behavior_patterns,
    export_processed_dataset,
    setup_logging,
    validate_environment,
    print_demo_summary
)

print("✅ All utility functions imported successfully!")


In [1]:
!python preprocess_filter.py dataset.json
!python preprocess_map.py dataset_filtered.json
!python preprocess_split_data.py dataset_filtered_mapped.json

Reading dataset.json...
Original dataset: 2887 records
Filtering spans...
After filtering: 2383 records
Converting tool output format...
Writing dataset_filtered.json...
Filtering complete! Output: dataset_filtered.json

Success! Created dataset_filtered.json
Reading dataset_filtered.json...
Input dataset: 2383 records
Mapping spans...
Writing dataset_filtered_mapped.json...
Mapping complete! Output: dataset_filtered_mapped.json

Success! Created dataset_filtered_mapped.json
Input file: dataset_filtered_mapped.json
Output directory: split_datasets

Loading dataset from dataset_filtered_mapped.json...
Loaded 2383 objects
Found 16 unique span names
Using sanitized name: agent:research_coordinator:research_coordinator -> agent_research_coordinator_research_coordinator_dataset.json
  Wrote 9 objects to split_datasets/agent_research_coordinator_research_coordinator_dataset.json
Using sanitized name: agent:search_agent:search_agent -> agent_search_agent_search_agent_dataset.json
  Wrote 9 ob

In [None]:
from demo_utils import run_complete_agent_evaluation

#evaluating the split datasets
run_complete_agent_evaluation('split_datasets/agent_comment_gen_dataset.json',
evaluation_name = "agent_comment_gen_dataset", output_dir = "./demo_results")

In [None]:
run_complete_agent_evaluation('split_datasets/agent_query_gen_dataset.json',
evaluation_name = "agent_query_gen_dataset", output_dir = "./demo_results")

In [None]:
run_complete_agent_evaluation('split_datasets/email_gen_send_dataset.json',
evaluation_name = "email_gen_send_dataset", output_dir = "./demo_results")

In [None]:
run_complete_agent_evaluation('split_datasets/post_validation_dataset.json', 
evaluation_name = "post_validation_dataset", output_dir = "./demo_results")

In [None]:
run_complete_agent_evaluation('split_datasets/tavily_search_results_dataset.json',
evaluation_name = "tavily_search_results_dataset", output_dir = "./demo_results")

# Analysis of poor scores in comment generation agent.

In [None]:
import pandas as pd
comment_gen = pd.read_csv("demo_results/agent_comment_gen_dataset/agent_evaluation_results.csv")

split_size = 3

task_progression = comment_gen.sort_values(by = 'task_progression', ascending= True).iloc[:split_size][['task_progression', 'task_progression_reasoning']]

print("Task Progression:")
print()
for idx, row in task_progression.iterrows():
    print(f"Score = {row['task_progression']}")
    print(f"Reasoning = {row['task_progression_reasoning']}")
    print()  # blank line

In [None]:
# Context Relevancy Analysis
context_relevancy = comment_gen.sort_values(by='context_relevancy', ascending=True).iloc[:3][['context_relevancy', 'context_relevancy_reasoning']]

print("Context Relevancy Analysis:")
print("=" * 50)
for idx, row in context_relevancy.iterrows():
    print(f"Score = {row['context_relevancy']}")
    print(f"Reasoning = {row['context_relevancy_reasoning']}")
    print()

In [None]:
# Role Adherence Analysis
role_adherence = comment_gen.sort_values(by='role_adherence', ascending=True).iloc[:3][['role_adherence', 'role_adherence_reasoning']]

print("Role Adherence Analysis:")
print("=" * 50)
for idx, row in role_adherence.iterrows():
    print(f"Score = {row['role_adherence']}")
    print(f"Reasoning = {row['role_adherence_reasoning']}")
    print()

In [None]:
from novapilot_utils import recommend_improvements

# Advanced usage with custom parameters
final_analysis, summaries, log_file = recommend_improvements(
    demo_results_dir="demo_results/",
    agent_doc_path="reddit_agent.md",
    log_dir="log",
    verbose=True
)

In [None]:
print(final_analysis)