In [1]:
from typing import List
import phoenix as px
from phoenix.trace.dsl import SpanQuery
from datetime import datetime, timedelta
import pandas as pd
import json
from datetime import datetime
from pydantic import BaseModel
import os
import requests
from tqdm import tqdm
from collections import defaultdict
import difflib

# Enable tqdm for pandas apply
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
px_client = px.Client()



In [3]:
def get_trace_spans(trace_id: str, span_kinds: List[str]):

    query = SpanQuery().where(
        (
            f"context.trace_id == '{trace_id}'"
            f" and span_kind in {span_kinds}"
        )
    )
   
    trace_df = px_client.query_spans(
        query, 
        project_name="PLACEHOLDER_project_name",
    )

    trace_df = trace_df.reset_index(drop = True)
    trace_df.loc[trace_df['status_code'] == 'UNSET', 'status_code'] = 'OK'

    return trace_df

In [4]:
df = get_trace_spans(
    'caff362f4f72ae50db329105402c0dad',
    ["AGENT", "LLM", "TOOL"]
)

  trace_df = px_client.query_spans(


In [5]:
df.head()

Unnamed: 0,name,span_kind,parent_id,start_time,end_time,status_code,status_message,events,context.span_id,context.trace_id,...,attributes.status,attributes.tool_description,attributes.tool.description,attributes.tool.name,attributes.tool_args,attributes.tool_name,attributes.tool.parameters,attributes.sender_agent_class,attributes.sender_agent_type,attributes.available tools
0,User,AGENT,4c87ba0215be3731,2025-08-06 00:24:06.211350+00:00,2025-08-06 00:24:06.215081+00:00,OK,,[],71f994c0f4fc4c3c,caff362f4f72ae50db329105402c0dad,...,,,,,,,,,,
1,ChatCompletion,LLM,f29c5411109784bc,2025-08-06 00:24:06.338052+00:00,2025-08-06 00:24:07.262210+00:00,OK,,[],dfffe6e81bf0ada1,caff362f4f72ae50db329105402c0dad,...,,,,,,,,,,
2,delegate_tasks,TOOL,f29c5411109784bc,2025-08-06 00:24:07.270741+00:00,2025-08-06 00:24:07.271688+00:00,OK,,[],8fc93e5149b68018,caff362f4f72ae50db329105402c0dad,...,OK,Call this tool to delegate your task to other ...,Call this tool to delegate your task to other ...,delegate_tasks,"{""delegation_tasks"": [{""agent"": ""RAG"", ""task"":...",delegate_tasks,(delegation_tasks: Union[List[tools.delegate_t...,,,
3,Triage,AGENT,7f69755044f07108,2025-08-06 00:24:06.224531+00:00,2025-08-06 00:24:07.278150+00:00,OK,,[],f29c5411109784bc,caff362f4f72ae50db329105402c0dad,...,,,,,,,,UserAgent,User,"[{'name': 'delegate_tasks', 'description': ""Ca..."
4,ChatCompletion,LLM,7e7852e07865221b,2025-08-06 00:24:07.283162+00:00,2025-08-06 00:24:07.995236+00:00,OK,,[],b5cd5d2091e5737c,caff362f4f72ae50db329105402c0dad,...,,,,,,,,,,


In [6]:
df = df.merge(
    df[['context.span_id', 'span_kind', 'name', 'attributes.input.value']],
    how='left',
    left_on='parent_id',
    right_on='context.span_id',
    suffixes=('', '_parent')
)

# Rename the new column
df.rename(columns={'span_kind_parent': 'parent_span_kind'}, inplace=True)
df.rename(columns={'name_parent': 'parent_name'}, inplace=True)
df.rename(columns={'attributes.input.value_parent': 'parent_input'}, inplace=True)

# Optional: Drop the duplicate span_id column from the merge
df.drop(columns=['context.span_id_parent'], inplace=True)

df['start_time'] = pd.to_datetime(df['start_time'])
df = df.sort_values('start_time')

In [7]:
df.head()

Unnamed: 0,name,span_kind,parent_id,start_time,end_time,status_code,status_message,events,context.span_id,context.trace_id,...,attributes.tool.name,attributes.tool_args,attributes.tool_name,attributes.tool.parameters,attributes.sender_agent_class,attributes.sender_agent_type,attributes.available tools,parent_span_kind,parent_name,parent_input
0,User,AGENT,4c87ba0215be3731,2025-08-06 00:24:06.211350+00:00,2025-08-06 00:24:06.215081+00:00,OK,,[],71f994c0f4fc4c3c,caff362f4f72ae50db329105402c0dad,...,,,,,,,,,,
3,Triage,AGENT,7f69755044f07108,2025-08-06 00:24:06.224531+00:00,2025-08-06 00:24:07.278150+00:00,OK,,[],f29c5411109784bc,caff362f4f72ae50db329105402c0dad,...,,,,,UserAgent,User,"[{'name': 'delegate_tasks', 'description': ""Ca...",,,
1,ChatCompletion,LLM,f29c5411109784bc,2025-08-06 00:24:06.338052+00:00,2025-08-06 00:24:07.262210+00:00,OK,,[],dfffe6e81bf0ada1,caff362f4f72ae50db329105402c0dad,...,,,,,,,,AGENT,Triage,Tell me more about Donald Trump. Call the 'thi...
2,delegate_tasks,TOOL,f29c5411109784bc,2025-08-06 00:24:07.270741+00:00,2025-08-06 00:24:07.271688+00:00,OK,,[],8fc93e5149b68018,caff362f4f72ae50db329105402c0dad,...,delegate_tasks,"{""delegation_tasks"": [{""agent"": ""RAG"", ""task"":...",delegate_tasks,(delegation_tasks: Union[List[tools.delegate_t...,,,,AGENT,Triage,Tell me more about Donald Trump. Call the 'thi...
7,RAG,AGENT,51c212ed337b8c97,2025-08-06 00:24:07.282100+00:00,2025-08-06 00:24:09.962191+00:00,OK,,[],7e7852e07865221b,caff362f4f72ae50db329105402c0dad,...,,,,,BaseAgent,Triage,"[{'name': 'rag', 'description': 'Given the tas...",,,


In [8]:
agent_spans = df[df['span_kind'] == 'AGENT']

# Create the dictionary
agent_children_dict = {}

for _, parent_row in agent_spans.iterrows():
    if parent_row['name'] not in agent_children_dict:
        agent_children_dict[parent_row['name']] = {}
    
    # Filter and sort LLM spans
    completion_spans_columns_to_drop = [
       'events', 'attributes.output.mime_type',
       'attributes.tool.name', 'attributes.tool_name',
       'attributes.tool.description', 'attributes.tool_description',
       'attributes.tool.parameters', 'attributes.input.mime_type',
       'attributes.tool_args', 'attributes.message_context',
       'attributes.recipient_agent_type', 'attributes.recipient_agent_class',
       'attributes.messaging', 'attributes.sender_agent_type',
       'attributes.available tools', 'attributes.sender_agent_class',
       'attributes.message'
    ]
    completion_spans = df[
        (df['parent_id'] == parent_row['context.span_id']) & (df['span_kind'] == "LLM")
    ].sort_values('start_time')

    completion_spans.drop(completion_spans_columns_to_drop, axis=1, errors='ignore', inplace=True)

    # Extract latest completion and drop None keys
    if not completion_spans.empty:
        latest_completion_raw = completion_spans.iloc[-1].to_dict()
        latest_completion = {k: v for k, v in latest_completion_raw.items() if v is not None}
    else:
        latest_completion = None

    # Filter and sort TOOL spans
    tool_spans_columns_to_drop = [
       'events', 'attributes.output.mime_type',
       'attributes.llm.tools',
       'attributes.input.mime_type', 'attributes.llm.token_count.prompt',
       'attributes.llm.output_messages', 'attributes.llm.token_count.total',
       'attributes.llm.token_count.completion', 'attributes.llm.system',
       'attributes.llm.invocation_parameters', 'attributes.llm.input_messages', 
       'attributes.llm.model_name', 'attributes.llm.provider',
       'attributes.tool_name', 'attributes.tool_description',
       'attributes.status', 'attributes.tool_args',
       'attributes.message_context', 'attributes.recipient_agent_type',
       'attributes.recipient_agent_class', 'attributes.messaging',
       'attributes.sender_agent_type', 'attributes.available tools',
       'attributes.sender_agent_class',
       'attributes.message'
    ]
    tool_spans = df[
        (df['parent_id'] == parent_row['context.span_id']) & (df['span_kind'] == "TOOL")
    ].sort_values('start_time')

    tool_spans.drop(tool_spans_columns_to_drop, axis=1, errors='ignore', inplace=True)

    # Clean invocation field
    invocation_msg = parent_row.get("attributes.input.value", None)
    invocated_by = parent_row.get("attributes.sender_agent_type", None)
    invocation = {
        "invocation_msg": invocation_msg,
        "invoked_by": invocated_by
    }
    available_tools = parent_row.get("attributes.available tools", None)

    agent_children_dict[parent_row['name']][parent_row['context.span_id']] = {
        "tool_spans": tool_spans,
        "completion_spans": completion_spans,
        "agent_type": parent_row['name'],
        "invocation": invocation,
        'available_tools': available_tools
    }

    
# agent_children_dict = {
#     parent_row['name']: df[df['parent_name'] == parent_row['name']].sort_values('start_time')
#     for _, parent_row in agent_spans.iterrows()
# }

In [9]:
agent_spans.head()

Unnamed: 0,name,span_kind,parent_id,start_time,end_time,status_code,status_message,events,context.span_id,context.trace_id,...,attributes.tool.name,attributes.tool_args,attributes.tool_name,attributes.tool.parameters,attributes.sender_agent_class,attributes.sender_agent_type,attributes.available tools,parent_span_kind,parent_name,parent_input
0,User,AGENT,4c87ba0215be3731,2025-08-06 00:24:06.211350+00:00,2025-08-06 00:24:06.215081+00:00,OK,,[],71f994c0f4fc4c3c,caff362f4f72ae50db329105402c0dad,...,,,,,,,,,,
3,Triage,AGENT,7f69755044f07108,2025-08-06 00:24:06.224531+00:00,2025-08-06 00:24:07.278150+00:00,OK,,[],f29c5411109784bc,caff362f4f72ae50db329105402c0dad,...,,,,,UserAgent,User,"[{'name': 'delegate_tasks', 'description': ""Ca...",,,
7,RAG,AGENT,51c212ed337b8c97,2025-08-06 00:24:07.282100+00:00,2025-08-06 00:24:09.962191+00:00,OK,,[],7e7852e07865221b,caff362f4f72ae50db329105402c0dad,...,,,,,BaseAgent,Triage,"[{'name': 'rag', 'description': 'Given the tas...",,,
9,Triage,AGENT,a973efc0b8836a50,2025-08-06 00:24:09.969873+00:00,2025-08-06 00:24:11.586370+00:00,OK,,[],01b598577612af3d,caff362f4f72ae50db329105402c0dad,...,,,,,BaseAgent,RAG,"[{'name': 'delegate_tasks', 'description': ""Ca...",,,
11,User,AGENT,4148abe2024c2b9d,2025-08-06 00:24:11.594878+00:00,2025-08-06 00:24:11.598127+00:00,OK,,[],0c9835f836b26e69,caff362f4f72ae50db329105402c0dad,...,,,,,BaseAgent,Triage,,,,


In [10]:
type(agent_spans["attributes.available tools"].values[1])

str

In [11]:
agent_children_dict['RAG']['523e74185531258c']["available_tools"]

KeyError: '523e74185531258c'

In [190]:
# agent_children_dict['RAG']["1f36486616bbfad5"]["tool_spans"]

In [136]:
agent_children_dict['RAG']["1f36486616bbfad5"]["completion_spans"]["attributes.output.value"]

4    {"id":"chatcmpl-C1LzXDa4EWKsc6dE9fWPSc3dciApw"...
6    {"id":"chatcmpl-C1LzY11Pm0WYtKogLTYaoYWQ1QGA6"...
Name: attributes.output.value, dtype: object

In [137]:
output = agent_children_dict['RAG']["1f36486616bbfad5"]["completion_spans"]["attributes.llm.output_messages"].values.tolist()

# Agent Eval

## Tool Use

Things to eval
- tool success
- number of invalid tool calls
- number of calls per tool
- tool correctness (is the tool result expected? should eval using the LLM that is being used to generate tool call)

In [138]:
agent_children_dict['RAG']["1f36486616bbfad5"]["tool_spans"].columns

Index(['name', 'span_kind', 'parent_id', 'start_time', 'end_time',
       'status_code', 'status_message', 'context.span_id', 'context.trace_id',
       'attributes.session.id', 'attributes.input.value',
       'attributes.openinference.span.kind', 'attributes.output.value',
       'attributes.tool.name', 'attributes.tool.description',
       'attributes.tool.parameters', 'parent_span_kind', 'parent_name',
       'parent_input'],
      dtype='object')

### Agent tool success rate (Agent Eval)

In [46]:
# import pandas as pd
# from collections import defaultdict

# tool_success_rate = {}

# for agent, agent_traces in tqdm(agent_children_dict.items(), desc="Processing agents"):
#     agent_tool_metrics = {
#         "tool_calls": 0,
#         "successful_tool_calls": 0,
#         "tool_success_rate": 0.0,
#         "tools_invoked": {},
#         "invalid_tools_invoked": {},
#     }

#     all_tool_sequence = []

#     for invocation, invocation_trace in tqdm(agent_traces.items(), desc=f"Processing {agent}", leave=False):
#         tool_spans = invocation_trace["tool_spans"].copy()

#         tool_spans["start_time"] = pd.to_datetime(tool_spans["start_time"])
#         tool_spans["end_time"] = pd.to_datetime(tool_spans["end_time"])
#         tool_spans["latency"] = (tool_spans["end_time"] - tool_spans["start_time"]).dt.total_seconds()
#         tool_spans = tool_spans.sort_values(by="start_time")

#         all_tool_sequence.extend(tool_spans["attributes.tool.name"].tolist())

#         agent_tool_metrics["tool_calls"] += len(tool_spans)
#         agent_tool_metrics["successful_tool_calls"] += tool_spans[tool_spans['status_code'] != 'ERROR'].shape[0]

#         valid_tool_spans = tool_spans[tool_spans["attributes.tool.description"] != "Invalid tool"]
#         for tool_name, group in valid_tool_spans.groupby("attributes.tool.name"):
#             total_calls = group.shape[0]
#             successful_calls = group[group['status_code'] != 'ERROR'].shape[0]
#             total_latency = group["latency"].sum()

#             if tool_name not in agent_tool_metrics["tools_invoked"]:
#                 agent_tool_metrics["tools_invoked"][tool_name] = {
#                     "total_calls": total_calls,
#                     "successful_calls": successful_calls,
#                     "total_latency": total_latency,
#                 }
#             else:
#                 tool_stats = agent_tool_metrics["tools_invoked"][tool_name]
#                 tool_stats["total_calls"] += total_calls
#                 tool_stats["successful_calls"] += successful_calls
#                 tool_stats["total_latency"] += total_latency

#         invalid_tool_spans = tool_spans[tool_spans["attributes.tool.description"] == "Invalid tool"]
#         for tool_name, group in invalid_tool_spans.groupby("attributes.tool.name"):
#             total_calls = group.shape[0]
#             successful_calls = group[group['status_code'] != 'ERROR'].shape[0]
#             total_latency = group["latency"].sum()

#             if tool_name not in agent_tool_metrics["invalid_tools_invoked"]:
#                 agent_tool_metrics["invalid_tools_invoked"][tool_name] = {
#                     "total_calls": total_calls,
#                     "successful_calls": successful_calls,
#                     "total_latency": total_latency,
#                 }
#             else:
#                 tool_stats = agent_tool_metrics["invalid_tools_invoked"][tool_name]
#                 tool_stats["total_calls"] += total_calls
#                 tool_stats["successful_calls"] += successful_calls
#                 tool_stats["total_latency"] += total_latency

#     if agent_tool_metrics["tool_calls"]:
#         agent_tool_metrics["tool_success_rate"] = (
#             agent_tool_metrics["successful_tool_calls"] / agent_tool_metrics["tool_calls"]
#         ) * 100

#     for tool_name, stats in agent_tool_metrics["tools_invoked"].items():
#         total = stats["total_calls"]
#         success = stats["successful_calls"]
#         total_latency = stats["total_latency"]
#         stats["number_of_times_invoked"] = total
#         stats["success_rate"] = (success / total) * 100 if total else 0.0
#         stats["average_latency"] = total_latency / total if total else None
#         del stats["total_calls"]
#         del stats["successful_calls"]
#         del stats["total_latency"]

#     for tool_name, stats in agent_tool_metrics["invalid_tools_invoked"].items():
#         total = stats["total_calls"]
#         success = stats["successful_calls"]
#         total_latency = stats["total_latency"]
#         stats["number_of_times_invoked"] = total
#         stats["success_rate"] = (success / total) * 100 if total else 0.0
#         stats["average_latency"] = total_latency / total if total else None
#         del stats["total_calls"]
#         del stats["successful_calls"]
#         del stats["total_latency"]

#     # Per-tool entropy
#     tool_indices = defaultdict(list)
#     for idx, tool in enumerate(all_tool_sequence):
#         tool_indices[tool].append(idx)

#     for tool_name in agent_tool_metrics["tools_invoked"]:
#         indices = [i for i, t in enumerate(all_tool_sequence) if t == tool_name]

#         if not indices:
#             agent_tool_metrics["tools_invoked"][tool_name]["tool_entropy"] = 1.0
#             continue

#         successive_penalty = 0
#         num_sequences = 0
#         prev_index = None
#         run_length = 0

#         for index in indices:
#             if prev_index is not None and index == prev_index + 1:
#                 run_length += 1
#             else:
#                 if run_length > 1:
#                     successive_penalty += (run_length - 1)
#                 if run_length > 0:
#                     num_sequences += 1
#                 run_length = 1
#             prev_index = index

#         if run_length > 1:
#             successive_penalty += (run_length - 1)
#         if run_length > 0:
#             num_sequences += 1

#         total_calls = len(indices)
#         max_penalty = total_calls - num_sequences if total_calls > 1 else 1
#         entropy = 1 - (successive_penalty / max_penalty) if max_penalty > 0 else 1.0
#         entropy = max(min(entropy, 1.0), 0.0)

#         agent_tool_metrics["tools_invoked"][tool_name]["tool_entropy"] = entropy

#     tool_success_rate[agent] = agent_tool_metrics
#     agent_children_dict[agent]["tool_metrics"] = agent_tool_metrics

Processing agents:   0%|                                                        | 0/3 [00:00<?, ?it/s]
Processing User:   0%|                                                          | 0/2 [00:00<?, ?it/s][A
                                                                                                      [A
Processing Triage:   0%|                                                        | 0/2 [00:00<?, ?it/s][A
                                                                                                      [A
Processing RAG:   0%|                                                           | 0/1 [00:00<?, ?it/s][A
Processing agents: 100%|███████████████████████████████████████████████| 3/3 [00:00<00:00, 112.36it/s][A


In [12]:
def compute_tool_latencies(tool_spans: pd.DataFrame) -> pd.DataFrame:
    """Add latency column and sort by start_time."""
    tool_spans["start_time"] = pd.to_datetime(tool_spans["start_time"])
    tool_spans["end_time"] = pd.to_datetime(tool_spans["end_time"])
    tool_spans["latency"] = (tool_spans["end_time"] - tool_spans["start_time"]).dt.total_seconds()
    return tool_spans.sort_values(by="start_time")


def aggregate_tool_stats(group: pd.DataFrame) -> dict:
    """Aggregate total calls, successful calls, and latency for a tool group."""
    total_calls = group.shape[0]
    successful_calls = group[group['status_code'] != 'ERROR'].shape[0]
    total_latency = group["latency"].sum()
    return {
        "total_calls": total_calls,
        "successful_calls": successful_calls,
        "total_latency": total_latency
    }


def finalize_tool_stats(stats: dict) -> dict:
    """Convert aggregated totals to success rate and average latency."""
    total = stats["total_calls"]
    success = stats["successful_calls"]
    total_latency = stats["total_latency"]
    return {
        "number_of_times_invoked": total,
        "success_rate": (success / total) * 100 if total else 0.0,
        "average_latency": total_latency / total if total else 0.0
    }


def compute_jaccard_similarity(str1: str, str2: str) -> float:
    """
    Compute Jaccard similarity between two strings, based on token overlap.
    Much faster than difflib for long strings.
    """
    set1 = set(str1.split())
    set2 = set(str2.split())
    if not set1 and not set2:
        return 1.0
    return len(set1 & set2) / len(set1 | set2)


def compute_tool_entropy_fast(tool_df: pd.DataFrame) -> float:
    """
    Compute entropy with information gain using fast Jaccard similarity.
    tool_df: DataFrame with ['index', 'input_val', 'output_val']
    """
    if tool_df.empty:
        return 1.0

    indices = tool_df["index"].to_numpy()
    successive = (indices[1:] == indices[:-1] + 1)

    if not successive.any():
        return 1.0

    successive_penalty = 0
    num_sequences = 1  # at least one sequence exists

    for i in range(len(successive)):
        if successive[i]:
            in_sim = compute_jaccard_similarity(tool_df.iloc[i]["input_val"], tool_df.iloc[i+1]["input_val"])
            out_sim = compute_jaccard_similarity(tool_df.iloc[i]["output_val"], tool_df.iloc[i+1]["output_val"])
            avg_sim = (in_sim + out_sim) / 2.0
            info_gain = 1 - avg_sim
            successive_penalty += (1 - info_gain)
        else:
            num_sequences += 1

    total_calls = len(indices)
    max_penalty = total_calls - num_sequences if total_calls > 1 else 1
    entropy = 1 - (successive_penalty / max_penalty) if max_penalty > 0 else 1.0
    return max(min(entropy, 1.0), 0.0)


def process_agent(agent_name: str, agent_traces: dict) -> dict:
    """Process all traces for a single agent and return metrics."""
    metrics = {
        "tool_calls": 0,
        "successful_tool_calls": 0,
        "tool_success_rate": 0.0,
        "tools_invoked": {},
        "invalid_tools_invoked": {},
    }
    all_tool_sequence_full = []

    for invocation_trace in tqdm(agent_traces.values(), desc=f"Processing {agent_name}", leave=False):
        tool_spans = compute_tool_latencies(invocation_trace["tool_spans"].copy())

        all_tool_sequence_full.extend(
            list(zip(
                tool_spans["attributes.tool.name"],
                tool_spans["attributes.input.value"].astype(str),
                tool_spans["attributes.output.value"].astype(str)
            ))
        )

        metrics["tool_calls"] += len(tool_spans)
        metrics["successful_tool_calls"] += tool_spans[tool_spans['status_code'] != 'ERROR'].shape[0]

        valid_tool_spans = tool_spans[tool_spans["attributes.tool.description"] != "Invalid tool"]
        for tool_name, group in valid_tool_spans.groupby("attributes.tool.name"):
            agg_stats = aggregate_tool_stats(group)
            if tool_name not in metrics["tools_invoked"]:
                metrics["tools_invoked"][tool_name] = agg_stats
            else:
                for k in agg_stats:
                    metrics["tools_invoked"][tool_name][k] += agg_stats[k]

        invalid_tool_spans = tool_spans[tool_spans["attributes.tool.description"] == "Invalid tool"]
        for tool_name, group in invalid_tool_spans.groupby("attributes.tool.name"):
            agg_stats = aggregate_tool_stats(group)
            if tool_name not in metrics["invalid_tools_invoked"]:
                metrics["invalid_tools_invoked"][tool_name] = agg_stats
            else:
                for k in agg_stats:
                    metrics["invalid_tools_invoked"][tool_name][k] += agg_stats[k]

    if metrics["tool_calls"]:
        metrics["tool_success_rate"] = (
            metrics["successful_tool_calls"] / metrics["tool_calls"]
        ) * 100

    for tool_name, stats in metrics["tools_invoked"].items():
        metrics["tools_invoked"][tool_name] = finalize_tool_stats(stats)

    for tool_name, stats in metrics["invalid_tools_invoked"].items():
        metrics["invalid_tools_invoked"][tool_name] = finalize_tool_stats(stats)

    df_full_seq = pd.DataFrame(all_tool_sequence_full, columns=["tool_name", "input_val", "output_val"])
    df_full_seq.reset_index(inplace=True)

    for tool_name in metrics["tools_invoked"]:
        tool_df = df_full_seq[df_full_seq["tool_name"] == tool_name]
        metrics["tools_invoked"][tool_name]["tool_entropy"] = compute_tool_entropy_fast(tool_df)

    return metrics


# ==== MAIN LOOP ====
tool_success_rate = {}
for agent, traces in tqdm(agent_children_dict.items(), desc="Processing agents"):
    metrics = process_agent(agent, traces)
    tool_success_rate[agent] = metrics
    agent_children_dict[agent]["tool_metrics"] = metrics

Processing agents:   0%|                                                                                                                                         | 0/3 [00:00<?, ?it/s]
Processing User:   0%|                                                                                                                                           | 0/2 [00:00<?, ?it/s][A
                                                                                                                                                                                       [A
Processing Triage:   0%|                                                                                                                                         | 0/2 [00:00<?, ?it/s][A
                                                                                                                                                                                       [A
Processing RAG:   0%|                                               

In [13]:
print(json.dumps(tool_success_rate, indent=4))

{
    "User": {
        "tool_calls": 0,
        "successful_tool_calls": 0,
        "tool_success_rate": 0.0,
        "tools_invoked": {},
        "invalid_tools_invoked": {}
    },
    "Triage": {
        "tool_calls": 1,
        "successful_tool_calls": 1,
        "tool_success_rate": 100.0,
        "tools_invoked": {
            "delegate_tasks": {
                "number_of_times_invoked": 1,
                "success_rate": 100.0,
                "average_latency": 0.000947,
                "tool_entropy": 1.0
            }
        },
        "invalid_tools_invoked": {}
    },
    "RAG": {
        "tool_calls": 1,
        "successful_tool_calls": 0,
        "tool_success_rate": 0.0,
        "tools_invoked": {
            "rag": {
                "number_of_times_invoked": 1,
                "success_rate": 0.0,
                "average_latency": 0.144299,
                "tool_entropy": 1.0
            }
        },
        "invalid_tools_invoked": {}
    }
}


In [14]:
agent_children_dict["RAG"]["tool_metrics"]

{'tool_calls': 1,
 'successful_tool_calls': 0,
 'tool_success_rate': 0.0,
 'tools_invoked': {'rag': {'number_of_times_invoked': 1,
   'success_rate': 0.0,
   'average_latency': 0.144299,
   'tool_entropy': 1.0}},
 'invalid_tools_invoked': {}}

### Perceived tool usefulness (scaffolding eval)

In [95]:
tools_df = df[(df['span_kind'] == "TOOL") & (df['status_code'] == "OK")].sort_values('start_time')
tools_df.head()

Unnamed: 0,name,span_kind,parent_id,start_time,end_time,status_code,status_message,events,context.span_id,context.trace_id,...,attributes.status,attributes.tool.description,attributes.tool_description,attributes.tool_name,attributes.available tools,attributes.sender_agent_type,attributes.sender_agent_class,parent_span_kind,parent_name,parent_input
2,delegate_tasks,TOOL,7f04682983ae9f79,2025-08-06 11:11:11.195422+00:00,2025-08-06 11:11:11.195931+00:00,OK,,[],b6c3094d0a7f49a9,6c48a221fd5873db959d3e9c50dc8bef,...,OK,Call this tool to delegate your task to other ...,Call this tool to delegate your task to other ...,delegate_tasks,,,,AGENT,Triage,Tell me more about Donald Trump


In [326]:
tool_usefulness_prompt = """Tool Name: {tool_name}
Tool Description: {tool_description}

Tool Arguments: {tool_arguments}

Tool Results: {tool_results}

Based on the information above, score the quality of the tool based on the following criteria:
- expectedness: Is the tool results as expected based on given tool name and tool description?
- fidelity: The degree of exactness of the tool result
- usability: how usable is the result of the tool with regards to intention of the tool call inferred from the tool arguments
- completeness: how complete the tool result is based on the tool arguments and tool description

You should provide a final score on the scale of 1 - 5 based on the criteria, and the reason in a few short sentences.
"""

def score_tool_quality(row):
    class ToolScore(BaseModel):
        reason: str
        score: float

    schema = ToolScore.model_json_schema()
    api_key = os.environ.get("MODEL_API_KEY")
    model = os.environ.get("MODEL_NAME")
    model_endpoint = os.environ.get("MODEL_ENDPOINT") + "/chat/completions"
    
    # Headers
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": model,
        "temperature": 0.7,
        "response_format": {
            "type": "json_schema",
            "json_schema": {
                "name": "tool_score_schema",
                "schema": schema
            }
        },
    }

    prompt = tool_usefulness_prompt.format(
        tool_name = row["attributes.tool.name"],
        tool_description = row["attributes.tool.description"],
        tool_arguments = row["attributes.input.value"],
        tool_results = row["attributes.output.value"]
    )

    try:
        messages = [
            {"role": "user", "content": prompt}
        ]
        payload["messages"] = messages
        response = requests.post(model_endpoint, headers=headers, data=json.dumps(payload))
        response_dict = response.json()
        response = json.loads(response_dict["choices"][0]["message"]["content"])
        score = response["score"]
        reason = response["reason"]
        print(response)
        print("-------------------------------------------------------------------------------------------------")
    except Exception as e:
        print(e)

    return pd.Series([score, reason])

In [327]:
tools_df[['tool.quality.score', 'tool.quality.reason']] = tools_df.progress_apply(score_tool_quality, axis=1)

100%|███████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.70s/it]


KeyboardInterrupt: 

In [18]:
tools_df.head()

Unnamed: 0,name,span_kind,parent_id,start_time,end_time,status_code,status_message,events,context.span_id,context.trace_id,...,attributes.tool.parameters,attributes.tool_args,attributes.sender_agent_type,attributes.sender_agent_class,attributes.available tools,parent_span_kind,parent_name,parent_input,tool.quality.score,tool.quality.reason
2,delegate_tasks,TOOL,1b712b840016e3b0,2025-08-05 10:00:10.495991+00:00,2025-08-05 10:00:10.496488+00:00,OK,,[],c84165db9dd9d438,609ab072e9ab6dcde09443043e375341,...,(delegation_tasks: Union[List[tools.delegate_t...,"{""delegation_tasks"": [{""agent"": ""RAG"", ""task"":...",,,,AGENT,Triage,Tell me more about Donald Trump,4.5,The tool 'delegate_tasks' effectively delegate...
5,rag,TOOL,3de41746169a2692,2025-08-05 10:00:11.900162+00:00,2025-08-05 10:00:12.112429+00:00,ERROR,Exception: Error occurred when retrieving docu...,"[{'name': 'exception', 'timestamp': '2025-08-0...",e32de80a1eb9f5b9,609ab072e9ab6dcde09443043e375341,...,"(query: str, name: str) -> List[str]","{""query"": ""Tell me more about Donald Trump"", ""...",,,,AGENT,RAG,Tell me more about Donald Trump,2.0,The tool's performance is suboptimal based on ...


In [19]:
# Group by tool name and aggregate
tool_quality_df = tools_df.groupby("name").agg({
    "tool.quality.score": "mean",  # average score
    "tool.quality.reason": lambda x: " | ".join(x.dropna().astype(str))  # join reasons
}).reset_index()

# Optional: Rename columns for clarity
tool_quality_df.rename(columns={
    "tool.quality.score": "avg_tool_quality_score",
    "tool.quality.reason": "combined_tool_quality_reasons"
}, inplace=True)

In [20]:
tool_quality_df.head()

Unnamed: 0,name,avg_tool_quality_score,combined_tool_quality_reasons
0,delegate_tasks,4.5,The tool 'delegate_tasks' effectively delegate...
1,rag,2.0,The tool's performance is suboptimal based on ...


## Agent Step Wise Eval

**Criteria**

- **Refusal like**: suggesting the task to be completed by another agent instead of performing it
- Choosing a strategy that is more likely to succeed eventually but likely to exceed token limit (inefficient tool calls etc)
- Frequently selecting the same particular action or subgoal that isn't helpful
- not using the correct tool from the selection of available tools

### Evaluating Agent Steps

In [194]:
agent_children_dict["Triage"]["7f04682983ae9f79"]["completion_spans"]

Unnamed: 0,name,span_kind,parent_id,start_time,end_time,status_code,status_message,context.span_id,context.trace_id,attributes.input.value,...,attributes.llm.input_messages,attributes.llm.model_name,attributes.llm.token_count.total,attributes.llm.token_count.completion,attributes.llm.system,attributes.llm.invocation_parameters,attributes.status,parent_span_kind,parent_name,parent_input
1,ChatCompletion,LLM,7f04682983ae9f79,2025-08-06 11:11:10.122652+00:00,2025-08-06 11:11:11.192373+00:00,OK,,d44f06f495fafa25,6c48a221fd5873db959d3e9c50dc8bef,"{""messages"": [{""content"": ""You are a triage ag...",...,"[{'message.role': 'system', 'message.content':...",gpt-4o-2024-08-06,279.0,35.0,openai,"{""model"": ""gpt-4o"", ""stream"": false, ""temperat...",,AGENT,Triage,Tell me more about Donald Trump


In [195]:
completion_dict = agent_children_dict["RAG"]["523e74185531258c"]["completion_spans"].iloc[-1].to_dict()

In [196]:
completion_dict["attributes.llm.input_messages"]

[{'message.role': 'system',
  'message.content': 'You are a RAG agent. You are responsible for answering user\nqueries. You will receive user queries from the user agent and answer them using\nretrieved information.\n\nCurrent task context: Tell me more about Donald Trump'},
 {'message.role': 'user',
  'message.content': 'Tell me more about Donald Trump',
  'message.name': 'Triage'},
 {'message.tool_calls': [{'tool_call.function.name': 'rag',
    'tool_call.function.arguments': '{"query":"Tell me more about Donald Trump","name":"assistant"}',
    'tool_call.id': 'call_r8PjfaZjLAoebY4MTwUYw64s'}],
  'message.role': 'assistant'},
 {'message.tool_call_id': 'call_r8PjfaZjLAoebY4MTwUYw64s',
  'message.role': 'tool',
  'message.content': "Error occurred when retrieving documents -> HTTPConnectionPool(host='localhost', port=8000): Max retries exceeded with url: /embeddings (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f0e7906e290>: Failed to establish a new co

In [197]:
completion_dict["attributes.llm.output_messages"]

[{'message.role': 'assistant',
  'message.content': 'I\'m currently unable to retrieve specific documents about Donald Trump. However, I can provide some general information:\n\nDonald Trump is a prominent American businessman, television personality, and politician who served as the 45th President of the United States from January 20, 2017, to January 20, 2021. Before his presidency, he was known for his real estate empire and as the host of the reality TV show "The Apprentice."\n\nDuring his presidency, Trump was known for his unconventional style, use of social media, and policies such as tax cuts, deregulation, and a focus on immigration control. His administration was marked by significant political polarization and controversy, including two impeachment trials, both of which resulted in acquittal by the Senate.\n\nAfter leaving office, Trump has remained a significant figure in American politics, continuing to influence the Republican Party and hinting at potential future politic

In [198]:
chat_history = completion_dict["attributes.llm.input_messages"] + completion_dict["attributes.llm.output_messages"]

In [199]:
chat_history

[{'message.role': 'system',
  'message.content': 'You are a RAG agent. You are responsible for answering user\nqueries. You will receive user queries from the user agent and answer them using\nretrieved information.\n\nCurrent task context: Tell me more about Donald Trump'},
 {'message.role': 'user',
  'message.content': 'Tell me more about Donald Trump',
  'message.name': 'Triage'},
 {'message.tool_calls': [{'tool_call.function.name': 'rag',
    'tool_call.function.arguments': '{"query":"Tell me more about Donald Trump","name":"assistant"}',
    'tool_call.id': 'call_r8PjfaZjLAoebY4MTwUYw64s'}],
  'message.role': 'assistant'},
 {'message.tool_call_id': 'call_r8PjfaZjLAoebY4MTwUYw64s',
  'message.role': 'tool',
  'message.content': "Error occurred when retrieving documents -> HTTPConnectionPool(host='localhost', port=8000): Max retries exceeded with url: /embeddings (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f0e7906e290>: Failed to establish a new co

In [200]:
def parse_chat(chat):
    steps = []
    strategy = []
    system_prompt = None
    user_prompt = None
    previous_response = None

    for idx, msg in enumerate(chat):
        role = msg.get("message.role") or msg.get("role")

        if role == "system":
            system_prompt = msg.get("message.content") or msg.get("content")

        elif role == "user":
            user_prompt = msg.get("message.content") or msg.get("content")
            # Reset for new interaction
            previous_response = None

        elif role == "assistant":
            if msg.get("message.tool_calls") or msg.get("tool_calls"):
                tool_calls = msg.get("message.tool_calls") or msg.get("tool_calls")
                # Assistant is making a tool call
                tool_calls = [f"Tool Call: {tool_call['tool_call.function.name']}\nArguments: {tool_call['tool_call.function.arguments']}" for tool_call in tool_calls]
                tool_calls_str = "\n\n".join(tool_calls)

                strategy.extend([tool_call['tool_call.function.name'] for tool_call in tool_calls])

                tool_results = []

                for i in range(idx, len(chat)):
                    print(chat[i])
                    if chat[i].get("message.role") == "tool" or msg.get("role") == "tool":
                        tool_results.append(chat[i])
                
                if tool_results:
                    tool_results_str = [f"Tool ID: {result['message.tool_call_id']}\nTool Result: {result['message.content']}" for result in tool_results]
                    tool_results_str = "\n\n".join(tool_results_str)
                    current_response = f"Tool_calls:\n\n{tool_calls_str}\n\nTool_results:\n\n{tool_results_str}"
                else:
                    current_response = tool_calls_str

                step = {
                    "system_prompt": system_prompt,
                    "user_prompt": user_prompt,
                    "strategy": " -> ".join(strategy),
                    "previous_response": previous_response,
                    "current_response": current_response
                }
                steps.append(step)
                previous_response = current_response

            else:
                # Assistant gives a normal message
                assistant_response = msg.get("message.content") or msg.get("content")

                strategy.append("response")

                step = {
                    "system_prompt": system_prompt,
                    "user_prompt": user_prompt,
                    "strategy": " -> ".join(strategy),
                    "previous_response": previous_response,
                    "current_response": assistant_response
                }
                steps.append(step)

                previous_response = assistant_response

    return steps

In [15]:
def parse_chat_n(chat, n_previous=1):
    steps = []
    strategy = []
    system_prompt = None
    user_prompt = None
    previous_responses = []  # Rolling history of assistant responses

    for idx, msg in enumerate(chat):
        role = msg.get("message.role") or msg.get("role")

        if role == "system":
            system_prompt = msg.get("message.content") or msg.get("content")

        elif role == "user":
            user_prompt = msg.get("message.content") or msg.get("content")
            # Reset history for new user turn["step_score"]
            previous_responses = []

        elif role == "assistant":
            if msg.get("message.tool_calls") or msg.get("tool_calls"):
                tool_calls = msg.get("message.tool_calls") or msg.get("tool_calls")
                # Format tool calls
                tool_calls_strs = [
                    f"Tool Call: {tool_call.get('tool_call.function.name')}\nArguments: {tool_call.get('tool_call.function.arguments')}"
                    for tool_call in tool_calls
                ]
                tool_calls_str = "\n\n".join(tool_calls_strs)

                strategy.extend([tool_call['tool_call.function.name'] for tool_call in tool_calls])

                tool_results = []
               
                for i in range(idx, len(chat)):
                        print(chat[i])
                        if chat[i].get("message.role") == "tool" or msg.get("role") == "tool":
                            tool_results.append(chat[i])
                
                if tool_results:
                    tool_results_str = [f"Tool ID: {result['message.tool_call_id']}\nTool Result: {result['message.content']}" for result in tool_results]
                    tool_results_str = "\n\n".join(tool_results_str)
                    current_response = f"Tool_calls:\n\n{tool_calls_str}\n\nTool_results:\n\n{tool_results_str}"
                else:
                    # do not process tool calls if there are no tool results
                    continue

                previous_response_str = "" if previous_responses else None

                for idx, response in enumerate(previous_responses[-n_previous:]):
                    res_num = f"---------- Previous response {-(-len(previous_responses[-n_previous:]) + idx)} ----------\n\n"
                    previous_response_str += res_num + response + "\n\n"
                
                # Create step
                step = {
                    "system_prompt": system_prompt,
                    "user_prompt": user_prompt,
                    "strategy": " -> ".join(strategy),
                    "previous_responses": previous_response_str,  # last n
                    "current_response": current_response,
                    "chat_index": idx
                }
                steps.append(step)

                # Update history
                previous_responses.append(current_response)
                previous_responses = previous_responses[-n_previous:]

            else:
                # Normal assistant message
                assistant_response = msg.get("message.content") or msg.get("content")

                strategy.append("response")

                previous_response_str = "" if previous_responses else None

                for idx, response in enumerate(previous_responses[-n_previous:]):
                    res_num = f"---------- Previous response {-(-len(previous_responses[-n_previous:]) + idx)} ----------\n\n"
                    previous_response_str += res_num + response + "\n\n"
                
                step = {
                    "system_prompt": system_prompt,
                    "user_prompt": user_prompt,
                    "strategy": " -> ".join(strategy),
                    "previous_responses": previous_response_str,  # last n
                    "current_response": assistant_response,
                    "chat_index": idx
                }
                steps.append(step)

                previous_responses.append(assistant_response)
                previous_responses = previous_responses[-n_previous:]

    return steps


In [20]:
for agent, agent_traces in tqdm(agent_children_dict.items(), desc="Processing agents"):
    for trace_id, trace in agent_traces.items():
        if "completion_spans" in trace and len(trace["completion_spans"]) > 0:
            completion_dict = trace["completion_spans"].iloc[-1].to_dict()
            chat_history = completion_dict["attributes.llm.input_messages"] + completion_dict["attributes.llm.output_messages"]
            agent_steps = parse_chat_n(chat_history, 2)
            agent_children_dict[agent][trace_id]["agent_steps"] = agent_steps
            agent_children_dict[agent][trace_id]["chat_history"] = chat_history

Processing agents: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 302.58it/s]

{'message.role': 'assistant', 'message.tool_calls': [{'tool_call.id': 'call_bQUOW8iiQn5IfFElY4r9fqLV', 'tool_call.function.name': 'delegate_tasks', 'tool_call.function.arguments': '{"delegation_tasks":[{"agent":"RAG","task":"Tell me more about Donald Trump.","name":"Triage"}]}'}]}
{'message.role': 'assistant', 'message.tool_calls': [{'tool_call.id': 'call_bQUOW8iiQn5IfFElY4r9fqLV', 'tool_call.function.name': 'delegate_tasks', 'tool_call.function.arguments': '{"delegation_tasks":[{"agent":"RAG","task":"Tell me more about Donald Trump.","name":"Triage"}]}'}]}
{'message.role': 'tool', 'message.tool_call_id': 'call_bQUOW8iiQn5IfFElY4r9fqLV', 'message.content': '[{"agent": "RAG", "response": "It seems there was an issue retrieving detailed information about Donald Trump. However, I can provide a general overview based on my knowledge.\\n\\nDonald Trump is a businessman, television personality, and politician who served as the 45th President of the United States from January 20, 2017, to Jan




In [21]:
agent_children_dict["RAG"].keys()

dict_keys(['7e7852e07865221b', 'tool_metrics'])

In [24]:
agent_children_dict["RAG"]["7e7852e07865221b"]["chat_history"]

[{'message.role': 'system',
  'message.content': "You are a RAG agent. You are responsible for answering user\nqueries. You will receive user queries from the user agent and answer them using\nretrieved information.\n\nCurrent task context: Tell me more about Donald Trump. Call the 'think' tool first with your thoughts"},
 {'message.role': 'user',
  'message.name': 'Triage',
  'message.content': 'Tell me more about Donald Trump.'},
 {'message.role': 'assistant',
  'message.tool_calls': [{'tool_call.id': 'call_eWBpLfzE5DSg1BXBvjMKqFmW',
    'tool_call.function.name': 'rag',
    'tool_call.function.arguments': '{"query":"Provide detailed information about Donald Trump, including his background, career, and presidency.","name":"RAG Agent"}'}]},
 {'message.role': 'tool',
  'message.tool_call_id': 'call_eWBpLfzE5DSg1BXBvjMKqFmW',
  'message.content': "Error occurred when retrieving documents -> HTTPConnectionPool(host='localhost', port=8000): Max retries exceeded with url: /embeddings (Caus

In [205]:
import re
import math

def logprob_to_confidence(logprob, base='e'):
    if base == 'e':  # natural log
        return math.exp(logprob)
    elif base == '10':  # base-10 log
        return 10 ** logprob
    else:
        raise ValueError("Unsupported log base. Use 'e' or '10'.")

def extract_score_confidence(response_dict, fields):
    """
    Extract numeric scores and logprobs from an OpenAI-compatible API response.

    Features:
      - Works regardless of tokenizer (OpenAI, vLLM, etc.)
      - Handles multi-token field names
      - Case-insensitive matching
      - Handles decimals, negative numbers, and scientific notation
      - Ignores extra spaces/quotes
      - Returns average logprob for multi-token numbers

    Args:
        response_dict (dict): Full API JSON response from an OpenAI-compatible API.
        fields (list[str]): List of exact field names to extract.

    Returns:
        dict: {
            field_name: float,
        }
    """
    logprobs_tokens = response_dict["choices"][0]["logprobs"]["content"]
    tokens = [t["token"] for t in logprobs_tokens]
    logprobs = [t["logprob"] for t in logprobs_tokens]

    # Reconstruct full raw string for searching
    raw_output = "".join(tokens)

    scores_with_logprobs = {}

    for field in fields:
        # Case-insensitive search
        search_pattern = re.compile(re.escape(field), re.IGNORECASE)

        for match in search_pattern.finditer(raw_output):
            char_pos = match.start()

            # Map char position -> token index
            char_count = 0
            matched_tokens = []
            token_index = None
            for i, tok in enumerate(tokens):
                char_count += len(tok)
                if char_count > char_pos:
                    token_index = i
                    matched_tokens.append(tok)
                    if "".join(matched_tokens) == field:
                        break

            if token_index is None:
                continue

            # Look forward for numeric value
            j = token_index + 1
            while j < len(tokens):
                tok_clean = tokens[j]
                if tok_clean.isnumeric():
                    # Gather numeric tokens (including -, ., e, digits)
                    num_tokens = []
                    num_logprobs = []
                    k = j
                    while k < len(tokens):
                        t_clean = tokens[k]
                        if re.match(r"^[0-9eE\.\+\-]+$", t_clean):
                            num_tokens.append(t_clean)
                            num_logprobs.append(logprobs[k])
                            k += 1
                        else:
                            break
                    try:
                        value = float("".join(num_tokens))
                        avg_logprob = sum(num_logprobs) / len(num_logprobs)
                        scores_with_logprobs[field] = {
                            "value": value,
                            "logprob": avg_logprob,
                            "tokens": num_tokens,
                            "token_logprobs": num_logprobs
                        }
                    except ValueError:
                        pass
                    break
                j += 1

    field_confidence = {}
    for field in fields:
        if field not in scores_with_logprobs:
            field_confidence[field] = 0.0
        else:
            field_confidence[field] = logprob_to_confidence(scores_with_logprobs[field]["logprob"])

    return field_confidence


In [212]:
stepwise_eval_prompt = """Current trajectory: {strategy}

Assistant's persona: {system_prompt}

Assistant's tools: {available_tools}

Current task: {user_prompt}

Previous steps: 

{previous_responses}

Current step: 

{current_response}

Based on the current trajectory taken by an agent, its current task, previous and current steps,
evaluate the current step taken by the agent based on the following criteria:
- invoked tool correctness: is the current tool invoked a good fit for the current task
- contextual coherence: ability to incorporate information from earlier context from previous steps
- response completeness: whether the agent response contains complete information from its previous steps
- tool result correctness: is the invoked tool giving the correct intended result
- role adherence: how well is the assistant adhering to its role given by its persona

Score each of the criteria on a scale of 1 - 5 (can be a float value).
The current trajectory shows the steps taken by the agent up till the current step.
Reason about the current step first before providing the scores.
"""

def stepwise_agent_eval(step: dict, available_tools: str):
    class StepwiseScore(BaseModel):
        reason: str
        invoked_tool_correctness: float
        contextual_coherence: float
        response_completeness: float
        tool_result_correctness: float
        role_adherence: float

    schema = StepwiseScore.model_json_schema()
    schema["additionalProperties"] = False
    api_key = os.environ.get("MODEL_API_KEY")
    model = os.environ.get("MODEL_NAME")
    model_endpoint = os.environ.get("MODEL_ENDPOINT") + "/chat/completions"
    
    # Headers
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": model,
        "temperature": 0.7,
        "logprobs": True,
        "response_format": {
            "type": "json_schema",
            "json_schema": {
                "name": "stepwise_score_schema",
                "schema": schema,
                "strict": True
            },
        },
    }

    prompt = stepwise_eval_prompt.format(
        strategy = step["strategy"],
        system_prompt = step["system_prompt"],
        available_tools = available_tools,
        user_prompt = step["user_prompt"],
        previous_responses = step["previous_responses"],
        current_response = step["current_response"]
    )

    try:
        messages = [
            {"role": "user", "content": prompt}
        ]
        payload["messages"] = messages
        response = requests.post(model_endpoint, headers=headers, data=json.dumps(payload))
        response_dict = response.json()
        response = json.loads(response_dict["choices"][0]["message"]["content"])
        print(response)
        logprobs = extract_score_confidence(response_dict, list(response.keys()))
        new_response = {}
        for score, score_value in response.items():
            if not isinstance(score_value, str):
                new_response[score] = {
                    "score": score_value,
                    "confidence": logprobs.get(score)
                }
        response = new_response
        response["contextual_coherence"] = None if not step["previous_responses"] else response["contextual_coherence"]
        if step["strategy"].split(" -> ")[-1] == "response":
            response["invoked_tool_correctness"] = None
            response["tool_result_correctness"] = None
        else:
            response["response_completeness"] = None
        print("current step: ", step["current_response"])
        print("\n")
        print("previous responses: ", step["previous_responses"])
        print("\n")
        print(response)
        print("-------------------------------------------------------------------------------------------------")
    except Exception as e:
        print(e)

    return response

In [213]:
for agent, agent_traces in tqdm(agent_children_dict.items(), desc="Processing agents"):
    for trace_id, trace in agent_traces.items():
        if "agent_steps" in trace:
            available_tools = trace.get("available_tools", "")
            print(agent, trace_id)
            for step in trace["agent_steps"]:
                step["step_score"] = stepwise_agent_eval(step, available_tools)
                

Processing agents:   0%|                                                        | 0/3 [00:00<?, ?it/s]

User d871c830d5a70153


Processing agents:  33%|████████████████                                | 1/3 [00:07<00:14,  7.03s/it]

{'reason': "In this current step, the assistant was asked to provide information about Donald Trump. Here is an evaluation of the response based on the specified criteria:\n\n1. **Invoked Tool Correctness:**\n   - **Rating:** 5.0\n   - **Reasoning:** The assistant did not invoke any tools, as it was not necessary for the task. The response was generated based on pre-existing knowledge about Donald Trump, which is fitting for the task.\n\n2. **Contextual Coherence:**\n   - **Rating:** 5.0\n   - **Reasoning:** As this is the first step, there are no previous steps to incorporate. The response is coherent and self-contained.\n\n3. **Response Completeness:**\n   - **Rating:** 4.5\n   - **Reasoning:** The response provides a brief overview of Donald Trump's background, presidency, and post-presidency activities. While comprehensive, it could include more details about specific policies or controversies during his presidency.\n\n4. **Tool Result Correctness:**\n   - **Rating:** 5.0\n   - **R

Processing agents:  67%|████████████████████████████████                | 2/3 [00:24<00:13, 13.13s/it]

{'reason': "The current step involves evaluating the agent's performance in delegating a task about Donald Trump to another agent, RAG, and how well it adheres to its role and uses its tools correctly.\n\n1. **Invoked Tool Correctness**:\n   - The task was to provide more information about Donald Trump. Since gathering and providing detailed information fits the capabilities of the RAG agent, the choice to delegate this task was correct.\n   - **Score: 5**\n\n2. **Contextual Coherence**:\n   - The response after the tool invocation provides a coherent summary of the information retrieved by RAG. It aligns with the context of the question asked and the response from the RAG agent.\n   - **Score: 4.5**\n\n3. **Response Completeness**:\n   - The response includes a comprehensive summary of the results provided by the RAG. It adequately covers Trump's business background, presidency, and post-office influence, which were mentioned by the RAG agent.\n   - **Score: 4.5**\n\n4. **Tool Result 

Processing agents: 100%|████████████████████████████████████████████████| 3/3 [00:41<00:00, 13.84s/it]

{'reason': "In evaluating the agent's current step, several factors need to be considered:\n\n1. **Invoked Tool Correctness:** The agent attempted to use the 'rag' tool, which is designed to retrieve relevant information. Given the task of providing information about Donald Trump, this was an appropriate choice of tool. However, due to a technical issue, the tool could not provide the results.\n\n2. **Contextual Coherence:** The agent maintained coherence by addressing the user's query directly and providing general information despite the lack of specific document retrieval. It smoothly transitioned from the tool error to offering available knowledge.\n\n3. **Response Completeness:** Despite the failure of the tool to provide additional information, the agent gave a well-rounded summary of Donald Trump's career and influence. It also invited further questions, showing completeness in addressing the user's needs.\n\n4. **Tool Result Correctness:** The tool failed to deliver the intende




In [214]:
agent_children_dict["RAG"]["523e74185531258c"].keys()

dict_keys(['tool_spans', 'completion_spans', 'agent_type', 'invocation', 'available_tools', 'agent_steps'])

In [219]:
def compute_stepwise_metrics(agent_children_dict, min_score=1, max_score=5, low_quality_cutoff=2.5,
                   high_quality_cutoff=3.5, low_conf_flag=0.8):
    """
    Mutates agent_children_dict in place:
      - Adds step['step_score_aggregated'] and step['step_quality']
      - Adds agent_children_dict[agent]['stepwise_metrics']
    Scoring is confidence-calibrated and bounded within [min_score, max_score].
    """

    def calibrate(score, conf):
        # Clamp raw score
        s = max(min_score, min(score, max_score))
        mid = (min_score + max_score) / 2
        # Pull toward midpoint when confidence is low
        return conf * s + (1 - conf) * mid

    for agent_name, traces in agent_children_dict.items():
        # Agent-level accumulators
        metric_sum = {}
        metric_count = {}

        # Iterate traces safely
        for trace_id, trace_data in list(traces.items()):

            steps = trace_data.get("agent_steps", []) or []

            for step in steps:
                score_data = step.get("step_score", {}) or {}

                calibrated_scores = []
                low_flag = False

                for metric_key, result in score_data.items():
                    if not isinstance(result, dict):
                        continue
                    score = result.get("score")
                    conf = result.get("confidence")
                    
                    if score is None or conf is None:
                        continue

                    # Confidence-calibrated score toward midpoint
                    cal = calibrate(score, conf)
                    calibrated_scores.append(cal)

                    # Agent-level aggregation (by metric key)
                    metric_sum[metric_key] = metric_sum.get(metric_key, 0.0) + cal
                    metric_count[metric_key] = metric_count.get(metric_key, 0) + 1

                    # Keep your original low-quality rule confidence-gated
                    if conf > low_conf_flag and cal < low_quality_cutoff:
                        low_flag = True

                # Step aggregated score (simple mean of calibrated scores)
                agg = (sum(calibrated_scores) / len(calibrated_scores)) if calibrated_scores else None
                step["step_score_aggregated"] = agg

                # Step quality classification
                if low_flag:
                    quality = "low"
                elif agg is not None and agg > high_quality_cutoff:
                    quality = "high"
                elif agg is not None and agg > low_quality_cutoff:
                    quality = "medium"
                else:
                    quality = "low"
                step["step_quality"] = quality

        # Per-agent stepwise metrics: average calibrated score per metric
        stepwise_metrics = {
            m: (metric_sum[m] / metric_count[m]) for m in metric_sum.keys() if metric_count[m] > 0
        }
        agent_children_dict[agent_name]["stepwise_metrics"] = stepwise_metrics

In [220]:
compute_stepwise_metrics(agent_children_dict)

In [221]:
agent_children_dict["RAG"]["523e74185531258c"]["invocation"]

{'invocation_msg': 'Tell me more about Donald Trump', 'invocated_by': 'Triage'}

In [222]:
agent_children_dict["RAG"]["stepwise_metrics"]

{'invoked_tool_correctness': 4.974797546856905,
 'tool_result_correctness': 1.000009764556201,
 'role_adherence': 4.684128022774619,
 'contextual_coherence': 3.81474174813464,
 'response_completeness': 4.06401769135731}

## Agent Invocation