Skip to content
28 changes: 24 additions & 4 deletions apps/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,9 @@ def _collect_run_metrics(run_dir: Path) -> tuple[list[dict], list[str]]:
if metric_name in _NON_NORMALIZED_METRICS:
_NON_NORMALIZED_METRICS.add(col)

for comp_name, comp_value in metrics.aggregate_metrics.items():
row[comp_name] = comp_value

rows.append(row)

return rows, sorted(all_metric_names)
Expand Down Expand Up @@ -1294,8 +1297,9 @@ def render_run_overview(run_dir: Path):
leading_cols = ["record"]
if has_trials:
leading_cols.append("trial")
composite_cols = [c for c in _EVA_BAR_COMPOSITES if c in table_df.columns]
ordered_metrics = [m for m in metric_names if m in table_df.columns]
table_df = table_df[leading_cols + ordered_metrics]
table_df = table_df[leading_cols + composite_cols + ordered_metrics]

# Add link column to navigate to Record Detail
def _record_link(row):
Expand All @@ -1306,11 +1310,14 @@ def _record_link(row):

table_df = table_df.copy()
table_df.insert(0, "link", table_df.apply(_record_link, axis=1))
table_df = table_df.rename(columns=col_rename)
composite_rename = {c: _EVA_COMPOSITE_DISPLAY[c] for c in composite_cols}
table_df = table_df.rename(columns={**col_rename, **composite_rename})

renamed_composites = [composite_rename[c] for c in composite_cols]
renamed_metrics = [col_rename[m] for m in ordered_metrics]
styled = table_df.style.map(_color_cell, subset=renamed_metrics)
styled = styled.format(dict.fromkeys(renamed_metrics, "{:.3f}"), na_rep="—")
score_cols = renamed_composites + renamed_metrics
styled = table_df.style.map(_color_cell, subset=score_cols)
styled = styled.format(dict.fromkeys(score_cols, "{:.3f}"), na_rep="—")
st.dataframe(
styled,
hide_index=True,
Expand All @@ -1323,13 +1330,25 @@ def _record_link(row):
st.download_button("Download CSV", csv, file_name=f"{run_dir.name}_metrics.csv", mime="text/csv")


def _render_eva_composite_cards(metrics: RecordMetrics) -> None:
"""Render EVA composite scores (EVA-A/EVA-X pass@1 and Mean) as st.metric cards."""
composites = [(c, metrics.aggregate_metrics.get(c)) for c in _EVA_BAR_COMPOSITES]
composites = [(c, v) for c, v in composites if v is not None]
if not composites:
return
cols = st.columns(len(composites))
for col, (name, value) in zip(cols, composites):
col.metric(_EVA_COMPOSITE_DISPLAY[name], f"{value:.3f}")


def render_metrics_tab(metrics: RecordMetrics | None):
"""Render the metrics tab with judge ratings and scores."""
if not metrics:
st.warning("No metrics.json found for this record")
return

st.markdown("### Metrics")
_render_eva_composite_cards(metrics)

# Group metrics by category, preserving insertion order within each group
grouped: dict[str, list[tuple[str, object]]] = {}
Expand Down Expand Up @@ -1546,6 +1565,7 @@ def render_conversation_trace_tab(metrics: RecordMetrics | None, record_dir: Pat
</style>"""
)
st.markdown("### Metrics Overview")
_render_eva_composite_cards(metrics)

grouped: dict[str, list[str]] = {}
for name in all_top_metrics:
Expand Down
75 changes: 75 additions & 0 deletions configs/prompts/judge.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -829,6 +829,81 @@ judge:
],
"explanation": "<string: overall summary of fidelity assessment>"
}}
s2s_user_prompt: |
You are an expert evaluator checking the **speech clarity and articulation** of entities spoken by an AI voice agent.

You will receive:
1. A conversation trace showing what the user said and what data the agent retrieved via tools. Assistant responses are redacted — you must listen to the audio to hear what the agent actually said.
2. An audio recording of the agent's side of the conversation only (the user is not audible).

## Conversation Trace
{conversation_trace_formatted}

## IMPORTANT: What This Metric Measures

This metric measures **speech fidelity** — whether entities are clearly and correctly articulated in the audio. The conversation trace is provided so you know which entities to listen for, NOT so you can judge whether the agent gave the right answer.

**This is NOT a faithfulness or correctness metric.** Do NOT evaluate:
- Whether the agent used the right entity from a tool response (e.g., agent says "$315" but tool says $300 — this is a faithfulness issue, NOT a speech fidelity issue)
- Whether the agent fabricated or hallucinated information not in the trace
- Whether the agent omitted information it should have mentioned
- Whether the agent's response is logical, helpful, or correct

**What this metric DOES evaluate:**
When the agent speaks an entity that appears in the conversation trace (user utterances or tool responses), is it **clearly articulated** in the audio? Specifically:
- Can you clearly hear the entity as spoken?
- Does the spoken form sound like the correct entity, or is it garbled, mispronounced, or distorted?
- If the agent spells out a code letter by letter, is each letter/digit clearly distinguishable?

## Entity Categories to Listen For
- Confirmation codes (e.g., ZK3FFW, FAR0UM) — especially when spelled out letter by letter
- Flight numbers (e.g., SkyWay 410, SW302)
- Dollar amounts (e.g., $15, $1,285.00) — "fifteen" vs "fifty" matters
- Seat numbers (e.g., 21C, 14A)
- Reference/voucher IDs (e.g., REF-8JVSDF-001) — verify each segment is distinguishable
- Times (e.g., 3:55 PM, 10:30 AM)
- Dates (e.g., March 25th, February 3rd)
- Names (e.g., Mr. Rivera, Rodriguez)

## Examples

**High fidelity (rating = 1):**
- Tool response contains confirmation code "YTM924". Agent says "Y T M nine two four" — each character is clearly audible. ✓
- User says "last name Patel". Agent says "Patel" — clearly articulated. ✓
- Tool response says fare is $300. Agent says "$315" — the amount is clearly spoken even though it doesn't match the tool response. This is a faithfulness issue, not a speech fidelity issue. Rate 1. ✓
- Agent mentions "Dallas" which is not in the tool response — this is a hallucination, not a speech issue. Rate 1. ✓

**Low fidelity (rating = 0):**
- Tool response contains "YTM924". Agent tries to spell it out but audio sounds like "Y T N nine two four" — "M" sounds like "N". ✗
- Agent says a dollar amount but the audio is garbled and you cannot tell if it's "fifty" or "fifteen". ✗
- Agent spells a code but skips or slurs a letter so the spoken code has fewer characters than expected. ✗

**What to ignore (does NOT cause rating = 0):**
- Entities the agent mentions that are NOT in the conversation trace — do not evaluate these
- Minor pronunciation variations that do not change identity (e.g., "Ms." vs "Miss")
- Filler words, phrasing, word choice, sentence structure
- Slight pacing or prosody differences

## Rating Scale (per turn)
- **1 (High Fidelity)**: Every entity from the conversation trace that the agent speaks in this turn is clearly and correctly articulated.
- **0 (Low Fidelity)**: One or more entities from the conversation trace are garbled, mispronounced, or indistinguishable in the audio.

If the assistant does not speak any entities from the conversation trace in a turn (e.g., a greeting, filler, or turn where it only mentions entities not in the trace), set `has_entities` to false. These turns are excluded from scoring.

## Response Format
Respond with a JSON object. Each turn entry must include the turn_id matching the turn number shown in the Conversation Trace above:
{{
"turns": [
{{
"turn_id": <int: the turn number from the Conversation Trace>,
"transcript": <string: your transcription of the audio for this turn, use only the audio for this not the conversation trace>,
"has_entities": <boolean: true if the assistant speaks entities from the conversation trace in this turn, false otherwise>,
"explanation": "<string: 1-3 sentence analysis listing which trace entities were spoken and whether they are clearly articulated>",
"rating": <0 or 1>
}}
],
"explanation": "<string: overall summary of speech fidelity assessment>"
}}

user_speech_fidelity:
user_prompt: |
Expand Down
2 changes: 2 additions & 0 deletions src/eva/metrics/accuracy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""Task completion metrics - measuring whether the agent accomplished the user's goal."""

from . import agent_speech_fidelity # noqa
from . import agent_speech_fidelity_s2s # noqa
from . import faithfulness # noqa
from . import task_completion # noqa

__all__ = [
"agent_speech_fidelity",
"agent_speech_fidelity_s2s",
"faithfulness",
"task_completion",
]
Loading
Loading