ServiceNow · gabegma · Apr 21, 2026 · Apr 15, 2026 · Apr 17, 2026 · Apr 19, 2026
diff --git a/apps/analysis.py b/apps/analysis.py
@@ -568,6 +568,9 @@ def _collect_run_metrics(run_dir: Path) -> tuple[list[dict], list[str]]:
                         if metric_name in _NON_NORMALIZED_METRICS:
                             _NON_NORMALIZED_METRICS.add(col)
 
+            for comp_name, comp_value in metrics.aggregate_metrics.items():
+                row[comp_name] = comp_value
+
             rows.append(row)
 
     return rows, sorted(all_metric_names)
@@ -1294,8 +1297,9 @@ def render_run_overview(run_dir: Path):
     leading_cols = ["record"]
     if has_trials:
         leading_cols.append("trial")
+    composite_cols = [c for c in _EVA_BAR_COMPOSITES if c in table_df.columns]
     ordered_metrics = [m for m in metric_names if m in table_df.columns]
-    table_df = table_df[leading_cols + ordered_metrics]
+    table_df = table_df[leading_cols + composite_cols + ordered_metrics]
 
     # Add link column to navigate to Record Detail
     def _record_link(row):
@@ -1306,11 +1310,14 @@ def _record_link(row):
 
     table_df = table_df.copy()
     table_df.insert(0, "link", table_df.apply(_record_link, axis=1))
-    table_df = table_df.rename(columns=col_rename)
+    composite_rename = {c: _EVA_COMPOSITE_DISPLAY[c] for c in composite_cols}
+    table_df = table_df.rename(columns={**col_rename, **composite_rename})
 
+    renamed_composites = [composite_rename[c] for c in composite_cols]
     renamed_metrics = [col_rename[m] for m in ordered_metrics]
-    styled = table_df.style.map(_color_cell, subset=renamed_metrics)
-    styled = styled.format(dict.fromkeys(renamed_metrics, "{:.3f}"), na_rep="—")
+    score_cols = renamed_composites + renamed_metrics
+    styled = table_df.style.map(_color_cell, subset=score_cols)
+    styled = styled.format(dict.fromkeys(score_cols, "{:.3f}"), na_rep="—")
     st.dataframe(
         styled,
         hide_index=True,
@@ -1323,13 +1330,25 @@ def _record_link(row):
     st.download_button("Download CSV", csv, file_name=f"{run_dir.name}_metrics.csv", mime="text/csv")
 
 
+def _render_eva_composite_cards(metrics: RecordMetrics) -> None:
+    """Render EVA composite scores (EVA-A/EVA-X pass@1 and Mean) as st.metric cards."""
+    composites = [(c, metrics.aggregate_metrics.get(c)) for c in _EVA_BAR_COMPOSITES]
+    composites = [(c, v) for c, v in composites if v is not None]
+    if not composites:
+        return
+    cols = st.columns(len(composites))
+    for col, (name, value) in zip(cols, composites):
+        col.metric(_EVA_COMPOSITE_DISPLAY[name], f"{value:.3f}")
+
+
 def render_metrics_tab(metrics: RecordMetrics | None):
     """Render the metrics tab with judge ratings and scores."""
     if not metrics:
         st.warning("No metrics.json found for this record")
         return
 
     st.markdown("### Metrics")
+    _render_eva_composite_cards(metrics)
 
     # Group metrics by category, preserving insertion order within each group
     grouped: dict[str, list[tuple[str, object]]] = {}
@@ -1546,6 +1565,7 @@ def render_conversation_trace_tab(metrics: RecordMetrics | None, record_dir: Pat
             </style>"""
         )
         st.markdown("### Metrics Overview")
+        _render_eva_composite_cards(metrics)
 
         grouped: dict[str, list[str]] = {}
         for name in all_top_metrics:

diff --git a/configs/prompts/judge.yaml b/configs/prompts/judge.yaml
@@ -829,6 +829,81 @@ judge:
           ],
           "explanation": "<string: overall summary of fidelity assessment>"
         }}
+    s2s_user_prompt: |
+        You are an expert evaluator checking the **speech clarity and articulation** of entities spoken by an AI voice agent.
+
+        You will receive:
+        1. A conversation trace showing what the user said and what data the agent retrieved via tools. Assistant responses are redacted — you must listen to the audio to hear what the agent actually said.
+        2. An audio recording of the agent's side of the conversation only (the user is not audible).
+
+        ## Conversation Trace
+        {conversation_trace_formatted}
+
+        ## IMPORTANT: What This Metric Measures
+
+        This metric measures **speech fidelity** — whether entities are clearly and correctly articulated in the audio. The conversation trace is provided so you know which entities to listen for, NOT so you can judge whether the agent gave the right answer.
+
+        **This is NOT a faithfulness or correctness metric.** Do NOT evaluate:
+        - Whether the agent used the right entity from a tool response (e.g., agent says "$315" but tool says $300 — this is a faithfulness issue, NOT a speech fidelity issue)
+        - Whether the agent fabricated or hallucinated information not in the trace
+        - Whether the agent omitted information it should have mentioned
+        - Whether the agent's response is logical, helpful, or correct
+
+        **What this metric DOES evaluate:**
+        When the agent speaks an entity that appears in the conversation trace (user utterances or tool responses), is it **clearly articulated** in the audio? Specifically:
+        - Can you clearly hear the entity as spoken?
+        - Does the spoken form sound like the correct entity, or is it garbled, mispronounced, or distorted?
+        - If the agent spells out a code letter by letter, is each letter/digit clearly distinguishable?
+
+        ## Entity Categories to Listen For
+        - Confirmation codes (e.g., ZK3FFW, FAR0UM) — especially when spelled out letter by letter
+        - Flight numbers (e.g., SkyWay 410, SW302)
+        - Dollar amounts (e.g., $15, $1,285.00) — "fifteen" vs "fifty" matters
+        - Seat numbers (e.g., 21C, 14A)
+        - Reference/voucher IDs (e.g., REF-8JVSDF-001) — verify each segment is distinguishable
+        - Times (e.g., 3:55 PM, 10:30 AM)
+        - Dates (e.g., March 25th, February 3rd)
+        - Names (e.g., Mr. Rivera, Rodriguez)
+
+        ## Examples
+
+        **High fidelity (rating = 1):**
+        - Tool response contains confirmation code "YTM924". Agent says "Y T M nine two four" — each character is clearly audible. ✓
+        - User says "last name Patel". Agent says "Patel" — clearly articulated. ✓
+        - Tool response says fare is $300. Agent says "$315" — the amount is clearly spoken even though it doesn't match the tool response. This is a faithfulness issue, not a speech fidelity issue. Rate 1. ✓
+        - Agent mentions "Dallas" which is not in the tool response — this is a hallucination, not a speech issue. Rate 1. ✓
+
+        **Low fidelity (rating = 0):**
+        - Tool response contains "YTM924". Agent tries to spell it out but audio sounds like "Y T N nine two four" — "M" sounds like "N". ✗
+        - Agent says a dollar amount but the audio is garbled and you cannot tell if it's "fifty" or "fifteen". ✗
+        - Agent spells a code but skips or slurs a letter so the spoken code has fewer characters than expected. ✗
+
+        **What to ignore (does NOT cause rating = 0):**
+        - Entities the agent mentions that are NOT in the conversation trace — do not evaluate these
+        - Minor pronunciation variations that do not change identity (e.g., "Ms." vs "Miss")
+        - Filler words, phrasing, word choice, sentence structure
+        - Slight pacing or prosody differences
+
+        ## Rating Scale (per turn)
+        - **1 (High Fidelity)**: Every entity from the conversation trace that the agent speaks in this turn is clearly and correctly articulated.
+        - **0 (Low Fidelity)**: One or more entities from the conversation trace are garbled, mispronounced, or indistinguishable in the audio.
+
+        If the assistant does not speak any entities from the conversation trace in a turn (e.g., a greeting, filler, or turn where it only mentions entities not in the trace), set `has_entities` to false. These turns are excluded from scoring.
+
+        ## Response Format
+        Respond with a JSON object. Each turn entry must include the turn_id matching the turn number shown in the Conversation Trace above:
+        {{
+          "turns": [
+            {{
+              "turn_id": <int: the turn number from the Conversation Trace>,
+              "transcript": <string: your transcription of the audio for this turn, use only the audio for this not the conversation trace>,
+              "has_entities": <boolean: true if the assistant speaks entities from the conversation trace in this turn, false otherwise>,
+              "explanation": "<string: 1-3 sentence analysis listing which trace entities were spoken and whether they are clearly articulated>",
+              "rating": <0 or 1>
+            }}
+          ],
+          "explanation": "<string: overall summary of speech fidelity assessment>"
+        }}
 
   user_speech_fidelity:
     user_prompt: |

diff --git a/src/eva/metrics/accuracy/__init__.py b/src/eva/metrics/accuracy/__init__.py
@@ -1,11 +1,13 @@
 """Task completion metrics - measuring whether the agent accomplished the user's goal."""
 
 from . import agent_speech_fidelity  # noqa
+from . import agent_speech_fidelity_s2s  # noqa
 from . import faithfulness  # noqa
 from . import task_completion  # noqa
 
 __all__ = [
     "agent_speech_fidelity",
+    "agent_speech_fidelity_s2s",
     "faithfulness",
     "task_completion",
 ]