From af32842600627e3d0f6fa35c19a828dc525fed0e Mon Sep 17 00:00:00 2001 From: Gabrielle Gauthier-Melancon Date: Wed, 15 Apr 2026 18:04:28 -0400 Subject: [PATCH 1/9] Add new speech fidelity metric for s2s --- configs/prompts/judge.yaml | 75 ++++ src/eva/metrics/accuracy/__init__.py | 2 + .../accuracy/agent_speech_fidelity_s2s.py | 239 +++++++++++ src/eva/metrics/runner.py | 8 + .../unit/metrics/test_speech_fidelity_s2s.py | 381 ++++++++++++++++++ 5 files changed, 705 insertions(+) create mode 100644 src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py create mode 100644 tests/unit/metrics/test_speech_fidelity_s2s.py diff --git a/configs/prompts/judge.yaml b/configs/prompts/judge.yaml index 4b2fd36e..5fb0b6d3 100644 --- a/configs/prompts/judge.yaml +++ b/configs/prompts/judge.yaml @@ -829,6 +829,81 @@ judge: ], "explanation": "" }} + s2s_user_prompt: | + You are an expert evaluator checking the **speech clarity and articulation** of entities spoken by an AI voice agent. + + You will receive: + 1. A conversation trace showing what the user said and what data the agent retrieved via tools. Assistant responses are redacted — you must listen to the audio to hear what the agent actually said. + 2. An audio recording of the agent's side of the conversation only (the user is not audible). + + ## Conversation Trace + {conversation_trace_formatted} + + ## IMPORTANT: What This Metric Measures + + This metric measures **speech fidelity** — whether entities are clearly and correctly articulated in the audio. The conversation trace is provided so you know which entities to listen for, NOT so you can judge whether the agent gave the right answer. + + **This is NOT a faithfulness or correctness metric.** Do NOT evaluate: + - Whether the agent used the right entity from a tool response (e.g., agent says "$315" but tool says $300 — this is a faithfulness issue, NOT a speech fidelity issue) + - Whether the agent fabricated or hallucinated information not in the trace + - Whether the agent omitted information it should have mentioned + - Whether the agent's response is logical, helpful, or correct + + **What this metric DOES evaluate:** + When the agent speaks an entity that appears in the conversation trace (user utterances or tool responses), is it **clearly articulated** in the audio? Specifically: + - Can you clearly hear the entity as spoken? + - Does the spoken form sound like the correct entity, or is it garbled, mispronounced, or distorted? + - If the agent spells out a code letter by letter, is each letter/digit clearly distinguishable? + + ## Entity Categories to Listen For + - Confirmation codes (e.g., ZK3FFW, FAR0UM) — especially when spelled out letter by letter + - Flight numbers (e.g., SkyWay 410, SW302) + - Dollar amounts (e.g., $15, $1,285.00) — "fifteen" vs "fifty" matters + - Seat numbers (e.g., 21C, 14A) + - Reference/voucher IDs (e.g., REF-8JVSDF-001) — verify each segment is distinguishable + - Times (e.g., 3:55 PM, 10:30 AM) + - Dates (e.g., March 25th, February 3rd) + - Names (e.g., Mr. Rivera, Rodriguez) + + ## Examples + + **High fidelity (rating = 1):** + - Tool response contains confirmation code "YTM924". Agent says "Y T M nine two four" — each character is clearly audible. ✓ + - User says "last name Patel". Agent says "Patel" — clearly articulated. ✓ + - Tool response says fare is $300. Agent says "$315" — the amount is clearly spoken even though it doesn't match the tool response. This is a faithfulness issue, not a speech fidelity issue. Rate 1. ✓ + - Agent mentions "Dallas" which is not in the tool response — this is a hallucination, not a speech issue. Rate 1. ✓ + + **Low fidelity (rating = 0):** + - Tool response contains "YTM924". Agent tries to spell it out but audio sounds like "Y T N nine two four" — "M" sounds like "N". ✗ + - Agent says a dollar amount but the audio is garbled and you cannot tell if it's "fifty" or "fifteen". ✗ + - Agent spells a code but skips or slurs a letter so the spoken code has fewer characters than expected. ✗ + + **What to ignore (does NOT cause rating = 0):** + - Entities the agent mentions that are NOT in the conversation trace — do not evaluate these + - Minor pronunciation variations that do not change identity (e.g., "Ms." vs "Miss") + - Filler words, phrasing, word choice, sentence structure + - Slight pacing or prosody differences + + ## Rating Scale (per turn) + - **1 (High Fidelity)**: Every entity from the conversation trace that the agent speaks in this turn is clearly and correctly articulated. + - **0 (Low Fidelity)**: One or more entities from the conversation trace are garbled, mispronounced, or indistinguishable in the audio. + + If the assistant does not speak any entities from the conversation trace in a turn (e.g., a greeting, filler, or turn where it only mentions entities not in the trace), set `has_entities` to false. These turns are excluded from scoring. + + ## Response Format + Respond with a JSON object. Each turn entry must include the turn_id matching the turn number shown in the Conversation Trace above: + {{ + "turns": [ + {{ + "turn_id": , + "transcript": , + "has_entities": , + "explanation": "", + "rating": <0 or 1> + }} + ], + "explanation": "" + }} user_speech_fidelity: user_prompt: | diff --git a/src/eva/metrics/accuracy/__init__.py b/src/eva/metrics/accuracy/__init__.py index 35e21ae4..ab2fff74 100644 --- a/src/eva/metrics/accuracy/__init__.py +++ b/src/eva/metrics/accuracy/__init__.py @@ -1,11 +1,13 @@ """Task completion metrics - measuring whether the agent accomplished the user's goal.""" from . import agent_speech_fidelity # noqa +from . import agent_speech_fidelity_s2s # noqa from . import faithfulness # noqa from . import task_completion # noqa __all__ = [ "agent_speech_fidelity", + "agent_speech_fidelity_s2s", "faithfulness", "task_completion", ] diff --git a/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py b/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py new file mode 100644 index 00000000..c3ed2858 --- /dev/null +++ b/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py @@ -0,0 +1,239 @@ +"""Agent speech fidelity metric for S2S models — entity-focused evaluation. + +For S2S (speech-to-speech) models, there is no intended text to compare against. +Instead, this metric verifies that key entities spoken by the agent (from tool +responses and user utterances) are accurate by sending a redacted conversation +trace alongside the agent audio to Gemini. +""" + +import json +from typing import Any + +from eva.metrics.base import MetricContext +from eva.metrics.speech_fidelity_base import SpeechFidelityBaseMetric +from eva.metrics.utils import aggregate_per_turn_scores, normalize_rating, resolve_turn_id +from eva.models.results import MetricScore + + +class AgentSpeechFidelityS2SMetric(SpeechFidelityBaseMetric): + """Audio-based entity fidelity metric for S2S agent speech. + + Evaluates whether key entities (from tool responses and user utterances) are + spoken correctly by the agent, without requiring intended text. + + Rating scale: 0 (entity error) or 1 (all entities accurate) + """ + + name = "agent_speech_fidelity" + description = "Audio-based evaluation of agent entity fidelity for S2S models" + category = "accuracy" + role = "assistant" + rating_scale = (0, 1) + pass_at_k_threshold = 0.95 + + async def compute(self, context: MetricContext) -> MetricScore: + """Compute entity fidelity score using redacted conversation trace + audio.""" + try: + audio_segment = self.load_role_audio(context, self.role) + if audio_segment is None: + return MetricScore( + name=self.name, + score=0.0, + normalized_score=0.0, + error=f"No {self.role} audio file available", + ) + + redacted_trace = self._build_redacted_trace(context) + assistant_turn_ids = self._get_assistant_turn_ids(redacted_trace) + + if not assistant_turn_ids: + return MetricScore( + name=self.name, + score=0.0, + normalized_score=0.0, + error="No assistant turns found in conversation trace", + ) + + num_turns = len(assistant_turn_ids) + trace_formatted = self._format_redacted_trace(redacted_trace) + audio_b64 = self.encode_audio_segment(audio_segment) + + prompt = self.get_judge_prompt( + prompt_key="s2s_user_prompt", + conversation_trace_formatted=trace_formatted, + ) + + messages = self.create_audio_message(audio_b64, prompt) + + per_turn_ratings: dict[int, int | None] = {} + per_turn_explanations: dict[int, str] = {} + per_turn_transcripts: dict[int, str] = {} + per_turn_normalized: dict[int, float] = {} + min_rating, max_rating = self.rating_scale + valid_ratings_range = list(range(min_rating, max_rating + 1)) + + response_text, turns = await self._call_and_parse(messages, context, audio_segment, prompt) + + if response_text is None: + return MetricScore( + name=self.name, + score=0.0, + normalized_score=0.0, + error="No response from judge", + ) + + self.logger.debug(f"Raw judge response: {response_text[:200]}") + + if len(turns) != num_turns: + self.logger.warning( + f"[{context.record_id}] Expected {num_turns} ratings for S2S entity fidelity, got {len(turns)}" + ) + + per_turn_has_entities: dict[int, bool] = {} + + for response_item in turns: + turn_id = resolve_turn_id(response_item, assistant_turn_ids, self.name) + if turn_id is None: + continue + rating = response_item.get("rating") + transcript = response_item.get("transcript") + explanation = response_item.get("explanation", "") + has_entities = response_item.get("has_entities", True) + + per_turn_has_entities[turn_id] = has_entities + + if not has_entities: + # Exclude turns with no entities from scoring + per_turn_ratings[turn_id] = rating + per_turn_explanations[turn_id] = explanation + per_turn_transcripts[turn_id] = transcript + continue + + if rating not in valid_ratings_range: + self.logger.warning(f"[{context.record_id}] Invalid rating {rating} for turn {turn_id}") + per_turn_ratings[turn_id] = None + per_turn_explanations[turn_id] = f"Invalid rating: {rating}" + continue + + per_turn_ratings[turn_id] = rating + per_turn_explanations[turn_id] = explanation + per_turn_transcripts[turn_id] = transcript + per_turn_normalized[turn_id] = normalize_rating(rating, min_rating, max_rating) + + aggregated_score = aggregate_per_turn_scores(list(per_turn_normalized.values()), self.aggregation) + + # Only count turns with entities toward the score + valid_ratings = [ + per_turn_ratings[tid] + for tid in per_turn_ratings + if per_turn_ratings[tid] is not None and per_turn_has_entities.get(tid, True) + ] + avg_rating = sum(valid_ratings) / len(valid_ratings) if valid_ratings else 0.0 + num_skipped_no_entities = sum(1 for v in per_turn_has_entities.values() if not v) + + details: dict[str, Any] = { + "variant": "s2s", + "aggregation": self.aggregation, + "num_turns": num_turns, + "num_evaluated": len(valid_ratings), + "num_skipped_no_entities": num_skipped_no_entities, + "per_turn_ratings": per_turn_ratings, + "per_turn_has_entities": per_turn_has_entities, + "per_turn_explanations": per_turn_explanations, + "judge_prompt": prompt, + "judge_raw_response": response_text, + } + + return MetricScore( + name=self.name, + score=round(avg_rating, 3), + normalized_score=round(aggregated_score, 3) if aggregated_score is not None else 0, + details=details, + error="Aggregation failed" if aggregated_score is None else None, + ) + + except Exception as e: + return self._handle_error(e, context) + + @staticmethod + def _build_redacted_trace(context: MetricContext) -> list[dict]: + """Build a redacted conversation trace for entity fidelity evaluation. + + Keeps user entries and tool responses as-is (entity sources). + Replaces assistant entries with a single placeholder per turn_id + (a turn can have multiple assistant entries, e.g. before/after tool calls). + Drops tool_call entries (parameters, not entity sources). + + Note: conversation trace entries use different schemas by type: + - user/assistant entries have ``role`` + ``content`` + - tool entries have ``type`` (tool_call/tool_response) + ``tool_name`` + data fields + """ + redacted = [] + seen_assistant_turns: set[int] = set() + for entry in context.conversation_trace or []: + role = entry.get("role") + entry_type = entry.get("type") + + if role == "assistant": + turn_id = entry.get("turn_id") + if turn_id not in seen_assistant_turns: + seen_assistant_turns.add(turn_id) + redacted.append( + { + "role": "assistant", + "turn_id": turn_id, + "redacted": True, + } + ) + elif role == "user": + redacted.append( + { + "role": "user", + "content": entry.get("content", ""), + "turn_id": entry.get("turn_id"), + } + ) + elif entry_type == "tool_response": + redacted.append( + { + "role": "tool_response", + "tool_name": entry.get("tool_name", "unknown"), + "content": entry.get("tool_response", {}), + "turn_id": entry.get("turn_id"), + } + ) + # Skip tool_call entries — parameters are not entity sources + + return redacted + + @staticmethod + def _get_assistant_turn_ids(redacted_trace: list[dict]) -> list[int]: + """Extract sorted unique assistant turn IDs from the redacted trace.""" + turn_ids = set() + for entry in redacted_trace: + if entry.get("role") == "assistant" and entry.get("turn_id") is not None: + turn_ids.add(entry["turn_id"]) + return sorted(turn_ids) + + @staticmethod + def _format_redacted_trace(redacted_trace: list[dict]) -> str: + """Format the redacted trace as text for the prompt.""" + lines = [] + for entry in redacted_trace: + turn_id = entry.get("turn_id", "?") + role = entry["role"] + + if role == "user": + lines.append(f"Turn {turn_id} - User: {entry['content']}") + elif role == "assistant": + lines.append(f"Turn {turn_id} - [Assistant speaks]") + elif role == "tool_response": + tool_name = entry.get("tool_name", "unknown") + content = entry.get("content", {}) + if isinstance(content, (dict, list)): + content_str = json.dumps(content, indent=None) + else: + content_str = str(content) + lines.append(f"Turn {turn_id} - Tool Response ({tool_name}): {content_str}") + + return "\n".join(lines) diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py index deabd3a9..40e3b003 100644 --- a/src/eva/metrics/runner.py +++ b/src/eva/metrics/runner.py @@ -9,6 +9,7 @@ import yaml +from eva.metrics.accuracy.agent_speech_fidelity_s2s import AgentSpeechFidelityS2SMetric from eva.metrics.aggregation import compute_record_aggregates, compute_run_level_aggregates from eva.metrics.base import BaseMetric, MetricContext from eva.metrics.processor import MetricsContextProcessor @@ -118,6 +119,13 @@ def __init__( else: logger.warning(f"Metric '{name}' not found, skipping") + # For S2S pipelines, swap agent_speech_fidelity with entity-focused variant + if self._pipeline_type == PipelineType.S2S: + self.metrics = [ + AgentSpeechFidelityS2SMetric(config=m.config) if m.name == "agent_speech_fidelity" else m + for m in self.metrics + ] + logger.info(f"Metrics runner initialized with {len(self.metrics)} metrics") def _load_agent_config(self) -> dict[str, Any]: diff --git a/tests/unit/metrics/test_speech_fidelity_s2s.py b/tests/unit/metrics/test_speech_fidelity_s2s.py new file mode 100644 index 00000000..25fbacd3 --- /dev/null +++ b/tests/unit/metrics/test_speech_fidelity_s2s.py @@ -0,0 +1,381 @@ +"""Tests for agent_speech_fidelity S2S variant (entity-focused evaluation).""" + +import json +import logging +from unittest.mock import MagicMock, patch + +import pytest + +from eva.metrics.accuracy.agent_speech_fidelity_s2s import AgentSpeechFidelityS2SMetric +from eva.models.config import PipelineType + +from .conftest import make_judge_metric, make_metric_context + + +def make_judge_response(turns: list[dict]) -> str: + """Create a JSON judge response with a ``turns`` wrapper.""" + return json.dumps({"turns": turns}) + + +@pytest.fixture +def s2s_metric(): + return make_judge_metric( + AgentSpeechFidelityS2SMetric, + mock_llm=True, + logger_name="test_agent_speech_fidelity_s2s", + ) + + +# --- Sample conversation traces --- + +# Conversation trace entries use different schemas: +# - user/assistant: have "role" + "content" + "type" (intended/transcribed) +# - tool entries: have "type" (tool_call/tool_response) + "tool_name" + data fields, no "role" + +SIMPLE_TRACE = [ + {"role": "user", "content": "Check reservation ABC123, last name Smith", "type": "intended", "turn_id": 0}, + {"role": "assistant", "content": "Looking that up for you.", "type": "transcribed", "turn_id": 1}, + { + "tool_name": "get_reservation", + "parameters": {"confirmation_number": "ABC123"}, + "type": "tool_call", + "turn_id": 1, + }, + { + "tool_name": "get_reservation", + "tool_response": {"confirmation_number": "ABC123", "last_name": "Smith", "flight": "UA456"}, + "type": "tool_response", + "turn_id": 1, + }, + {"role": "assistant", "content": "Your flight is UA456.", "type": "transcribed", "turn_id": 1}, + {"role": "user", "content": "Thanks", "type": "intended", "turn_id": 2}, + {"role": "assistant", "content": "You're welcome!", "type": "transcribed", "turn_id": 3}, +] + +MULTI_ASSISTANT_SAME_TURN_TRACE = [ + {"role": "user", "content": "Book me a flight", "type": "intended", "turn_id": 0}, + {"role": "assistant", "content": "Let me search.", "type": "transcribed", "turn_id": 1}, + {"tool_name": "search_flights", "parameters": {}, "type": "tool_call", "turn_id": 1}, + {"tool_name": "search_flights", "tool_response": {"flights": ["SW302"]}, "type": "tool_response", "turn_id": 1}, + {"role": "assistant", "content": "I found flight SW302.", "type": "transcribed", "turn_id": 1}, + {"role": "user", "content": "Great, book it", "type": "intended", "turn_id": 2}, + {"role": "assistant", "content": "Done!", "type": "transcribed", "turn_id": 3}, +] + +NO_TOOL_TRACE = [ + {"role": "user", "content": "Hello", "type": "intended", "turn_id": 0}, + {"role": "assistant", "content": "Hi there!", "type": "transcribed", "turn_id": 1}, +] + + +def _default_context(**overrides): + """Context for S2S speech fidelity tests.""" + defaults = { + "audio_assistant_path": "/fake/audio_assistant.wav", + "audio_user_path": "/fake/audio_user.wav", + "pipeline_type": PipelineType.S2S, + "conversation_trace": SIMPLE_TRACE, + } + defaults.update(overrides) + return make_metric_context(**defaults) + + +class TestClassAttributes: + def test_s2s_metric_attributes(self, s2s_metric): + assert s2s_metric.name == "agent_speech_fidelity" + assert s2s_metric.category == "accuracy" + assert s2s_metric.role == "assistant" + assert s2s_metric.rating_scale == (0, 1) + assert s2s_metric.pass_at_k_threshold == 0.95 + + +class TestBuildRedactedTrace: + def test_assistant_entries_are_redacted(self, s2s_metric): + redacted = s2s_metric._build_redacted_trace(_default_context()) + assistant_entries = [e for e in redacted if e["role"] == "assistant"] + for entry in assistant_entries: + assert entry.get("redacted") is True + assert "content" not in entry + + def test_user_entries_preserved(self, s2s_metric): + redacted = s2s_metric._build_redacted_trace(_default_context()) + user_entries = [e for e in redacted if e["role"] == "user"] + assert len(user_entries) == 2 + assert user_entries[0]["content"] == "Check reservation ABC123, last name Smith" + assert user_entries[1]["content"] == "Thanks" + + def test_tool_responses_preserved(self, s2s_metric): + redacted = s2s_metric._build_redacted_trace(_default_context()) + tool_entries = [e for e in redacted if e["role"] == "tool_response"] + assert len(tool_entries) == 1 + assert tool_entries[0]["tool_name"] == "get_reservation" + assert tool_entries[0]["content"]["confirmation_number"] == "ABC123" + assert tool_entries[0]["content"]["flight"] == "UA456" + + def test_tool_calls_dropped(self, s2s_metric): + """Tool call entries (type=tool_call, no role) should not appear in redacted trace.""" + redacted = s2s_metric._build_redacted_trace(_default_context()) + tool_call_entries = [e for e in redacted if e.get("type") == "tool_call" or e.get("role") == "tool_call"] + assert len(tool_call_entries) == 0 + + def test_multiple_assistant_entries_same_turn_deduplicated(self, s2s_metric): + """Multiple assistant entries in the same turn should produce one placeholder.""" + context = _default_context(conversation_trace=MULTI_ASSISTANT_SAME_TURN_TRACE) + redacted = s2s_metric._build_redacted_trace(context) + assistant_entries = [e for e in redacted if e["role"] == "assistant"] + # Turn 1 has two assistant entries, but should be deduplicated to one + turn_1_entries = [e for e in assistant_entries if e["turn_id"] == 1] + assert len(turn_1_entries) == 1 + + def test_empty_trace(self, s2s_metric): + context = _default_context(conversation_trace=[]) + redacted = s2s_metric._build_redacted_trace(context) + assert redacted == [] + + def test_none_trace(self, s2s_metric): + context = _default_context(conversation_trace=None) + redacted = s2s_metric._build_redacted_trace(context) + assert redacted == [] + + +class TestGetAssistantTurnIds: + def test_extracts_unique_turn_ids(self, s2s_metric): + redacted = s2s_metric._build_redacted_trace(_default_context()) + turn_ids = s2s_metric._get_assistant_turn_ids(redacted) + assert turn_ids == [1, 3] + + def test_deduplicates_same_turn(self, s2s_metric): + context = _default_context(conversation_trace=MULTI_ASSISTANT_SAME_TURN_TRACE) + redacted = s2s_metric._build_redacted_trace(context) + turn_ids = s2s_metric._get_assistant_turn_ids(redacted) + assert turn_ids == [1, 3] + + def test_empty_trace(self, s2s_metric): + turn_ids = s2s_metric._get_assistant_turn_ids([]) + assert turn_ids == [] + + +class TestFormatRedactedTrace: + def test_format_simple_trace(self, s2s_metric): + redacted = s2s_metric._build_redacted_trace(_default_context()) + formatted = s2s_metric._format_redacted_trace(redacted) + lines = formatted.split("\n") + + assert lines[0] == "Turn 0 - User: Check reservation ABC123, last name Smith" + assert lines[1] == "Turn 1 - [Assistant speaks]" + assert "Turn 1 - Tool Response (get_reservation):" in lines[2] + assert '"confirmation_number": "ABC123"' in lines[2] + assert lines[3] == "Turn 2 - User: Thanks" + assert lines[4] == "Turn 3 - [Assistant speaks]" + + def test_format_no_duplicate_assistant_lines(self, s2s_metric): + """Even with multiple assistant entries per turn, only one line appears.""" + context = _default_context(conversation_trace=MULTI_ASSISTANT_SAME_TURN_TRACE) + redacted = s2s_metric._build_redacted_trace(context) + formatted = s2s_metric._format_redacted_trace(redacted) + assert formatted.count("Turn 1 - [Assistant speaks]") == 1 + + +class TestNoAudio: + @pytest.mark.asyncio + async def test_no_audio_returns_error(self, s2s_metric): + context = _default_context(audio_assistant_path=None) + result = await s2s_metric.compute(context) + assert result.score == 0.0 + assert "No assistant audio" in result.error + + +class TestNoAssistantTurns: + @pytest.mark.asyncio + async def test_no_assistant_turns_returns_error(self, s2s_metric): + trace = [ + {"role": "user", "content": "Hello", "type": "intended", "turn_id": 0}, + ] + context = _default_context(conversation_trace=trace) + with patch.object(s2s_metric, "load_role_audio", return_value=MagicMock()): + result = await s2s_metric.compute(context) + assert result.score == 0.0 + assert "No assistant turns" in result.error + + +class TestNoJudgeResponse: + @pytest.mark.asyncio + async def test_no_response_returns_error(self, s2s_metric): + s2s_metric.llm_client.generate_text.return_value = None + context = _default_context() + with patch.object(s2s_metric, "load_role_audio", return_value=MagicMock()): + with patch.object(s2s_metric, "encode_audio_segment", return_value="base64audio"): + result = await s2s_metric.compute(context) + assert result.score == 0.0 + assert result.error == "No response from judge" + + +class TestS2SCompute: + @pytest.mark.asyncio + async def test_all_high_fidelity(self, s2s_metric): + """All turns rated 1 -> perfect score.""" + response = make_judge_response( + [ + {"turn_id": 1, "rating": 1, "explanation": "All entities correct"}, + {"turn_id": 3, "rating": 1, "explanation": "No entities to check"}, + ] + ) + s2s_metric.llm_client.generate_text.return_value = response + with patch.object(s2s_metric, "load_role_audio", return_value=MagicMock()): + with patch.object(s2s_metric, "encode_audio_segment", return_value="base64audio"): + context = _default_context() + result = await s2s_metric.compute(context) + + assert result.score == 1.0 + assert result.normalized_score == 1.0 + assert result.details["num_turns"] == 2 + assert result.details["num_evaluated"] == 2 + assert result.details["variant"] == "s2s" + assert result.error is None + + @pytest.mark.asyncio + async def test_all_low_fidelity(self, s2s_metric): + """All turns rated 0 -> zero score.""" + response = make_judge_response( + [ + {"turn_id": 1, "rating": 0, "explanation": "Said UA465 instead of UA456"}, + {"turn_id": 3, "rating": 0, "explanation": "Wrong name"}, + ] + ) + s2s_metric.llm_client.generate_text.return_value = response + with patch.object(s2s_metric, "load_role_audio", return_value=MagicMock()): + with patch.object(s2s_metric, "encode_audio_segment", return_value="base64audio"): + context = _default_context() + result = await s2s_metric.compute(context) + + assert result.score == 0.0 + assert result.normalized_score == 0.0 + + @pytest.mark.asyncio + async def test_mixed_ratings(self, s2s_metric): + """One turn correct, one incorrect -> 0.5.""" + response = make_judge_response( + [ + {"turn_id": 1, "rating": 1, "explanation": "Correct"}, + {"turn_id": 3, "rating": 0, "explanation": "Wrong entity"}, + ] + ) + s2s_metric.llm_client.generate_text.return_value = response + with patch.object(s2s_metric, "load_role_audio", return_value=MagicMock()): + with patch.object(s2s_metric, "encode_audio_segment", return_value="base64audio"): + context = _default_context() + result = await s2s_metric.compute(context) + + assert result.score == 0.5 + assert result.normalized_score == 0.5 + + @pytest.mark.asyncio + async def test_invalid_rating_excluded(self, s2s_metric): + """Invalid ratings are excluded from aggregation.""" + response = make_judge_response( + [ + {"turn_id": 1, "rating": 1, "has_entities": True, "explanation": "Good"}, + {"turn_id": 3, "rating": 5, "has_entities": True, "explanation": "Invalid"}, + ] + ) + s2s_metric.llm_client.generate_text.return_value = response + with patch.object(s2s_metric, "load_role_audio", return_value=MagicMock()): + with patch.object(s2s_metric, "encode_audio_segment", return_value="base64audio"): + context = _default_context() + result = await s2s_metric.compute(context) + + assert result.details["num_evaluated"] == 1 + assert result.details["per_turn_ratings"][3] is None + assert result.score == 1.0 + + @pytest.mark.asyncio + async def test_no_entity_turns_excluded_from_score(self, s2s_metric): + """Turns with has_entities=false should not count toward the score.""" + response = make_judge_response( + [ + {"turn_id": 1, "rating": 0, "has_entities": False, "explanation": "Greeting, no entities"}, + {"turn_id": 3, "rating": 1, "has_entities": True, "explanation": "Flight number correct"}, + ] + ) + s2s_metric.llm_client.generate_text.return_value = response + with patch.object(s2s_metric, "load_role_audio", return_value=MagicMock()): + with patch.object(s2s_metric, "encode_audio_segment", return_value="base64audio"): + context = _default_context() + result = await s2s_metric.compute(context) + + # Only turn 3 (has_entities=True) should be evaluated + assert result.details["num_evaluated"] == 1 + assert result.details["num_skipped_no_entities"] == 1 + assert result.score == 1.0 + assert result.normalized_score == 1.0 + + @pytest.mark.asyncio + async def test_all_turns_no_entities(self, s2s_metric): + """If all turns have no entities, score should be 0 with no evaluated turns.""" + response = make_judge_response( + [ + {"turn_id": 1, "rating": 1, "has_entities": False, "explanation": "No entities"}, + {"turn_id": 3, "rating": 1, "has_entities": False, "explanation": "No entities"}, + ] + ) + s2s_metric.llm_client.generate_text.return_value = response + with patch.object(s2s_metric, "load_role_audio", return_value=MagicMock()): + with patch.object(s2s_metric, "encode_audio_segment", return_value="base64audio"): + context = _default_context() + result = await s2s_metric.compute(context) + + assert result.details["num_evaluated"] == 0 + assert result.details["num_skipped_no_entities"] == 2 + assert result.score == 0.0 + + @pytest.mark.asyncio + async def test_has_entities_defaults_to_true(self, s2s_metric): + """If has_entities is missing from response, default to True (include in scoring).""" + response = make_judge_response( + [ + {"turn_id": 1, "rating": 1, "explanation": "Good"}, + {"turn_id": 3, "rating": 0, "explanation": "Wrong entity"}, + ] + ) + s2s_metric.llm_client.generate_text.return_value = response + with patch.object(s2s_metric, "load_role_audio", return_value=MagicMock()): + with patch.object(s2s_metric, "encode_audio_segment", return_value="base64audio"): + context = _default_context() + result = await s2s_metric.compute(context) + + assert result.details["num_evaluated"] == 2 + assert result.details["num_skipped_no_entities"] == 0 + assert result.score == 0.5 + + +class TestTurnCountMismatch: + @pytest.mark.asyncio + async def test_fewer_turns_returned(self, s2s_metric, caplog): + """Fewer turns than expected logs a warning but still computes.""" + response = make_judge_response( + [ + {"turn_id": 1, "rating": 1, "explanation": "Good"}, + ] + ) + s2s_metric.llm_client.generate_text.return_value = response + with patch.object(s2s_metric, "load_role_audio", return_value=MagicMock()): + with patch.object(s2s_metric, "encode_audio_segment", return_value="base64audio"): + context = _default_context() + with caplog.at_level(logging.WARNING): + result = await s2s_metric.compute(context) + + assert "Expected 2 ratings" in caplog.text + assert result.details["num_evaluated"] == 1 + assert result.score == 1.0 + + +class TestErrorHandling: + @pytest.mark.asyncio + async def test_exception_returns_error_score(self, s2s_metric): + with patch.object(s2s_metric, "load_role_audio", side_effect=RuntimeError("boom")): + context = _default_context() + result = await s2s_metric.compute(context) + + assert result.score == 0.0 + assert result.normalized_score == 0.0 + assert "boom" in result.error From 5962066ff8f1d392c13e43f1db2f0ba39f7d1f8d Mon Sep 17 00:00:00 2001 From: Gabrielle Gauthier-Melancon Date: Fri, 17 Apr 2026 17:54:09 -0400 Subject: [PATCH 2/9] Fix aggregation when there is no entities --- src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py | 8 ++++++-- tests/unit/metrics/test_speech_fidelity_s2s.py | 6 ++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py b/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py index c3ed2858..ac7638f0 100644 --- a/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py +++ b/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py @@ -131,12 +131,17 @@ async def compute(self, context: MetricContext) -> MetricScore: avg_rating = sum(valid_ratings) / len(valid_ratings) if valid_ratings else 0.0 num_skipped_no_entities = sum(1 for v in per_turn_has_entities.values() if not v) + # No valid scores to aggregate — not an error, just nothing to score + skipped = not valid_ratings + details: dict[str, Any] = { "variant": "s2s", "aggregation": self.aggregation, "num_turns": num_turns, "num_evaluated": len(valid_ratings), "num_skipped_no_entities": num_skipped_no_entities, + "skipped": skipped, + "skipped_reason": "No valid ratings to aggregate" if skipped else None, "per_turn_ratings": per_turn_ratings, "per_turn_has_entities": per_turn_has_entities, "per_turn_explanations": per_turn_explanations, @@ -147,9 +152,8 @@ async def compute(self, context: MetricContext) -> MetricScore: return MetricScore( name=self.name, score=round(avg_rating, 3), - normalized_score=round(aggregated_score, 3) if aggregated_score is not None else 0, + normalized_score=round(aggregated_score, 3) if aggregated_score is not None else None, details=details, - error="Aggregation failed" if aggregated_score is None else None, ) except Exception as e: diff --git a/tests/unit/metrics/test_speech_fidelity_s2s.py b/tests/unit/metrics/test_speech_fidelity_s2s.py index 25fbacd3..c94d987e 100644 --- a/tests/unit/metrics/test_speech_fidelity_s2s.py +++ b/tests/unit/metrics/test_speech_fidelity_s2s.py @@ -311,7 +311,7 @@ async def test_no_entity_turns_excluded_from_score(self, s2s_metric): @pytest.mark.asyncio async def test_all_turns_no_entities(self, s2s_metric): - """If all turns have no entities, score should be 0 with no evaluated turns.""" + """If all turns have no entities, it is not an error — normalized score is None.""" response = make_judge_response( [ {"turn_id": 1, "rating": 1, "has_entities": False, "explanation": "No entities"}, @@ -326,7 +326,9 @@ async def test_all_turns_no_entities(self, s2s_metric): assert result.details["num_evaluated"] == 0 assert result.details["num_skipped_no_entities"] == 2 - assert result.score == 0.0 + assert result.normalized_score is None + assert result.error is None + assert result.details.get("skipped") is True @pytest.mark.asyncio async def test_has_entities_defaults_to_true(self, s2s_metric): From 580749bc04fefd6a91d431a926ad1079bf30d252 Mon Sep 17 00:00:00 2001 From: Gabrielle Gauthier-Melancon Date: Sun, 19 Apr 2026 19:39:13 -0400 Subject: [PATCH 3/9] Add skipped field to MetricScore Makes score optional and introduces an explicit `skipped: bool` flag so metrics can signal "no applicable data" distinctly from an error. Allows downstream consumers to treat skipped metrics as a first-class state instead of inferring it from None scores. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/eva/models/results.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/eva/models/results.py b/src/eva/models/results.py index 8679c7e3..7a64e3a4 100644 --- a/src/eva/models/results.py +++ b/src/eva/models/results.py @@ -82,10 +82,14 @@ class MetricScore(BaseModel): """Score for a single metric.""" name: str = Field(..., description="Metric name") - score: float = Field(..., description="Raw score value") + score: float | None = Field(None, description="Raw score value (None when the metric was skipped)") normalized_score: float | None = Field(None, description="Normalized score (0-1 scale)") details: dict[str, Any] = Field(default_factory=dict, description="Additional metric details") error: str | None = Field(None, description="Error message if metric computation failed") + skipped: bool = Field( + False, + description="True when the metric had no applicable data to score (distinct from errored)", + ) sub_metrics: dict[str, "MetricScore"] | None = Field( None, description="Optional sub-metric breakdowns, aggregated generically by the runner" ) From d6315005d6afee200eebf09b07276a561b36211b Mon Sep 17 00:00:00 2001 From: Gabrielle Gauthier-Melancon Date: Sun, 19 Apr 2026 19:39:41 -0400 Subject: [PATCH 4/9] Mark metrics as skipped when no applicable data Agent speech fidelity (S2S) and transcription accuracy key entities both have legitimate cases where no entities exist to score. Previously these returned score=0.0 with error="Aggregation failed" (for S2S) or a zero-valued score that conflated with real zero scores. Now they set skipped=True with score=None so consumers can handle the case correctly. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py | 6 +++--- .../diagnostic/transcription_accuracy_key_entities.py | 6 +++--- tests/unit/metrics/test_speech_fidelity_s2s.py | 5 +++-- .../metrics/test_transcription_accuracy_key_entities.py | 6 +++--- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py b/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py index ac7638f0..e280a7fa 100644 --- a/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py +++ b/src/eva/metrics/accuracy/agent_speech_fidelity_s2s.py @@ -128,11 +128,11 @@ async def compute(self, context: MetricContext) -> MetricScore: for tid in per_turn_ratings if per_turn_ratings[tid] is not None and per_turn_has_entities.get(tid, True) ] - avg_rating = sum(valid_ratings) / len(valid_ratings) if valid_ratings else 0.0 num_skipped_no_entities = sum(1 for v in per_turn_has_entities.values() if not v) # No valid scores to aggregate — not an error, just nothing to score skipped = not valid_ratings + avg_rating = sum(valid_ratings) / len(valid_ratings) if valid_ratings else None details: dict[str, Any] = { "variant": "s2s", @@ -140,7 +140,6 @@ async def compute(self, context: MetricContext) -> MetricScore: "num_turns": num_turns, "num_evaluated": len(valid_ratings), "num_skipped_no_entities": num_skipped_no_entities, - "skipped": skipped, "skipped_reason": "No valid ratings to aggregate" if skipped else None, "per_turn_ratings": per_turn_ratings, "per_turn_has_entities": per_turn_has_entities, @@ -151,9 +150,10 @@ async def compute(self, context: MetricContext) -> MetricScore: return MetricScore( name=self.name, - score=round(avg_rating, 3), + score=round(avg_rating, 3) if avg_rating is not None else None, normalized_score=round(aggregated_score, 3) if aggregated_score is not None else None, details=details, + skipped=skipped, ) except Exception as e: diff --git a/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py b/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py index c8232867..e00678f9 100644 --- a/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py +++ b/src/eva/metrics/diagnostic/transcription_accuracy_key_entities.py @@ -116,7 +116,7 @@ async def compute(self, context: MetricContext) -> MetricScore: # Compute average raw score valid_ratings = [r for r in per_turn_ratings.values() if r is not None and r != -1.0] not_applicable = [r for r in per_turn_ratings.values() if r == -1.0] - avg_rating = sum(valid_ratings) / len(valid_ratings) if valid_ratings else 0.0 + avg_rating = sum(valid_ratings) / len(valid_ratings) if valid_ratings else None # All turns had no entities to evaluate — not an error, just nothing to score skipped = not applicable_normalized @@ -127,7 +127,7 @@ async def compute(self, context: MetricContext) -> MetricScore: return MetricScore( name=self.name, - score=round(avg_rating, 3), + score=round(avg_rating, 3) if avg_rating is not None else None, normalized_score=round(aggregated_score, 3) if aggregated_score is not None else None, details={ "judge_prompt": prompt, @@ -135,7 +135,6 @@ async def compute(self, context: MetricContext) -> MetricScore: "num_turns": len(turns_to_evaluate), "num_evaluated": num_evaluated, "num_not_applicable": len(not_applicable), - "skipped": skipped, "skipped_reason": "No key entities found in any evaluated turn" if skipped else None, "per_turn_ratings": per_turn_ratings, "per_turn_normalized": per_turn_normalized, @@ -143,6 +142,7 @@ async def compute(self, context: MetricContext) -> MetricScore: "per_turn_entity_details": per_turn_entity_details, "judge_raw_response": response_text, }, + skipped=skipped, ) except Exception as e: diff --git a/tests/unit/metrics/test_speech_fidelity_s2s.py b/tests/unit/metrics/test_speech_fidelity_s2s.py index c94d987e..d53ad39d 100644 --- a/tests/unit/metrics/test_speech_fidelity_s2s.py +++ b/tests/unit/metrics/test_speech_fidelity_s2s.py @@ -311,7 +311,7 @@ async def test_no_entity_turns_excluded_from_score(self, s2s_metric): @pytest.mark.asyncio async def test_all_turns_no_entities(self, s2s_metric): - """If all turns have no entities, it is not an error — normalized score is None.""" + """If all turns have no entities, it is not an error — scores are None.""" response = make_judge_response( [ {"turn_id": 1, "rating": 1, "has_entities": False, "explanation": "No entities"}, @@ -326,9 +326,10 @@ async def test_all_turns_no_entities(self, s2s_metric): assert result.details["num_evaluated"] == 0 assert result.details["num_skipped_no_entities"] == 2 + assert result.score is None assert result.normalized_score is None assert result.error is None - assert result.details.get("skipped") is True + assert result.skipped is True @pytest.mark.asyncio async def test_has_entities_defaults_to_true(self, s2s_metric): diff --git a/tests/unit/metrics/test_transcription_accuracy_key_entities.py b/tests/unit/metrics/test_transcription_accuracy_key_entities.py index e0a80d68..28f3e8e8 100644 --- a/tests/unit/metrics/test_transcription_accuracy_key_entities.py +++ b/tests/unit/metrics/test_transcription_accuracy_key_entities.py @@ -192,7 +192,7 @@ async def test_all_turns_have_entities(self, metric): assert result.details["num_turns"] == 2 assert result.details["num_evaluated"] == 2 assert result.details["num_not_applicable"] == 0 - assert result.details["skipped"] is False + assert result.skipped is False assert result.score == 1.0 assert result.normalized_score == 1.0 @@ -225,7 +225,7 @@ async def test_one_turn_no_entities(self, metric): assert result.details["num_turns"] == 2 assert result.details["num_evaluated"] == 2 assert result.details["num_not_applicable"] == 1 - assert result.details["skipped"] is False + assert result.skipped is False assert result.score == 1.0 @pytest.mark.asyncio @@ -248,7 +248,7 @@ async def test_all_turns_no_entities(self, metric): assert result.details["num_turns"] == 2 assert result.details["num_evaluated"] == 2 assert result.details["num_not_applicable"] == 2 - assert result.details["skipped"] is True + assert result.skipped is True assert result.normalized_score is None @pytest.mark.asyncio From 5f2f9478afb071e84a62878ee2378784625ff887 Mon Sep 17 00:00:00 2001 From: Gabrielle Gauthier-Melancon Date: Sun, 19 Apr 2026 19:40:55 -0400 Subject: [PATCH 5/9] Honor skipped metric signal in validation and pass@k - validation_runner: skipped metrics no longer fail validation; they are excluded from threshold checks. - pass_at_k: skipped trials are excluded from n/c so pass@k is computed over the remaining valid trials. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/eva/orchestrator/validation_runner.py | 5 +++++ src/eva/utils/pass_at_k.py | 3 +++ tests/unit/utils/test_pass_at_k.py | 26 +++++++++++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/src/eva/orchestrator/validation_runner.py b/src/eva/orchestrator/validation_runner.py index b94d1151..57558c5c 100644 --- a/src/eva/orchestrator/validation_runner.py +++ b/src/eva/orchestrator/validation_runner.py @@ -189,6 +189,11 @@ def _evaluate_record( failed_metrics.append(metric_name) continue + # Skipped metric (no applicable data) — exclude from validation, not a failure. + if metric_score.skipped: + logger.debug(f"Record {record_id}: Metric '{metric_name}' was skipped") + continue + score = metric_score.normalized_score if metric_score.normalized_score is not None else metric_score.score scores[metric_name] = score diff --git a/src/eva/utils/pass_at_k.py b/src/eva/utils/pass_at_k.py index b4195a2e..ade92017 100644 --- a/src/eva/utils/pass_at_k.py +++ b/src/eva/utils/pass_at_k.py @@ -162,6 +162,9 @@ def compute_pass_at_k_for_scores( for ms in per_trial_scores: if ms.error is not None: continue + # Skipped trials contribute no pass/fail signal to pass@k — exclude them. + if ms.skipped: + continue val = ms.normalized_score if ms.normalized_score is not None else ms.score valid_scores.append(val) valid_passed.append(val >= threshold) diff --git a/tests/unit/utils/test_pass_at_k.py b/tests/unit/utils/test_pass_at_k.py index 8a935b63..86e63912 100644 --- a/tests/unit/utils/test_pass_at_k.py +++ b/tests/unit/utils/test_pass_at_k.py @@ -304,3 +304,29 @@ def test_falls_back_to_raw_score(self): result = compute_pass_at_k_for_scores("test", [score], threshold=0.5, k=1) assert result.per_trial_scores == [0.7] assert result.per_trial_passed == [True] + + def test_skipped_trials_excluded_but_others_still_counted(self): + """Skipped trials excluded while others still count. + + Verifies pass@k is still computed from the remaining valid trials. + """ + scores = [ + self._make_score(0.8), # pass + MetricScore(name="test_metric", score=None, normalized_score=None, skipped=True), + self._make_score(0.3), # fail + ] + result = compute_pass_at_k_for_scores("test", scores, threshold=0.5, k=1) + + assert result is not None + assert result.n == 2 + assert result.c == 1 + assert result.per_trial_passed == [True, False] + + def test_all_trials_skipped_returns_none(self): + """If every trial was skipped (no valid scores), the metric contributes no pass@k.""" + scores = [ + MetricScore(name="test_metric", score=None, normalized_score=None, skipped=True), + MetricScore(name="test_metric", score=None, normalized_score=None, skipped=True), + ] + result = compute_pass_at_k_for_scores("test", scores, threshold=0.5, k=1) + assert result is None From 7093a0ac0216aa95ebb872f5a0b12b6f01dd620e Mon Sep 17 00:00:00 2001 From: Gabrielle Gauthier-Melancon Date: Sun, 19 Apr 2026 19:41:38 -0400 Subject: [PATCH 6/9] Exclude skipped components from EVA-A_pass instead of collapsing composite Previously any None component (missing, errored, or legitimately skipped) would collapse EVA-A_pass to None, excluding the record from composite pass statistics. Now a skipped component is excluded from the pass check while remaining applicable components still determine pass/fail. Missing or errored components still collapse the composite to None, since that represents genuine data absence. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/eva/metrics/aggregation.py | 17 +++++--- tests/unit/metrics/test_aggregation.py | 56 ++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 6 deletions(-) diff --git a/src/eva/metrics/aggregation.py b/src/eva/metrics/aggregation.py index 6c0a203d..cd3ad296 100644 --- a/src/eva/metrics/aggregation.py +++ b/src/eva/metrics/aggregation.py @@ -113,18 +113,23 @@ def compute_record_aggregates( for comp in composites: if comp.aggregation_type == "pass": - # All components must be present; if any missing -> None + # Components missing or errored collapse the composite to None (genuine data + # absence). Components flagged as skipped are excluded from the pass check + # so they don't mask applicable components. scores: list[tuple[float, str, float]] = [] - missing = False + has_error_or_missing = False for metric_name in comp.component_metrics: - val = record_metrics.get_score(metric_name) - if val is None: - missing = True + metric = record_metrics.metrics.get(metric_name) + if metric is None or metric.error: + has_error_or_missing = True break + if metric.skipped: + continue + val = metric.normalized_score if metric.normalized_score is not None else metric.score op, thresh = comp.thresholds[metric_name] scores.append((val, op, thresh)) - if missing: + if has_error_or_missing or not scores: results[comp.name] = None else: all_pass = all(_check_threshold(v, op, th) for v, op, th in scores) diff --git a/tests/unit/metrics/test_aggregation.py b/tests/unit/metrics/test_aggregation.py index 73886260..e876142a 100644 --- a/tests/unit/metrics/test_aggregation.py +++ b/tests/unit/metrics/test_aggregation.py @@ -157,6 +157,62 @@ def test_error_metric_excluded(self): # Mean only includes non-error scores assert agg["EVA-A_mean"] == pytest.approx((0.5 + 0.95) / 2) + def test_skipped_component_excluded_from_pass(self): + """Skipped component excluded from pass check. + + Remaining components still determine pass/fail. + """ + rm = RecordMetrics( + record_id="1.1.1", + metrics={ + "task_completion": MetricScore(name="task_completion", score=1.0, normalized_score=1.0), + "faithfulness": MetricScore(name="faithfulness", score=0.8, normalized_score=0.8), + "agent_speech_fidelity": MetricScore( + name="agent_speech_fidelity", + score=None, + normalized_score=None, + skipped=True, + ), + }, + ) + agg = compute_record_aggregates(rm) + + # Skipped component is excluded; the two remaining components both pass + assert agg["EVA-A_pass"] == 1.0 + + def test_skipped_component_still_respects_other_failures(self): + """A skipped component does not mask a real failure in another component.""" + rm = RecordMetrics( + record_id="1.1.1", + metrics={ + "task_completion": MetricScore(name="task_completion", score=0.5, normalized_score=0.5), + "faithfulness": MetricScore(name="faithfulness", score=0.8, normalized_score=0.8), + "agent_speech_fidelity": MetricScore( + name="agent_speech_fidelity", score=None, normalized_score=None, skipped=True + ), + }, + ) + agg = compute_record_aggregates(rm) + + # task_completion fails (0.5 != 1.0) -> EVA-A_pass is 0.0 + assert agg["EVA-A_pass"] == 0.0 + + def test_all_components_skipped_pass_is_none(self): + """If every component is skipped, the composite is None (nothing to evaluate).""" + rm = RecordMetrics( + record_id="1.1.1", + metrics={ + "task_completion": MetricScore(name="task_completion", score=None, normalized_score=None, skipped=True), + "faithfulness": MetricScore(name="faithfulness", score=None, normalized_score=None, skipped=True), + "agent_speech_fidelity": MetricScore( + name="agent_speech_fidelity", score=None, normalized_score=None, skipped=True + ), + }, + ) + agg = compute_record_aggregates(rm) + + assert agg["EVA-A_pass"] is None + def test_agent_speech_fidelity_threshold_boundary(self): """agent_speech_fidelity uses > 0.9 (not >=), so 0.9 exactly fails.""" rm = make_record_metrics( From 691e63571d24651074e8a5ff08e3a282d5aa152e Mon Sep 17 00:00:00 2001 From: Gabrielle Gauthier-Melancon Date: Mon, 20 Apr 2026 13:39:35 -0400 Subject: [PATCH 7/9] Fix turn boundary with realtime where user transcription could be too early --- src/eva/metrics/processor.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/eva/metrics/processor.py b/src/eva/metrics/processor.py index 104f011d..e5e66034 100644 --- a/src/eva/metrics/processor.py +++ b/src/eva/metrics/processor.py @@ -106,15 +106,19 @@ class _TurnExtractionState: # user_speech lands at the same turn. rollback_advance_consumed_by_user: bool = False - def advance_turn_if_needed(self, from_audio_start: bool = False) -> None: + def advance_turn_if_needed(self, from_audio_start: bool = False, bypass_hold: bool = False) -> None: """Advance turn if the assistant responded since the last user event. Called on audio_start(elevenlabs_user) and audit_log/user events. After an interruption, hold_turn suppresses one advance from audit_log/user (late STT from the interrupted session) but never blocks audio_start (the user speaking again always starts a new turn). + + bypass_hold=True is used by S2S pipelines, where audit_log/user carries the + S2S model's own transcription of the current utterance, not a late STT chunk + from the previous (interrupted) session. """ - if self.hold_turn: + if self.hold_turn and not bypass_hold: if from_audio_start: # New user speech — clear hold_turn but still advance self.hold_turn = False @@ -216,7 +220,7 @@ def _handle_audit_log_event( # advance so that hold_turn is consumed if set. if state.user_audio_open: state.assistant_spoke_in_turn = False - state.advance_turn_if_needed() + state.advance_turn_if_needed(bypass_hold=pipeline_type == PipelineType.S2S) turn = state.turn_num entry = get_entry_for_audit_log(event, turn) existing = context.transcribed_user_turns.get(turn, "") From 4596762ccc88c17ddc100204bde8060260bb107d Mon Sep 17 00:00:00 2001 From: Gabrielle Gauthier-Melancon Date: Mon, 20 Apr 2026 14:00:58 -0400 Subject: [PATCH 8/9] Add bug in test --- tests/fixtures/processor_histories.json | 42 +++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tests/fixtures/processor_histories.json b/tests/fixtures/processor_histories.json index dbc15119..d75622d6 100644 --- a/tests/fixtures/processor_histories.json +++ b/tests/fixtures/processor_histories.json @@ -1023,5 +1023,47 @@ "assistant_interrupted_turns": [1], "user_interrupted_turns": [] } + }, + { + "id": "s2s_audit_user_cotimestamped_with_audio_start_after_interruption", + "description": "Based on record 1.1.2/trial_0. S2S pipeline with a short-overlap assistant interruption at turn 1 (sets hold_turn). The next user utterance's audit_log/user event is co-timestamped with audio_start(elevenlabs_user) and sorts first in the stable-sorted history. hold_turn is a cascade-pipeline concept (suppresses late STT chunks from the interrupted session) and must not cause audit_log/user in S2S to consume the turn advance. The 'Yeah, my confirmation is ZK3FFW.' utterance must land at turn 2, not turn 1.", + "pipeline_type": "s2s", + "history": [ + {"timestamp_ms": 1000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 1.0}}, + {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 1.5}}, + {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Hello, how can I help you?"}}}, + {"timestamp_ms": 2050, "source": "audit_log", "event_type": "assistant", "data": "Hello, how can I help you?"}, + {"timestamp_ms": 2500, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 2.5}}, + {"timestamp_ms": 2600, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "Hi, I need to change my flight."}}}, + {"timestamp_ms": 2700, "source": "audit_log", "event_type": "user", "data": "Hi, I need to change my flight."}, + {"timestamp_ms": 3000, "source": "audit_log", "event_type": "assistant", "data": "Of course, what is your confirmation?"}, + {"timestamp_ms": 3001, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "framework_agent", "audio_timestamp": 3.001}}, + {"timestamp_ms": 3162, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 3.162}}, + {"timestamp_ms": 4800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "framework_agent", "audio_timestamp": 4.8}}, + {"timestamp_ms": 5500, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Of course, what is your confirmation?"}}}, + {"timestamp_ms": 5800, "source": "audit_log", "event_type": "user", "data": "Yeah, my confirmation is ZK3FFW."}, + {"timestamp_ms": 5800, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 5.8}}, + {"timestamp_ms": 5900, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "Yeah, my confirmation is ZK3FFW."}}}, + {"timestamp_ms": 6500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 6.5}}, + {"timestamp_ms": 7000, "source": "elevenlabs", "event_type": "connection_state", "data": {"data": {"state": "session_ended"}}} + ], + "expected": { + "transcribed_assistant_turns": {"0": "Hello, how can I help you?", "1": "[assistant interrupts] Of course, what is your confirmation?"}, + "transcribed_user_turns": {"1": "Hi, I need to change my flight.", "2": "Yeah, my confirmation is ZK3FFW."}, + "intended_user_turns": {"1": "Hi, I need to change my flight.", "2": "Yeah, my confirmation is ZK3FFW."}, + "audio_timestamps_assistant_turns": {"0": [[1.0, 1.5]], "1": [[3.001, 4.8]]}, + "audio_timestamps_user_turns": {"1": [[2.5, 3.162]], "2": [[5.8, 6.5]]}, + "num_assistant_turns": 2, + "num_user_turns": 2, + "num_tool_calls": 0, + "conversation_trace": [ + {"role": "assistant", "content": "Hello, how can I help you?", "type": "transcribed", "turn_id": 0}, + {"role": "user", "content": "Hi, I need to change my flight.", "type": "intended", "turn_id": 1}, + {"role": "assistant", "content": "[assistant interrupts] Of course, what is your confirmation?", "type": "transcribed", "turn_id": 1}, + {"role": "user", "content": "Yeah, my confirmation is ZK3FFW.", "type": "intended", "turn_id": 2} + ], + "assistant_interrupted_turns": [1], + "user_interrupted_turns": [] + } } ] From e927574379648ab749ced158ba27bd442fe24ad5 Mon Sep 17 00:00:00 2001 From: Gabrielle Gauthier-Melancon Date: Mon, 20 Apr 2026 23:01:21 -0400 Subject: [PATCH 9/9] Add composite scores per record --- apps/analysis.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/apps/analysis.py b/apps/analysis.py index cb2d7deb..c4d69963 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -568,6 +568,9 @@ def _collect_run_metrics(run_dir: Path) -> tuple[list[dict], list[str]]: if metric_name in _NON_NORMALIZED_METRICS: _NON_NORMALIZED_METRICS.add(col) + for comp_name, comp_value in metrics.aggregate_metrics.items(): + row[comp_name] = comp_value + rows.append(row) return rows, sorted(all_metric_names) @@ -1294,8 +1297,9 @@ def render_run_overview(run_dir: Path): leading_cols = ["record"] if has_trials: leading_cols.append("trial") + composite_cols = [c for c in _EVA_BAR_COMPOSITES if c in table_df.columns] ordered_metrics = [m for m in metric_names if m in table_df.columns] - table_df = table_df[leading_cols + ordered_metrics] + table_df = table_df[leading_cols + composite_cols + ordered_metrics] # Add link column to navigate to Record Detail def _record_link(row): @@ -1306,11 +1310,14 @@ def _record_link(row): table_df = table_df.copy() table_df.insert(0, "link", table_df.apply(_record_link, axis=1)) - table_df = table_df.rename(columns=col_rename) + composite_rename = {c: _EVA_COMPOSITE_DISPLAY[c] for c in composite_cols} + table_df = table_df.rename(columns={**col_rename, **composite_rename}) + renamed_composites = [composite_rename[c] for c in composite_cols] renamed_metrics = [col_rename[m] for m in ordered_metrics] - styled = table_df.style.map(_color_cell, subset=renamed_metrics) - styled = styled.format(dict.fromkeys(renamed_metrics, "{:.3f}"), na_rep="—") + score_cols = renamed_composites + renamed_metrics + styled = table_df.style.map(_color_cell, subset=score_cols) + styled = styled.format(dict.fromkeys(score_cols, "{:.3f}"), na_rep="—") st.dataframe( styled, hide_index=True, @@ -1323,6 +1330,17 @@ def _record_link(row): st.download_button("Download CSV", csv, file_name=f"{run_dir.name}_metrics.csv", mime="text/csv") +def _render_eva_composite_cards(metrics: RecordMetrics) -> None: + """Render EVA composite scores (EVA-A/EVA-X pass@1 and Mean) as st.metric cards.""" + composites = [(c, metrics.aggregate_metrics.get(c)) for c in _EVA_BAR_COMPOSITES] + composites = [(c, v) for c, v in composites if v is not None] + if not composites: + return + cols = st.columns(len(composites)) + for col, (name, value) in zip(cols, composites): + col.metric(_EVA_COMPOSITE_DISPLAY[name], f"{value:.3f}") + + def render_metrics_tab(metrics: RecordMetrics | None): """Render the metrics tab with judge ratings and scores.""" if not metrics: @@ -1330,6 +1348,7 @@ def render_metrics_tab(metrics: RecordMetrics | None): return st.markdown("### Metrics") + _render_eva_composite_cards(metrics) # Group metrics by category, preserving insertion order within each group grouped: dict[str, list[tuple[str, object]]] = {} @@ -1546,6 +1565,7 @@ def render_conversation_trace_tab(metrics: RecordMetrics | None, record_dir: Pat """ ) st.markdown("### Metrics Overview") + _render_eva_composite_cards(metrics) grouped: dict[str, list[str]] = {} for name in all_top_metrics: