Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion configs/prompts/simulation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ user_simulator:

IMPORTANT: Before ending the conversation, confirm with the agent that there are no outstanding actions. The end_call tool should only be called in a turn that is a brief goodbye — never in the same turn where you are providing the agent with data, an identifier, a request to transfer to a live agent, an approval to proceed, or any kind of additional information.

system_prompt_hr: |
system_prompt_medical_hr: |
You are an employee or credentialed provider at a medical organization calling HR to complete an administrative task.

You are communicating through a voice channel. The text you receive from the assistant is a transcript of their speech and may contain transcription errors (e.g., misheard words, garbled phrases). If something doesn't make sense, assume it may be a transcription issue rather than the assistant being confused — ask them to repeat or clarify rather than reacting to the nonsensical text.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "eva"
version = "0.1.2"
version = "0.1.3"
description = "A New End-to-end Framework for Evaluating Voice Agents (EVA)"
authors = [
{ name = "Tara Bogavelli" },
Expand Down
97 changes: 91 additions & 6 deletions src/eva/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,111 @@
"""

import asyncio
import os
import sys

from pydantic import ValidationError


def _extract_domain_spec() -> tuple[str | None, bool]:
"""Return (raw_domain_spec, came_from_argv).

Looks at --domain / --domain=... in sys.argv first, then EVA_DOMAIN env var.
"""
argv = sys.argv
for i, arg in enumerate(argv):
if arg == "--domain" and i + 1 < len(argv):
return argv[i + 1], True
if arg.startswith("--domain="):
return arg.split("=", 1)[1], True
env = os.environ.get("EVA_DOMAIN")
if env is not None:
return env, False
return None, False


def _strip_domain_from_argv() -> None:
"""Remove --domain (and its value) from sys.argv in place."""
argv = sys.argv
out = [argv[0]]
i = 1
while i < len(argv):
a = argv[i]
if a == "--domain":
i += 2
continue
if a.startswith("--domain="):
i += 1
continue
out.append(a)
i += 1
sys.argv = out


def _has_explicit_run_id() -> bool:
return any(a == "--run-id" or a.startswith("--run-id=") for a in sys.argv) or "EVA_RUN_ID" in os.environ


def main():
"""Entry point for the `eva` console script."""
# Import config first (lightweight) for fast --help and validation errors.
# Heavy deps (pipecat, litellm, etc.) are imported only in run_benchmark.
from eva.models.config import RunConfig

try:
config = RunConfig(_cli_parse_args=True, _env_file=".env")
except ValidationError as e:
print(e, file=sys.stderr)
sys.exit(1)
spec, _ = _extract_domain_spec()
domains = [d.strip() for d in spec.split(",")] if spec and "," in spec else None

if domains is None:
# Single-domain path — unchanged behavior.
try:
config = RunConfig(_cli_parse_args=True, _env_file=".env")
except ValidationError as e:
print(e, file=sys.stderr)
sys.exit(1)

from eva.run_benchmark import run_benchmark

sys.exit(asyncio.run(run_benchmark(config)))

# Multi-domain path: loop, one RunConfig + run_benchmark per domain.
# Dedupe preserving order, drop empties.
seen: set[str] = set()
ordered: list[str] = []
for d in domains:
if d and d not in seen:
seen.add(d)
ordered.append(d)

explicit_run_id = _has_explicit_run_id()
original_env = os.environ.get("EVA_DOMAIN")
_strip_domain_from_argv() # prevent pydantic-settings from re-reading the comma list

from eva.run_benchmark import run_benchmark

sys.exit(asyncio.run(run_benchmark(config)))
worst_exit = 0
try:
for domain in ordered:
os.environ["EVA_DOMAIN"] = domain
try:
config = RunConfig(_cli_parse_args=True, _env_file=".env")
except ValidationError as e:
print(f"[domain={domain}] {e}", file=sys.stderr)
worst_exit = max(worst_exit, 1)
continue

if explicit_run_id:
config.run_id = f"{config.run_id}_{domain}"

print(f"\n=== Running domain: {domain} ===\n", file=sys.stderr)
exit_code = asyncio.run(run_benchmark(config))
worst_exit = max(worst_exit, exit_code)
finally:
if original_env is None:
os.environ.pop("EVA_DOMAIN", None)
else:
os.environ["EVA_DOMAIN"] = original_env

sys.exit(worst_exit)


if __name__ == "__main__":
Expand Down
Loading