From f279d46f5d73d72ac90dc6c8f45298d4383ab57d Mon Sep 17 00:00:00 2001 From: Arvind Rk Date: Thu, 16 Oct 2025 11:13:39 -0700 Subject: [PATCH 1/8] updates --- fern/docs.yml | 15 + fern/observability/evals-advanced.mdx | 1124 ++++++++++++++++++ fern/observability/evals-quickstart.mdx | 1404 +++++++++++++++++++++++ 3 files changed, 2543 insertions(+) create mode 100644 fern/observability/evals-advanced.mdx create mode 100644 fern/observability/evals-quickstart.mdx diff --git a/fern/docs.yml b/fern/docs.yml index 02b352f8f..1ea186d7b 100644 --- a/fern/docs.yml +++ b/fern/docs.yml @@ -280,6 +280,21 @@ navigation: path: assistants/examples/multilingual-agent.mdx icon: fa-light fa-globe + - section: Observability + contents: + - section: Evals + icon: fa-light fa-clipboard-check + contents: + - page: Quickstart + path: observability/evals-quickstart.mdx + icon: fa-light fa-clipboard-check + - page: Advanced + path: observability/evals-advanced.mdx + icon: fa-light fa-clipboard-check + - page: Boards + path: observability/boards-quickstart.mdx + icon: fa-light fa-chart-line + - section: Squads contents: - page: Quickstart diff --git a/fern/observability/evals-advanced.mdx b/fern/observability/evals-advanced.mdx new file mode 100644 index 000000000..1e6b801d3 --- /dev/null +++ b/fern/observability/evals-advanced.mdx @@ -0,0 +1,1124 @@ +--- +title: Advanced eval testing +subtitle: Master testing strategies and best practices for production AI agents +slug: observability/evals-advanced +--- + +## Overview + +This guide covers advanced evaluation strategies, testing patterns, and best practices for building robust test suites that ensure your AI agents work reliably in production. + +**You'll learn:** + +- Strategic testing approaches (smoke, regression, edge case) +- Testing patterns for different scenarios +- Performance optimization techniques +- Maintenance and CI/CD integration strategies +- Advanced troubleshooting methods + +## Testing strategies + +### Smoke tests + +Quick validation that core functionality works. Run these first to catch obvious issues. + +**Purpose:** Verify assistant responds and basic conversation flow works. + +```json +{ + "name": "Smoke Test - Basic Response", + "description": "Verify assistant responds to simple greeting", + "type": "chat.mockConversation", + "messages": [ + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "judgePlan": { + "type": "regex", + "content": ".+" + } + } + ] +} +``` + +**Characteristics:** + +- Minimal validation (just check for any response) +- Fast execution (1-2 turns) +- Run before detailed tests +- Exit early if smoke tests fail + +**When to use:** + +- Before running expensive test suites +- After deploying configuration changes +- As health checks in monitoring +- Quick validation during development + +### Regression tests + +Ensure fixes and updates don't break existing functionality. + +**Purpose:** Validate that known issues stay fixed and features keep working. + + + + 1. Create evaluation named with "Regression: " prefix + 2. Include issue ticket number in description + 3. Add exact scenario that previously failed + 4. Validate the fix still works + + Example: + - Name: "Regression: Date Parsing Bug #1234" + - Description: "Verify dates like '3/15' parse correctly after bug fix" + + + +```bash +curl -X POST "https://api.vapi.ai/eval" \ + -H "Authorization: Bearer $VAPI_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "Regression: Date Parsing Bug #1234", + "description": "Verify dates like 3/15 are parsed correctly after fix", + "type": "chat.mockConversation", + "messages": [ + { + "role": "user", + "content": "Book me for 3/15" + }, + { + "role": "assistant", + "judgePlan": { + "type": "exact", + "toolCalls": [{ + "name": "bookAppointment", + "arguments": { + "date": "2025-03-15" + } + }] + } + } + ] + }' +``` + + + +**Best practices:** + +- Name tests after bugs they prevent +- Include ticket/issue numbers in descriptions +- Add regression tests when fixing bugs +- Run full regression suite before major releases +- Archive tests only when features are removed + +### Edge case testing + +Test boundary conditions and unusual inputs. + +**Common edge cases to test:** + + + + ```json + { + "messages": [ + {"role": "user", "content": ""}, + {"role": "assistant", "judgePlan": { + "type": "ai", + "model": { + "provider": "openai", + "model": "gpt-4o", + "messages": [{ + "role": "system", + "content": "PASS if response asks for clarification politely. Output: pass or fail" + }] + } + }} + ] + } + ``` + + + + ```json + { + "messages": [ + { + "role": "user", + "content": "I need help with... (repeat 1000 times)" + }, + { + "role": "assistant", + "judgePlan": { + "type": "regex", + "content": ".+" + }, + "continuePlan": { + "exitOnFailureEnabled": true + } + } + ] + } + ``` + + + + ```json + { + "messages": [ + { + "role": "user", + "content": "My name is François José 王明" + }, + { + "role": "assistant", + "judgePlan": { + "type": "ai", + "model": { + "provider": "openai", + "model": "gpt-4o", + "messages": [{ + "role": "system", + "content": "PASS if response correctly acknowledges the name with special characters. Output: pass or fail" + }] + } + } + } + ] + } + ``` + + + + ```json + { + "messages": [ + { + "role": "user", + "content": "asdfghjkl" + }, + { + "role": "assistant", + "judgePlan": { + "type": "ai", + "model": { + "provider": "openai", + "model": "gpt-4o", + "messages": [{ + "role": "system", + "content": "PASS if response asks for clarification without being rude. Output: pass or fail" + }] + } + } + } + ] + } + ``` + + + + ```json + { + "messages": [ + {"role": "user", "content": "Book appointment"}, + {"role": "assistant", "judgePlan": {"type": "regex", "content": ".*appointment.*"}}, + {"role": "user", "content": "Actually, cancel that. I need tech support."}, + {"role": "assistant", "judgePlan": { + "type": "ai", + "model": { + "provider": "openai", + "model": "gpt-4o", + "messages": [{ + "role": "system", + "content": "PASS if response pivots to tech support without confusion. Output: pass or fail" + }] + } + }} + ] + } + ``` + + + +**Edge case categories:** + +- **Input boundaries:** Empty, maximum length, special characters +- **Data formats:** Invalid dates, malformed phone numbers, unusual names +- **Conversation patterns:** Interruptions, topic changes, contradictions +- **Timing:** Very fast responses, long pauses, timeout scenarios + +## Testing patterns + +### Happy path testing + +Validate ideal user journeys where everything works correctly. + +**Structure:** + +1. User provides clear, complete information +2. Assistant responds appropriately +3. Tools execute successfully +4. Conversation completes with desired outcome + +**Example: Perfect booking flow** + +```json +{ + "name": "Happy Path - Complete Booking", + "description": "User provides all info clearly, booking succeeds", + "type": "chat.mockConversation", + "messages": [ + { + "role": "user", + "content": "I'd like to book an appointment" + }, + { + "role": "assistant", + "judgePlan": { + "type": "ai", + "model": { + "provider": "openai", + "model": "gpt-4o", + "messages": [ + { + "role": "system", + "content": "PASS if response asks for date/time preferences. Output: pass or fail" + } + ] + } + } + }, + { + "role": "user", + "content": "Next Monday at 2pm please" + }, + { + "role": "assistant", + "judgePlan": { + "type": "exact", + "toolCalls": [ + { + "name": "bookAppointment", + "arguments": { + "date": "2025-01-20", + "time": "14:00" + } + } + ] + } + }, + { + "role": "tool", + "content": "{\"status\": \"success\", \"confirmationId\": \"APT-12345\"}" + }, + { + "role": "assistant", + "judgePlan": { + "type": "regex", + "content": ".*(confirmed|booked).*APT-12345.*" + } + } + ] +} +``` + +**Happy path coverage:** + +- Test primary user goals +- Verify expected tool executions +- Validate success messages +- Confirm data accuracy + +### Error handling testing + +Test how your assistant handles failures gracefully. + +**Tool failure scenarios:** + +```json +{ + "name": "Error Handling - Booking Unavailable", + "messages": [ + { + "role": "user", + "content": "Book Monday at 2pm" + }, + { + "role": "assistant", + "judgePlan": { + "type": "exact", + "toolCalls": [{ "name": "bookAppointment" }] + } + }, + { + "role": "tool", + "content": "{\"status\": \"error\", \"message\": \"Time slot unavailable\"}" + }, + { + "role": "assistant", + "judgePlan": { + "type": "ai", + "model": { + "provider": "openai", + "model": "gpt-4o", + "messages": [ + { + "role": "system", + "content": "Evaluate: {{messages[-1]}}\n\nPASS if:\n- Response acknowledges the time is unavailable\n- Response offers alternatives or asks for different time\n- Tone remains helpful (not apologetic to excess)\n\nFAIL if:\n- Response ignores the error\n- Response doesn't offer next steps\n- Tone is frustrated or rude\n\nOutput: pass or fail" + } + ] + } + } + } + ] +} +``` + +**Invalid input handling:** + +```json +{ + "name": "Error Handling - Invalid Date Format", + "messages": [ + { + "role": "user", + "content": "Book me for the 45th of Octember" + }, + { + "role": "assistant", + "judgePlan": { + "type": "ai", + "model": { + "provider": "openai", + "model": "gpt-4o", + "messages": [ + { + "role": "system", + "content": "PASS if response politely asks for valid date without mocking user. Output: pass or fail" + } + ] + } + } + } + ] +} +``` + +**API timeout simulation:** + +```json +{ + "name": "Error Handling - Tool Timeout", + "messages": [ + { + "role": "user", + "content": "Check my order status" + }, + { + "role": "assistant", + "judgePlan": { + "type": "exact", + "toolCalls": [{ "name": "checkOrderStatus" }] + } + }, + { + "role": "tool", + "content": "{\"status\": \"error\", \"message\": \"Request timeout\"}" + }, + { + "role": "assistant", + "judgePlan": { + "type": "ai", + "model": { + "provider": "openai", + "model": "gpt-4o", + "messages": [ + { + "role": "system", + "content": "PASS if response acknowledges technical issue and suggests retry or alternative. Output: pass or fail" + } + ] + } + } + } + ] +} +``` + +**Error categories to test:** + +- Tool/API failures +- Invalid user input +- Timeout scenarios +- Rate limit errors +- Partial data availability +- Permission/authorization issues + +### Boundary testing + +Test limits and thresholds of your system. + +**Maximum conversation length:** + +```json +{ + "name": "Boundary - Max Turns", + "description": "Test assistant handles long conversations (20+ turns)", + "messages": [ + { "role": "user", "content": "Question 1" }, + { "role": "assistant", "judgePlan": { "type": "regex", "content": ".+" } }, + { "role": "user", "content": "Question 2" }, + { "role": "assistant", "judgePlan": { "type": "regex", "content": ".+" } }, + // ... repeat up to boundary ... + { "role": "user", "content": "Final question" }, + { + "role": "assistant", + "judgePlan": { + "type": "ai", + "model": { + "provider": "openai", + "model": "gpt-4o", + "messages": [ + { + "role": "system", + "content": "PASS if response is coherent and maintains context from earlier conversation. Output: pass or fail" + } + ] + } + } + } + ] +} +``` + +**Rate limits:** + +Test behavior at or near rate limits: + +- Multiple tool calls in succession +- Rapid user input +- Large data processing requests + +**Data size boundaries:** + +```json +{ + "name": "Boundary - Large Data Response", + "messages": [ + { + "role": "user", + "content": "Get all customer records" + }, + { + "role": "assistant", + "judgePlan": { + "type": "exact", + "toolCalls": [{ "name": "getAllCustomers" }] + } + }, + { + "role": "tool", + "content": "{\"customers\": [/* 1000 customer objects */]}" + }, + { + "role": "assistant", + "judgePlan": { + "type": "ai", + "model": { + "provider": "openai", + "model": "gpt-4o", + "messages": [ + { + "role": "system", + "content": "PASS if response summarizes data rather than reading full list. Output: pass or fail" + } + ] + } + } + } + ] +} +``` + +## Best practices + +### Evaluation design principles + + + + Each evaluation should test one specific behavior or feature. + + ✅ **Good:** "Test greeting acknowledgment" + + ❌ **Bad:** "Test greeting, booking, and error handling" + + +{" "} + + + Use descriptive names that explain what's being tested. ✅ **Good:** "Booking + - Validates Date Format" ❌ **Bad:** "Test 1" or "Eval ABC" + + +{" "} + + + Document why the test exists and what it validates. Include context: business + requirement, bug ticket, or feature spec. + + + + Keep evaluations focused (5-10 turns max). + + Split complex scenarios into multiple targeted tests. + + + +### Validation approach selection + +Choose the right judge type for each scenario: + + + + **Ideal for:** + - Critical business data (confirmation IDs, totals, dates) + - Tool call validation with specific arguments + - Compliance-required exact wording + - Success/failure status messages + + **Example:** Booking confirmation ID must be exact + ```json + { + "judgePlan": { + "type": "exact", + "content": "Your confirmation ID is APT-12345" + } + } + ``` + + + + **Ideal for:** + - Responses with variable data (names, dates, IDs) + - Pattern matching (email formats, phone numbers) + - Flexible phrasing with specific keywords + - Multiple acceptable phrasings + + **Example:** Confirmation with variable ID format + ```json + { + "judgePlan": { + "type": "regex", + "content": ".*confirmation (ID|number|code): [A-Z]{3}-[0-9]{5}.*" + } + } + ``` + + + + **Ideal for:** + - Semantic meaning validation + - Tone and sentiment evaluation + - Contextual appropriateness + - Complex multi-factor criteria + - Helpfulness assessment + + **Example:** Validate polite rejection + ```json + { + "judgePlan": { + "type": "ai", + "model": { + "provider": "openai", + "model": "gpt-4o", + "messages": [{ + "role": "system", + "content": "PASS if response politely declines without being rude and offers alternative. Output: pass or fail" + }] + } + } + } + ``` + + + +**Decision tree:** + +``` +Is the exact wording critical? +├─ Yes → Use Exact Match +└─ No → Does it follow a pattern? + ├─ Yes → Use Regex + └─ No → Does it require understanding context/tone? + ├─ Yes → Use AI Judge + └─ No → Use Regex with flexible pattern +``` + +### Performance optimization + +**Minimize test execution time:** + +1. **Use exit-on-failure for early steps:** + +```json +{ + "continuePlan": { + "exitOnFailureEnabled": true + } +} +``` + +Stops test immediately when critical validation fails. + +2. **Run critical tests first:** + Organize test suites so smoke tests and critical validations run before expensive tests. + +3. **Keep conversations focused:** + Aim for 5-10 turns maximum. Split longer scenarios into multiple tests. + +4. **Batch related tests:** + Group similar evaluations to run sequentially rather than one-off. + +5. **Optimize AI judge prompts:** + +- Use faster models (gpt-3.5-turbo) for simple validations +- Use advanced models (gpt-4o) only for complex semantic evaluation +- Keep prompts concise and specific + +**Performance comparison:** + +| Judge Type | Speed | Cost | Use Case | +| ------------ | ----------- | ---------- | ---------------------- | +| Exact | ⚡⚡⚡ Fast | $ Low | Critical exact matches | +| Regex | ⚡⚡ Fast | $ Low | Pattern matching | +| AI (GPT-3.5) | ⚡ Medium | $$ Medium | Simple semantic checks | +| AI (GPT-4) | ⏱ Slower | $$$ Higher | Complex evaluation | + +### Maintenance strategies + +**Version control your evaluations:** + +Store evaluation definitions alongside your codebase: + +```bash +/tests + /evals + /greeting + - basic-greeting.json + - multilingual-greeting.json + /booking + - happy-path-booking.json + - error-handling-booking.json + /regression + - date-parsing-bug-1234.json +``` + +**Regular review cycle:** + + + + Investigate all failures. Update tests if expectations changed, or fix assistant if behavior regressed. + + +{" "} + + + Review test suite completeness: - All critical user flows covered? - New + features have tests? - Deprecated features removed? + + + + - Remove duplicate tests + - Update outdated validation criteria + - Optimize slow-running tests + - Document test rationale + + + +**Update tests when:** + +- Assistant prompts or behavior change intentionally +- New features are added +- Bugs are fixed (add regression tests) +- User feedback reveals edge cases +- Business requirements evolve + +**Deprecation strategy:** + +Don't delete tests immediately when features change: + +1. Mark test as "deprecated" in description +2. Update expected behavior to match new requirements +3. Run for one release cycle to verify +4. Archive after confirmed stable + +### CI/CD integration + +Automate evaluation runs in your deployment pipeline. + +**Basic workflow:** + +```yaml +# .github/workflows/test-assistant.yml +name: Test Assistant Changes + +on: + pull_request: + paths: + - "assistants/**" + - "prompts/**" + +jobs: + run-evals: + runs-on: ubuntu-latest + steps: + - name: Run critical evals + run: | + # Run smoke tests + curl -X POST "https://api.vapi.ai/eval/run" \ + -H "Authorization: Bearer ${{ secrets.VAPI_API_KEY }}" \ + -d '{"evalId": "$SMOKE_TEST_ID", "target": {...}}' + + # Check results + # Fail build if tests fail +``` + +**Advanced patterns:** + + + + Run full eval suite against staging before production deploy: + + ```bash + # Run all evals against staging assistant + for eval_id in $EVAL_IDS; do + run_result=$(curl -X POST "https://api.vapi.ai/eval/run" \ + -H "Authorization: Bearer $VAPI_API_KEY" \ + -d "{\"evalId\": \"$eval_id\", \"target\": {\"type\": \"assistant\", \"assistantId\": \"$STAGING_ASSISTANT_ID\"}}") + + # Check if passed + status=$(echo $run_result | jq -r '.results[0].status') + if [ "$status" != "pass" ]; then + echo "Eval $eval_id failed!" + exit 1 + fi + done + ``` + + + + Run multiple evals concurrently to speed up CI: + + ```bash + # Run evals in parallel + for eval_id in $EVAL_IDS; do + (curl -X POST "https://api.vapi.ai/eval/run" \ + -H "Authorization: Bearer $VAPI_API_KEY" \ + -d "{\"evalId\": \"$eval_id\", ...}" > results_$eval_id.json) & + done + wait + + # Aggregate results + for result_file in results_*.json; do + # Check each result + done + ``` + + +{" "} + + + Block deployment if test pass rate falls below threshold: ```bash # Calculate + pass rate total_tests=10 passed_tests=$(grep -c '"status":"pass"' + all_results.json) pass_rate=$((passed_tests * 100 / total_tests)) if [ + $pass_rate -lt 95 ]; then echo "Pass rate $pass_rate% below threshold 95%" + exit 1 fi ``` + + + + Run full regression suite nightly: + + ```yaml + # .github/workflows/nightly-regression.yml + on: + schedule: + - cron: '0 2 * * *' # 2 AM daily + + jobs: + regression-suite: + runs-on: ubuntu-latest + steps: + - name: Run regression tests + run: ./scripts/run-regression-suite.sh + + - name: Notify on failures + if: failure() + run: | + # Send Slack notification + # Create GitHub issue + ``` + + + +## Advanced troubleshooting + +### Debugging failed evaluations + +**Step-by-step investigation:** + + + + Check `judge.failureReason` for specific details: + + ```json + { + "judge": { + "status": "fail", + "failureReason": "Expected exact match: 'confirmed' but got: 'booked'" + } + } + ``` + + This tells you exactly what differed. + + +{" "} + + + Look at `results[0].messages` to see complete interaction: - What did the user + actually say? - How did the assistant respond? - Were tool calls made + correctly? - Did tool responses contain expected data? + + +{" "} + + + For exact match failures: - Check for extra spaces or newlines - Verify + punctuation matches exactly - Look for case sensitivity issues For tool call + failures: - Verify argument types (string vs number) - Check for extra/missing + arguments - Validate argument values + + +{" "} + + + For regex: - Test pattern with online validators - Try pattern against actual + response - Check for escaped special characters For AI judge: - Test prompt + with known good/bad examples - Verify binary pass/fail criteria - Check for + ambiguous requirements + + + + Test the assistant interactively: + - Use same input as eval + - Compare live behavior to eval results + - Check if issue is with assistant or eval validation + + + +### Common failure patterns + + + + **Problem:** Expected "Hello, how can I help?" but got "Hello, how may I help?" + + **Solutions:** + - Switch to regex for flexibility: `Hello, how (can|may) I help\?` + - Use AI judge for semantic matching + - Update expected value if new phrasing is acceptable + + +{" "} + + + **Problem:** Arguments have different types or extra fields **Solutions:** - + Check argument types: `"14:00"` (string) vs `14` (number) - Use partial + matching: omit `arguments` to match only function name - Normalize data + formats in tool implementation + + +{" "} + + + **Problem:** Same response sometimes passes, sometimes fails **Solutions:** - + Make criteria more specific and binary - Add explicit examples of pass/fail + cases in prompt - Use temperature=0 for deterministic evaluation - Switch to + regex if pattern-based validation works + + +{" "} + + + **Problem:** Eval status stuck in "running" **Solutions:** - Check assistant + configuration for errors - Verify tool endpoints are accessible - Reduce + conversation complexity - Check for infinite loops in assistant logic + + + + **Problem:** Pattern seems correct but fails + + **Solutions:** + - Escape special regex characters: `.`, `?`, `*`, `+`, `(`, `)` + - Use `.*` for flexible matching around keywords + - Test pattern with online regex validators + - Check for hidden characters or unicode + + + +### Debugging tools and techniques + +**Use structured logging:** + +Track eval executions systematically: + +```javascript +{ + "timestamp": "2024-01-15T10:30:00Z", + "evalId": "eval-123", + "evalName": "Booking Flow Test", + "runId": "run-456", + "target": "assistant-789", + "result": "fail", + "failedStep": 3, + "failureReason": "Tool call mismatch", + "actualBehavior": "Called cancelAppointment instead of bookAppointment" +} +``` + +**Isolate variables:** + +When tests fail inconsistently: + +1. Run same eval multiple times +2. Test with different assistants (A/B comparison) +3. Simplify conversation to minimum reproduction +4. Check for race conditions or timing issues + +**Progressive validation:** + +Build up complexity gradually: + +```json +// Step 1: Verify basic response +{"judgePlan": {"type": "regex", "content": ".+"}} + +// Step 2: Verify contains keyword +{"judgePlan": {"type": "regex", "content": ".*appointment.*"}} + +// Step 3: Verify exact format +{"judgePlan": {"type": "exact", "content": "Appointment confirmed"}} +``` + +## Troubleshooting reference + +### Status and error codes + +| Status | Ended Reason | Meaning | Action | +| ------- | --------------------- | -------------------------------- | ------------------------------------- | +| ended | mockConversation.done | ✅ Test completed normally | Check results[0].status for pass/fail | +| ended | assistant-error | ❌ Assistant configuration error | Fix assistant setup, re-run | +| ended | pipeline-error-\* | ❌ Provider API error | Check provider status, API keys | +| running | - | ⏳ Test in progress | Wait or check for timeout | +| queued | - | ⏳ Test waiting to start | Normal, should start soon | + +### Quick diagnostic checklist + + + **When an eval fails, check:** - [ ] `endedReason` is "mockConversation.done" + - [ ] Assistant works correctly in manual testing - [ ] Tool endpoints are + accessible - [ ] Validation criteria match actual behavior - [ ] Regex + patterns are properly escaped - [ ] AI judge prompts are specific and binary - + [ ] Arguments match expected types (string vs number) - [ ] API keys and + permissions are valid - [ ] No rate limits or quota issues + + +### Getting help + +**Include these details when reporting issues:** + +- Eval ID and run ID +- Full `endedReason` value +- Conversation transcript (`results[0].messages`) +- Expected vs actual behavior +- Assistant/squad configuration +- Provider and model being used + +**Resources:** + +- [Eval API Reference](/api-reference/eval/create) +- [Discord Community](https://discord.gg/pUFNcf2WmH) - #testing channel +- [Support](mailto:support@vapi.ai) - Include eval run ID + +## Next steps + + + + Return to quickstart guide for basic evaluation setup + + +{" "} + + Learn to build and configure assistants for testing + + +{" "} + + Build custom tools and test their behavior + + + + Complete API documentation for evaluations + + + +## Summary + + +**Key takeaways for advanced eval testing:** + +**Testing strategy:** + +- Use smoke tests before comprehensive suites +- Build regression tests when fixing bugs +- Cover edge cases systematically + +**Validation selection:** + +- Exact match for critical data +- Regex for pattern matching +- AI judge for semantic evaluation + +**Performance:** + +- Exit early on critical failures +- Keep conversations focused (5-10 turns) +- Batch related tests together + +**Maintenance:** + +- Version control evaluations +- Review failures promptly +- Update tests with features +- Document test purpose clearly + +**CI/CD:** + +- Automate critical tests in pipelines +- Use staging for full suite validation +- Set quality gate thresholds +- Run regression suites regularly + diff --git a/fern/observability/evals-quickstart.mdx b/fern/observability/evals-quickstart.mdx new file mode 100644 index 000000000..600c68e1e --- /dev/null +++ b/fern/observability/evals-quickstart.mdx @@ -0,0 +1,1404 @@ +--- +title: Evals quickstart +subtitle: Get started with AI agent testing in 5 minutes +slug: observability/evals-quickstart +--- + +## Overview + +This quickstart guide will help you set up automated testing for your AI assistants and squads. In just a few minutes, you'll create mock conversations, define expected behaviors, and validate your agents work correctly before production. + +### What are Evals? + +Evals is Vapi's AI agent testing framework that enables you to systematically test assistants and squads using mock conversations with automated validation. Test your agents by: + +1. **Creating mock conversations** - Define user messages and expected assistant responses +2. **Validating behavior** - Use exact match, regex patterns, or AI-powered judging +3. **Testing tool calls** - Verify function calls with specific arguments +4. **Running automated tests** - Execute tests and receive detailed pass/fail results +5. **Debugging failures** - Review full conversation transcripts with evaluation details + +### When are Evals useful? + +Evals help you maintain quality and catch issues early: + +- **Pre-deployment testing** - Validate new assistant configurations before going live +- **Regression testing** - Ensure prompt or tool changes don't break existing behaviors +- **Conversation flow validation** - Test multi-turn interactions and complex scenarios +- **Tool calling verification** - Validate function calls with correct arguments +- **Squad handoff testing** - Ensure smooth transitions between squad members +- **CI/CD integration** - Automate quality gates in your deployment pipeline + +### What you'll build + +An evaluation suite for an appointment booking assistant that tests: + +- Greeting and initial response validation +- Tool call execution with specific arguments +- Response pattern matching with regex +- Semantic validation using AI judges +- Multi-turn conversation flows + +## Prerequisites + + + + Sign up at [dashboard.vapi.ai](https://dashboard.vapi.ai) + + + Get your API key from **API Keys** in sidebar + + + + + You'll also need an existing assistant or squad to test. You can create one in + the Dashboard or use the API. + + +## Step 1: Create your first evaluation + +Define a mock conversation to test your assistant's greeting behavior. + + + + + + 1. Log in to [dashboard.vapi.ai](https://dashboard.vapi.ai) + 2. Click on **Evals** in the left sidebar (under Observability) + 3. Click **Create Evaluation** + + + + 1. **Name**: Enter "Greeting Test" + 2. **Description**: Add "Verify assistant greets users appropriately" + 3. **Type**: Automatically set to "chat.mockConversation" + + + + 1. Click **Add Message** + 2. Select **User** message type + 3. Enter content: "Hello" + 4. Click **Add Message** again + 5. Select **Assistant** message type + 6. Click **Enable Evaluation** toggle + 7. Select **Exact Match** as judge type + 8. Enter expected content: "Hello! How can I help you today?" + 9. Click **Save Evaluation** + + + + + Your evaluation is now saved. You can run it against any assistant or squad. + + + + + +```bash +curl -X POST "https://api.vapi.ai/eval" \ + -H "Authorization: Bearer $VAPI_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "Greeting Test", + "description": "Verify assistant greets users appropriately", + "type": "chat.mockConversation", + "messages": [ + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "judgePlan": { + "type": "exact", + "content": "Hello! How can I help you today?" + } + } + ] + }' +``` + +**Response:** + +```json +{ + "id": "550e8400-e29b-41d4-a716-446655440000", + "orgId": "org-123", + "type": "chat.mockConversation", + "name": "Greeting Test", + "description": "Verify assistant greets users appropriately", + "messages": [...], + "createdAt": "2024-01-15T09:30:00Z", + "updatedAt": "2024-01-15T09:30:00Z" +} +``` + +Save the returned `id` - you'll need it to run the evaluation. + +For complete API details, see [Create Eval](/api-reference/eval/create). + + + + + + **Message structure:** Each conversation turn has a `role` (user, assistant, + system, or tool). Assistant messages with `judgePlan` define what to validate. + + +## Step 2: Run your evaluation + +Execute the evaluation against your assistant or squad. + + + + + + 1. Navigate to **Evals** in the sidebar + 2. Click on "Greeting Test" from your evaluations list + + + + 1. In the evaluation detail page, find the **Run Test** section + 2. Select **Assistant** or **Squad** as the target type + 3. Choose your assistant/squad from the dropdown + 4. Click **Run Evaluation** + 5. Watch real-time progress as the test executes + + + + Results appear automatically when the test completes: + - ✅ **Green checkmark** indicates evaluation passed + - ❌ **Red X** indicates evaluation failed + - Click **View Details** to see full conversation transcript + + + + + + +**Create an eval run:** + +```bash +curl -X POST "https://api.vapi.ai/eval/run" \ + -H "Authorization: Bearer $VAPI_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "evalId": "550e8400-e29b-41d4-a716-446655440000", + "target": { + "type": "assistant", + "assistantId": "your-assistant-id" + } + }' +``` + +**Response:** + +```json +{ + "id": "eval-run-123", + "evalId": "550e8400-e29b-41d4-a716-446655440000", + "orgId": "org-123", + "status": "queued", + "createdAt": "2024-01-15T09:35:00Z", + "updatedAt": "2024-01-15T09:35:00Z" +} +``` + +**Check results:** + +```bash +curl -X GET "https://api.vapi.ai/eval/run/eval-run-123" \ + -H "Authorization: Bearer $VAPI_API_KEY" +``` + +For complete API details, see [Create Eval Run](/api-reference/eval/run) and [Get Eval Run](/api-reference/eval/get-run). + + + + + + You can also run evaluations with transient assistant or squad configurations + by providing `assistant` or `squad` objects instead of IDs in the target. + + +## Step 3: Understand test results + +Learn to interpret evaluation results and identify issues. + +### Successful evaluation + +When all checks pass, you'll see: + +```json +{ + "id": "eval-run-123", + "evalId": "550e8400-e29b-41d4-a716-446655440000", + "status": "ended", + "endedReason": "mockConversation.done", + "results": [ + { + "status": "pass", + "messages": [ + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hello! How can I help you today?", + "judge": { + "status": "pass" + } + } + ] + } + ] +} +``` + +**Pass criteria:** + +- `status` is "ended" +- `endedReason` is "mockConversation.done" +- `results[0].status` is "pass" +- All `judge.status` values are "pass" + +### Failed evaluation + +When validation fails, you'll see details: + +```json +{ + "status": "ended", + "endedReason": "mockConversation.done", + "results": [ + { + "status": "fail", + "messages": [ + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hi there! What can I do for you?", + "judge": { + "status": "fail", + "failureReason": "Expected exact match: 'Hello! How can I help you today?' but got: 'Hi there! What can I do for you?'" + } + } + ] + } + ] +} +``` + +**Failure indicators:** + +- `results[0].status` is "fail" +- `judge.status` is "fail" +- `judge.failureReason` explains why validation failed + + + If `endedReason` is not "mockConversation.done", the test encountered an error + (like "assistant-error" or "pipeline-error-openai-llm-failed"). Check your + assistant configuration. + + +## Step 4: Test tool/function calls + +Validate that your assistant calls functions with correct arguments. + +### Basic tool call validation + +Test appointment booking with exact argument matching: + + + + 1. Create new evaluation: "Appointment Booking Test" + 2. Add user message: "Book me an appointment for next Monday at 2pm" + 3. Add assistant message with evaluation enabled + 4. Select **Exact Match** judge type + 5. Click **Add Tool Call** + 6. Enter function name: "bookAppointment" + 7. Add arguments: + - `date`: "2025-01-20" + - `time`: "14:00" + 8. Add tool response message: + - Type: **Tool** + - Content: `{"status": "success", "confirmationId": "APT-12345"}` + 9. Add final assistant message to verify confirmation + 10. Save evaluation + + + +```bash +curl -X POST "https://api.vapi.ai/eval" \ + -H "Authorization: Bearer $VAPI_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "Appointment Booking Test", + "type": "chat.mockConversation", + "messages": [ + { + "role": "user", + "content": "Book me an appointment for next Monday at 2pm" + }, + { + "role": "assistant", + "judgePlan": { + "type": "exact", + "toolCalls": [{ + "name": "bookAppointment", + "arguments": { + "date": "2025-01-20", + "time": "14:00" + } + }] + } + }, + { + "role": "tool", + "content": "{\"status\": \"success\", \"confirmationId\": \"APT-12345\"}" + }, + { + "role": "assistant", + "judgePlan": { + "type": "regex", + "content": ".*confirmed.*APT-12345.*" + } + } + ] + }' +``` + +For API details, see [Create Eval](/api-reference/eval/create). + + + + +### Tool call validation modes + +**Exact match - Full validation:** + +```json +{ + "judgePlan": { + "type": "exact", + "toolCalls": [ + { + "name": "bookAppointment", + "arguments": { + "date": "2025-01-20", + "time": "14:00" + } + } + ] + } +} +``` + +Validates both function name AND all argument values exactly. + +**Partial match - Name only:** + +```json +{ + "judgePlan": { + "type": "regex", + "toolCalls": [ + { + "name": "bookAppointment" + } + ] + } +} +``` + +Validates only that the function was called (arguments can vary). + +**Multiple tool calls:** + +```json +{ + "judgePlan": { + "type": "exact", + "toolCalls": [ + { + "name": "checkAvailability", + "arguments": { "date": "2025-01-20" } + }, + { + "name": "bookAppointment", + "arguments": { "date": "2025-01-20", "time": "14:00" } + } + ] + } +} +``` + +Validates multiple function calls in sequence. + + + Tool calls are validated in the order they're defined. Use `type: "exact"` for + strict validation or `type: "regex"` for flexible validation. + + +## Step 5: Use regex for flexible validation + +When responses vary slightly (like names, dates, or IDs), use regex patterns for flexible matching. + +### Common regex patterns + +**Greeting variations:** + +```json +{ + "judgePlan": { + "type": "regex", + "content": "^(Hello|Hi|Hey),? (I can|I'll|let me) help.*" + } +} +``` + +Matches: "Hello, I can help...", "Hi I'll help...", "Hey let me help..." + +**Responses with variables:** + +```json +{ + "judgePlan": { + "type": "regex", + "content": ".*appointment.*confirmed.*[A-Z]{3}-[0-9]{5}.*" + } +} +``` + +Matches any confirmation message with appointment ID format. + +**Date patterns:** + +```json +{ + "judgePlan": { + "type": "regex", + "content": ".*scheduled for (Monday|Tuesday|Wednesday|Thursday|Friday).*" + } +} +``` + +Matches responses mentioning weekdays. + +**Case-insensitive matching:** + +```json +{ + "judgePlan": { + "type": "regex", + "content": "(?i)booking confirmed" + } +} +``` + +The `(?i)` flag makes matching case-insensitive. + +### Example: Flexible booking confirmation + + + + 1. Add assistant message with evaluation enabled + 2. Select **Regex** as judge type + 3. Enter pattern: `.*appointment.*(confirmed|booked).*\d{1,2}:\d{2}.*` + 4. This matches various confirmation phrasings with time mentions + + + +```bash +curl -X POST "https://api.vapi.ai/eval" \ + -H "Authorization: Bearer $VAPI_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "Flexible Booking Test", + "type": "chat.mockConversation", + "messages": [ + { + "role": "user", + "content": "I need to schedule an appointment" + }, + { + "role": "assistant", + "judgePlan": { + "type": "regex", + "content": ".*(schedule|book|set up).*appointment.*" + } + } + ] + }' +``` + + + + + **Regex tips:** - Use `.*` to match any characters - Use `(option1|option2)` + for alternatives - Use `\d` for digits, `\s` for whitespace - Use `.*?` for + non-greedy matching - Test your patterns with sample responses first + + +## Step 6: Use AI judge for semantic validation + +For complex validation criteria beyond pattern matching, use AI-powered judges to evaluate responses semantically. + +### AI judge structure + +```json +{ + "role": "assistant", + "judgePlan": { + "type": "ai", + "model": { + "provider": "openai", + "model": "gpt-4o", + "messages": [ + { + "role": "system", + "content": "Your evaluation prompt here" + } + ] + } + } +} +``` + +### Writing effective judge prompts + +**Template structure:** + +``` +You are an LLM-Judge. Evaluate ONLY the last assistant message in the mock conversation: {{messages[-1]}}. + +Include the full conversation history for context: {{messages[0:-1]}} + +Decision rule: +- PASS if ALL "pass criteria" are satisfied AND NONE of the "fail criteria" are triggered. +- Otherwise FAIL. + +Pass criteria: +- [Specific requirement 1] +- [Specific requirement 2] + +Fail criteria (any one triggers FAIL): +- [Specific failure condition 1] +- [Specific failure condition 2] + +Output format: respond with exactly one word: pass or fail +- No explanations +- No punctuation +- No additional text +``` + +### Example: Evaluate helpfulness and tone + + + + 1. Add assistant message with evaluation enabled + 2. Select **AI Judge** as judge type + 3. Choose provider: **OpenAI** + 4. Select model: **gpt-4o** + 5. Enter evaluation prompt (see template above) + 6. Customize pass/fail criteria for your use case + + + +```bash +curl -X POST "https://api.vapi.ai/eval" \ + -H "Authorization: Bearer $VAPI_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "Helpfulness Test", + "type": "chat.mockConversation", + "messages": [ + { + "role": "user", + "content": "I need help with my account" + }, + { + "role": "assistant", + "judgePlan": { + "type": "ai", + "model": { + "provider": "openai", + "model": "gpt-4o", + "messages": [{ + "role": "system", + "content": "You are an LLM-Judge. Evaluate ONLY the last assistant message: {{messages[-1]}}.\n\nInclude context: {{messages[0:-1]}}\n\nDecision rule:\n- PASS if ALL pass criteria are met AND NO fail criteria are triggered.\n- Otherwise FAIL.\n\nPass criteria:\n- Response acknowledges the user request\n- Response offers specific help or next steps\n- Tone is professional and friendly\n\nFail criteria (any triggers FAIL):\n- Response is rude or dismissive\n- Response ignores the user request\n- Response provides no actionable information\n\nOutput format: respond with exactly one word: pass or fail" + }] + } + } + } + ] + }' +``` + + + +### Supported AI judge providers + + + + **Models:** gpt-4o, gpt-4-turbo, gpt-3.5-turbo + + Best for general-purpose evaluation + + +{" "} + + + **Models:** claude-3-5-sonnet-20241022, claude-3-opus-20240229 Best for + nuanced evaluation + + +{" "} + + + **Models:** gemini-1.5-pro, gemini-1.5-flash Best for multilingual content + + + + **Models:** llama-3.1-70b-versatile, mixtral-8x7b-32768 + + Best for fast evaluation + + + +**Custom LLM:** + +```json +{ + "model": { + "provider": "custom-llm", + "model": "your-model-name", + "url": "https://your-api-endpoint.com/chat/completions", + "messages": [...] + } +} +``` + +### AI judge best practices + + + **Tips for reliable AI judging:** - Be specific with pass/fail criteria (avoid + ambiguous requirements) - Use "ALL pass criteria must be met" logic - Use "ANY + fail criteria triggers fail" logic - Include conversation context with ` + {{ messages }}` syntax - Request exact "pass" or "fail" output (no + explanations) - Test criteria with known good/bad responses before production + - Use consistent evaluation standards across similar tests + + +## Step 7: Control flow with Continue Plan + +Define what happens after an evaluation passes or fails using `continuePlan`. + +### Exit on failure + +Stop the test immediately if a critical check fails: + +```json +{ + "role": "assistant", + "judgePlan": { + "type": "exact", + "content": "I can help you with that." + }, + "continuePlan": { + "exitOnFailureEnabled": true + } +} +``` + +**Use case:** Skip expensive subsequent tests when initial validation fails. + +### Override responses on failure + +Provide fallback responses to continue testing even when validation fails: + +```json +{ + "role": "assistant", + "judgePlan": { + "type": "exact", + "content": "I've processed your request." + }, + "continuePlan": { + "exitOnFailureEnabled": false, + "contentOverride": "Let me rephrase that...", + "toolCallsOverride": [ + { + "name": "retryProcessing", + "arguments": { "retry": "true" } + } + ] + } +} +``` + +**Use case:** Test error recovery paths or force specific tool calls for subsequent validation. + +### Example: Multi-step with exit control + + + + 1. Create evaluation with multiple conversation turns + 2. For each assistant message with critical validation: + - Enable evaluation + - Configure judge plan (exact, regex, or AI) + - Toggle **Exit on Failure** to stop test early + 3. For non-critical checks, leave **Exit on Failure** disabled + + + +```bash +curl -X POST "https://api.vapi.ai/eval" \ + -H "Authorization: Bearer $VAPI_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "Multi-Step with Control", + "type": "chat.mockConversation", + "messages": [ + { + "role": "user", + "content": "I want to book an appointment" + }, + { + "role": "assistant", + "judgePlan": { + "type": "exact", + "content": "I can help you book an appointment." + }, + "continuePlan": { + "exitOnFailureEnabled": true + } + }, + { + "role": "user", + "content": "Monday at 2pm" + }, + { + "role": "assistant", + "judgePlan": { + "type": "exact", + "toolCalls": [{"name": "bookAppointment"}] + }, + "continuePlan": { + "exitOnFailureEnabled": false, + "contentOverride": "Booking confirmed for Monday at 2pm.", + "toolCallsOverride": [{ + "name": "bookAppointment", + "arguments": {"date": "2025-01-20", "time": "14:00"} + }] + } + } + ] + }' +``` + + + + + If `exitOnFailureEnabled` is `true` and validation fails, the test stops + immediately. Subsequent conversation turns are not executed. Use this for + critical checkpoints. + + +## Step 8: Test complete conversation flows + +Validate multi-turn interactions that simulate real user conversations. + +### Complete booking flow example + + + + Create a comprehensive test: + + 1. **Turn 1 - Initial request:** + - User: "I need to schedule an appointment" + - Assistant evaluation: AI judge checking acknowledgment + + 2. **Turn 2 - Provide details:** + - User: "Next Monday at 2pm" + - Assistant evaluation: Exact match on tool call `bookAppointment` + + 3. **Turn 3 - Tool response:** + - Tool: `{"status": "success", "confirmationId": "APT-12345"}` + + 4. **Turn 4 - Confirmation:** + - Assistant evaluation: Regex matching confirmation with ID + + 5. **Turn 5 - Follow-up:** + - User: "Can I get that via email?" + - Assistant evaluation: Exact match on tool call `sendEmail` + + + +```bash +curl -X POST "https://api.vapi.ai/eval" \ + -H "Authorization: Bearer $VAPI_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "Complete Booking Flow", + "description": "Test full appointment booking conversation", + "type": "chat.mockConversation", + "messages": [ + { + "role": "user", + "content": "I need to schedule an appointment" + }, + { + "role": "assistant", + "judgePlan": { + "type": "ai", + "model": { + "provider": "openai", + "model": "gpt-4o", + "messages": [{ + "role": "system", + "content": "Evaluate: {{messages[-1]}}\n\nPASS if:\n- Response acknowledges appointment request\n- Response asks for details or preferences\n\nFAIL if:\n- Response is dismissive\n- Response ignores request\n\nOutput: pass or fail" + }] + } + } + }, + { + "role": "user", + "content": "Next Monday at 2pm" + }, + { + "role": "assistant", + "judgePlan": { + "type": "exact", + "toolCalls": [{ + "name": "bookAppointment", + "arguments": { + "date": "2025-01-20", + "time": "14:00" + } + }] + } + }, + { + "role": "tool", + "content": "{\"status\": \"success\", \"confirmationId\": \"APT-12345\"}" + }, + { + "role": "assistant", + "judgePlan": { + "type": "regex", + "content": ".*confirmed.*APT-12345.*" + } + }, + { + "role": "user", + "content": "Can I get that via email?" + }, + { + "role": "assistant", + "judgePlan": { + "type": "exact", + "toolCalls": [{ + "name": "sendEmail" + }] + } + } + ] + }' +``` + +For API details, see [Create Eval](/api-reference/eval/create). + + + + +### System message injection + +Inject system prompts mid-conversation to test dynamic behavior changes: + +```json +{ + "messages": [ + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "judgePlan": { + "type": "regex", + "content": ".*help.*" + } + }, + { + "role": "system", + "content": "You are now in urgent mode. Prioritize speed." + }, + { + "role": "user", + "content": "I need immediate help" + }, + { + "role": "assistant", + "judgePlan": { + "type": "ai", + "model": { + "provider": "openai", + "model": "gpt-4o", + "messages": [ + { + "role": "system", + "content": "PASS if response shows urgency. FAIL if response is casual. Output: pass or fail" + } + ] + } + } + } + ] +} +``` + + + **Multi-turn testing tips:** - Keep conversations focused (5-10 turns for most + tests) - Use exit-on-failure for early turns to save time - Test one primary + flow per evaluation - Mix judge types (exact, regex, AI) for comprehensive + validation - Include tool responses to simulate real interactions + + +## Step 9: Manage evaluations + +List, update, and organize your evaluation suite. + +### List all evaluations + + + + 1. Navigate to **Evals** in the sidebar + 2. View all evaluations in a table with: + - Name and description + - Created date + - Last run status + - Actions (Edit, Run, Delete) + 3. Use search to filter by name + 4. Sort by date or status + + + +```bash +curl -X GET "https://api.vapi.ai/eval" \ + -H "Authorization: Bearer $VAPI_API_KEY" +``` + +**Response:** + +```json +{ + "results": [ + { + "id": "550e8400-e29b-41d4-a716-446655440000", + "name": "Greeting Test", + "description": "Verify assistant greets users appropriately", + "type": "chat.mockConversation", + "createdAt": "2024-01-15T09:30:00Z", + "updatedAt": "2024-01-15T09:30:00Z" + } + ], + "page": 1, + "total": 1 +} +``` + +For API details, see [List Evals](/api-reference/eval/list). + + + + +### Update an evaluation + + + + 1. Navigate to **Evals** and click on an evaluation + 2. Click **Edit** button + 3. Modify conversation turns, judge plans, or settings + 4. Click **Save Changes** + 5. Previous test runs remain unchanged + + + +```bash +curl -X PATCH "https://api.vapi.ai/eval/550e8400-e29b-41d4-a716-446655440000" \ + -H "Authorization: Bearer $VAPI_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "Updated Greeting Test", + "description": "Enhanced greeting validation", + "messages": [ + { + "role": "user", + "content": "Hi there" + }, + { + "role": "assistant", + "judgePlan": { + "type": "regex", + "content": "^(Hello|Hi|Hey).*" + } + } + ] + }' +``` + +For API details, see [Update Eval](/api-reference/eval/update). + + + + +### Delete an evaluation + + + + 1. Navigate to **Evals** + 2. Click on an evaluation + 3. Click **Delete** button + 4. Confirm deletion + + + Deleting an evaluation does NOT delete its run history. Past run results remain accessible. + + + + +```bash +curl -X DELETE "https://api.vapi.ai/eval/550e8400-e29b-41d4-a716-446655440000" \ + -H "Authorization: Bearer $VAPI_API_KEY" +``` + +For API details, see [Delete Eval](/api-reference/eval/delete). + + + + +### View run history + + + + 1. Navigate to **Evals** + 2. Click on an evaluation + 3. View **Runs** tab showing: + - Run timestamp + - Target (assistant/squad) + - Status (pass/fail) + - Duration + 4. Click any run to view detailed results + + + +**List all runs:** +```bash +curl -X GET "https://api.vapi.ai/eval/run" \ + -H "Authorization: Bearer $VAPI_API_KEY" +``` + +**Filter by eval ID:** + +```bash +curl -X GET "https://api.vapi.ai/eval/run?evalId=550e8400-e29b-41d4-a716-446655440000" \ + -H "Authorization: Bearer $VAPI_API_KEY" +``` + +For API details, see [List Eval Runs](/api-reference/eval/list-runs). + + + + +## Expected output + +### Successful run + +```json +{ + "id": "eval-run-123", + "evalId": "550e8400-e29b-41d4-a716-446655440000", + "orgId": "org-123", + "status": "ended", + "endedReason": "mockConversation.done", + "createdAt": "2024-01-15T09:35:00Z", + "updatedAt": "2024-01-15T09:35:45Z", + "results": [ + { + "status": "pass", + "messages": [ + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hello! How can I help you today?", + "judge": { + "status": "pass" + } + } + ] + } + ], + "target": { + "type": "assistant", + "assistantId": "your-assistant-id" + } +} +``` + +**Indicators of success:** + +- ✅ `status` is "ended" +- ✅ `endedReason` is "mockConversation.done" +- ✅ `results[0].status` is "pass" +- ✅ All `judge.status` values are "pass" + +### Failed run + +```json +{ + "id": "eval-run-124", + "status": "ended", + "endedReason": "mockConversation.done", + "results": [ + { + "status": "fail", + "messages": [ + { + "role": "user", + "content": "Book an appointment for Monday at 2pm" + }, + { + "role": "assistant", + "content": "Sure, let me help you with that.", + "toolCalls": [ + { + "name": "bookAppointment", + "arguments": { + "date": "2025-01-20", + "time": "2:00 PM" + } + } + ], + "judge": { + "status": "fail", + "failureReason": "Tool call arguments mismatch. Expected time: '14:00' but got: '2:00 PM'" + } + } + ] + } + ] +} +``` + +**Indicators of failure:** + +- ❌ `results[0].status` is "fail" +- ❌ `judge.status` is "fail" +- ❌ `judge.failureReason` provides specific details + + + Full conversation transcripts show both expected and actual values, making + debugging straightforward. + + +## Common patterns + +### Multiple validation types in one eval + +Combine exact, regex, and AI judges for comprehensive testing: + +```json +{ + "messages": [ + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "judgePlan": { + "type": "exact", + "content": "Hello! How can I help you?" + } + }, + { + "role": "user", + "content": "Book appointment for Monday" + }, + { + "role": "assistant", + "judgePlan": { + "type": "regex", + "content": ".*(Monday|next week).*" + } + }, + { + "role": "user", + "content": "Thanks for your help" + }, + { + "role": "assistant", + "judgePlan": { + "type": "ai", + "model": { + "provider": "openai", + "model": "gpt-4o", + "messages": [ + { + "role": "system", + "content": "PASS if response is polite and acknowledges thanks. Output: pass or fail" + } + ] + } + } + } + ] +} +``` + +### Test squad handoffs + +Validate smooth transitions between squad members: + +```json +{ + "name": "Squad Handoff Test", + "messages": [ + { + "role": "user", + "content": "I need technical support" + }, + { + "role": "assistant", + "judgePlan": { + "type": "exact", + "toolCalls": [ + { + "name": "transferToSquadMember", + "arguments": { + "destination": "technical-support-agent" + } + } + ] + } + } + ], + "target": { + "type": "squad", + "squadId": "your-squad-id" + } +} +``` + +### Regression test suite + +Organize related tests for systematic validation: + +```json +{ + "name": "Greeting Regression Suite", + "tests": [ + "Greeting Test - Formal", + "Greeting Test - Casual", + "Greeting Test - Multilingual" + ] +} +``` + +Run multiple evals sequentially to validate all greeting scenarios. + +## Troubleshooting + +| Issue | Solution | +| ----------------------- | --------------------------------------------------------------------------------------- | +| Eval always fails | Verify exact match strings character-by-character. Consider using regex for flexibility | +| AI judge inconsistent | Make pass/fail criteria more specific and binary. Test with known examples | +| Tool calls not matching | Check argument types (string vs number). Ensure exact spelling of function names | +| Run stuck in "running" | Verify assistant configuration. Check for errors in assistant's tools or prompts | +| Timeout errors | Reduce conversation length or simplify evaluations. Check assistant response times | +| Regex not matching | Test regex patterns separately. Remember to escape special characters like `.` or `?` | +| Empty results array | Check `endedReason` field. Assistant may have encountered an error before completion | +| Missing judge results | Verify `judgePlan` is properly configured in assistant messages | + +### Common errors + +**"mockConversation.done" not reached:** + +- Check `endedReason` for actual error (e.g., "assistant-error", "pipeline-error-openai-llm-failed") +- Verify assistant configuration (model, voice, tools) +- Check API key validity and rate limits + +**Judge validation fails unexpectedly:** + +- Review actual vs expected output in `failureReason` +- For exact match: Check for extra spaces, punctuation, or case differences +- For regex: Test pattern with online regex validators +- For AI judge: Verify prompt clarity and binary pass/fail logic + +**Tool calls not validated:** + +- Ensure tool is properly configured in assistant +- Check argument types match exactly (string "14:00" vs number 14) +- Verify tool function names are spelled correctly + + + If you see `endedReason: "assistant-error"`, your assistant configuration has + issues. Test the assistant manually first before running evals. + + +## Next steps + + + + Learn testing patterns, best practices, and CI/CD integration + + +{" "} + + Create and configure assistants to test + + +{" "} + + Build custom tools and validate their behavior + + + + Complete API documentation for evals + + + +## Tips for success + + + **Best practices for reliable testing:** - Start simple with exact matches, + then add complexity - One behavior per evaluation turn keeps tests focused - + Use descriptive names that explain what's being tested - Test both happy paths + and edge cases - Version control your evals alongside assistant configs - Run + critical tests first to fail fast - Review failure reasons promptly and + iterate - Document why each test exists (use descriptions) + + +## Get help + +Need assistance? We're here to help: + +- [Eval API Reference](/api-reference/eval/create) +- [Discord Community](https://discord.gg/pUFNcf2WmH) +- [Support](mailto:support@vapi.ai) From a468bf1dda4badd6b7a82887e607f65e5d4f9f6a Mon Sep 17 00:00:00 2001 From: Arvind Rk Date: Thu, 16 Oct 2025 11:17:52 -0700 Subject: [PATCH 2/8] updates --- fern/docs.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/fern/docs.yml b/fern/docs.yml index 1ea186d7b..dd6e12dbf 100644 --- a/fern/docs.yml +++ b/fern/docs.yml @@ -291,9 +291,6 @@ navigation: - page: Advanced path: observability/evals-advanced.mdx icon: fa-light fa-clipboard-check - - page: Boards - path: observability/boards-quickstart.mdx - icon: fa-light fa-chart-line - section: Squads contents: From 2c9c39a4d7e96656da415f96422f6423b67aa5cb Mon Sep 17 00:00:00 2001 From: Arvind Rk Date: Thu, 16 Oct 2025 11:27:00 -0700 Subject: [PATCH 3/8] updates --- fern/observability/evals-advanced.mdx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fern/observability/evals-advanced.mdx b/fern/observability/evals-advanced.mdx index 1e6b801d3..956c07830 100644 --- a/fern/observability/evals-advanced.mdx +++ b/fern/observability/evals-advanced.mdx @@ -1067,11 +1067,13 @@ Build up complexity gradually: {" "} + Learn to build and configure assistants for testing {" "} + Build custom tools and test their behavior From 4fb38acc4e8a9e663b073cd6963e9c379384314d Mon Sep 17 00:00:00 2001 From: Arvind Rk Date: Thu, 16 Oct 2025 11:46:09 -0700 Subject: [PATCH 4/8] updates --- fern/observability/evals-advanced.mdx | 49 ++++++++++++++------------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/fern/observability/evals-advanced.mdx b/fern/observability/evals-advanced.mdx index 956c07830..e3fbdcb09 100644 --- a/fern/observability/evals-advanced.mdx +++ b/fern/observability/evals-advanced.mdx @@ -1090,37 +1090,38 @@ Build up complexity gradually: ## Summary -**Key takeaways for advanced eval testing:** + **Key takeaways for advanced eval testing:** -**Testing strategy:** + **Testing strategy:** -- Use smoke tests before comprehensive suites -- Build regression tests when fixing bugs -- Cover edge cases systematically + - Use smoke tests before comprehensive suites + - Build regression tests when fixing bugs + - Cover edge cases systematically -**Validation selection:** + **Validation selection:** -- Exact match for critical data -- Regex for pattern matching -- AI judge for semantic evaluation + - Exact match for critical data + - Regex for pattern matching + - AI judge for semantic evaluation -**Performance:** + **Performance:** -- Exit early on critical failures -- Keep conversations focused (5-10 turns) -- Batch related tests together + - Exit early on critical failures + - Keep conversations focused (5-10 turns) + - Batch related tests together -**Maintenance:** + **Maintenance:** -- Version control evaluations -- Review failures promptly -- Update tests with features -- Document test purpose clearly + - Version control evaluations + - Review failures promptly + - Update tests with features + - Document test purpose clearly -**CI/CD:** + **CI/CD:** -- Automate critical tests in pipelines -- Use staging for full suite validation -- Set quality gate thresholds -- Run regression suites regularly - + - Automate critical tests in pipelines + - Use staging for full suite validation + - Set quality gate thresholds + - Run regression suites regularly + + From 18313cb6705bb7f05453527243fffefc0331a20d Mon Sep 17 00:00:00 2001 From: Arvind Rk Date: Thu, 16 Oct 2025 12:09:22 -0700 Subject: [PATCH 5/8] updates --- fern/docs.yml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fern/docs.yml b/fern/docs.yml index 4085a8374..1ea186d7b 100644 --- a/fern/docs.yml +++ b/fern/docs.yml @@ -291,6 +291,9 @@ navigation: - page: Advanced path: observability/evals-advanced.mdx icon: fa-light fa-clipboard-check + - page: Boards + path: observability/boards-quickstart.mdx + icon: fa-light fa-chart-line - section: Squads contents: @@ -343,12 +346,6 @@ navigation: path: test/voice-testing.mdx icon: fa-light fa-volume-high - - section: Observability - contents: - - page: Boards - path: observability/boards-quickstart.mdx - icon: fa-light fa-chart-line - - section: Phone numbers contents: - page: Free Vapi number From c10de048b9ff183e54daeee1d2f343536634ba94 Mon Sep 17 00:00:00 2001 From: Arvind Rk Date: Thu, 16 Oct 2025 13:05:15 -0700 Subject: [PATCH 6/8] updates --- fern/observability/evals-quickstart.mdx | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fern/observability/evals-quickstart.mdx b/fern/observability/evals-quickstart.mdx index 600c68e1e..f48b2c20d 100644 --- a/fern/observability/evals-quickstart.mdx +++ b/fern/observability/evals-quickstart.mdx @@ -576,7 +576,7 @@ For complex validation criteria beyond pattern matching, use AI-powered judges t ``` You are an LLM-Judge. Evaluate ONLY the last assistant message in the mock conversation: {{messages[-1]}}. -Include the full conversation history for context: {{messages[0:-1]}} +Include the full conversation history for context: {{messages}} Decision rule: - PASS if ALL "pass criteria" are satisfied AND NONE of the "fail criteria" are triggered. @@ -596,6 +596,12 @@ Output format: respond with exactly one word: pass or fail - No additional text ``` + +**Template variables:** +- `{{messages}}` - The entire conversation history (all messages exchanged) +- `{{messages[-1]}}` - The last assistant message only + + ### Example: Evaluate helpfulness and tone @@ -630,7 +636,7 @@ curl -X POST "https://api.vapi.ai/eval" \ "model": "gpt-4o", "messages": [{ "role": "system", - "content": "You are an LLM-Judge. Evaluate ONLY the last assistant message: {{messages[-1]}}.\n\nInclude context: {{messages[0:-1]}}\n\nDecision rule:\n- PASS if ALL pass criteria are met AND NO fail criteria are triggered.\n- Otherwise FAIL.\n\nPass criteria:\n- Response acknowledges the user request\n- Response offers specific help or next steps\n- Tone is professional and friendly\n\nFail criteria (any triggers FAIL):\n- Response is rude or dismissive\n- Response ignores the user request\n- Response provides no actionable information\n\nOutput format: respond with exactly one word: pass or fail" + "content": "You are an LLM-Judge. Evaluate ONLY the last assistant message: {{messages[-1]}}.\n\nInclude context: {{messages}}\n\nDecision rule:\n- PASS if ALL pass criteria are met AND NO fail criteria are triggered.\n- Otherwise FAIL.\n\nPass criteria:\n- Response acknowledges the user request\n- Response offers specific help or next steps\n- Tone is professional and friendly\n\nFail criteria (any triggers FAIL):\n- Response is rude or dismissive\n- Response ignores the user request\n- Response provides no actionable information\n\nOutput format: respond with exactly one word: pass or fail" }] } } @@ -1366,11 +1372,13 @@ Run multiple evals sequentially to validate all greeting scenarios. {" "} + Create and configure assistants to test {" "} + Build custom tools and validate their behavior From 710cb62ba4fc5c643ecf8bf9f5c7818f0b9cf492 Mon Sep 17 00:00:00 2001 From: Arvind Rk Date: Thu, 16 Oct 2025 13:08:33 -0700 Subject: [PATCH 7/8] updates --- fern/docs.yml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fern/docs.yml b/fern/docs.yml index 1ea186d7b..3f6b946bc 100644 --- a/fern/docs.yml +++ b/fern/docs.yml @@ -222,16 +222,6 @@ navigation: - page: Migrating from Trieve path: knowledge-base/migrating-from-trieve.mdx icon: fa-light fa-triangle-exclamation - - section: Structured outputs - icon: fa-light fa-database - path: assistants/structured-outputs.mdx - contents: - - page: Quickstart - path: assistants/structured-outputs-quickstart.mdx - icon: fa-light fa-rocket - - page: Examples - path: assistants/structured-outputs-examples.mdx - icon: fa-light fa-code - page: Custom keywords path: customization/custom-keywords.mdx icon: fa-light fa-bullseye @@ -294,6 +284,16 @@ navigation: - page: Boards path: observability/boards-quickstart.mdx icon: fa-light fa-chart-line + - section: Structured outputs + icon: fa-light fa-database + path: assistants/structured-outputs.mdx + contents: + - page: Quickstart + path: assistants/structured-outputs-quickstart.mdx + icon: fa-light fa-rocket + - page: Examples + path: assistants/structured-outputs-examples.mdx + icon: fa-light fa-code - section: Squads contents: From b6e9ab3a8cddd75f3a053e1afb53b15014945ca8 Mon Sep 17 00:00:00 2001 From: Arvind Rk Date: Thu, 16 Oct 2025 13:11:32 -0700 Subject: [PATCH 8/8] updates --- fern/observability/evals-quickstart.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fern/observability/evals-quickstart.mdx b/fern/observability/evals-quickstart.mdx index f48b2c20d..9b5e20de7 100644 --- a/fern/observability/evals-quickstart.mdx +++ b/fern/observability/evals-quickstart.mdx @@ -1407,6 +1407,6 @@ Run multiple evals sequentially to validate all greeting scenarios. Need assistance? We're here to help: -- [Eval API Reference](/api-reference/eval/create) +- [Eval API Reference](/api-reference/eval/eval-controller-create) - [Discord Community](https://discord.gg/pUFNcf2WmH) - [Support](mailto:support@vapi.ai)