From 485a3bb6573f48a47b059b2061dc5274f04a6d13 Mon Sep 17 00:00:00 2001 From: "Hanna Paasivirta (OpenFn)" Date: Mon, 27 Apr 2026 21:59:52 +0900 Subject: [PATCH 01/16] update plan --- agent-team-architecture-plan/1-unit-tests.md | 198 +++++++++ .../2-service-tests.md | 377 ++++++++++++++++ .../3-integration-tests.md | 416 ++++++++++++++++++ .../4-acceptance-tests.md | 377 ++++++++++++++++ agent-team-architecture-plan/5-overview.md | 205 +++++++++ 5 files changed, 1573 insertions(+) create mode 100644 agent-team-architecture-plan/1-unit-tests.md create mode 100644 agent-team-architecture-plan/2-service-tests.md create mode 100644 agent-team-architecture-plan/3-integration-tests.md create mode 100644 agent-team-architecture-plan/4-acceptance-tests.md create mode 100644 agent-team-architecture-plan/5-overview.md diff --git a/agent-team-architecture-plan/1-unit-tests.md b/agent-team-architecture-plan/1-unit-tests.md new file mode 100644 index 00000000..962a321a --- /dev/null +++ b/agent-team-architecture-plan/1-unit-tests.md @@ -0,0 +1,198 @@ +# Apollo Testing Architecture — Layer 1: Unit Tests + +**Scope:** `services/global_chat/`, `services/workflow_chat/`, `services/job_chat/`, their tools, and any future sub-agent / tool service. + +--- + +## 1. What qualifies as a unit test in Apollo + +A test is a unit test iff ALL of: + +1. Exercises **exactly one function, method, or pure class behaviour**. +2. **Zero I/O** — no network (Anthropic, Pinecone, Langfuse export, Sentry), no subprocess, no DB, no file writes outside `tmp_path`. +3. **Zero LLM calls**, not even through a mock HTTP client. If the test needs a mocked Anthropic response to make sense, it's a service test. +4. **Deterministic and fast** — target <50 ms per test; whole unit suite <15 s. +5. **Free** — no API keys required. + +Tests that don't meet all five get moved to service or integration — see §7. + +--- + +## 2. Directory and file layout + +Per-service: one flat `tests/` folder holding all four tiers. This tier owns `*_unit.py` filenames. + +``` +services//tests/ + __init__.py + conftest.py # thin — auto-applies tier marker by filename suffix and folder name + test__unit.py # ← this tier + test__service.py # tier 2 + test__integration.py # tier 3 + acceptance/ # tier 4 — markdown specs + *.md + fixtures/ # optional; static JSON/YAML sample inputs (per-service only) +``` + +Cross-service util tests (`services/util.py` — `AdaptorSpecifier`, `DictObj`, `sum_usage`, `add_page_prefix`) live under `services/tests/test_util_unit.py` — a shared test directory at the services root, since these helpers don't relate to a specific chat service. + +**Why suffix + auto-marker, not subdirs?** While each service has <20 test files, subdirs add bureaucracy for no gain. Tier marker is applied by filename suffix in the per-service conftest. If a service grows past ~30 test files, promoting to subdirs is a mechanical rename. + +--- + +## 3. Shared helper package: `apollo/testing/` + +Lives at the apollo root as a peer of `services/` and `platform/` — it's test infrastructure, not a service. + +``` +apollo/testing/ + __init__.py + anthropic_mock.py # owned by service tier; unit tier does not import + fixtures.py # pytest fixtures: sample payloads, fake api keys, yaml-assertion helpers + server.py # owned by integration tier; unit tier does not import + judge.py # owned by acceptance tier; unit tier does not import + fixtures/ # sample YAML / JSON shared across services + workflows/*.yaml + histories/*.json +``` + +Unit-tier import rules: + +- ✅ Import from `testing.fixtures` — pytest fixtures, YAML assertion helpers, payload builders, fixture loaders. +- ❌ Do not import from `testing.anthropic_mock` — that's the service tier's territory. +- ❌ Do not import from `testing.server` or `testing.judge` — integration / acceptance tiers only. + +`testing/fixtures.py` owns (same module, flat — split into files only when this one grows past ~500 lines): + +- Pytest fixtures (`sample_workflow_yaml`, `sample__chat_payload`, `fake_api_key`, `anthropic_client_no_network`). +- YAML assertion helpers migrated from the currently-duplicated `services/global_chat/tests/test_utils.py` and `services/workflow_chat/tests/test_utils.py` (`path_matches`, `assert_yaml_equal_except`, `assert_yaml_section_contains_all`, `assert_yaml_has_ids`, `assert_yaml_jobs_have_body`, `assert_no_special_chars`). +- Payload builders (`make__chat_payload`) — shared with integration tier. +- Fixture loaders (`load_fixture_json`, `load_fixture_yaml`). +- `set_unit_test_env()` — dummy keys, disable langfuse/sentry. + +These are pure functions, so they themselves deserve unit tests in `services/tests/test_fixtures_unit.py`. + +--- + +## 4. Test runner and configuration + +pytest. Add to `pyproject.toml`: + +```toml +[tool.pytest.ini_options] +minversion = "8.0" +pythonpath = ["services", "."] +testpaths = ["services"] +python_files = ["test_*.py"] +markers = [ + "unit: fast, isolated, no I/O, no LLM, no network", + "service: main() with mocked anthropic HTTP client", + "integration: real LLM via bun server", + "acceptance: LLM-judged quality/voice tests", +] +addopts = ["-ra"] +``` + +`pythonpath = ["services", "."]` kills the `sys.path.insert(...)` boilerplate (the `"services"` entry lets you write `from global_chat.global_chat import main`; the `"."` entry lets you write `from testing.fixtures import ...`). + +Commands: + +```bash +poetry run pytest -m unit +poetry run pytest services/global_chat/tests -m unit +poetry run pytest services/workflow_chat/tests/test_workflow_chat_functions_unit.py +``` + +TypeScript platform tests continue to use `bun:test`. Unchanged. + +--- + +## 5. Conftest strategy + +Two levels: + +### 5.1 Root: `apollo/conftest.py` + +- Calls `testing.fixtures.set_unit_test_env()` at import time. +- Exposes `pytest_plugins = ["testing.fixtures"]` so shared fixtures are globally available. + +### 5.2 Per-service: `services//tests/conftest.py` + +- Service-specific fixtures (e.g. a `global_chat_router_decision` factory). Usually <30 lines. +- Auto-applies the tier marker by filename suffix: + + ```python + def pytest_collection_modifyitems(config, items): + for item in items: + name = item.fspath.basename + if name.endswith("_unit.py"): + item.add_marker(pytest.mark.unit) + elif name.endswith("_service.py"): + item.add_marker(pytest.mark.service) + ``` + +Same loop can live in the root `apollo/conftest.py` instead — one copy rather than per-service. Minor detail for implementation. + +--- + +## 6. CI integration + +Unit and service both run in `.github/workflows/tests.yaml` under one job with `-m "unit or service"`. Triggers on every push / PR. No secrets. 10-minute timeout. See overview §6. + +Coverage: generate `--cov=services --cov-report=xml` and upload as artifact for visibility; no threshold gate. + +--- + +## 7. Migration path for existing tests + +- `services/workflow_chat/tests/test_functions.py` — all eight tests are already unit-shaped. Rename to `test_workflow_chat_functions_unit.py`, delete `sys.path.insert(...)`, replace local `client` fixture with `anthropic_client_no_network`. No assertion changes. +- `services/job_chat/tests/test_functions.py` — misclassified. `test_generate_system_message_loads_adaptor_docs_when_missing` hits Postgres → integration. `test_generate_queries_returns_valid_structure` hits real Anthropic → service (with mocked client) or integration. `test_search_docs_returns_general_docs_only` hits Pinecone → integration. A new `test_prompt_unit.py` covers the pure helpers (`build_prompt`, `build_error_correction_prompt`, `extract_page_prefix_from_last_turn`). +- `services/global_chat/tests/test_utils.py` + `services/workflow_chat/tests/test_utils.py` — YAML helpers migrate to `testing/fixtures.py`. The `call__service` subprocess helpers are replaced by the integration tier's `ApolloClient`. Old files are deleted after all callers are updated. +- `*_pass_fail.py`, `*_qualitative.py`, `*_langfuse_tracing.py`, `*_adaptor_version_passthrough.py`, `*_planner_*.py`, `*_good_morning_*.py` — owned by service/integration/acceptance tiers. + +**Migration order:** + +1. Create `testing/` skeleton + root `apollo/conftest.py` (empty fixtures — just env guard + network block). +2. Add `[tool.pytest.ini_options]` to pyproject. +3. Migrate `workflow_chat/tests/test_functions.py` (smallest, cleanest — the worked example). +4. Wire `.github/workflows/tests.yaml` with `-m unit` initially. +5. Only then start adding net-new unit tests for `yaml_utils.py`, `config_loader.py`, `tool_definitions.py`, etc. + +--- + +## 8. Extensibility — new sub-agent or tool + +A new service `services/my_new_agent/`: + +1. Create `services/my_new_agent/tests/` with a conftest (copy the auto-marker loop). +2. Add `test__unit.py` files. + +A new tool in `services//tools/`: + +1. Add `services//tests/test__unit.py`. Tool schema dicts are ideal targets — assert shape, required keys, JSON-schema validity. + +No pyproject, CI, or shared-package edits required. + +--- + +## 9. Boundaries with other tiers + +- **Unit stops / service begins:** the moment a test needs `main()` to run, or needs to verify "logs appended to prompt" / "api key ended up in headers" / "router picked workflow_agent when given X", it's a service test. +- **Unit stops / integration begins:** if a test needs a live Anthropic response (even for shape), it's integration. If it needs the bun server, it's integration. +- **Unit stops / acceptance begins:** no overlap — acceptance is about answer quality, unit is about code correctness. + +--- + +## 10. Things deliberately NOT done + +- No per-tier subdirectories per service. +- No separate `env.py` / `loaders.py` / `yaml_assertions.py` modules in `testing/` — one `fixtures.py` until a concrete split is warranted. +- No coverage gates. +- No property-based testing (Hypothesis) in the initial plan — add a `strategies.py` module when the need arises. +- No pytest plugin dev dep beyond what's already in poetry.lock. + +--- + +## Summary + +Unit layer is small, boring, and fast. One flat `tests/` folder per service with filename-suffixed tiers, one `testing/` package with three files, dummy env vars in the root conftest, CI on every push with no secrets. Adding a unit test for a new function is a one-line import and a `def test_...`. diff --git a/agent-team-architecture-plan/2-service-tests.md b/agent-team-architecture-plan/2-service-tests.md new file mode 100644 index 00000000..a0c30ce8 --- /dev/null +++ b/agent-team-architecture-plan/2-service-tests.md @@ -0,0 +1,377 @@ +# Section 2 — Service Tests Architecture + +> Scope: `services/global_chat/`, `services/workflow_chat/`, `services/job_chat/`, and the tools / sub-agents they invoke. + +--- + +## 1. Naming and position + +**Service tests** — direct calls to `main()` with Anthropic HTTP calls mocked. One rung above unit, one below integration. + +| Tier | Scope | LLM calls | HTTP layer | Cost | Runs on PR push | +|------------|-------------------|-----------|------------|------|-----------------| +| unit | single function | no | no | free | yes | +| **service** | **`main()` end-to-end** | **mocked** | **no (direct python)** | **free** | **yes** | +| integration | `main()` via server | real | yes | $$ | manual / label | +| acceptance | behaviour spec | real | yes | $$$ | nightly / manual | + +Service tests verify *logic and information flow*: payload validation, routing, prompt assembly, tool-call orchestration, sub-agent invocation, history/usage aggregation, error paths, headers, api_key passthrough. They do **not** verify model quality. + +--- + +## 2. The `test_hooks` second argument + +### 2.1 Signature change + +```python +def main(data_dict: dict, test_hooks: Optional[dict] = None) -> dict: ... +``` + +`entry.py` keeps calling `m.main(data)` with one positional arg — the HTTP path never sees `test_hooks`. `test_hooks` is a test-only affordance. + +### 2.2 The `test_hooks` dict — minimum viable shape + +Plain Python `dict`. No `TypedDict`, no pydantic model — just a dict with documented keys. The recognised keys are documented as a docstring in `testing/anthropic_mock.py`: + +```python +# testing/anthropic_mock.py +""" +The `test_hooks` dict accepts (all optional; all default to absent): + +- "anthropic_http_client": an httpx.Client backed by httpx.MockTransport. + When present, threaded into every Anthropic(...) constructor site. +- "tool_calls": a list[dict] the test allocates. Production code appends + breadcrumbs via record_tool_call(test_hooks, entry). +- "tool_stubs": dict[str, Callable] keyed by tool name. When the planner + dispatches a tool, if a stub exists for that name, the stub is called + with the tool input and its return value used as the tool result. Today + only used for "search_documentation" — see §5. +""" +``` + +Start with three keys. Add more only when a concrete test can't be written without one. Things intentionally left out until needed: + +- Sub-agent stub registry. Default behaviour: the sub-agent's `main()` runs under the same mock HTTP client — that's usually what a test wants. A stub registry (`test_hooks["subagent_stubs"]`) can be added if a test needs to bypass the sub-agent's logic entirely. +- `seed`, `disable_langfuse`, `scratch`. Add when a test fails without them. + +### 2.3 Threading `test_hooks` through + +Each chat service's `main()` passes `test_hooks` into the agent / client constructors it creates. Everywhere that currently calls `Anthropic(api_key=...)` swaps to `build_anthropic_client(api_key, test_hooks)` (new factory — see §3). + +Sites that change: + +- `services/job_chat/job_chat.py` — `AnthropicClient.__init__`. +- `services/workflow_chat/workflow_chat.py` — `AnthropicClient.__init__`. +- `services/global_chat/router.py` — `RouterAgent.__init__`. +- `services/global_chat/planner.py` — `PlannerAgent.__init__`. +- `services/global_chat/subagent_caller.py` — accepts `test_hooks` and forwards to sub-agent `main()` calls. + +Production behaviour when `test_hooks is None` is byte-identical to today. Every new kwarg defaults to `None`. + +--- + +## 3. Mock Anthropic HTTP client + +### 3.1 Factory in `services/util.py` + +```python +def build_anthropic_client(api_key: str, test_hooks: Optional[dict] = None) -> Anthropic: + http_client = (test_hooks or {}).get("anthropic_http_client") + kwargs = {"api_key": api_key} + if http_client is not None: + kwargs["http_client"] = http_client + return Anthropic(**kwargs) +``` + +Every `AnthropicClient` / `RouterAgent` / `PlannerAgent` constructor swaps `Anthropic(api_key=...)` for `build_anthropic_client(api_key, test_hooks)`. + +### 3.2 `testing/anthropic_mock.py` + +Single file — `MockAnthropicClient` class, canned response-body builders, docstring documenting recognised `test_hooks` keys, and the `record_tool_call(test_hooks, entry)` helper. Split later if it grows unwieldy. + +The `anthropic` Python SDK accepts a custom `http_client` — we build ours from `httpx.MockTransport`: + +```python +class MockAnthropicClient: + """Thin wrapper over httpx.Client + httpx.MockTransport. + + Usage: + mc = MockAnthropicClient.always(response=text_response("hello")) + mc = MockAnthropicClient.script([resp1, resp2, resp3]) # multi-turn + mc = MockAnthropicClient.streaming(events=[...]) # SSE + + After the call: + mc.requests # list[RecordedRequest] + mc.last_request.json["messages"] + mc.last_request.headers["x-api-key"] + """ + @classmethod + def always(cls, response) -> "MockAnthropicClient": ... + @classmethod + def script(cls, responses) -> "MockAnthropicClient": ... + @classmethod + def streaming(cls, events) -> "MockAnthropicClient": ... + + @property + def httpx_client(self) -> httpx.Client: ... + @property + def requests(self) -> list[RecordedRequest]: ... + @property + def last_request(self) -> RecordedRequest: ... +``` + +Response-body builders in the same file: + +- `text_response(text, model=..., usage=...)` +- `tool_use_response(tool_name, tool_input, tool_use_id="toolu_01")` +- `mixed_response(text, tool_uses=[...])` +- `router_decision_response(destination, confidence=4, job_key=None)` +- `stream_events(text="", tool_uses=None)` +- `usage_block(input_tokens=100, output_tokens=50, cache_creation=0, cache_read=0)` + +No new runtime dep — `httpx.MockTransport` is built into httpx, which is already in `poetry.lock`. + +### 3.3 Scripted multi-turn example + +```python +def test_planner_calls_workflow_then_job_agents(test_hooks_factory): + mock = MockAnthropicClient.script([ + router_decision_response("planner", confidence=5), + tool_use_response("call_workflow_agent", {"message": "create workflow"}), + tool_use_response("call_job_code_agent", {"message": "code for step", "job_key": "fetch"}), + text_response("All done."), + ]) + test_hooks = test_hooks_factory(anthropic=mock) + result = global_chat_main(make_global_chat_payload("create a workflow"), test_hooks) + + assert [c["tool"] for c in test_hooks["tool_calls"]] == [ + "router_decision", "call_workflow_agent", "call_job_code_agent", + ] +``` + +When a test wants to bypass a sub-agent's real code, it scripts responses for the planner's `/v1/messages` calls and lets the sub-agent's own `main()` run under the same mock client. Stub registries aren't needed for the common case. + +--- + +## 4. Tool-call breadcrumbs (`test_hooks["tool_calls"]`) + +`test_hooks["tool_calls"]` is a list the test allocates and production code appends to. One helper in `testing/anthropic_mock.py`: + +```python +def record_tool_call(test_hooks: Optional[dict], entry: dict) -> None: + if test_hooks is None: + return + crumbs = test_hooks.get("tool_calls") + if crumbs is not None: + crumbs.append(entry) +``` + +Dispatch sites (`planner._execute_tool`, `router.route_and_execute`) call `record_tool_call(test_hooks, {"tool": ..., "input": ...})`. Two dict lookups per call when `test_hooks is None` — negligible. + +Tests read: + +```python +assert [c["tool"] for c in test_hooks["tool_calls"]] == ["router_decision", "call_workflow_agent"] +``` + +--- + +## 5. Tool stubs (`test_hooks["tool_stubs"]`) + +Most planner tools don't need stubbing. `call_workflow_agent` and `call_job_code_agent` inherit `test_hooks` and run the sub-agent's own mocked `main()`. `inspect_job_code` is pure local code with no network. The one tool that does need stubbing is **`search_documentation`** — without a stub it would hit Pinecone (vector store) and OpenAI (embeddings) on every service test. + +Production change in `services/global_chat/planner.py::_execute_tool` — one if/else at the top of the dispatch: + +```python +stub = (self._test_hooks or {}).get("tool_stubs", {}).get(tool_use_block.name) +if stub is not None: + tool_result = stub(tool_use_block.input) +else: + # original dispatch by name follows + ... +``` + +Test usage: + +```python +test_hooks = { + "anthropic_http_client": mock.httpx_client, + "tool_calls": [], + "tool_stubs": { + "search_documentation": lambda tool_input: "Cron triggers run on a schedule...", + }, +} +result = main(payload, test_hooks) +``` + +The stub returns whatever shape the real tool returns (here a string — the planner feeds it back into the next Anthropic call). + +A `build_search_documentation_stub(docs=[...])` helper in `testing/anthropic_mock.py` can emerge once a second test reuses the same shape — not preemptively. + +--- + +## 6. Directory layout + +``` +services//tests/ + __init__.py + conftest.py # re-exports shared fixtures; auto-marks by filename suffix + test__unit.py # tier 1 (unit-tests-architect) + test__service.py # tier 2 (this tier) + fixtures/ # per-service fixture data (optional) +``` + +Test filenames this tier will add (illustrative, not exhaustive): + +- `services/global_chat/tests/test_router_service.py` — router decisions by intent. +- `services/global_chat/tests/test_planner_service.py` — tool dispatch order, test_hooks propagation. +- `services/global_chat/tests/test_subagent_passthrough_service.py` — global → workflow / job wiring. +- `services/workflow_chat/tests/test_workflow_chat_service.py` — YAML extraction, retry loop, streaming events. +- `services/job_chat/tests/test_job_chat_service.py` — RAG injection (with stubbed retriever), suggest-code response shape, page-prefix detection, error-correction loop. + +Cross-service end-to-end flow tests (planner chain over mocks) also live under `services/global_chat/tests/` since `global_chat` owns the planner. + +--- + +## 7. Shared helpers in `testing/` + +``` +testing/ + __init__.py + anthropic_mock.py # MockAnthropicClient, response builders, test_hooks-keys docstring, record_tool_call + fixtures.py # pytest fixtures + YAML assertion helpers + payload builders + loaders + fixtures/ + workflows/*.yaml + histories/*.json +``` + +`fixtures.py` is the flat home for: + +- `make_global_chat_payload`, `make_workflow_chat_payload`, `make_job_chat_payload`. +- `get_workflow_yaml_attachment`, `get_suggested_code_attachment`, `get_usage`. +- `assert_yaml_has_ids`, `assert_yaml_jobs_have_body`, `assert_yaml_equal_except`, `path_matches`, `assert_no_special_chars`. +- Pytest fixtures: `mock_anthropic`, `test_hooks_factory`, `fake_api_key`, `sample_workflow_yaml`, `anthropic_client_no_network`. +- `set_unit_test_env` (dummy keys, disable langfuse/sentry). +- `load_fixture_json`, `load_fixture_yaml`. + +One file until it gets unwieldy (~500 lines). Split then, not pre-emptively. + +Key fixture: + +```python +@pytest.fixture +def test_hooks_factory(): + def _factory(*, anthropic=None, **overrides): + opts = {"tool_calls": []} + if anthropic is not None: + opts["anthropic_http_client"] = anthropic.httpx_client + opts.update(overrides) + return opts + return _factory +``` + +Per-service `conftest.py` just exposes `pytest_plugins = ["testing.fixtures"]` (inherited from root) plus any per-service niche fixtures. + +--- + +## 8. Pytest configuration + +Owned initially by this tier in PR #1 (bootstrap). See overview §5 for the full block. Relevant keys: + +```toml +[tool.pytest.ini_options] +pythonpath = ["services", "."] +testpaths = ["services"] +python_files = ["test_*.py"] +markers = [ + "unit: ...", + "service: main() tests with mocked LLM HTTP client", + "integration: ...", + "acceptance: ...", +] +addopts = ["-ra"] +``` + +Markers applied by filename suffix in the root `apollo/conftest.py` — authors don't decorate manually. + +--- + +## 9. CI integration + +Service runs in the same `tests.yaml` workflow as unit, via `pytest -m "unit or service"`. No secrets on this job (invariant). See overview §6. + +--- + +## 10. Migration recipe for existing `pass_fail` tests + +1. **Classify the assertion.** Content-sensitive (`"response mentions Salesforce"`) → integration or acceptance. Structural (`"workflow_yaml has 2 jobs"`) → service (with a canned mock producing that structure). +2. **Replace the call site.** Swap `subprocess.run([..., "entry.py", ...])` for `from . import main; main(payload, test_hooks)`. +3. **Build the mock.** Hand-craft an Anthropic response fixture (or a script for planner multi-turn) that produces the shape under test. +4. **Assert on structure + breadcrumbs.** Replace content asserts with routing / shape asserts; keep content in acceptance. +5. **Delete the old test** once the new one is stable. + +Expect ~50–70% of `pass_fail` tests to become service tests; the rest stay in integration. + +--- + +## 11. Production-code edits (summary) + +| File | Edit | +|------|------| +| `services/global_chat/global_chat.py` | `main(data_dict, test_hooks=None)`; thread through | +| `services/workflow_chat/workflow_chat.py` | same | +| `services/job_chat/job_chat.py` | same | +| `services/global_chat/router.py` | accept `test_hooks` in `__init__`; pass into sub-agent calls | +| `services/global_chat/planner.py` | accept `test_hooks`; use in `_execute_tool`; thread into subagent_caller | +| `services/global_chat/subagent_caller.py` | accept `test_hooks`; pass to sub-agent `main()` | +| `services/util.py` | add `build_anthropic_client()` | + +Everywhere: backward-compatible defaults. `test_hooks is None` ⇒ existing behaviour, byte-for-byte. + +--- + +## 12. Extensibility — new sub-agent or tool + +**New sub-agent:** + +1. `services/my_new_agent/my_new_agent.py` with `def main(data, test_hooks=None)`. +2. Every `Anthropic(...)` site uses `build_anthropic_client(api_key, test_hooks)`. +3. Thread `test_hooks` through internal calls. +4. Add `services/my_new_agent/tests/test_*_service.py`. Conftest auto-inherits. + +**New tool in the planner:** + +1. Append to `services/global_chat/tools/tool_definitions.py`. +2. Dispatch branch in `planner._execute_tool` calls `record_tool_call(test_hooks, ...)`. +3. Write service tests. + +Pattern: **one arg, one call to `record_tool_call`, one test**. No framework changes. + +--- + +## 13. What this tier deliberately does NOT do + +- **No sub-agent stub registry on day one.** Default behaviour (sub-agent runs under shared mock client) is what tests usually want. +- **No `test_hooks["seed"]` / `["disable_langfuse"]` / `["scratch"]`.** Add when a test fails without them. +- **No `pytest-asyncio`, `pytest-randomly`, or other dev deps.** Add when needed. +- **No frozen public API contract between tiers.** Shared helpers live in one package; rename when the signature improves. + +--- + +## 14. What else belongs in this tier + +Good service-test targets: + +- **API key threading** — payload `api_key` ends up in `mock.last_request.headers["x-api-key"]`; absent → env var is used. +- **Cache-control regression** — planner system prompt has `cache_control: {"type": "ephemeral"}`; assert on outbound request body. +- **Context-management beta** — planner sets `context-management-2025-06-27` header and `context_management` field on every call. +- **History round-trip** — returned `history` equals input + this turn's user/assistant messages. +- **`AdaptorSpecifier` propagation** — payload `context.adaptor = "@openfn/language-http@3.1.11"` shows up in the prompt. +- **Retry loops** — `workflow_chat` retries once on YAML parse failure; script invalid-then-valid and assert count. +- **Negative paths** — missing `content` → `ApolloError(400)`; malformed tool-use response → graceful fallback. + +--- + +## Summary + +`test_hooks` second arg on `main()` + `build_anthropic_client(api_key, test_hooks)` factory + `MockAnthropicClient` with `always`/`script`/`streaming` constructors + three `test_hooks` keys (`anthropic_http_client`, `tool_calls`, `tool_stubs` — the last only used for `search_documentation` today). Three files in `testing/`, filename-suffix markers, shared workflow with the unit tier. Add sub-agent stub infrastructure the first time a test can't be written without it. diff --git a/agent-team-architecture-plan/3-integration-tests.md b/agent-team-architecture-plan/3-integration-tests.md new file mode 100644 index 00000000..2cd4c26c --- /dev/null +++ b/agent-team-architecture-plan/3-integration-tests.md @@ -0,0 +1,416 @@ +# Section 3 — Integration Tests Architecture + +**Scope:** `services/global_chat/`, `services/workflow_chat/`, `services/job_chat/` and future sub-agents / tools. + +--- + +## 1. Purpose and boundaries + +Integration tests exercise a chat service **end-to-end through the running bun server, with real model calls**. First tier in the pyramid where: + +- Full stack: TypeScript `bridge.ts` → `spawn poetry run python services/entry.py` → `services//.py:main()` → Anthropic / OpenAI / Pinecone network calls. +- Assertions are loose-to-moderate on content (regex, keyword presence, structural shape) and strict on payload shape (required fields, types, SSE event sequence, WS lifecycle). +- Slow (seconds to a minute each) and cost real money. + +**What belongs here:** "does the service return a valid workflow yaml end-to-end over HTTP?", "does `/services/global_chat/stream` emit Anthropic-formatted SSE events in order?", "does WS `start → log* → complete` work for a real LLM run?", "does `test_errors` still map `ApolloError` to the right HTTP status?". + +**What doesn't:** + +| Concern | Correct tier | +|---|---| +| "Is this specific LLM answer good quality?" | acceptance | +| "Is the router prompt formatted right?" | service (mocked Anthropic) | +| "Does `AdaptorSpecifier.parse()` handle shorthand?" | unit | + +**Dividing lines:** + +- Service → integration: assertion depends on LLM-generated content. +- Integration → acceptance: assertion needs a judge LLM to evaluate. + +--- + +## 2. Directory layout + +Integration tests live alongside their service. **No top-level `tests/` tree.** One `*_integration.py` file per service: + +``` +services//tests/ + test__integration.py # sync POST + SSE stream + WS in one file +``` + +Concretely: + +``` +services/global_chat/tests/test_global_chat_integration.py +services/workflow_chat/tests/test_workflow_chat_integration.py +services/job_chat/tests/test_job_chat_integration.py +``` + +**Cross-service / planner-chain end-to-end tests** live under `services/global_chat/tests/test_global_chat_integration.py` because `global_chat` owns the planner that dispatches to `workflow_chat` and `job_chat` over real HTTP. There's no separate "cross-service" home — the orchestrator hosts them. + +**Smoke tests** for the platform itself (healthcheck `GET /`, 404 for unknown service, `test_errors` mapping) live under `services/echo/tests/test_echo_integration.py` since `echo` is the test-pipeline-verification service. + +**One file per service** covers sync POST, SSE streaming, and WS in one place. Split by transport (`test__stream_integration.py`) only if a single file gets unwieldy (>500 lines). + +--- + +## 3. Server lifecycle + +The session-scoped bun server fixture lives in **`testing/server.py`** (the shared helper package), not in a top-level conftest. Registered globally via `pytest_plugins = ["testing.server"]` in the root `apollo/conftest.py`. + +### 3.1 Fixture + +```python +@pytest.fixture(scope="session") +def apollo_server() -> ApolloServerHandle: + ... +``` + +Handle: + +```python +@dataclass +class ApolloServerHandle: + base_url: str # "http://127.0.0.1:" + port: int + log_path: Path +``` + +**Startup:** + +1. If `APOLLO_TEST_BASE_URL` is set, return a handle pointing at that URL. Lets us reuse a deployed staging server or a local `bun dev` during debugging. +2. Otherwise, allocate a port (bind-port-0 trick). +3. `subprocess.Popen(["bun", "run", "start"], env={..., "PORT": str(port)}, cwd=repo_root)`. Use `start` (not `dev`) — no hot-reload noise; matches production. +4. Drain stdout/stderr to `tmp/test-logs//bun.log`. +5. Poll `GET http://127.0.0.1:/` with exponential backoff, 60s total cap. +6. Yield handle. + +**Teardown:** SIGTERM → `wait(timeout=5)` → SIGKILL if needed. Flush log threads. + +### 3.2 Scope + +Session-scoped. Cold-start is ~2–5s; per-file or per-test would double runtime. Bun server is stateless between requests (each invocation spawns a fresh Python subprocess via `bridge.ts`), so sharing is safe. + +No `pytest-xdist` parallelism on day one — Anthropic rate limits per API key. Add `-n auto --dist loadfile` with per-file scope when there's demand + headroom. + +--- + +## 4. Environment and secrets + +| Var | Purpose | Missing (dev) | Missing (CI) | +|---|---|---|---| +| `ANTHROPIC_API_KEY` | All chat services | skip | fail | +| `OPENAI_API_KEY` | embeddings, search_docsite | skip | fail | +| `PINECONE_API_KEY` | search_docsite | skip | fail | +| `POSTGRES_URL` | services using `util.get_db_connection()` | skip | fail | +| `APOLLO_TEST_BASE_URL` | reuse deployed server | spawn | spawn | +| `LANGFUSE_TRACING` | flip Langfuse export on/off without changing credentials | unset → off | unset → off | + +`require_env(*names)` helper in `testing/server.py`: + +```python +def require_env(*names: str) -> None: + missing = [n for n in names if not os.environ.get(n)] + if not missing: + return + if os.environ.get("CI") == "true": + pytest.fail(f"Missing env vars: {missing}") + pytest.skip(f"Missing env vars: {missing}", allow_module_level=True) +``` + +Per-module: `require_env("ANTHROPIC_API_KEY")` at module top. + +### 4.1 Secret sourcing in CI + +- Repo secrets: `ANTHROPIC_API_KEY_TEST`, `OPENAI_API_KEY_TEST`, `PINECONE_API_KEY_TEST`, `POSTGRES_URL_TEST`. +- Dedicated test Pinecone namespace; ephemeral test Postgres. Never share the production `apollo-mappings` index / DB. +- Anthropic: separate API key with a low monthly cap, scoped to a test project. + +### 4.2 Langfuse opt-in + +Integration runs *can* trace to Langfuse but don't have to. Mechanism: + +- `LANGFUSE_TRACING=true` → existing `@observe` decorators on production `main()` export traces. +- `LANGFUSE_TRACING=false` (or unset) → `@observe` short-circuits, no export. + +In the CI workflow, leave `LANGFUSE_TRACING` unset by default — integration runs are clean. Add a `workflow_dispatch` input "Trace to Langfuse" (or a separate `run-integration-traced` PR label) to flip it on for the runs where someone wants to inspect traces afterwards. Credentials stay configured either way. + +Integration writes traces only — no scores. Scoring is acceptance-tier territory. + +--- + +## 5. HTTP client + +`ApolloClient` lives in **`testing/server.py`** alongside the server fixture: + +```python +class ApolloClient: + def __init__(self, base_url: str, *, timeout: float = 120.0): ... + + def call(self, service: str, payload: dict) -> dict: ... + # POST /services/. Raises ApolloHTTPError on non-2xx. + + def stream(self, service: str, payload: dict) -> Iterator[SSEEvent]: ... + # POST /services//stream. Yields parsed SSE events. + + def ws(self, service: str, payload: dict) -> Iterator[WSEvent]: ... + # WS /services/. Yields events until "complete". +``` + +Dataclasses `SSEEvent(type, data)` and `WSEvent(event, type, data)` in the same file. + +Dependencies: `httpx` (already in `poetry.lock`) and `websockets`. Install via a new `test-integration` poetry group: + +```toml +[tool.poetry.group.test-integration] +optional = true + +[tool.poetry.group.test-integration.dependencies] +websockets = "^13" +``` + +Installed in CI with `poetry install --with test-integration`. + +Exposed fixture: + +```python +@pytest.fixture +def client(apollo_server) -> ApolloClient: + return ApolloClient(apollo_server.base_url) +``` + +Both integration and acceptance tiers use the same `client` fixture. + +--- + +## 6. Assertion helpers + +All in `testing/server.py` (or a small `testing/helpers.py` if it crowds the file): + +- `collect_until_complete(stream, *, timeout=120)` → `(list[SSEEvent], final_payload)`. +- `assert_event_sequence(events, expected_types, *, strict=False)` — subsequence match by default. +- `accumulate_text_deltas(events)` — concatenate `content_block_delta` text. +- `assert_response_shape(response, required)` — `{"response": str, "usage": dict, "history": list}` style. +- `assert_response_contains(response, *regexes, field="response", flags=re.IGNORECASE)`. + +YAML helpers (`assert_yaml_has_ids`, etc.) come from `testing/fixtures.py` — one canonical location. + +--- + +## 7. Pytest configuration + +Markers declared in `pyproject.toml [tool.pytest.ini_options].markers` — see overview §5 and service-tests plan §7. + +Marker auto-applied by filename suffix `_integration.py` in the per-service or root conftest: + +```python +def pytest_collection_modifyitems(config, items): + for item in items: + name = item.fspath.basename + if name.endswith("_integration.py"): + item.add_marker(pytest.mark.integration) +``` + +Authors don't write `@pytest.mark.integration`. Filename suffix is sufficient. + +--- + +## 8. Opt-in commands + +```bash +# Run the whole integration tier +poetry run pytest -m integration + +# Run one service's +poetry run pytest services/global_chat/tests -m integration + +# Smoke only +poetry run pytest services/echo/tests + +# Against a deployed server +APOLLO_TEST_BASE_URL=https://apollo-staging.openfn.org poetry run pytest -m integration + +# With Langfuse tracing on +LANGFUSE_TRACING=true poetry run pytest -m integration +``` + +--- + +## 9. CI integration + +Integration and acceptance share one workflow: `.github/workflows/llm-tests.yaml`. Integration job runs on PR label `run-integration`, push to `main`, or `workflow_dispatch`. Acceptance job runs on PR label `run-acceptance` or `workflow_dispatch` only — never cron. + +```yaml +name: llm-tests +on: + pull_request: + types: [labeled] + push: + branches: [main] + workflow_dispatch: + inputs: + trace_langfuse: + type: boolean + default: false + description: "Export traces to Langfuse" + +concurrency: + group: llm-${{ github.ref }} + cancel-in-progress: true + +jobs: + integration: + if: >- + (github.event_name == 'pull_request' && github.event.label.name == 'run-integration') + || github.event_name == 'push' + || github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + timeout-minutes: 20 + env: + CI: "true" + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY_TEST }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY_TEST }} + PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY_TEST }} + POSTGRES_URL: ${{ secrets.POSTGRES_URL_TEST }} + LANGFUSE_PUBLIC_KEY: ${{ secrets.LANGFUSE_PUBLIC_KEY_TEST }} + LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY_TEST }} + LANGFUSE_BASE_URL: ${{ secrets.LANGFUSE_BASE_URL }} + LANGFUSE_TRACING: ${{ inputs.trace_langfuse == true && 'true' || 'false' }} + steps: + - uses: actions/checkout@v4 + - uses: oven-sh/setup-bun@v2 + - uses: actions/setup-python@v5 + with: { python-version: "3.11" } + - uses: snok/install-poetry@v1 + - run: bun install + - run: poetry install --with test-integration + - run: poetry run pytest -m integration -v --maxfail=5 + - uses: actions/upload-artifact@v4 + if: always() + with: + name: integration-logs-${{ github.run_id }} + path: tmp/test-logs/ + retention-days: 14 + + acceptance: + # see 4-acceptance-tests.md §6 + ... +``` + +`--maxfail=5` so a broken prompt doesn't burn the budget running every test. No `--timeout` plugin on day one. + +--- + +## 10. Cost, retries, flakiness — not on day one + +Deliberately deferred until a concrete bill or flake makes them necessary: + +- No `cost_tracker.py`, `pricing.py`, budget env var, circuit breaker. +- No `pytest-rerunfailures` — flakes are signal. +- No `pytest-timeout` — `--maxfail=5` + 20-min workflow timeout is enough defence. +- No per-test `@pytest.mark.budget(usd=...)`. +- No `slow`, `flaky_model`, `needs_pinecone` markers — add individually when first needed. + +Each of these has a real cost (learning curve, config churn, false alarms). Add one the first time its absence bites — not preemptively. + +--- + +## 11. Streaming and WebSocket testing + +Three transports need coverage. All three live in the same `test__integration.py` file per service. + +### Sync POST + +```python +def test_workflow_chat_cron_trigger(client, sample_workflow_yaml): + payload = make_workflow_chat_payload( + existing_yaml=sample_workflow_yaml, + content="Change the trigger to every day at midnight", + ) + r = client.call("workflow_chat", payload) + assert_response_shape(r, {"response": str, "response_yaml": str, "usage": dict}) + assert_yaml_has_ids(r["response_yaml"]) + yml = yaml.safe_load(r["response_yaml"]) + assert "cron" in yml["triggers"] +``` + +### SSE + +```python +def test_workflow_chat_streams_events(client): + payload = make_workflow_chat_payload(content="Generate a basic workflow", stream=True) + events, final = collect_until_complete(client.stream("workflow_chat", payload)) + assert_event_sequence(events, [ + "message_start", "content_block_start", "content_block_delta", + "content_block_stop", "message_delta", "message_stop", + ]) + assert final is not None + assert final["response"] == accumulate_text_deltas(events) +``` + +### WS + +```python +def test_workflow_chat_ws_lifecycle(client): + payload = make_workflow_chat_payload(content="Generate a basic workflow") + events = list(client.ws("workflow_chat", payload)) + start = [e for e in events if e.event == "start"] + complete = [e for e in events if e.event == "complete"] + assert len(start) == 1 and len(complete) == 1 +``` + +### Error paths + +Lives in the smoke tests for the test_errors service: + +```python +# services/echo/tests/test_echo_integration.py +def test_stream_emits_error_event(client): + events = list(client.stream("test_errors", {"trigger": "RATE_LIMIT"})) + err = next(e for e in events if e.type == "error") + assert err.data["code"] == 429 +``` + +--- + +## 12. Migration of existing tests + +| Existing file | Destination | +|---|---| +| `services/*/tests/test_functions.py` (pure) | unit tier | +| `services/*/tests/test_pass_fail*.py` using `subprocess.run(entry.py)` | `services//tests/test__integration.py` — swap subprocess for `client.call()` | +| `services/*/tests/test_pass_fail*.py` using mocked sub-agents (`test_adaptor_version_passthrough.py`, `test_planner_subagent_clarification.py`) | service tier (mocked LLM) | +| `services/*/tests/test_qualitative.py` | mostly acceptance; a few with machine-checkable shape → integration | +| `services/*/tests/test_langfuse_tracing.py` | `services//tests/test__integration.py` — swap subprocess for `client.call()` | + +Migration strategy: one service at a time, smallest first (workflow_chat). Keep old file for one PR with `pytest.mark.skip("migrated")`, delete in follow-up. + +--- + +## 13. Extensibility + +A new sub-agent or tool is a ~10-minute add: + +1. Create `services//tests/test__integration.py`. +2. Use `client.call() / .stream() / .ws()` — no middleware changes; `describe-modules.ts` auto-mounts. +3. Add `make__payload` to `testing/fixtures.py` if needed. +4. No CI changes — workflow already collects `-m integration` across all services. + +If a new tool requires a new external dep (new API key), add it to `require_env` and the CI secrets list. + +--- + +## 14. What this tier deliberately does NOT do + +- No top-level `tests/` tree — everything lives with the service. +- No cost tracker / pricing table / budget enforcement on day one. +- No retry plugin. +- No dual-API fixture (pytest fixture + context manager). Acceptance imports the same `client` fixture. +- No `APOLLO_TEST_QUICK` skip mode — `-m "not slow"` pattern can be added if needed, but no tests are marked `slow` on day one. +- No contract testing / VCR / recorded HAR. Possible future addition. + +--- + +## Summary + +One file per service: `services//tests/test__integration.py` covering sync + SSE + WS. Session-scoped `apollo_server` + `ApolloClient` in `testing/server.py`. Label-gated CI workflow shared with acceptance. Langfuse tracing is opt-in via `LANGFUSE_TRACING=true` (workflow input or env var). No cost tracking, no retries, no parallelism on day one. Add each when it's actually missed. diff --git a/agent-team-architecture-plan/4-acceptance-tests.md b/agent-team-architecture-plan/4-acceptance-tests.md new file mode 100644 index 00000000..6c3899c3 --- /dev/null +++ b/agent-team-architecture-plan/4-acceptance-tests.md @@ -0,0 +1,377 @@ +# Section 4 — Acceptance Tests Architecture + +**Scope:** quality-, voice-, and style-focused evaluation of Apollo chat services (`global_chat`, `workflow_chat`, `job_chat`) against product-owner-authored hero questions. Judged by an LLM-as-judge, reviewable by a human (Joe or Brandon), optionally logged to Langfuse for trend analysis. + +**Non-goals (other tiers):** + +- Unit tests of pure functions → Section 1. +- Mocked-LLM `main()` invocations → Section 2. +- Functional flow with regex content assertions → Section 3. + +Acceptance answers a different question than integration: not "does the system function end-to-end?" but "does the answer sound like us, read well, satisfy the user's intent, and not regress in voice as we bump model versions?" + +--- + +## 1. Guiding principles + +1. **Specs are markdown.** A PO edits a text file, not Python. YAML frontmatter + markdown sections. +2. **HTTP is internal plumbing.** Specs never mention ports, payload shapes, or service internals. +3. **Live models.** The whole point is to audit the real production path after model upgrades. +4. **LLM-as-judge with receipts.** Every evaluation records the judge's reasoning so a human can spot-check. +5. **pytest is the runner.** Same as every other tier. Spec files are collected via a tiny `pytest_collect_file` hook. No custom CLI, no `bless`/`differ`/`migrate-questions` subcommands. +6. **Human-triggered only.** Never on every push, never on a schedule. Humans decide when a change is big enough to warrant an acceptance run — via PR label or manual `workflow_dispatch`. + +--- + +## 2. Directory layout + +Acceptance specs live alongside the service they test, in an `acceptance/` subfolder of `services//tests/`. **No top-level `tests/` tree.** + +``` +services//tests/ + acceptance/ + *.md # one spec per file + _template.md # copy-paste starter (underscore = skipped by collector) +``` + +Concretely: + +``` +services/global_chat/tests/acceptance/ + hero-patient-sync.md + voice-concise-answers.md + refuse-non-openfn-questions.md +services/workflow_chat/tests/acceptance/ + *.md +services/job_chat/tests/acceptance/ + *.md +``` + +**Cross-service specs** (refusals, safety, "hero" questions that test the orchestrator end-to-end) live under `services/global_chat/tests/acceptance/` since global_chat is the entry point everyone hits. + +The judge helper and the markdown-spec pytest collector live in **`testing/`** (the shared package): + +``` +testing/ + judge.py # LLM-as-judge helper (~150 lines) + fixtures.py # already there; nothing new +``` + +The pytest_collect_file hook is registered in the **root `apollo/conftest.py`** so it picks up `*.md` under any `acceptance/` folder anywhere in `services/`. + +**No `golden/` tree, no `reports/` folder in git.** Langfuse is the trend / comparison backend. Local test output (pass/fail + judge reasoning) comes from pytest stdout and `--junitxml`. If a run needs an HTML report, generate it with `pytest-html` when someone asks for it — not preemptively. + +**No `services/llm_evaluator/` service.** The judge is a helper module in `testing/judge.py` that calls Anthropic directly via the SDK. Promote to a service only when a non-test caller needs to invoke it. + +--- + +## 3. Spec format + +One spec per markdown file. YAML frontmatter + named markdown sections. + +### 3.1 Frontmatter + +```yaml +--- +id: global-chat.hero.patient-sync +title: "Build a CommCare to DHIS2 sync" +service: global_chat # global_chat | workflow_chat | job_chat +tags: [hero, voice, multi-turn] +runs: 3 # default 1 — number of times to run the same spec +judge_model: claude-sonnet-4-6 # defaults to the same in conftest +--- +``` + +Only `id` and `service` are required. Everything else inherits sensible defaults from the root `apollo/conftest.py`. + +### 3.2 Body sections (top-level markdown headers, case-insensitive) + +| Section | Required | Purpose | +|---|---|---| +| `# conversation` | one of conversation/question | `- user:` / `- assistant:` list. Last user line is tested; earlier lines become `history`. | +| `# question` | one of conversation/question | Shorthand for a single-turn conversation. | +| `# context` | optional | YAML block merged into payload: `workflow_yaml`, `page`, `context`, `attachments`, etc. | +| `# must_include` | optional | Substrings or `/regex/` that must appear in `response`. Deterministic; failure short-circuits before judge runs. | +| `# must_not_include` | optional | Opposite. | +| `# assertions` | required | Natural-language criteria, one per bullet — each passed to the LLM judge. | +| `# notes` | optional | Reviewer context, not sent to judge. | + +### 3.3 Example + +```markdown +--- +id: global-chat.hero.patient-sync +title: "Build a CommCare to DHIS2 sync" +service: global_chat +tags: [hero, planner] +runs: 3 +--- + +# conversation + +- user: "I want to create a workflow that fetches new patient registrations from CommCare every hour and creates matching tracked entities in DHIS2." + +# must_include +- /commcare/i +- /dhis2/i + +# must_not_include +- "I cannot help with that" + +# assertions +- The response proposes a workflow with at least two jobs. +- The tone is warm and collaborative, not clinical. +- An attached workflow_yaml is present and syntactically valid. +- The response does not leak the user's api_key or any secret-looking string. +``` + +--- + +## 4. Collection: spec → pytest item + +The root `apollo/conftest.py` implements a standard pytest collection hook that finds markdown specs in any `acceptance/` folder under `services/`: + +```python +def pytest_collect_file(parent, file_path): + if ( + file_path.suffix == ".md" + and not file_path.name.startswith("_") + and file_path.parent.name == "acceptance" + ): + return SpecFile.from_parent(parent, path=file_path) + +class SpecFile(pytest.File): + def collect(self): + spec = parse_spec(self.path) + for run_index in range(spec.runs): + yield SpecItem.from_parent( + self, name=f"{spec.id}[run={run_index}]", + spec=spec, run_index=run_index, + ) + +class SpecItem(pytest.Item): + def runtest(self): + payload = build_payload(self.spec, self.run_index) + response = self.client.call(self.spec.service, payload) + check_must_include(self.spec, response) # hard precondition; raises on fail + verdict = judge.evaluate(self.spec, response, model=self.spec.judge_model) + if not verdict.passed: + raise AssertionError(verdict.summary) +``` + +Each run is a separate pytest item. Benefits: `pytest -m acceptance` works, `--junitxml` works, `pytest-xdist` works, filtering with `-k hero` works. No new runner. + +A tiny `pytest_sessionfinish` hook counts, per spec, how many of the N runs the judge marked `passed=True` and prints `spec-id: 2/3 passed` to stdout. No pass/fail policy is applied — the count is raw output for humans to read. (The pytest exit code still reflects individual item pass/fail in the usual way.) + +--- + +## 5. The judge + +`testing/judge.py` is a single module (~150 lines). Not an Apollo service. + +### 5.1 Interface + +```python +@dataclass +class Verdict: + passed: bool + score: float # 0..1 — fraction of criteria passed + criteria: list[CriterionResult] + reasoning_summary: str # shown on pytest failure + judge_usage: dict # input/output tokens + +@dataclass +class CriterionResult: + criterion: str + passed: bool + reasoning: str + evidence: str # verbatim span from candidate + +def evaluate(spec: Spec, response: dict, *, model: str) -> Verdict: ... +``` + +### 5.2 Prompt strategy + +- System prompt instructs the judge to be strict and to quote evidence verbatim. +- User prompt: numbered criteria + candidate response (+ attachments if present) + demand for JSON output with per-criterion pass/fail + reasoning. +- Prefilled `{` to force JSON. +- Malformed JSON / judge refusal → verdict with `passed=False`, reason `judge_error`. Surfaced loudly. + +### 5.3 Why a helper and not a service + +The judge only has one caller today (acceptance tier). A whole Apollo service + `/services/llm_evaluator` HTTP endpoint + per-service test directory is overkill for that. If future callers appear (a ranker for search_docsite, a sanity-check step in gen_job), promote `judge.py` to `services/llm_evaluator/llm_evaluator.py` — it's a ~50-line reshape, not a redesign. + +### 5.4 Self-tests + +`services/tests/test_judge_unit.py` exercises the prompt builder and JSON parser with canned inputs. Uses `MockAnthropicClient` from the service tier's shared helpers. + +--- + +## 6. Langfuse integration + +Langfuse is already wired on `add-langfuse` — acceptance leans on it lightly for cross-run comparison. The runner does NOT rebuild Langfuse's dataset / score UI. + +### 6.1 Already in place (we reuse) + +- `services/langfuse_util.py::should_track()` gates trace export. Payloads set `user.employee=True` to stay inside the employee window. +- `@observe` on each chat service's `main()` — every acceptance run is automatically traced when `LANGFUSE_TRACING=true`. + +### 6.2 What we add + +1. **Session tagging.** Each run sets `session_id = f"acceptance-{spec.id}-run{i}"` and `tags = ["acceptance", spec.id, ...spec.tags]`. Done via Langfuse's `propagate_attributes`. +2. **Score push.** After the judge returns, write one score per run: `acceptance_pass` (0/1) and `acceptance_score` (0..1). Use Langfuse's Scores API directly from `judge.py` — no `langfuse_sink.py` wrapper. +3. **Cross-version comparison.** Native Langfuse dataset-runs view does this. Runner surfaces the URL in stdout. + +### 6.3 What we don't do via Langfuse + +- No Langfuse-hosted eval (we own the prompt). +- No hard dependency — acceptance runs offline if `LANGFUSE_PUBLIC_KEY` is unset OR `LANGFUSE_TRACING=false`; scores are skipped, runs still complete. + +--- + +## 7. Multi-run sampling + +Specs that benefit from sampling (tone, voice, any criterion where the LLM varies between runs) declare `runs: N` in the frontmatter. Default is `1`. + +Mechanics: + +- The collection hook yields N separate pytest items per spec, named `[run=0]` ... `[run=N-1]`. +- Each run calls the service independently and is judged independently by `testing/judge.py`. +- A `pytest_sessionfinish` hook tallies per spec: `: / passed` to stdout. + +That's it — no `all` / `majority` / `any` policies, no named aggregator, no default that says "2 out of 3 is good enough." Whoever reads the output decides whether the ratio is acceptable for that spec. A refusal spec with 2/3 is probably a regression; a voice spec with 2/3 might be fine. Humans make the call, the tooling just reports the count. + +Per-run pass/fail comes from the LLM judge (`Verdict.passed`, see §5.1). The count in `2/3 passed` is simply the number of runs whose judge verdict was `passed=True`. + +--- + +## 8. Human review loop + +**Primary: Langfuse UI.** Joe / Brandon open the dashboard, filter by `tags:acceptance`, review candidate + judge reasoning + score, override with a human annotation if they disagree. + +**Secondary: pytest stdout / JUnit.** CI logs show `FAIL global-chat.hero.patient-sync[run=1]` with the judge's reasoning summary as the pytest message. Enough for a quick triage. + +No dedicated HTML report on day one. Add `pytest-html` the first time someone asks for it. + +--- + +## 9. Triggers + +Acceptance is **never triggered automatically**. A human decides when a change is big enough to warrant spending the money on a run. + +| Trigger | Mechanism | +|---|---| +| Local manual | `poetry run pytest -m acceptance` | +| CI manual (any branch) | GH Actions `workflow_dispatch` on `llm-tests.yaml` | +| PR label | Apply `run-acceptance` label to a PR | + +Explicitly excluded: no cron, no nightly, no push-to-main, no tag-push, no scheduled runs of any kind. If the team later decides they want continuous drift monitoring, that's a deliberate policy change — not a default. + +Shares `.github/workflows/llm-tests.yaml` with the integration tier — see Section 3 §9 for the skeleton. Acceptance is a second job in the same workflow, gated by its own label condition. + +--- + +## 10. CI workflow (acceptance job) + +Appended to `.github/workflows/llm-tests.yaml`: + +```yaml + acceptance: + # Human-triggered only. No 'schedule' trigger — by design. + if: >- + (github.event_name == 'pull_request' && github.event.label.name == 'run-acceptance') + || github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + timeout-minutes: 45 + env: + CI: "true" + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY_TEST }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY_TEST }} + LANGFUSE_PUBLIC_KEY: ${{ secrets.LANGFUSE_PUBLIC_KEY_TEST }} + LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY_TEST }} + LANGFUSE_BASE_URL: ${{ secrets.LANGFUSE_BASE_URL }} + LANGFUSE_TRACING: "true" # acceptance runs always trace; that's the point + steps: + - uses: actions/checkout@v4 + - uses: oven-sh/setup-bun@v2 + - uses: actions/setup-python@v5 + with: { python-version: "3.11" } + - uses: snok/install-poetry@v1 + - run: bun install + - run: poetry install --with test-integration + - run: poetry run pytest -m acceptance -v --junitxml=tmp/test-logs/acceptance-junit.xml + - uses: actions/upload-artifact@v4 + if: always() + with: + name: acceptance-junit-${{ github.run_id }} + path: tmp/test-logs/acceptance-junit.xml + retention-days: 14 +``` + +--- + +## 11. Cost control + +Day-one approach is human-gated triggering + sensible defaults, not elaborate budget code: + +- Never automatic — every run is a deliberate human action. +- `runs: 1` default — specs must opt into sampling. +- Judge defaults to `claude-sonnet-4-6` (not opus). +- Prompt caching on candidate calls — preserved across the N runs of one spec by shared `session_id`. +- 45-minute workflow timeout as a hard ceiling. + +A budget env + soft circuit breaker can be added the first time a manual run surprises someone. Not on day one. + +--- + +## 12. Extensibility + +Adding a new sub-agent or tool — no Python required: + +1. Ensure the new service exposes `main()` at `services//.py` (auto-mounts). +2. Create `services//tests/acceptance/` and drop markdown specs. + +Adding a new judge model: list it in `apollo/conftest.py` (or let it be free-form — strings all the way). `judge_model:` in frontmatter. + +--- + +## 13. Relationship to integration + +| Concern | Integration | Acceptance | +|---|---|---| +| Goal | Functional correctness | Quality, voice, style | +| Assertions | Regex + shape | Natural-language criteria + LLM judge | +| Trigger | PR label / push to main / manual | PR label / manual — **never automatic** | +| Stability | Deterministic | Probabilistic (N runs) | +| Runner | pytest | pytest | +| Marker | `@pytest.mark.integration` | `@pytest.mark.acceptance` | +| Location | `services//tests/test__integration.py` | `services//tests/acceptance/*.md` | + +**Overlap rule:** a test lives in exactly one tier. An "acceptance" spec that merely asserts a YAML attachment exists belongs in integration. An integration test that checks "the tone feels terse enough" belongs in acceptance. + +--- + +## 14. Migration of existing artefacts + +- `services/job_chat/evaluation/questions.md` — mostly-compatible format. One-time manual conversion (split per entry, add frontmatter, drop into `services/job_chat/tests/acceptance/`). No migration CLI needed — it's a one-shot editor task. +- `services/global_chat/tests/test_*_qualitative.py` — the prose at the top of each test (in `print()` statements) becomes `# notes`; `content`/`context` become spec sections; qualitative asserts become `# assertions` bullets. Drop the resulting markdown files into `services/global_chat/tests/acceptance/` (or the relevant sub-agent's acceptance folder if the test is targeting a sub-agent specifically). Any machine-checkable asserts move to integration. +- Migration is opt-in, one file at a time. + +--- + +## 15. What this tier deliberately does NOT do + +- **No top-level `tests/` tree.** Specs live under their service. +- **No `services/llm_evaluator/` service.** Judge is a helper module. +- **No custom acceptance runner.** Pytest collects specs; that's it. +- **No `bless` / `differ` / `migrate-questions` / `review` subcommands.** The first two make sense if we adopt golden-file diffing; we don't on day one (Langfuse's dataset-runs comparison is the primitive). The last two are one-off editor tasks. +- **No `golden/` git tree.** Model drift is tracked in Langfuse. +- **No HTML reporter.** `pytest-html` is a line in `pyproject.toml` the day we want it. +- **No per-spec cost caps, budget estimator, `list`/`lint` commands, skip-on-no-change mode.** Defer until bills say otherwise. +- **No `criteria_mode: weighted` with per-criterion weights.** `all` or `any` across criteria. Add weighting when a spec genuinely needs it. + +--- + +## Summary + +Acceptance = markdown specs in `services//tests/acceptance/` + pytest collector + tiny judge helper in `testing/judge.py` + Langfuse scores. No new Apollo service, no custom runner, no golden tree, no top-level `tests/` directory. Runs via the standard pytest mechanism under a shared `llm-tests.yaml` workflow. Adding a sub-agent means dropping markdown files under that service's `acceptance/` folder. The judge promotes to a service the day it has a second caller; nothing else changes. diff --git a/agent-team-architecture-plan/5-overview.md b/agent-team-architecture-plan/5-overview.md new file mode 100644 index 00000000..36d0fa28 --- /dev/null +++ b/agent-team-architecture-plan/5-overview.md @@ -0,0 +1,205 @@ +# Apollo Testing Architecture — Overview + +**Supervisor:** team-lead +**Scope:** architecture (files, utils, fixtures, CI wiring) for a four-tier test framework covering `global_chat`, `workflow_chat`, `job_chat`, their tools, and any future sub-agent / tool services. Specific tests are NOT designed here. + +Deep detail lives in: + +- `1-unit-tests.md` — pure-function tests, free, every push. +- `2-service-tests.md` — `main()` with a mocked Anthropic HTTP client, free, every push. +- `3-integration-tests.md` — HTTP to the running bun server with real LLMs, label-gated. +- `4-acceptance-tests.md` — markdown specs judged by an LLM helper, manually triggered only. + +--- + +## 1. The four tiers at a glance + +| Tier | Transport | LLM | Asserts on | Runs on | +|---|---|---|---|---| +| **Unit** | direct import of one function | none | function output | every push | +| **Service** | direct `main(data, test_hooks)` | mocked via injected `httpx.MockTransport` | code path, mock call args, payload shape | every push | +| **Integration** | HTTP / SSE / WS via running bun server | real | response shape + loose regex on content | push to `main`, PR label `run-integration`, manual | +| **Acceptance** | HTTP via running bun server | real + judge LLM | natural-language criteria graded by judge | PR label `run-acceptance`, manual (`workflow_dispatch`). **Never automatic** — humans decide when a change is big enough to warrant a run. | + +**Sharp dividing lines:** + +- Content assertion depends on *what the LLM wrote* → integration or acceptance (not service). +- Content assertion needs a judge LLM → acceptance (not integration). +- Test exercises `main()` → service or integration (not unit). +- Test needs the bun server → integration or acceptance (not unit or service). + +--- + +## 2. Directory layout + +Every test lives under the service it relates to. One `tests/` folder per service holds all four tiers. + +``` +apollo/ +├── conftest.py # root: dummy api keys, register shared fixtures, markdown spec collector +│ +├── testing/ # Shared test helpers — peer to services/, not a service itself +│ ├── __init__.py +│ ├── anthropic_mock.py # MockAnthropicClient + canned response builders; documents the `test_hooks` dict keys +│ ├── fixtures.py # pytest fixtures (mock client, test_hooks factory, payloads, yaml assertions) +│ ├── server.py # apollo_server fixture + ApolloClient (sync / sse / ws) +│ ├── judge.py # small LLM-as-judge helper for acceptance specs +│ └── fixtures/ # sample yaml/json shared across services +│ ├── workflows/*.yaml +│ └── histories/*.json +│ +├── services/ +│ └── / # global_chat, workflow_chat, job_chat +│ ├── .py # main() gains optional `test_hooks` 2nd arg (default None) +│ └── tests/ +│ ├── conftest.py # auto-marks by filename suffix and acceptance/ folder +│ ├── test__unit.py # tier 1 — pure functions, marker: unit +│ ├── test__service.py # tier 2 — main() with mocked anthropic, marker: service +│ ├── test__integration.py # tier 3 — sync + stream + ws, marker: integration +│ └── acceptance/ # tier 4 — markdown specs, marker: acceptance +│ └── *.md # question + data + natural-language assertions +│ +├── .github/workflows/ +│ ├── tests.yaml # unit + service, every push (free) +│ └── llm-tests.yaml # integration + acceptance, label-gated / manual only (no cron, no schedule) +│ +└── pyproject.toml # [tool.pytest.ini_options]: markers, pythonpath=["services", "."] +``` + +**Decisions worth knowing:** + +- **All tests live with their service.** `services//tests/` holds unit + service + integration tests for that service, plus an `acceptance/` subfolder of markdown specs. No top-level `tests/` tree. +- **Cross-service work lives with the orchestrator.** Planner-chain end-to-end integration tests live under `services/global_chat/tests/test_global_chat_integration.py` because `global_chat` owns the planner. Cross-service acceptance specs (refusals, safety) live under `services/global_chat/tests/acceptance/`. +- **One shared helper package: `apollo/testing/`.** A peer of `services/` and `platform/`, not a service. Server fixture, ApolloClient, mock anthropic, judge, payload builders, yaml assertions — all here. +- **Filename suffix + folder name carry tier intent.** `*_unit.py` → unit, `*_service.py` → service, `*_integration.py` → integration, `acceptance/*.md` → acceptance. Marker auto-applied by conftest. Authors never write `@pytest.mark.X` manually. +- **No `llm_evaluator` Apollo service.** The judge is `testing/judge.py`. Promote to a service the day it has a non-test caller. +- **`test_hooks` second-arg stays.** Load-bearing production change; cheap. `test_hooks is None` is byte-identical to today. + +--- + +## 3. The `test_hooks` second-arg pattern (service tier's anchor) + +Every chat service's `main()` gains an optional second argument: + +```python +def main(data_dict: dict, test_hooks: Optional[dict] = None) -> dict: ... +``` + +- **HTTP path is untouched.** `entry.py` calls `m.main(data)` with one positional arg. Because `test_hooks` defaults to `None`, the bridge never sees it. +- **Only direct-Python callers (pytest) set `test_hooks`.** Integration and acceptance tiers don't use it (they go through HTTP, which strips `test_hooks`). Unit tier doesn't use it (unit tests never call `main()`). + +`test_hooks` is a plain Python `dict` (not a formal type). Its recognised keys are documented as a docstring in `testing/anthropic_mock.py`: + +- `anthropic_http_client` — an `httpx.Client` backed by `httpx.MockTransport`; threaded into every `Anthropic(api_key=..., http_client=...)` constructor site via a new `services/util.py::build_anthropic_client()` factory. +- `tool_calls` — a test-allocated `list[dict]` that production code appends to as breadcrumbs when present. +- `tool_stubs` — a `dict[str, Callable]` keyed by tool name. The planner consults it before dispatching a tool; if a stub is registered, it's called instead of the real tool. Today only used for `search_documentation` (which otherwise hits Pinecone + OpenAI). See `2-service-tests.md` §5. + +No sub-agent stub registry, no tool stub registry, no `seed`, no `disable_langfuse` — the shape starts minimal and grows only when a test can't be written without it. + +Minimal production-code edits: the three chat services' `main()`, their `AnthropicClient`/`RouterAgent`/`PlannerAgent` constructors (one-line swap to `build_anthropic_client(api_key, test_hooks)`), and the new factory in `services/util.py`. + +--- + +## 4. Server lifecycle + +`testing/server.py` exposes a session-scoped pytest fixture `apollo_server` that: + +- Honours `APOLLO_TEST_BASE_URL` to reuse a running staging server or local `bun dev`. +- Otherwise spawns `bun run start` on an OS-allocated port, polls `GET /` until 200, yields `(base_url, port)`. +- On teardown: SIGTERM, 5s wait, SIGKILL if needed. Drains stdout/stderr to `tmp/test-logs/`. + +`ApolloClient` (also in `testing/server.py`) wraps `.call()`, `.stream()`, `.ws()`. Function-scoped `client` fixture binds to the session server. Both integration and acceptance use the same fixture. + +Registered globally via `pytest_plugins = ["testing.fixtures", "testing.server"]` in the root `apollo/conftest.py`. + +--- + +## 5. Test runner and commands + +Everything is pytest. Acceptance markdown specs are collected by a `pytest_collect_file` hook in the root `apollo/conftest.py`. No custom runner. + +```bash +# Dev default — free, fast +poetry run pytest -m "unit or service" + +# Run just one tier +poetry run pytest -m unit +poetry run pytest -m service +poetry run pytest -m integration # opt-in +poetry run pytest -m acceptance # opt-in + +# Run all tests for a single service (any tier) +poetry run pytest services/global_chat/tests + +# Run one tier within one service +poetry run pytest services/global_chat/tests -m integration +poetry run pytest services/global_chat/tests/acceptance +``` + +TypeScript platform tests continue to run via `bun test`. Unchanged. + +--- + +## 6. CI integration + +Two workflow files: + +| File | Runs | Trigger | Secrets | Timeout | +|---|---|---|---|---| +| `.github/workflows/tests.yaml` | `pytest -m "unit or service"` | every push, every PR | **none** (deliberate — mocks must not hit real APIs; missing `ANTHROPIC_API_KEY` is a loud failure) | 10 min | +| `.github/workflows/llm-tests.yaml` | `pytest -m "integration or acceptance"` | **Integration:** PR label `run-integration`, push to `main`, `workflow_dispatch`. **Acceptance:** PR label `run-acceptance` or `workflow_dispatch` only — never cron, schedule, or push. | `ANTHROPIC_API_KEY_TEST`, `OPENAI_API_KEY_TEST`, `PINECONE_API_KEY_TEST`, `POSTGRES_URL_TEST`, `LANGFUSE_*_TEST` | 45 min | + +Design invariant: **the fast job has no `ANTHROPIC_API_KEY` env var**. If a service test accidentally constructs a real Anthropic client (because someone forgot to thread `test_hooks`), the SDK errors on missing-key — a loud failure is free defence-in-depth against mocked-test leaks. + +Cost controls for the LLM workflow are intentionally out of day-one scope: start with `--maxfail=5` and a 45-min timeout; add budget tracking the first time a bill surprises someone. + +--- + +## 7. Conventions that hold across all tiers + +- **Layer marker auto-apply.** Per-service `services//tests/conftest.py` uses `pytest_collection_modifyitems` to tag tests by filename suffix (`_unit.py` / `_service.py` / `_integration.py`) and folder (`acceptance/`). Authors never write `@pytest.mark.X` manually. +- **No Langfuse export from free-tier tests.** `LANGFUSE_TRACING=false` set in unit/service runs. +- **Dummy API keys for free-tier tests.** `ANTHROPIC_API_KEY="unit-test-dummy"` so imports don't crash but any accidentally-real call fails loud. +- **`pythonpath = ["services"]`.** Kills the `sys.path.insert(...)` boilerplate at the top of every existing test file. + +--- + +## 8. Extensibility — adding a new sub-agent or tool + +1. Add `services//.py` with `main(data, test_hooks=None)`. Auto-mounted by `describe-modules.ts`. +2. Create `services//tests/conftest.py` — copy from an existing service. +3. Add files as needed: `test__unit.py`, `test__service.py`, `test__integration.py`, `acceptance/*.md`. + +No changes needed in pyproject, CI, or shared helpers. Discovery is zero-config — pytest already crawls `services/*/tests/` and the markdown collector finds any `acceptance/` folder. + +--- + +## 9. Implementation order (recommended) + +1. **Scaffolding.** Create `testing/` (skeleton — `anthropic_mock.py`, `fixtures.py`, `server.py`, `judge.py`), root `apollo/conftest.py`, `[tool.pytest.ini_options]` block in pyproject. One PR — unblocks everything else. +2. **Unit tier.** Migrate `services/workflow_chat/tests/test_functions.py` → `test_workflow_chat_functions_unit.py` as the worked example. Wire `tests.yaml` with just `-m unit`. Green CI on every push. +3. **Service tier.** Add `test_hooks=None` to the three chat services' `main()`. Add `services/util.py::build_anthropic_client()`. Build `MockAnthropicClient`. Extend `tests.yaml` to `-m "unit or service"`. Migrate the first `pass_fail` test whose assertion doesn't depend on content. +4. **Integration tier.** Add `testing/server.py` (server fixture + `ApolloClient`). Create `llm-tests.yaml`. Migrate the first cross-service end-to-end test into `services/global_chat/tests/test_global_chat_integration.py`. Secrets wired. +5. **Acceptance tier.** Add `testing/judge.py` and the markdown collector hook in the root conftest. Drop the first 2–3 hero specs into `services/global_chat/tests/acceptance/`. First manual run green (`workflow_dispatch`). + +Each step is an independent PR. Nothing here blocks shipping production features in between. + +--- + +## 10. What this architecture deliberately does NOT do + +- Does not design specific tests. That's the next phase. +- Does not modify production code beyond the `test_hooks` second-arg + `build_anthropic_client` factory. +- Does not replace `bun:test` for TypeScript. +- Does not create a top-level `tests/` tree. Everything lives with the service. +- Does not create a separate `llm_evaluator` service. Judge is a helper module. +- Does not build a custom acceptance runner with `bless`/`differ`/`migrate-questions`/etc. Specs are markdown, collection is pytest, judge writes scores to Langfuse via its existing SDK — that's it. +- Does not track golden answers as a separate git tree. If we want regression diffing later, Langfuse's native dataset-run comparison is the primitive. +- Does not build cost tracking, pricing tables, retry plugins, or circuit breakers on day one. Add each the first time its absence costs something. +- Does not gate merges on coverage percentages. + +--- + +## Summary + +Four tiers, one home (`services//tests/`), one shared Python helper package (`testing/`), zero new Apollo services, two CI workflows. The `test_hooks` second-arg pattern is the single architectural change to production code; everything else is additive. Adding a new sub-agent or tool is a ~10-minute task with no framework edits. From f219414afd47e8199cce3b80a119eb8d17191c10 Mon Sep 17 00:00:00 2001 From: "Hanna Paasivirta (OpenFn)" Date: Tue, 28 Apr 2026 01:19:25 +0900 Subject: [PATCH 02/16] update plan 2 --- agent-team-architecture-plan/1-unit-tests.md | 4 +- .../2-service-tests.md | 133 +++++++++++------- agent-team-architecture-plan/5-overview.md | 6 +- 3 files changed, 91 insertions(+), 52 deletions(-) diff --git a/agent-team-architecture-plan/1-unit-tests.md b/agent-team-architecture-plan/1-unit-tests.md index 962a321a..903dcd9b 100644 --- a/agent-team-architecture-plan/1-unit-tests.md +++ b/agent-team-architecture-plan/1-unit-tests.md @@ -64,7 +64,7 @@ Unit-tier import rules: `testing/fixtures.py` owns (same module, flat — split into files only when this one grows past ~500 lines): -- Pytest fixtures (`sample_workflow_yaml`, `sample__chat_payload`, `fake_api_key`, `anthropic_client_no_network`). +- Pytest fixtures (`sample_workflow_yaml`, `sample__chat_payload`, `fake_api_key`). - YAML assertion helpers migrated from the currently-duplicated `services/global_chat/tests/test_utils.py` and `services/workflow_chat/tests/test_utils.py` (`path_matches`, `assert_yaml_equal_except`, `assert_yaml_section_contains_all`, `assert_yaml_has_ids`, `assert_yaml_jobs_have_body`, `assert_no_special_chars`). - Payload builders (`make__chat_payload`) — shared with integration tier. - Fixture loaders (`load_fixture_json`, `load_fixture_yaml`). @@ -145,7 +145,7 @@ Coverage: generate `--cov=services --cov-report=xml` and upload as artifact for ## 7. Migration path for existing tests -- `services/workflow_chat/tests/test_functions.py` — all eight tests are already unit-shaped. Rename to `test_workflow_chat_functions_unit.py`, delete `sys.path.insert(...)`, replace local `client` fixture with `anthropic_client_no_network`. No assertion changes. +- `services/workflow_chat/tests/test_functions.py` — all eight tests are already unit-shaped. Rename to `test_workflow_chat_functions_unit.py`, delete `sys.path.insert(...)`, delete the local `client` fixture entirely (unit tests don't need an Anthropic client per §1's "zero LLM calls" rule). No assertion changes. - `services/job_chat/tests/test_functions.py` — misclassified. `test_generate_system_message_loads_adaptor_docs_when_missing` hits Postgres → integration. `test_generate_queries_returns_valid_structure` hits real Anthropic → service (with mocked client) or integration. `test_search_docs_returns_general_docs_only` hits Pinecone → integration. A new `test_prompt_unit.py` covers the pure helpers (`build_prompt`, `build_error_correction_prompt`, `extract_page_prefix_from_last_turn`). - `services/global_chat/tests/test_utils.py` + `services/workflow_chat/tests/test_utils.py` — YAML helpers migrate to `testing/fixtures.py`. The `call__service` subprocess helpers are replaced by the integration tier's `ApolloClient`. Old files are deleted after all callers are updated. - `*_pass_fail.py`, `*_qualitative.py`, `*_langfuse_tracing.py`, `*_adaptor_version_passthrough.py`, `*_planner_*.py`, `*_good_morning_*.py` — owned by service/integration/acceptance tiers. diff --git a/agent-team-architecture-plan/2-service-tests.md b/agent-team-architecture-plan/2-service-tests.md index a0c30ce8..e79f584a 100644 --- a/agent-team-architecture-plan/2-service-tests.md +++ b/agent-team-architecture-plan/2-service-tests.md @@ -85,71 +85,110 @@ def build_anthropic_client(api_key: str, test_hooks: Optional[dict] = None) -> A Every `AnthropicClient` / `RouterAgent` / `PlannerAgent` constructor swaps `Anthropic(api_key=...)` for `build_anthropic_client(api_key, test_hooks)`. -### 3.2 `testing/anthropic_mock.py` +### 3.2 `MockAnthropic` in `testing/anthropic_mock.py` -Single file — `MockAnthropicClient` class, canned response-body builders, docstring documenting recognised `test_hooks` keys, and the `record_tool_call(test_hooks, entry)` helper. Split later if it grows unwieldy. - -The `anthropic` Python SDK accepts a custom `http_client` — we build ours from `httpx.MockTransport`: +`MockAnthropic` is a thin wrapper over `httpx.MockTransport`. Tests register regex → response pairs; on each Anthropic request the mock matches the latest user message text against registered patterns and returns the first match. No new runtime dep — `httpx.MockTransport` is built into httpx, which is already in `poetry.lock`. ```python -class MockAnthropicClient: - """Thin wrapper over httpx.Client + httpx.MockTransport. +class MockAnthropic: + """Mock Anthropic API backed by httpx.MockTransport. + + Tests register regex → response pairs. Each request is matched against + the latest user message text (including tool_result content); the first + matching pattern wins. No match raises AssertionError. Usage: - mc = MockAnthropicClient.always(response=text_response("hello")) - mc = MockAnthropicClient.script([resp1, resp2, resp3]) # multi-turn - mc = MockAnthropicClient.streaming(events=[...]) # SSE - - After the call: - mc.requests # list[RecordedRequest] - mc.last_request.json["messages"] - mc.last_request.headers["x-api-key"] + mock = MockAnthropic() + mock.set_response(r"haiku", "sure, here's a haiku") + mock.set_response(r"create workflow", tool_use("call_workflow_agent", {...})) + + test_hooks = test_hooks_factory(anthropic=mock) + main(payload, test_hooks) + + assert mock.last_request.headers["x-api-key"] == "sk-test" """ - @classmethod - def always(cls, response) -> "MockAnthropicClient": ... - @classmethod - def script(cls, responses) -> "MockAnthropicClient": ... - @classmethod - def streaming(cls, events) -> "MockAnthropicClient": ... + + def __init__(self): + self._responses: list[tuple[re.Pattern, str | list[dict]]] = [] + self.requests: list[httpx.Request] = [] + + def set_response(self, pattern: str, response: str | list[dict]) -> None: + """Register a response for any request whose latest user message + text matches `pattern`. `response` is either: + - str: returned as a single text content block. + - list[dict]: returned as content blocks (use for tool_use, mixed). + """ + self._responses.append((re.compile(pattern), response)) @property - def httpx_client(self) -> httpx.Client: ... - @property - def requests(self) -> list[RecordedRequest]: ... + def httpx_client(self) -> httpx.Client: + return httpx.Client(transport=httpx.MockTransport(self._handle)) + @property - def last_request(self) -> RecordedRequest: ... + def last_request(self) -> httpx.Request: + return self.requests[-1] + + def _handle(self, request: httpx.Request) -> httpx.Response: + self.requests.append(request) + body = json.loads(request.content) + user_text = _latest_user_text(body.get("messages", [])) + for pattern, resp in self._responses: + if pattern.search(user_text): + return httpx.Response(200, json=_build_message_body(resp)) + raise AssertionError( + f"MockAnthropic: no pattern matched user message {user_text!r}. " + f"Registered patterns: {[p.pattern for p, _ in self._responses]}" + ) + + +def tool_use(name: str, input: dict, id: str = "toolu_test") -> list[dict]: + """Build a single tool_use content block for `set_response`.""" + return [{"type": "tool_use", "id": id, "name": name, "input": input}] ``` -Response-body builders in the same file: +Two private helpers in the same file: -- `text_response(text, model=..., usage=...)` -- `tool_use_response(tool_name, tool_input, tool_use_id="toolu_01")` -- `mixed_response(text, tool_uses=[...])` -- `router_decision_response(destination, confidence=4, job_key=None)` -- `stream_events(text="", tool_uses=None)` -- `usage_block(input_tokens=100, output_tokens=50, cache_creation=0, cache_read=0)` +- `_latest_user_text(messages)` walks `messages` in reverse, takes the last `role=user` message, and concatenates its `text` blocks plus any `tool_result` content. Matching against `tool_result` text is what makes the planner's internal loop work — see §3.3. +- `_build_message_body(response)` wraps a string in `[{"type": "text", "text": ...}]` (or passes a list through), then assembles the standard Anthropic message envelope: `id`, `type`, `role: "assistant"`, `model`, `content`, `stop_reason`, `stop_sequence`, `usage`. -No new runtime dep — `httpx.MockTransport` is built into httpx, which is already in `poetry.lock`. +Design choices worth knowing: -### 3.3 Scripted multi-turn example +- **First-match-wins.** Order specific patterns before general ones. +- **Loud no-match.** Raises `AssertionError` with the unmatched text and the list of registered patterns. Beats silent fallbacks that mask test drift. +- **Captured requests.** `mock.requests` (list) and `mock.last_request` for assertions on outbound headers, body shape, system prompt, `cache_control`, beta headers, history round-trip. + +Tool-call breadcrumbs (`record_tool_call`) live in the same file — see §4. + +### 3.3 The planner's internal loop + +`main()` is one user turn, but the planner agent internally calls Anthropic multiple times within that turn — once per round of the tool-use loop (`call → tool_use → run tool → call with tool_result → ...` until `end_turn`). The regex matcher handles this naturally because each round has different content in the latest user message: ```python def test_planner_calls_workflow_then_job_agents(test_hooks_factory): - mock = MockAnthropicClient.script([ - router_decision_response("planner", confidence=5), - tool_use_response("call_workflow_agent", {"message": "create workflow"}), - tool_use_response("call_job_code_agent", {"message": "code for step", "job_key": "fetch"}), - text_response("All done."), - ]) + mock = MockAnthropic() + mock.set_response(r"create a workflow", + tool_use("call_workflow_agent", {"message": "create workflow"})) + mock.set_response(r"workflow created", + tool_use("call_job_code_agent", {"message": "code", "job_key": "fetch"})) + mock.set_response(r"code generated", "all done") + test_hooks = test_hooks_factory(anthropic=mock) - result = global_chat_main(make_global_chat_payload("create a workflow"), test_hooks) + main(make_global_chat_payload("create a workflow"), test_hooks) assert [c["tool"] for c in test_hooks["tool_calls"]] == [ - "router_decision", "call_workflow_agent", "call_job_code_agent", + "call_workflow_agent", "call_job_code_agent", ] ``` -When a test wants to bypass a sub-agent's real code, it scripts responses for the planner's `/v1/messages` calls and lets the sub-agent's own `main()` run under the same mock client. Stub registries aren't needed for the common case. +- Round 1: latest user message = the user's prompt → matches `r"create a workflow"`, returns the first `tool_use`. +- Round 2: latest user message = the `tool_result` from `call_workflow_agent` → matches `r"workflow created"`, returns the next `tool_use`. +- Round 3: latest user message = the `tool_result` from `call_job_code_agent` → matches `r"code generated"`, returns the final text. + +When a sub-agent (`workflow_chat`, `job_chat`) gets invoked inside this loop, its own `main()` runs under the *same* mock client. Tests register regexes covering both the parent's and the child's expected user messages on one mock — no sub-agent stub registry needed. + +### 3.4 Streaming (deferred) + +Streaming endpoints (`/services//stream`, `/services/` WS) emit Anthropic-formatted SSE events through `bridge.ts` (`StreamManager` in `services/streaming_util.py`). Detailed mock support — likely a `set_stream_response(pattern, events)` method or a sibling `MockAnthropicStream` class — is out of scope for this section. Defer until the first service test for a streaming code path actually needs it; meanwhile the integration tier covers stream behaviour against the real bun server. --- @@ -238,7 +277,7 @@ Cross-service end-to-end flow tests (planner chain over mocks) also live under ` ``` testing/ __init__.py - anthropic_mock.py # MockAnthropicClient, response builders, test_hooks-keys docstring, record_tool_call + anthropic_mock.py # MockAnthropic + tool_use helper, test_hooks-keys docstring, record_tool_call fixtures.py # pytest fixtures + YAML assertion helpers + payload builders + loaders fixtures/ workflows/*.yaml @@ -250,7 +289,7 @@ testing/ - `make_global_chat_payload`, `make_workflow_chat_payload`, `make_job_chat_payload`. - `get_workflow_yaml_attachment`, `get_suggested_code_attachment`, `get_usage`. - `assert_yaml_has_ids`, `assert_yaml_jobs_have_body`, `assert_yaml_equal_except`, `path_matches`, `assert_no_special_chars`. -- Pytest fixtures: `mock_anthropic`, `test_hooks_factory`, `fake_api_key`, `sample_workflow_yaml`, `anthropic_client_no_network`. +- Pytest fixtures: `mock_anthropic`, `test_hooks_factory`, `fake_api_key`, `sample_workflow_yaml`. - `set_unit_test_env` (dummy keys, disable langfuse/sentry). - `load_fixture_json`, `load_fixture_yaml`. @@ -306,7 +345,7 @@ Service runs in the same `tests.yaml` workflow as unit, via `pytest -m "unit or 1. **Classify the assertion.** Content-sensitive (`"response mentions Salesforce"`) → integration or acceptance. Structural (`"workflow_yaml has 2 jobs"`) → service (with a canned mock producing that structure). 2. **Replace the call site.** Swap `subprocess.run([..., "entry.py", ...])` for `from . import main; main(payload, test_hooks)`. -3. **Build the mock.** Hand-craft an Anthropic response fixture (or a script for planner multi-turn) that produces the shape under test. +3. **Build the mock.** Register one or more `set_response(pattern, response)` pairs on `MockAnthropic` so each Anthropic call in the path under test gets a matching canned response. 4. **Assert on structure + breadcrumbs.** Replace content asserts with routing / shape asserts; keep content in acceptance. 5. **Delete the old test** once the new one is stable. @@ -363,7 +402,7 @@ Pattern: **one arg, one call to `record_tool_call`, one test**. No framework cha Good service-test targets: - **API key threading** — payload `api_key` ends up in `mock.last_request.headers["x-api-key"]`; absent → env var is used. -- **Cache-control regression** — planner system prompt has `cache_control: {"type": "ephemeral"}`; assert on outbound request body. +- **Cache-control regression** — planner system prompt has `cache_control: {"type": "ephemeral"}`; assert on outbound request body via `json.loads(mock.last_request.content)` (httpx `Request.content` is bytes — there's no `.json` accessor on the request side). - **Context-management beta** — planner sets `context-management-2025-06-27` header and `context_management` field on every call. - **History round-trip** — returned `history` equals input + this turn's user/assistant messages. - **`AdaptorSpecifier` propagation** — payload `context.adaptor = "@openfn/language-http@3.1.11"` shows up in the prompt. @@ -374,4 +413,4 @@ Good service-test targets: ## Summary -`test_hooks` second arg on `main()` + `build_anthropic_client(api_key, test_hooks)` factory + `MockAnthropicClient` with `always`/`script`/`streaming` constructors + three `test_hooks` keys (`anthropic_http_client`, `tool_calls`, `tool_stubs` — the last only used for `search_documentation` today). Three files in `testing/`, filename-suffix markers, shared workflow with the unit tier. Add sub-agent stub infrastructure the first time a test can't be written without it. +`test_hooks` second arg on `main()` + `build_anthropic_client(api_key, test_hooks)` factory + `MockAnthropic` (regex → response pairs, `AssertionError` on no match) with a `tool_use(...)` helper for tool-use content blocks + three `test_hooks` keys (`anthropic_http_client`, `tool_calls`, `tool_stubs` — the last only used for `search_documentation` today). One mock file in `testing/`, filename-suffix markers, shared workflow with the unit tier. Streaming and any sub-agent stub infrastructure are deferred until a real test needs them. diff --git a/agent-team-architecture-plan/5-overview.md b/agent-team-architecture-plan/5-overview.md index 36d0fa28..05617e05 100644 --- a/agent-team-architecture-plan/5-overview.md +++ b/agent-team-architecture-plan/5-overview.md @@ -40,7 +40,7 @@ apollo/ │ ├── testing/ # Shared test helpers — peer to services/, not a service itself │ ├── __init__.py -│ ├── anthropic_mock.py # MockAnthropicClient + canned response builders; documents the `test_hooks` dict keys +│ ├── anthropic_mock.py # MockAnthropic (regex → response) + tool_use helper + record_tool_call; documents the `test_hooks` dict keys │ ├── fixtures.py # pytest fixtures (mock client, test_hooks factory, payloads, yaml assertions) │ ├── server.py # apollo_server fixture + ApolloClient (sync / sse / ws) │ ├── judge.py # small LLM-as-judge helper for acceptance specs @@ -90,7 +90,7 @@ def main(data_dict: dict, test_hooks: Optional[dict] = None) -> dict: ... `test_hooks` is a plain Python `dict` (not a formal type). Its recognised keys are documented as a docstring in `testing/anthropic_mock.py`: -- `anthropic_http_client` — an `httpx.Client` backed by `httpx.MockTransport`; threaded into every `Anthropic(api_key=..., http_client=...)` constructor site via a new `services/util.py::build_anthropic_client()` factory. +- `anthropic_http_client` — an `httpx.Client` backed by `httpx.MockTransport`. Built by `MockAnthropic`, which matches each request's latest user message text against test-registered regex → response pairs (no match → `AssertionError`). Threaded into every `Anthropic(api_key=..., http_client=...)` constructor site via a new `services/util.py::build_anthropic_client()` factory. The planner's internal tool-use loop (multiple Anthropic calls per `main()`) is covered by the same mechanism — each round has different latest-user-message text, so different regexes match. See `2-service-tests.md` §3. - `tool_calls` — a test-allocated `list[dict]` that production code appends to as breadcrumbs when present. - `tool_stubs` — a `dict[str, Callable]` keyed by tool name. The planner consults it before dispatching a tool; if a stub is registered, it's called instead of the real tool. Today only used for `search_documentation` (which otherwise hits Pinecone + OpenAI). See `2-service-tests.md` §5. @@ -178,7 +178,7 @@ No changes needed in pyproject, CI, or shared helpers. Discovery is zero-config 1. **Scaffolding.** Create `testing/` (skeleton — `anthropic_mock.py`, `fixtures.py`, `server.py`, `judge.py`), root `apollo/conftest.py`, `[tool.pytest.ini_options]` block in pyproject. One PR — unblocks everything else. 2. **Unit tier.** Migrate `services/workflow_chat/tests/test_functions.py` → `test_workflow_chat_functions_unit.py` as the worked example. Wire `tests.yaml` with just `-m unit`. Green CI on every push. -3. **Service tier.** Add `test_hooks=None` to the three chat services' `main()`. Add `services/util.py::build_anthropic_client()`. Build `MockAnthropicClient`. Extend `tests.yaml` to `-m "unit or service"`. Migrate the first `pass_fail` test whose assertion doesn't depend on content. +3. **Service tier.** Add `test_hooks=None` to the three chat services' `main()`. Add `services/util.py::build_anthropic_client()`. Build `MockAnthropic`. Extend `tests.yaml` to `-m "unit or service"`. Migrate the first `pass_fail` test whose assertion doesn't depend on content. 4. **Integration tier.** Add `testing/server.py` (server fixture + `ApolloClient`). Create `llm-tests.yaml`. Migrate the first cross-service end-to-end test into `services/global_chat/tests/test_global_chat_integration.py`. Secrets wired. 5. **Acceptance tier.** Add `testing/judge.py` and the markdown collector hook in the root conftest. Drop the first 2–3 hero specs into `services/global_chat/tests/acceptance/`. First manual run green (`workflow_dispatch`). From dce9dffe8f0b8b0e5675efe4803354863c8b00af Mon Sep 17 00:00:00 2001 From: "Hanna Paasivirta (OpenFn)" Date: Tue, 28 Apr 2026 01:49:15 +0900 Subject: [PATCH 03/16] add unit testing architecture --- .github/workflows/unit-tests.yaml | 41 ++++ conftest.py | 72 ++++++ pyproject.toml | 40 +++ services/global_chat/tests/test_utils.py | 201 +-------------- services/global_chat/tests/unit/__init__.py | 0 .../job_chat/tests/integration/__init__.py | 0 .../test_adaptor_docs_pipeline.py} | 14 +- services/job_chat/tests/unit/__init__.py | 0 services/testing/README.md | 22 ++ services/testing/__init__.py | 0 services/testing/yaml_assertions.py | 124 ++++++++++ .../search_documentation/tests/__init__.py | 0 .../tests/unit/__init__.py | 0 services/workflow_chat/tests/conftest.py | 15 ++ .../workflow_chat/tests/test_functions.py | 170 ------------- services/workflow_chat/tests/test_utils.py | 229 +++--------------- services/workflow_chat/tests/unit/__init__.py | 0 .../unit/test_gen_project_prompt_build.py | 49 ++++ .../tests/unit/test_isolation_guard.py | 17 ++ .../unit/test_workflow_chat_client_extract.py | 28 +++ .../test_workflow_chat_client_sanitize.py | 40 +++ 21 files changed, 491 insertions(+), 571 deletions(-) create mode 100644 .github/workflows/unit-tests.yaml create mode 100644 conftest.py create mode 100644 services/global_chat/tests/unit/__init__.py create mode 100644 services/job_chat/tests/integration/__init__.py rename services/job_chat/tests/{test_functions.py => integration/test_adaptor_docs_pipeline.py} (98%) create mode 100644 services/job_chat/tests/unit/__init__.py create mode 100644 services/testing/README.md create mode 100644 services/testing/__init__.py create mode 100644 services/testing/yaml_assertions.py create mode 100644 services/tools/search_documentation/tests/__init__.py create mode 100644 services/tools/search_documentation/tests/unit/__init__.py create mode 100644 services/workflow_chat/tests/conftest.py delete mode 100644 services/workflow_chat/tests/test_functions.py create mode 100644 services/workflow_chat/tests/unit/__init__.py create mode 100644 services/workflow_chat/tests/unit/test_gen_project_prompt_build.py create mode 100644 services/workflow_chat/tests/unit/test_isolation_guard.py create mode 100644 services/workflow_chat/tests/unit/test_workflow_chat_client_extract.py create mode 100644 services/workflow_chat/tests/unit/test_workflow_chat_client_sanitize.py diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml new file mode 100644 index 00000000..9d5c29eb --- /dev/null +++ b/.github/workflows/unit-tests.yaml @@ -0,0 +1,41 @@ +name: Unit tests + +on: + pull_request: + branches: ["**"] + push: + branches: [main] + +jobs: + unit: + name: Python unit tests + runs-on: ubuntu-latest + timeout-minutes: 5 + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install Poetry + run: pipx install poetry==1.8.3 + + - name: Cache Poetry virtualenv + uses: actions/cache@v4 + with: + path: .venv + key: poetry-${{ runner.os }}-${{ hashFiles('poetry.lock') }} + + - name: Install dependencies (main group only, no ft) + run: poetry install --no-interaction --no-root + + - name: Run unit tests + env: + # No secrets injected — unit tests must not need them. + # If a test passes locally but fails on CI for a missing key, it + # was never a unit test. See conftest.py at the repo root. + PYTHONDONTWRITEBYTECODE: "1" + run: poetry run pytest -m unit --maxfail=5 diff --git a/conftest.py b/conftest.py new file mode 100644 index 00000000..dd03bd84 --- /dev/null +++ b/conftest.py @@ -0,0 +1,72 @@ +"""Repo-root pytest configuration. + +- Auto-applies a tier marker (`unit` / `service` / `integration` / + `acceptance`) based on the test's path. The directory IS the marker. +- For tests marked `unit`, blocks network, subprocess, DB, and LLM client + construction so accidental I/O fails loud instead of timing out. +""" + +from unittest.mock import patch + +import pytest + + +_TIER_DIRS = ("unit", "service", "integration", "acceptance") + +_BLOCKED_TARGETS = ( + ("socket.socket.connect", "socket.connect()"), + ("subprocess.run", "subprocess.run()"), + ("subprocess.Popen", "subprocess.Popen()"), + ("psycopg2.connect", "psycopg2.connect()"), + # Block LLM client construction, not first request — earlier failure, + # easier to trace. + ("anthropic.Anthropic.__init__", "anthropic.Anthropic()"), + ("anthropic.AsyncAnthropic.__init__", "anthropic.AsyncAnthropic()"), + ("openai.OpenAI.__init__", "openai.OpenAI()"), + ("openai.AsyncOpenAI.__init__", "openai.AsyncOpenAI()"), +) + + +class UnitTestViolation(RuntimeError): + """Raised when a unit test attempts a forbidden operation.""" + + +def _make_blocker(operation): + def _block(*_args, **_kwargs): + raise UnitTestViolation( + f"Unit tests may not perform `{operation}`. Move this test to " + "tests/service/ or tests/integration/ if real I/O is needed. " + "See conftest.py at the repo root for the policy." + ) + + return _block + + +def pytest_collection_modifyitems(items): + for item in items: + for tier in _TIER_DIRS: + if tier in item.path.parts: + item.add_marker(getattr(pytest.mark, tier)) + break + + +@pytest.fixture(autouse=True) +def _enforce_unit_isolation(request): + if "unit" not in request.keywords: + yield + return + + patches = [] + for target, label in _BLOCKED_TARGETS: + try: + p = patch(target, side_effect=_make_blocker(label)) + p.start() + patches.append(p) + except (AttributeError, ModuleNotFoundError): + continue + + try: + yield + finally: + for p in patches: + p.stop() diff --git a/pyproject.toml b/pyproject.toml index 07ad572e..53ce1743 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,46 @@ ruff = "^0.15.10" requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" +[tool.pytest.ini_options] +# Make `services/` importable the same way `services/entry.py` does it. +# Without this, `from workflow_chat.workflow_chat import ...` fails inside tests. +pythonpath = ["services"] + +# Discovery roots. pytest walks these for test_*.py files. +testpaths = [ + "services/global_chat/tests", + "services/workflow_chat/tests", + "services/job_chat/tests", + "services/tools", +] + +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] + +markers = [ + "unit: fast, isolated, no I/O. Runs on every PR push.", + "service: mocks HTTP/LLM clients; exercises service handlers. Runs on merge.", + "integration: hits real external services (LLM, Pinecone, Postgres). Manual/nightly.", + "acceptance: end-to-end acceptance criteria. Manual/nightly.", +] + +addopts = [ + "--strict-markers", + "--strict-config", + "-ra", + "--tb=short", +] + +filterwarnings = [ + "default::DeprecationWarning", + "ignore::DeprecationWarning:anthropic.*", + "ignore::DeprecationWarning:pydantic.*", +] + +[tool.black] +line-length = 120 + [tool.ruff] select = [ "E", # pycodestyle diff --git a/services/global_chat/tests/test_utils.py b/services/global_chat/tests/test_utils.py index 3ddb7b43..a48df39d 100644 --- a/services/global_chat/tests/test_utils.py +++ b/services/global_chat/tests/test_utils.py @@ -3,10 +3,20 @@ import yaml import tempfile import subprocess -import difflib from pathlib import Path from typing import Dict, List, Optional, Any +# YAML helpers live in `testing.yaml_assertions`; re-exported so existing +# imports from this module keep working. New code should import directly. +from testing.yaml_assertions import ( # noqa: F401 + assert_no_special_chars, + assert_yaml_equal_except, + assert_yaml_has_ids, + assert_yaml_jobs_have_body, + assert_yaml_section_contains_all, + path_matches, +) + def call_global_chat_service(service_input: Dict[str, Any]) -> Dict[str, Any]: """ @@ -250,192 +260,3 @@ def print_response_details(response: Dict[str, Any], test_name: str = None, cont print(f" job_key: {job_key}") print(f" message:\n {msg.strip().replace(chr(10), chr(10) + ' ')}") - -# ===== YAML validation helpers from workflow_chat ===== - -def path_matches(path, allowed_paths): - """Check if a path (list of keys) matches any allowed path pattern.""" - for allowed in allowed_paths: - allowed_parts = allowed.split('.') - if len(path) != len(allowed_parts): - continue - match = True - for p, a in zip(path, allowed_parts): - if a == '*': - continue - if p != a: - match = False - break - if match: - return True - return False - - -def assert_yaml_equal_except(orig, new, allowed_paths: List[str], context=''): - """ - Assert that two YAML objects are equal except for allowed paths. - allowed_paths: e.g. ['triggers', 'jobs.*.body'] - """ - def compare(o, n, path): - if path_matches(path, allowed_paths): - return # allowed to differ - if type(o) != type(n): - raise AssertionError(f"Type mismatch at {'.'.join(path)}: {type(o)} != {type(n)}") - if isinstance(o, dict): - for k in set(o.keys()).union(n.keys()): - if k not in o or k not in n: - raise AssertionError(f"Key '{k}' missing at {'.'.join(path)}") - compare(o[k], n[k], path + [k]) - elif isinstance(o, list): - if len(o) != len(n): - raise AssertionError(f"List length mismatch at {'.'.join(path)}: {len(o)} != {len(n)}") - for i, (oi, ni) in enumerate(zip(o, n)): - compare(oi, ni, path + [str(i)]) - else: - if o != n: - diff = "\n".join(difflib.unified_diff( - [str(o)], [str(n)], - fromfile='original', tofile='response', lineterm='' - )) - raise AssertionError(f"Value mismatch at {'.'.join(path)}:\n{diff}") - - try: - compare(orig, new, []) - except AssertionError as e: - orig_str = yaml.dump(orig, sort_keys=True) - new_str = yaml.dump(new, sort_keys=True) - diff = "\n".join(difflib.unified_diff( - orig_str.splitlines(), new_str.splitlines(), - fromfile='original', tofile='response', lineterm='' - )) - raise AssertionError(f"{context}\n{e}\nFull YAML diff:\n{diff}") - - -def assert_yaml_section_contains_all(orig, new, section, context=''): - """ - Assert that all items in orig[section] are present and unchanged in new[section]. - Allows new items to be added in new[section]. - - Args: - orig: Original YAML as string or dict - new: New YAML as string or dict - section: Section name to compare - context: Context string for error messages - """ - # Handle both string and dict inputs - if isinstance(orig, str): - orig_data = yaml.safe_load(orig) - else: - orig_data = orig - - if isinstance(new, str): - new_data = yaml.safe_load(new) - else: - new_data = new - - orig_section = orig_data.get(section, {}) - new_section = new_data.get(section, {}) - - for key, value in orig_section.items(): - assert key in new_section, f"{context}: Key '{key}' missing in '{section}'" - assert new_section[key] == value, ( - f"{context}: Value for '{section}.{key}' changed.\nOriginal: {value}\nNew: {new_section[key]}" - ) - - -def assert_yaml_has_ids(yaml_str_or_dict, context=''): - """ - Assert that every job, trigger, and edge in the YAML has a non-empty 'id' field. - Args: - yaml_str_or_dict: YAML as string or dict - context: Context string for error messages - """ - if isinstance(yaml_str_or_dict, str): - data = yaml.safe_load(yaml_str_or_dict) - else: - data = yaml_str_or_dict - - # Check jobs - jobs = data.get('jobs', {}) - for job_key, job_data in jobs.items(): - assert 'id' in job_data, f"{context}: Job '{job_key}' missing 'id' field." - assert job_data['id'] not in (None, '', []), f"{context}: Job '{job_key}' has empty 'id' field." - - # Check triggers - triggers = data.get('triggers', {}) - for trig_key, trig_data in triggers.items(): - assert 'id' in trig_data, f"{context}: Trigger '{trig_key}' missing 'id' field." - assert trig_data['id'] not in (None, '', []), f"{context}: Trigger '{trig_key}' has empty 'id' field." - - # Check edges - edges = data.get('edges', {}) - for edge_key, edge_data in edges.items(): - assert 'id' in edge_data, f"{context}: Edge '{edge_key}' missing 'id' field." - assert edge_data['id'] not in (None, '', []), f"{context}: Edge '{edge_key}' has empty 'id' field." - - -def assert_yaml_jobs_have_body(yaml_str_or_dict, context=''): - """ - Assert that every job in the YAML has a non-empty 'body' field. - Args: - yaml_str_or_dict: YAML as string or dict - context: Context string for error messages - """ - if isinstance(yaml_str_or_dict, str): - data = yaml.safe_load(yaml_str_or_dict) - else: - data = yaml_str_or_dict - - jobs = data.get('jobs', {}) - for job_key, job_data in jobs.items(): - assert 'body' in job_data, f"{context}: Job '{job_key}' missing 'body' field." - assert job_data['body'] not in (None, '', []), f"{context}: Job '{job_key}' has empty 'body' field." - - -def assert_no_special_chars(yaml_str_or_dict, context=''): - """ - Assert that there are no special characters in job names, source_job and target_job fields. - Special characters are anything that's not alphanumeric, space, hyphen, or underscore. - - Args: - yaml_str_or_dict: YAML as string or dict - context: Context string for error messages - """ - import re - if isinstance(yaml_str_or_dict, str): - data = yaml.safe_load(yaml_str_or_dict) - else: - data = yaml_str_or_dict - - # Pattern matches any character that is NOT alphanumeric, space, hyphen, or underscore - special_char_pattern = re.compile(r'[^a-zA-Z0-9\s\-_]') - - # Check job names - jobs = data.get('jobs', {}) - for job_key, job_data in jobs.items(): - if 'name' in job_data and job_data['name']: - name = str(job_data['name']) - match = special_char_pattern.search(name) - assert not match, f"{context}: Job '{job_key}' name '{name}' contains special character '{match.group(0)}'" - - # Check edge: source_job, target_job and edge key - edges = data.get('edges', {}) - for edge_key, edge_data in edges.items(): - if 'source_job' in edge_data and edge_data['source_job']: - source = str(edge_data['source_job']) - match = special_char_pattern.search(source) - assert not match, f"{context}: Edge '{edge_key}' source_job '{source}' contains special character '{match.group(0)}'" - - if 'target_job' in edge_data and edge_data['target_job']: - target = str(edge_data['target_job']) - match = special_char_pattern.search(target) - assert not match, f"{context}: Edge '{edge_key}' target_job '{target}' contains special character '{match.group(0)}'" - - if "->" in edge_key: - source_part, target_part = edge_key.split("->", 1) - - match = special_char_pattern.search(source_part) - assert not match, f"{context}: Edge key '{edge_key}' source part '{source_part}' contains special character '{match.group(0)}'" - - match = special_char_pattern.search(target_part) - assert not match, f"{context}: Edge key '{edge_key}' target part '{target_part}' contains special character '{match.group(0)}'" diff --git a/services/global_chat/tests/unit/__init__.py b/services/global_chat/tests/unit/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/services/job_chat/tests/integration/__init__.py b/services/job_chat/tests/integration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/services/job_chat/tests/test_functions.py b/services/job_chat/tests/integration/test_adaptor_docs_pipeline.py similarity index 98% rename from services/job_chat/tests/test_functions.py rename to services/job_chat/tests/integration/test_adaptor_docs_pipeline.py index 73f953cd..78ec9979 100644 --- a/services/job_chat/tests/test_functions.py +++ b/services/job_chat/tests/integration/test_adaptor_docs_pipeline.py @@ -1,17 +1,13 @@ -import sys import os -from pathlib import Path + import psycopg2 +import pytest from dotenv import load_dotenv -load_dotenv() - -services_dir = Path(__file__).parent.parent.parent -sys.path.insert(0, str(services_dir)) - -import pytest from job_chat.prompt import generate_system_message +load_dotenv() + def get_db_connection(): """Get database connection from POSTGRES_URL environment variable.""" @@ -212,4 +208,4 @@ def test_search_docs_returns_general_docs_only(): if __name__ == "__main__": - pytest.main([__file__, "-v", "-s"]) \ No newline at end of file + pytest.main([__file__, "-v", "-s"]) diff --git a/services/job_chat/tests/unit/__init__.py b/services/job_chat/tests/unit/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/services/testing/README.md b/services/testing/README.md new file mode 100644 index 00000000..8b30f9c3 --- /dev/null +++ b/services/testing/README.md @@ -0,0 +1,22 @@ +# services/testing + +Shared helpers for the test suite. + +This directory is on the Python path via `pyproject.toml` +(`pythonpath = ["services"]`), so tests import as +`from testing.yaml_assertions import assert_yaml_equal_except`. + +## Modules + +- `yaml_assertions.py` — pure-function YAML structural assertions, safe for + every tier (unit included). + +## Why under `services/` and not a top-level `tests/`? + +Imports across the codebase resolve relative to `services/` (see +`CLAUDE.md` → "Python Import Patterns"). Putting helpers here means tests +import without path-munging hacks, the same way services do `from util +import …`. + +The Bun service auto-discovery in `platform/src/util/describe-modules.ts` +skips this directory because it has no `testing.py` index file. diff --git a/services/testing/__init__.py b/services/testing/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/services/testing/yaml_assertions.py b/services/testing/yaml_assertions.py new file mode 100644 index 00000000..c2136e34 --- /dev/null +++ b/services/testing/yaml_assertions.py @@ -0,0 +1,124 @@ +"""Pure-function YAML structural assertions, safe for every test tier.""" + +import difflib +import re + +import yaml + + +def path_matches(path, allowed_paths: list[str]) -> bool: + """Match a path (list of keys) against patterns of dotted keys with `*` wildcards.""" + for allowed in allowed_paths: + allowed_parts = allowed.split(".") + if len(path) != len(allowed_parts): + continue + if all(a == "*" or a == p for p, a in zip(path, allowed_parts)): + return True + return False + + +def assert_yaml_equal_except(orig, new, allowed_paths: list[str], context: str = "") -> None: + """Assert two YAML structures are equal except at `allowed_paths`. + + Patterns are dotted keys with `*` wildcards, e.g. `['triggers', 'jobs.*.body']`. + """ + + def compare(o, n, path): + if path_matches(path, allowed_paths): + return + if type(o) is not type(n): + raise AssertionError(f"Type mismatch at {'.'.join(path)}: {type(o)} != {type(n)}") + if isinstance(o, dict): + for k in set(o.keys()).union(n.keys()): + if k not in o or k not in n: + raise AssertionError(f"Key '{k}' missing at {'.'.join(path)}") + compare(o[k], n[k], path + [k]) + elif isinstance(o, list): + if len(o) != len(n): + raise AssertionError(f"List length mismatch at {'.'.join(path)}: {len(o)} != {len(n)}") + for i, (oi, ni) in enumerate(zip(o, n)): + compare(oi, ni, path + [str(i)]) + elif o != n: + diff = "\n".join( + difflib.unified_diff([str(o)], [str(n)], fromfile="original", tofile="response", lineterm="") + ) + raise AssertionError(f"Value mismatch at {'.'.join(path)}:\n{diff}") + + try: + compare(orig, new, []) + except AssertionError as e: + diff = "\n".join( + difflib.unified_diff( + yaml.dump(orig, sort_keys=True).splitlines(), + yaml.dump(new, sort_keys=True).splitlines(), + fromfile="original", + tofile="response", + lineterm="", + ) + ) + raise AssertionError(f"{context}\n{e}\nFull YAML diff:\n{diff}") + + +def _as_dict(yaml_str_or_dict): + return yaml.safe_load(yaml_str_or_dict) if isinstance(yaml_str_or_dict, str) else yaml_str_or_dict + + +def assert_yaml_section_contains_all(orig, new, section: str, context: str = "") -> None: + """Assert all items in `orig[section]` are present and unchanged in `new[section]`. + + New items in `new[section]` are allowed. + """ + orig_section = _as_dict(orig).get(section, {}) + new_section = _as_dict(new).get(section, {}) + + for key, value in orig_section.items(): + assert key in new_section, f"{context}: Key '{key}' missing in '{section}'" + assert new_section[key] == value, ( + f"{context}: Value for '{section}.{key}' changed.\n" + f"Original: {value}\nNew: {new_section[key]}" + ) + + +def assert_yaml_has_ids(yaml_str_or_dict, context: str = "") -> None: + """Assert every job, trigger, and edge has a non-empty `id`.""" + data = _as_dict(yaml_str_or_dict) + for kind in ("jobs", "triggers", "edges"): + singular = kind[:-1].title() + for item_key, item_data in data.get(kind, {}).items(): + assert "id" in item_data, f"{context}: {singular} '{item_key}' missing 'id' field." + assert item_data["id"] not in (None, "", []), ( + f"{context}: {singular} '{item_key}' has empty 'id' field." + ) + + +def assert_yaml_jobs_have_body(yaml_str_or_dict, context: str = "") -> None: + """Assert every job has a non-empty `body`.""" + for job_key, job_data in _as_dict(yaml_str_or_dict).get("jobs", {}).items(): + assert "body" in job_data, f"{context}: Job '{job_key}' missing 'body' field." + assert job_data["body"] not in (None, "", []), f"{context}: Job '{job_key}' has empty 'body' field." + + +_SPECIAL_CHAR = re.compile(r"[^a-zA-Z0-9\s\-_]") + + +def assert_no_special_chars(yaml_str_or_dict, context: str = "") -> None: + """Assert job names and edge source/target/keys use only [A-Za-z0-9 _-].""" + data = _as_dict(yaml_str_or_dict) + + def check(value, descriptor): + match = _SPECIAL_CHAR.search(value) + assert not match, f"{context}: {descriptor} '{value}' contains special character '{match.group(0)}'" + + for job_key, job_data in data.get("jobs", {}).items(): + if job_data.get("name"): + check(str(job_data["name"]), f"Job '{job_key}' name") + + for edge_key, edge_data in data.get("edges", {}).items(): + for field in ("source_job", "target_job"): + if edge_data.get(field): + check(str(edge_data[field]), f"Edge '{edge_key}' {field}") + + if "->" in edge_key: + source_part, target_part = edge_key.split("->", 1) + check(source_part, f"Edge key '{edge_key}' source part") + check(target_part, f"Edge key '{edge_key}' target part") diff --git a/services/tools/search_documentation/tests/__init__.py b/services/tools/search_documentation/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/services/tools/search_documentation/tests/unit/__init__.py b/services/tools/search_documentation/tests/unit/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/services/workflow_chat/tests/conftest.py b/services/workflow_chat/tests/conftest.py new file mode 100644 index 00000000..e56c1c30 --- /dev/null +++ b/services/workflow_chat/tests/conftest.py @@ -0,0 +1,15 @@ +import pytest + +from workflow_chat.workflow_chat import AnthropicClient + + +@pytest.fixture +def workflow_chat_client(): + """An `AnthropicClient` for testing pure helper methods. + + `AnthropicClient.__init__` instantiates `anthropic.Anthropic(...)`, which + the unit-tier guard blocks. The helpers we exercise never touch + `self.client`, so we bypass `__init__` to skip SDK construction. Tests + that need `self.client` belong in the service tier. + """ + return object.__new__(AnthropicClient) diff --git a/services/workflow_chat/tests/test_functions.py b/services/workflow_chat/tests/test_functions.py deleted file mode 100644 index e7f698e2..00000000 --- a/services/workflow_chat/tests/test_functions.py +++ /dev/null @@ -1,170 +0,0 @@ -import sys -from pathlib import Path - -services_dir = Path(__file__).parent.parent.parent -sys.path.insert(0, str(services_dir)) - -import pytest -from workflow_chat.workflow_chat import AnthropicClient, ChatConfig -from workflow_chat.gen_project_prompt import build_prompt - - -@pytest.fixture -def client(): - """Create a client instance for testing.""" - return AnthropicClient(ChatConfig(api_key="fake-key")) - -def test_extract_job_codes_preserves_real_code(client): - """Test that actual code is extracted and preserved.""" - - yaml_data = { - "jobs": { - "job1": {"body": "console.log('hello world')"}, - "job2": {"body": "const data = fetchData();\nprocessData(data);"} - } - } - - preserved_values, processed_yaml = client.extract_and_preserve_components(yaml_data) - - assert len(preserved_values) == 2 - assert preserved_values["__CODE_BLOCK_job1__"] == "console.log('hello world')" - assert preserved_values["__CODE_BLOCK_job2__"] == "const data = fetchData();\nprocessData(data);" - - -def test_extract_job_codes_ignores_default_placeholder(client): - """Test that default placeholder is not preserved as code.""" - - yaml_data = { - "jobs": { - "job1": {"body": "// Add operations here"}, - "job2": {"body": "real code here"}, - "job3": {"body": " // Add operations here "} # with whitespace - } - } - - preserved_values, processed_yaml = client.extract_and_preserve_components(yaml_data) - - assert len(preserved_values) == 1 - assert "__CODE_BLOCK_job1__" not in preserved_values - assert "__CODE_BLOCK_job3__" not in preserved_values - assert preserved_values["__CODE_BLOCK_job2__"] == "real code here" - - -def test_sanitize_job_names_removes_diacritics(client): - """Test that diacritics are properly normalized and removed.""" - - yaml_data = { - "jobs": { - "job1": {"name": "Café München"}, - "job2": {"name": "Naïve résumé"} - } - } - - client.sanitize_job_names(yaml_data) - - assert yaml_data["jobs"]["job1"]["name"] == "Cafe Munchen" - assert yaml_data["jobs"]["job2"]["name"] == "Naive resume" - - -def test_sanitize_job_names_removes_special_characters(client): - """Test that special characters are properly removed.""" - - yaml_data = { - "jobs": { - "job1": {"name": "Job@#$%Name!"}, - "job2": {"name": "Process&Data*With+Symbols"} - } - } - - client.sanitize_job_names(yaml_data) - - assert yaml_data["jobs"]["job1"]["name"] == "JobName" - assert yaml_data["jobs"]["job2"]["name"] == "ProcessDataWithSymbols" - - -def test_sanitize_job_names_preserves_allowed_characters(client): - """Test that alphanumeric, spaces, hyphens, and underscores are preserved.""" - - yaml_data = { - "jobs": { - "job1": {"name": "Valid Job-Name_123"} - } - } - - client.sanitize_job_names(yaml_data) - - assert yaml_data["jobs"]["job1"]["name"] == "Valid Job-Name_123" - - -def test_sanitize_job_names_handles_empty_data(client): - """Test that function handles edge cases gracefully.""" - - # Should not raise exceptions and return without error - result1 = client.sanitize_job_names(None) - result2 = client.sanitize_job_names({}) - result3 = client.sanitize_job_names({"jobs": {}}) - - assert result1 is None - assert result2 is None - assert result3 is None - - -def test_build_prompt_normal_mode(): - """Test that normal mode uses correct configuration.""" - system_msg, prompt = build_prompt( - content='Create a workflow', - existing_yaml='name: test-workflow', - history=[{'role': 'user', 'content': 'Hello'}] - ) - - # Should use normal mode configuration - assert 'talk to a client with the goal of converting' in system_msg # normal_mode_intro - assert 'You can either' in system_msg # normal_mode_answering_instructions - assert 'user is currently editing this YAML' in system_msg - assert 'name: test-workflow' in system_msg - - # Prompt structure - assert len(prompt) == 2 - assert prompt[-1]['content'] == 'Create a workflow' - - -def test_build_prompt_error_mode(): - """Test that error mode uses correct configuration and appends error message.""" - system_msg, prompt = build_prompt( - content='Fix the workflow', - existing_yaml='name: broken-workflow', - errors='Invalid trigger type', - history=[] - ) - - # Should use error mode configuration - assert 'Your previous suggestion produced an invalid' in system_msg # error_mode_intro - assert 'Answer with BOTH the' in system_msg # error_mode_answering_instructions - assert 'YAML causing the error' in system_msg - assert 'name: broken-workflow' in system_msg - - # User content should have error appended - assert prompt[-1]['content'] == 'Fix the workflow\nThis is the error message:\nInvalid trigger type' - - -def test_build_prompt_readonly_mode(): - """Test that readonly mode uses unstructured output format.""" - system_msg, prompt = build_prompt( - content='What does this workflow do?', - existing_yaml='name: readonly-workflow', - read_only=True, - history=[] - ) - - # Should use readonly mode configuration - assert 'Read-only Mode' in system_msg # unstructured_output_format - assert 'triple-backticked YAML code blocks' in system_msg # readonly_mode_answering_instructions - assert 'user is viewing this read-only YAML' in system_msg - assert 'name: readonly-workflow' in system_msg - - # User content should be unchanged - assert prompt[-1]['content'] == 'What does this workflow do?' - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) \ No newline at end of file diff --git a/services/workflow_chat/tests/test_utils.py b/services/workflow_chat/tests/test_utils.py index 535b7db2..0fb8de6b 100644 --- a/services/workflow_chat/tests/test_utils.py +++ b/services/workflow_chat/tests/test_utils.py @@ -1,106 +1,60 @@ +"""Service-tier helpers for workflow_chat. + +YAML assertion helpers live in `testing.yaml_assertions`; they are re-exported +here so existing service/integration tests keep working. New code should +import from `testing.yaml_assertions` directly. +""" + import json +import subprocess import sys -import yaml import tempfile -import subprocess -import yaml from pathlib import Path -import difflib -from typing import List - - -def path_matches(path, allowed_paths): - """Check if a path (list of keys) matches any allowed path pattern.""" - for allowed in allowed_paths: - allowed_parts = allowed.split('.') - if len(path) != len(allowed_parts): - continue - match = True - for p, a in zip(path, allowed_parts): - if a == '*': - continue - if p != a: - match = False - break - if match: - return True - return False - -def assert_yaml_equal_except(orig, new, allowed_paths: List[str], context=''): - """ - Assert that two YAML objects are equal except for allowed paths. - allowed_paths: e.g. ['triggers', 'jobs.*.body'] - """ - def compare(o, n, path): - if path_matches(path, allowed_paths): - return # allowed to differ - if type(o) != type(n): - raise AssertionError(f"Type mismatch at {'.'.join(path)}: {type(o)} != {type(n)}") - if isinstance(o, dict): - for k in set(o.keys()).union(n.keys()): - if k not in o or k not in n: - raise AssertionError(f"Key '{k}' missing at {'.'.join(path)}") - compare(o[k], n[k], path + [k]) - elif isinstance(o, list): - if len(o) != len(n): - raise AssertionError(f"List length mismatch at {'.'.join(path)}: {len(o)} != {len(n)}") - for i, (oi, ni) in enumerate(zip(o, n)): - compare(oi, ni, path + [str(i)]) - else: - if o != n: - diff = "\n".join(difflib.unified_diff( - [str(o)], [str(n)], - fromfile='original', tofile='response', lineterm='' - )) - raise AssertionError(f"Value mismatch at {'.'.join(path)}:\n{diff}") - - try: - compare(orig, new, []) - except AssertionError as e: - orig_str = yaml.dump(orig, sort_keys=True) - new_str = yaml.dump(new, sort_keys=True) - diff = "\n".join(difflib.unified_diff( - orig_str.splitlines(), new_str.splitlines(), - fromfile='original', tofile='response', lineterm='' - )) - raise AssertionError(f"{context}\n{e}\nFull YAML diff:\n{diff}") +from testing.yaml_assertions import ( # noqa: F401 + assert_no_special_chars, + assert_yaml_equal_except, + assert_yaml_has_ids, + assert_yaml_jobs_have_body, + assert_yaml_section_contains_all, + path_matches, +) def call_workflow_chat_service(service_input): - with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_input: + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as temp_input: json.dump(service_input, temp_input, indent=2) temp_input_path = temp_input.name - with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_output: + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as temp_output: temp_output_path = temp_output.name try: cmd = [ sys.executable, str(Path(__file__).parent.parent.parent / "entry.py"), "workflow_chat", - "--input", temp_input_path, - "--output", temp_output_path + "--input", + temp_input_path, + "--output", + temp_output_path, ] result = subprocess.run(cmd, capture_output=True, text=True, cwd=Path(__file__).parent.parent) if result.returncode != 0: raise Exception(f"Service call failed: {result.stderr}") - with open(temp_output_path, 'r') as f: + with open(temp_output_path, "r") as f: response = json.load(f) return response finally: import os + try: os.unlink(temp_input_path) os.unlink(temp_output_path) - except: + except Exception: pass def make_service_input(existing_yaml, history, content=None, errors=None, context=None, meta=None): - service_input = { - "existing_yaml": existing_yaml, - "history": history - } + service_input = {"existing_yaml": existing_yaml, "history": history} if content is not None: service_input["content"] = content if errors is not None: @@ -113,7 +67,7 @@ def make_service_input(existing_yaml, history, content=None, errors=None, contex def print_response_details(response, content=None, errors=None): - """Print detailed response information like the original script, including content or errors if provided.""" + """Print detailed response information including content or errors if provided.""" if content is not None: print("\nUSER CONTENT:") print(content) @@ -138,132 +92,3 @@ def print_response_details(response, content=None, errors=None): if "usage" in response: print("\nTOKEN USAGE:") print(json.dumps(response["usage"], indent=2)) - - -def assert_yaml_section_contains_all(orig, new, section, context=''): - """ - Assert that all items in orig[section] are present and unchanged in new[section]. - Allows new items to be added in new[section]. - - Args: - orig: Original YAML as string or dict - new: New YAML as string or dict - section: Section name to compare - context: Context string for error messages - """ - # Handle both string and dict inputs - if isinstance(orig, str): - orig_data = yaml.safe_load(orig) - else: - orig_data = orig - - if isinstance(new, str): - new_data = yaml.safe_load(new) - else: - new_data = new - - orig_section = orig_data.get(section, {}) - new_section = new_data.get(section, {}) - - for key, value in orig_section.items(): - assert key in new_section, f"{context}: Key '{key}' missing in '{section}'" - assert new_section[key] == value, ( - f"{context}: Value for '{section}.{key}' changed.\nOriginal: {value}\nNew: {new_section[key]}" - ) - - -def assert_yaml_has_ids(yaml_str_or_dict, context=''): - """ - Assert that every job, trigger, and edge in the YAML has a non-empty 'id' field. - Args: - yaml_str_or_dict: YAML as string or dict - context: Context string for error messages - """ - if isinstance(yaml_str_or_dict, str): - data = yaml.safe_load(yaml_str_or_dict) - else: - data = yaml_str_or_dict - - # Check jobs - jobs = data.get('jobs', {}) - for job_key, job_data in jobs.items(): - assert 'id' in job_data, f"{context}: Job '{job_key}' missing 'id' field." - assert job_data['id'] not in (None, '', []), f"{context}: Job '{job_key}' has empty 'id' field." - - # Check triggers - triggers = data.get('triggers', {}) - for trig_key, trig_data in triggers.items(): - assert 'id' in trig_data, f"{context}: Trigger '{trig_key}' missing 'id' field." - assert trig_data['id'] not in (None, '', []), f"{context}: Trigger '{trig_key}' has empty 'id' field." - - # Check edges - edges = data.get('edges', {}) - for edge_key, edge_data in edges.items(): - assert 'id' in edge_data, f"{context}: Edge '{edge_key}' missing 'id' field." - assert edge_data['id'] not in (None, '', []), f"{context}: Edge '{edge_key}' has empty 'id' field." - - -def assert_yaml_jobs_have_body(yaml_str_or_dict, context=''): - """ - Assert that every job in the YAML has a non-empty 'body' field. - Args: - yaml_str_or_dict: YAML as string or dict - context: Context string for error messages - """ - if isinstance(yaml_str_or_dict, str): - data = yaml.safe_load(yaml_str_or_dict) - else: - data = yaml_str_or_dict - - jobs = data.get('jobs', {}) - for job_key, job_data in jobs.items(): - assert 'body' in job_data, f"{context}: Job '{job_key}' missing 'body' field." - assert job_data['body'] not in (None, '', []), f"{context}: Job '{job_key}' has empty 'body' field." - -def assert_no_special_chars(yaml_str_or_dict, context=''): - """ - Assert that there are no special characters in job names, source_job and target_job fields. - Special characters are anything that's not alphanumeric, space, hyphen, or underscore. - - Args: - yaml_str_or_dict: YAML as string or dict - context: Context string for error messages - """ - import re - if isinstance(yaml_str_or_dict, str): - data = yaml.safe_load(yaml_str_or_dict) - else: - data = yaml_str_or_dict - - # Pattern matches any character that is NOT alphanumeric, space, hyphen, or underscore - special_char_pattern = re.compile(r'[^a-zA-Z0-9\s\-_]') - - # Check job names - jobs = data.get('jobs', {}) - for job_key, job_data in jobs.items(): - if 'name' in job_data and job_data['name']: - name = str(job_data['name']) - match = special_char_pattern.search(name) - assert not match, f"{context}: Job '{job_key}' name '{name}' contains special character '{match.group(0)}'" - - # Check edge: source_job, target_job and edge key - edges = data.get('edges', {}) - for edge_key, edge_data in edges.items(): - if 'source_job' in edge_data and edge_data['source_job']: - source = str(edge_data['source_job']) - match = special_char_pattern.search(source) - assert not match, f"{context}: Edge '{edge_key}' source_job '{source}' contains special character '{match.group(0)}'" - - if 'target_job' in edge_data and edge_data['target_job']: - target = str(edge_data['target_job']) - match = special_char_pattern.search(target) - assert not match, f"{context}: Edge '{edge_key}' target_job '{target}' contains special character '{match.group(0)}'" - - if "->" in edge_key: - source_part, target_part = edge_key.split("->", 1) - - match = special_char_pattern.search(source_part) - assert not match, f"{context}: Edge key '{edge_key}' source part '{source_part}' contains special character '{match.group(0)}'" - - match = special_char_pattern.search(target_part) - assert not match, f"{context}: Edge key '{edge_key}' target part '{target_part}' contains special character '{match.group(0)}'" \ No newline at end of file diff --git a/services/workflow_chat/tests/unit/__init__.py b/services/workflow_chat/tests/unit/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/services/workflow_chat/tests/unit/test_gen_project_prompt_build.py b/services/workflow_chat/tests/unit/test_gen_project_prompt_build.py new file mode 100644 index 00000000..a848a180 --- /dev/null +++ b/services/workflow_chat/tests/unit/test_gen_project_prompt_build.py @@ -0,0 +1,49 @@ +from workflow_chat.gen_project_prompt import build_prompt + + +def test_build_prompt_normal_mode(): + system_msg, prompt = build_prompt( + content="Create a workflow", + existing_yaml="name: test-workflow", + history=[{"role": "user", "content": "Hello"}], + ) + + assert "talk to a client with the goal of converting" in system_msg + assert "You can either" in system_msg + assert "user is currently editing this YAML" in system_msg + assert "name: test-workflow" in system_msg + + assert len(prompt) == 2 + assert prompt[-1]["content"] == "Create a workflow" + + +def test_build_prompt_error_mode(): + system_msg, prompt = build_prompt( + content="Fix the workflow", + existing_yaml="name: broken-workflow", + errors="Invalid trigger type", + history=[], + ) + + assert "Your previous suggestion produced an invalid" in system_msg + assert "Answer with BOTH the" in system_msg + assert "YAML causing the error" in system_msg + assert "name: broken-workflow" in system_msg + + assert prompt[-1]["content"] == "Fix the workflow\nThis is the error message:\nInvalid trigger type" + + +def test_build_prompt_readonly_mode(): + system_msg, prompt = build_prompt( + content="What does this workflow do?", + existing_yaml="name: readonly-workflow", + read_only=True, + history=[], + ) + + assert "Read-only Mode" in system_msg + assert "triple-backticked YAML code blocks" in system_msg + assert "user is viewing this read-only YAML" in system_msg + assert "name: readonly-workflow" in system_msg + + assert prompt[-1]["content"] == "What does this workflow do?" diff --git a/services/workflow_chat/tests/unit/test_isolation_guard.py b/services/workflow_chat/tests/unit/test_isolation_guard.py new file mode 100644 index 00000000..8986939d --- /dev/null +++ b/services/workflow_chat/tests/unit/test_isolation_guard.py @@ -0,0 +1,17 @@ +"""Smoke tests for the unit-tier isolation guard in the root conftest.py.""" + +import subprocess + +import pytest + + +def test_subprocess_run_is_blocked(): + with pytest.raises(RuntimeError, match="Unit tests may not perform"): + subprocess.run(["echo", "hello"]) + + +def test_anthropic_construction_is_blocked(): + from anthropic import Anthropic + + with pytest.raises(RuntimeError, match="Unit tests may not perform"): + Anthropic(api_key="fake-key") diff --git a/services/workflow_chat/tests/unit/test_workflow_chat_client_extract.py b/services/workflow_chat/tests/unit/test_workflow_chat_client_extract.py new file mode 100644 index 00000000..530704a8 --- /dev/null +++ b/services/workflow_chat/tests/unit/test_workflow_chat_client_extract.py @@ -0,0 +1,28 @@ +def test_extract_job_codes_preserves_real_code(workflow_chat_client): + yaml_data = { + "jobs": { + "job1": {"body": "console.log('hello world')"}, + "job2": {"body": "const data = fetchData();\nprocessData(data);"}, + } + } + + preserved_values, _ = workflow_chat_client.extract_and_preserve_components(yaml_data) + + assert preserved_values == { + "__CODE_BLOCK_job1__": "console.log('hello world')", + "__CODE_BLOCK_job2__": "const data = fetchData();\nprocessData(data);", + } + + +def test_extract_job_codes_ignores_default_placeholder(workflow_chat_client): + yaml_data = { + "jobs": { + "job1": {"body": "// Add operations here"}, + "job2": {"body": "real code here"}, + "job3": {"body": " // Add operations here "}, + } + } + + preserved_values, _ = workflow_chat_client.extract_and_preserve_components(yaml_data) + + assert preserved_values == {"__CODE_BLOCK_job2__": "real code here"} diff --git a/services/workflow_chat/tests/unit/test_workflow_chat_client_sanitize.py b/services/workflow_chat/tests/unit/test_workflow_chat_client_sanitize.py new file mode 100644 index 00000000..61b8e798 --- /dev/null +++ b/services/workflow_chat/tests/unit/test_workflow_chat_client_sanitize.py @@ -0,0 +1,40 @@ +def test_sanitize_job_names_removes_diacritics(workflow_chat_client): + yaml_data = { + "jobs": { + "job1": {"name": "Café München"}, + "job2": {"name": "Naïve résumé"}, + } + } + + workflow_chat_client.sanitize_job_names(yaml_data) + + assert yaml_data["jobs"]["job1"]["name"] == "Cafe Munchen" + assert yaml_data["jobs"]["job2"]["name"] == "Naive resume" + + +def test_sanitize_job_names_removes_special_characters(workflow_chat_client): + yaml_data = { + "jobs": { + "job1": {"name": "Job@#$%Name!"}, + "job2": {"name": "Process&Data*With+Symbols"}, + } + } + + workflow_chat_client.sanitize_job_names(yaml_data) + + assert yaml_data["jobs"]["job1"]["name"] == "JobName" + assert yaml_data["jobs"]["job2"]["name"] == "ProcessDataWithSymbols" + + +def test_sanitize_job_names_preserves_allowed_characters(workflow_chat_client): + yaml_data = {"jobs": {"job1": {"name": "Valid Job-Name_123"}}} + + workflow_chat_client.sanitize_job_names(yaml_data) + + assert yaml_data["jobs"]["job1"]["name"] == "Valid Job-Name_123" + + +def test_sanitize_job_names_handles_empty_data(workflow_chat_client): + assert workflow_chat_client.sanitize_job_names(None) is None + assert workflow_chat_client.sanitize_job_names({}) is None + assert workflow_chat_client.sanitize_job_names({"jobs": {}}) is None From c269b000e77860049f55dd5dc3088a67a81ffa1a Mon Sep 17 00:00:00 2001 From: Joe Clark Date: Fri, 1 May 2026 13:49:50 +0100 Subject: [PATCH 04/16] simplify action --- .github/workflows/unit-tests.yaml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index 9d5c29eb..758a14c4 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -1,10 +1,8 @@ name: Unit tests -on: - pull_request: - branches: ["**"] - push: - branches: [main] +# Runs when a pull request is created or its head is updated +# https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request +on: pull_request jobs: unit: From c77f332aa6f25bc2852163320960cb0d5a0a0bac Mon Sep 17 00:00:00 2001 From: Joe Clark Date: Fri, 1 May 2026 13:50:32 +0100 Subject: [PATCH 05/16] update action version --- .github/workflows/unit-tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index 758a14c4..5bad0e99 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -11,7 +11,7 @@ jobs: timeout-minutes: 5 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python 3.11 uses: actions/setup-python@v5 From 81edd5c00c9b630534919d1f229020fc4a84b2db Mon Sep 17 00:00:00 2001 From: Joe Clark Date: Fri, 1 May 2026 13:52:39 +0100 Subject: [PATCH 06/16] more action updates --- .github/workflows/unit-tests.yaml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index 5bad0e99..9e1f5fc4 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -31,9 +31,4 @@ jobs: run: poetry install --no-interaction --no-root - name: Run unit tests - env: - # No secrets injected — unit tests must not need them. - # If a test passes locally but fails on CI for a missing key, it - # was never a unit test. See conftest.py at the repo root. - PYTHONDONTWRITEBYTECODE: "1" - run: poetry run pytest -m unit --maxfail=5 + run: poetry run pytest -m unit From f3dbc041fa8bd5298f9d97c4eb24b466587ea060 Mon Sep 17 00:00:00 2001 From: "Hanna Paasivirta (OpenFn)" Date: Tue, 5 May 2026 11:59:19 +0900 Subject: [PATCH 07/16] address first batch of comments --- .../tests/test_good_morning_workflow.py | 10 +++--- .../tests/test_planner_multistep.py | 10 +++--- services/global_chat/tests/test_utils.py | 12 ------- .../tests/test_workflow_chat_pass_fail.py | 14 +++++++- .../tests/test_workflow_chat_qualitative.py | 14 +++++++- services/workflow_chat/tests/conftest.py | 15 --------- .../workflow_chat/tests/test_pass_fail.py | 12 ++++++- .../workflow_chat/tests/test_qualitative.py | 12 ++++++- services/workflow_chat/tests/test_utils.py | 16 +-------- .../tests/unit/gen_project/__init__.py | 0 .../test_prompt_build.py} | 0 .../tests/unit/test_isolation_guard.py | 17 ---------- .../unit/test_workflow_chat_client_extract.py | 19 ++++++++--- .../test_workflow_chat_client_sanitize.py | 33 +++++++++++++------ 14 files changed, 99 insertions(+), 85 deletions(-) delete mode 100644 services/workflow_chat/tests/conftest.py create mode 100644 services/workflow_chat/tests/unit/gen_project/__init__.py rename services/workflow_chat/tests/unit/{test_gen_project_prompt_build.py => gen_project/test_prompt_build.py} (100%) delete mode 100644 services/workflow_chat/tests/unit/test_isolation_guard.py diff --git a/services/global_chat/tests/test_good_morning_workflow.py b/services/global_chat/tests/test_good_morning_workflow.py index 60f8adcb..5888b150 100644 --- a/services/global_chat/tests/test_good_morning_workflow.py +++ b/services/global_chat/tests/test_good_morning_workflow.py @@ -1,13 +1,15 @@ import pytest import yaml +from testing.yaml_assertions import ( + assert_yaml_has_ids, + assert_yaml_jobs_have_body, +) from .test_utils import ( + assert_routed_to, call_global_chat_service, + get_attachment, make_service_input, print_response_details, - assert_routed_to, - get_attachment, - assert_yaml_has_ids, - assert_yaml_jobs_have_body, ) diff --git a/services/global_chat/tests/test_planner_multistep.py b/services/global_chat/tests/test_planner_multistep.py index 18b01bb6..9022cbac 100644 --- a/services/global_chat/tests/test_planner_multistep.py +++ b/services/global_chat/tests/test_planner_multistep.py @@ -1,13 +1,15 @@ import pytest import yaml +from testing.yaml_assertions import ( + assert_yaml_has_ids, + assert_yaml_jobs_have_body, +) from .test_utils import ( + assert_routed_to, call_global_chat_service, + get_attachment, make_service_input, print_response_details, - assert_routed_to, - get_attachment, - assert_yaml_has_ids, - assert_yaml_jobs_have_body, ) diff --git a/services/global_chat/tests/test_utils.py b/services/global_chat/tests/test_utils.py index a48df39d..5d1e9b0e 100644 --- a/services/global_chat/tests/test_utils.py +++ b/services/global_chat/tests/test_utils.py @@ -6,18 +6,6 @@ from pathlib import Path from typing import Dict, List, Optional, Any -# YAML helpers live in `testing.yaml_assertions`; re-exported so existing -# imports from this module keep working. New code should import directly. -from testing.yaml_assertions import ( # noqa: F401 - assert_no_special_chars, - assert_yaml_equal_except, - assert_yaml_has_ids, - assert_yaml_jobs_have_body, - assert_yaml_section_contains_all, - path_matches, -) - - def call_global_chat_service(service_input: Dict[str, Any]) -> Dict[str, Any]: """ Call the global_chat service with the given input and return the response. diff --git a/services/global_chat/tests/test_workflow_chat_pass_fail.py b/services/global_chat/tests/test_workflow_chat_pass_fail.py index 7f11f62c..b5545bd1 100644 --- a/services/global_chat/tests/test_workflow_chat_pass_fail.py +++ b/services/global_chat/tests/test_workflow_chat_pass_fail.py @@ -7,7 +7,19 @@ from pathlib import Path import difflib from typing import List -from .test_utils import assert_yaml_equal_except, call_global_chat_service, make_service_input, print_response_details, assert_no_special_chars, assert_yaml_jobs_have_body, assert_yaml_has_ids, assert_routed_to, get_response_yaml +from testing.yaml_assertions import ( + assert_no_special_chars, + assert_yaml_equal_except, + assert_yaml_has_ids, + assert_yaml_jobs_have_body, +) +from .test_utils import ( + assert_routed_to, + call_global_chat_service, + get_response_yaml, + make_service_input, + print_response_details, +) def test_change_trigger(): print("==================TEST==================") diff --git a/services/global_chat/tests/test_workflow_chat_qualitative.py b/services/global_chat/tests/test_workflow_chat_qualitative.py index 857ba1b0..3997bc1b 100644 --- a/services/global_chat/tests/test_workflow_chat_qualitative.py +++ b/services/global_chat/tests/test_workflow_chat_qualitative.py @@ -7,7 +7,19 @@ import subprocess import yaml from pathlib import Path -from .test_utils import call_global_chat_service, make_service_input, print_response_details, assert_routed_to, assert_yaml_section_contains_all, assert_yaml_has_ids, assert_yaml_jobs_have_body, assert_no_special_chars, get_response_yaml +from testing.yaml_assertions import ( + assert_no_special_chars, + assert_yaml_has_ids, + assert_yaml_jobs_have_body, + assert_yaml_section_contains_all, +) +from .test_utils import ( + assert_routed_to, + call_global_chat_service, + get_response_yaml, + make_service_input, + print_response_details, +) # ---- TESTS ---- def test_basic_input(): diff --git a/services/workflow_chat/tests/conftest.py b/services/workflow_chat/tests/conftest.py deleted file mode 100644 index e56c1c30..00000000 --- a/services/workflow_chat/tests/conftest.py +++ /dev/null @@ -1,15 +0,0 @@ -import pytest - -from workflow_chat.workflow_chat import AnthropicClient - - -@pytest.fixture -def workflow_chat_client(): - """An `AnthropicClient` for testing pure helper methods. - - `AnthropicClient.__init__` instantiates `anthropic.Anthropic(...)`, which - the unit-tier guard blocks. The helpers we exercise never touch - `self.client`, so we bypass `__init__` to skip SDK construction. Tests - that need `self.client` belong in the service tier. - """ - return object.__new__(AnthropicClient) diff --git a/services/workflow_chat/tests/test_pass_fail.py b/services/workflow_chat/tests/test_pass_fail.py index eeb83de3..832d8621 100644 --- a/services/workflow_chat/tests/test_pass_fail.py +++ b/services/workflow_chat/tests/test_pass_fail.py @@ -7,7 +7,17 @@ from pathlib import Path import difflib from typing import List -from .test_utils import assert_yaml_equal_except, call_workflow_chat_service, make_service_input, print_response_details, assert_no_special_chars, assert_yaml_jobs_have_body, assert_yaml_has_ids +from testing.yaml_assertions import ( + assert_no_special_chars, + assert_yaml_equal_except, + assert_yaml_has_ids, + assert_yaml_jobs_have_body, +) +from .test_utils import ( + call_workflow_chat_service, + make_service_input, + print_response_details, +) def test_change_trigger(): print("==================TEST==================") diff --git a/services/workflow_chat/tests/test_qualitative.py b/services/workflow_chat/tests/test_qualitative.py index 0285b9da..285f5e1f 100644 --- a/services/workflow_chat/tests/test_qualitative.py +++ b/services/workflow_chat/tests/test_qualitative.py @@ -7,7 +7,17 @@ import subprocess import yaml from pathlib import Path -from .test_utils import call_workflow_chat_service, make_service_input, print_response_details, assert_yaml_section_contains_all, assert_yaml_has_ids, assert_yaml_jobs_have_body, assert_no_special_chars +from testing.yaml_assertions import ( + assert_no_special_chars, + assert_yaml_has_ids, + assert_yaml_jobs_have_body, + assert_yaml_section_contains_all, +) +from .test_utils import ( + call_workflow_chat_service, + make_service_input, + print_response_details, +) # ---- TESTS ---- def test_basic_input(): diff --git a/services/workflow_chat/tests/test_utils.py b/services/workflow_chat/tests/test_utils.py index 0fb8de6b..06de577e 100644 --- a/services/workflow_chat/tests/test_utils.py +++ b/services/workflow_chat/tests/test_utils.py @@ -1,9 +1,4 @@ -"""Service-tier helpers for workflow_chat. - -YAML assertion helpers live in `testing.yaml_assertions`; they are re-exported -here so existing service/integration tests keep working. New code should -import from `testing.yaml_assertions` directly. -""" +"""Service-tier helpers for workflow_chat.""" import json import subprocess @@ -11,15 +6,6 @@ import tempfile from pathlib import Path -from testing.yaml_assertions import ( # noqa: F401 - assert_no_special_chars, - assert_yaml_equal_except, - assert_yaml_has_ids, - assert_yaml_jobs_have_body, - assert_yaml_section_contains_all, - path_matches, -) - def call_workflow_chat_service(service_input): with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as temp_input: diff --git a/services/workflow_chat/tests/unit/gen_project/__init__.py b/services/workflow_chat/tests/unit/gen_project/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/services/workflow_chat/tests/unit/test_gen_project_prompt_build.py b/services/workflow_chat/tests/unit/gen_project/test_prompt_build.py similarity index 100% rename from services/workflow_chat/tests/unit/test_gen_project_prompt_build.py rename to services/workflow_chat/tests/unit/gen_project/test_prompt_build.py diff --git a/services/workflow_chat/tests/unit/test_isolation_guard.py b/services/workflow_chat/tests/unit/test_isolation_guard.py deleted file mode 100644 index 8986939d..00000000 --- a/services/workflow_chat/tests/unit/test_isolation_guard.py +++ /dev/null @@ -1,17 +0,0 @@ -"""Smoke tests for the unit-tier isolation guard in the root conftest.py.""" - -import subprocess - -import pytest - - -def test_subprocess_run_is_blocked(): - with pytest.raises(RuntimeError, match="Unit tests may not perform"): - subprocess.run(["echo", "hello"]) - - -def test_anthropic_construction_is_blocked(): - from anthropic import Anthropic - - with pytest.raises(RuntimeError, match="Unit tests may not perform"): - Anthropic(api_key="fake-key") diff --git a/services/workflow_chat/tests/unit/test_workflow_chat_client_extract.py b/services/workflow_chat/tests/unit/test_workflow_chat_client_extract.py index 530704a8..77e37f87 100644 --- a/services/workflow_chat/tests/unit/test_workflow_chat_client_extract.py +++ b/services/workflow_chat/tests/unit/test_workflow_chat_client_extract.py @@ -1,4 +1,14 @@ -def test_extract_job_codes_preserves_real_code(workflow_chat_client): +from workflow_chat.workflow_chat import AnthropicClient + + +def _make_client(): + # Bypass __init__ to skip anthropic.Anthropic() construction, which the + # unit-tier guard blocks. The helpers under test never touch self.client. + return object.__new__(AnthropicClient) + + +def test_extract_job_codes_preserves_real_code(): + client = _make_client() yaml_data = { "jobs": { "job1": {"body": "console.log('hello world')"}, @@ -6,7 +16,7 @@ def test_extract_job_codes_preserves_real_code(workflow_chat_client): } } - preserved_values, _ = workflow_chat_client.extract_and_preserve_components(yaml_data) + preserved_values, _ = client.extract_and_preserve_components(yaml_data) assert preserved_values == { "__CODE_BLOCK_job1__": "console.log('hello world')", @@ -14,7 +24,8 @@ def test_extract_job_codes_preserves_real_code(workflow_chat_client): } -def test_extract_job_codes_ignores_default_placeholder(workflow_chat_client): +def test_extract_job_codes_ignores_default_placeholder(): + client = _make_client() yaml_data = { "jobs": { "job1": {"body": "// Add operations here"}, @@ -23,6 +34,6 @@ def test_extract_job_codes_ignores_default_placeholder(workflow_chat_client): } } - preserved_values, _ = workflow_chat_client.extract_and_preserve_components(yaml_data) + preserved_values, _ = client.extract_and_preserve_components(yaml_data) assert preserved_values == {"__CODE_BLOCK_job2__": "real code here"} diff --git a/services/workflow_chat/tests/unit/test_workflow_chat_client_sanitize.py b/services/workflow_chat/tests/unit/test_workflow_chat_client_sanitize.py index 61b8e798..43becd1e 100644 --- a/services/workflow_chat/tests/unit/test_workflow_chat_client_sanitize.py +++ b/services/workflow_chat/tests/unit/test_workflow_chat_client_sanitize.py @@ -1,4 +1,14 @@ -def test_sanitize_job_names_removes_diacritics(workflow_chat_client): +from workflow_chat.workflow_chat import AnthropicClient + + +def _make_client(): + # Bypass __init__ to skip anthropic.Anthropic() construction, which the + # unit-tier guard blocks. The helpers under test never touch self.client. + return object.__new__(AnthropicClient) + + +def test_sanitize_job_names_removes_diacritics(): + client = _make_client() yaml_data = { "jobs": { "job1": {"name": "Café München"}, @@ -6,13 +16,14 @@ def test_sanitize_job_names_removes_diacritics(workflow_chat_client): } } - workflow_chat_client.sanitize_job_names(yaml_data) + client.sanitize_job_names(yaml_data) assert yaml_data["jobs"]["job1"]["name"] == "Cafe Munchen" assert yaml_data["jobs"]["job2"]["name"] == "Naive resume" -def test_sanitize_job_names_removes_special_characters(workflow_chat_client): +def test_sanitize_job_names_removes_special_characters(): + client = _make_client() yaml_data = { "jobs": { "job1": {"name": "Job@#$%Name!"}, @@ -20,21 +31,23 @@ def test_sanitize_job_names_removes_special_characters(workflow_chat_client): } } - workflow_chat_client.sanitize_job_names(yaml_data) + client.sanitize_job_names(yaml_data) assert yaml_data["jobs"]["job1"]["name"] == "JobName" assert yaml_data["jobs"]["job2"]["name"] == "ProcessDataWithSymbols" -def test_sanitize_job_names_preserves_allowed_characters(workflow_chat_client): +def test_sanitize_job_names_preserves_allowed_characters(): + client = _make_client() yaml_data = {"jobs": {"job1": {"name": "Valid Job-Name_123"}}} - workflow_chat_client.sanitize_job_names(yaml_data) + client.sanitize_job_names(yaml_data) assert yaml_data["jobs"]["job1"]["name"] == "Valid Job-Name_123" -def test_sanitize_job_names_handles_empty_data(workflow_chat_client): - assert workflow_chat_client.sanitize_job_names(None) is None - assert workflow_chat_client.sanitize_job_names({}) is None - assert workflow_chat_client.sanitize_job_names({"jobs": {}}) is None +def test_sanitize_job_names_handles_empty_data(): + client = _make_client() + assert client.sanitize_job_names(None) is None + assert client.sanitize_job_names({}) is None + assert client.sanitize_job_names({"jobs": {}}) is None From 90c97253345a091083e5137bbb30efc29a42ff6c Mon Sep 17 00:00:00 2001 From: "Hanna Paasivirta (OpenFn)" Date: Tue, 5 May 2026 22:52:16 +0900 Subject: [PATCH 08/16] rename tests --- .../test_extract.py} | 0 .../test_sanitize.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename services/workflow_chat/tests/unit/{test_workflow_chat_client_extract.py => client/test_extract.py} (100%) rename services/workflow_chat/tests/unit/{test_workflow_chat_client_sanitize.py => client/test_sanitize.py} (100%) diff --git a/services/workflow_chat/tests/unit/test_workflow_chat_client_extract.py b/services/workflow_chat/tests/unit/client/test_extract.py similarity index 100% rename from services/workflow_chat/tests/unit/test_workflow_chat_client_extract.py rename to services/workflow_chat/tests/unit/client/test_extract.py diff --git a/services/workflow_chat/tests/unit/test_workflow_chat_client_sanitize.py b/services/workflow_chat/tests/unit/client/test_sanitize.py similarity index 100% rename from services/workflow_chat/tests/unit/test_workflow_chat_client_sanitize.py rename to services/workflow_chat/tests/unit/client/test_sanitize.py From 013c1a7bf6e813ed7d3497a21e1dca1f06d0cbaa Mon Sep 17 00:00:00 2001 From: "Hanna Paasivirta (OpenFn)" Date: Tue, 5 May 2026 23:32:08 +0900 Subject: [PATCH 09/16] make static sanitisation method --- .../tests/unit/client/__init__.py | 0 .../tests/unit/client/test_extract.py | 12 ++-------- .../tests/unit/client/test_sanitize.py | 22 +++++-------------- services/workflow_chat/workflow_chat.py | 6 +++-- 4 files changed, 12 insertions(+), 28 deletions(-) create mode 100644 services/workflow_chat/tests/unit/client/__init__.py diff --git a/services/workflow_chat/tests/unit/client/__init__.py b/services/workflow_chat/tests/unit/client/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/services/workflow_chat/tests/unit/client/test_extract.py b/services/workflow_chat/tests/unit/client/test_extract.py index 77e37f87..f5eb41c9 100644 --- a/services/workflow_chat/tests/unit/client/test_extract.py +++ b/services/workflow_chat/tests/unit/client/test_extract.py @@ -1,14 +1,7 @@ from workflow_chat.workflow_chat import AnthropicClient -def _make_client(): - # Bypass __init__ to skip anthropic.Anthropic() construction, which the - # unit-tier guard blocks. The helpers under test never touch self.client. - return object.__new__(AnthropicClient) - - def test_extract_job_codes_preserves_real_code(): - client = _make_client() yaml_data = { "jobs": { "job1": {"body": "console.log('hello world')"}, @@ -16,7 +9,7 @@ def test_extract_job_codes_preserves_real_code(): } } - preserved_values, _ = client.extract_and_preserve_components(yaml_data) + preserved_values, _ = AnthropicClient.extract_and_preserve_components(yaml_data) assert preserved_values == { "__CODE_BLOCK_job1__": "console.log('hello world')", @@ -25,7 +18,6 @@ def test_extract_job_codes_preserves_real_code(): def test_extract_job_codes_ignores_default_placeholder(): - client = _make_client() yaml_data = { "jobs": { "job1": {"body": "// Add operations here"}, @@ -34,6 +26,6 @@ def test_extract_job_codes_ignores_default_placeholder(): } } - preserved_values, _ = client.extract_and_preserve_components(yaml_data) + preserved_values, _ = AnthropicClient.extract_and_preserve_components(yaml_data) assert preserved_values == {"__CODE_BLOCK_job2__": "real code here"} diff --git a/services/workflow_chat/tests/unit/client/test_sanitize.py b/services/workflow_chat/tests/unit/client/test_sanitize.py index 43becd1e..3bfd30a9 100644 --- a/services/workflow_chat/tests/unit/client/test_sanitize.py +++ b/services/workflow_chat/tests/unit/client/test_sanitize.py @@ -1,14 +1,7 @@ from workflow_chat.workflow_chat import AnthropicClient -def _make_client(): - # Bypass __init__ to skip anthropic.Anthropic() construction, which the - # unit-tier guard blocks. The helpers under test never touch self.client. - return object.__new__(AnthropicClient) - - def test_sanitize_job_names_removes_diacritics(): - client = _make_client() yaml_data = { "jobs": { "job1": {"name": "Café München"}, @@ -16,14 +9,13 @@ def test_sanitize_job_names_removes_diacritics(): } } - client.sanitize_job_names(yaml_data) + AnthropicClient.sanitize_job_names(yaml_data) assert yaml_data["jobs"]["job1"]["name"] == "Cafe Munchen" assert yaml_data["jobs"]["job2"]["name"] == "Naive resume" def test_sanitize_job_names_removes_special_characters(): - client = _make_client() yaml_data = { "jobs": { "job1": {"name": "Job@#$%Name!"}, @@ -31,23 +23,21 @@ def test_sanitize_job_names_removes_special_characters(): } } - client.sanitize_job_names(yaml_data) + AnthropicClient.sanitize_job_names(yaml_data) assert yaml_data["jobs"]["job1"]["name"] == "JobName" assert yaml_data["jobs"]["job2"]["name"] == "ProcessDataWithSymbols" def test_sanitize_job_names_preserves_allowed_characters(): - client = _make_client() yaml_data = {"jobs": {"job1": {"name": "Valid Job-Name_123"}}} - client.sanitize_job_names(yaml_data) + AnthropicClient.sanitize_job_names(yaml_data) assert yaml_data["jobs"]["job1"]["name"] == "Valid Job-Name_123" def test_sanitize_job_names_handles_empty_data(): - client = _make_client() - assert client.sanitize_job_names(None) is None - assert client.sanitize_job_names({}) is None - assert client.sanitize_job_names({"jobs": {}}) is None + assert AnthropicClient.sanitize_job_names(None) is None + assert AnthropicClient.sanitize_job_names({}) is None + assert AnthropicClient.sanitize_job_names({"jobs": {}}) is None diff --git a/services/workflow_chat/workflow_chat.py b/services/workflow_chat/workflow_chat.py index 95e12657..7abef569 100644 --- a/services/workflow_chat/workflow_chat.py +++ b/services/workflow_chat/workflow_chat.py @@ -313,7 +313,8 @@ def remove_ids(obj): logger.warning(f"Could not remove IDs from YAML: {e}") return yaml_str - def sanitize_job_names(self, yaml_data): + @staticmethod + def sanitize_job_names(yaml_data): """ Sanitize job names by removing special characters and normalizing diacritics. Also sanitizes job references in edges (source and target fields) and edge keys. @@ -440,7 +441,8 @@ def validate_adaptors(self, yaml_data): except Exception as e: logger.error(f"validate_adaptors encountered an error: {e}") - def extract_and_preserve_components(self, yaml_data): + @staticmethod + def extract_and_preserve_components(yaml_data): """ Extract both codes and IDs from all components. Returns: (preserved_values, processed_yaml_string) From d03f0d893fb2f3a5258dcb16a5b9ea2bcff067e4 Mon Sep 17 00:00:00 2001 From: Joe Clark Date: Thu, 7 May 2026 10:16:54 +0100 Subject: [PATCH 10/16] remove prompt stuff --- agent-team-architecture-plan/1-unit-tests.md | 198 --------- .../2-service-tests.md | 416 ------------------ .../3-integration-tests.md | 416 ------------------ .../4-acceptance-tests.md | 377 ---------------- agent-team-architecture-plan/5-overview.md | 205 --------- 5 files changed, 1612 deletions(-) delete mode 100644 agent-team-architecture-plan/1-unit-tests.md delete mode 100644 agent-team-architecture-plan/2-service-tests.md delete mode 100644 agent-team-architecture-plan/3-integration-tests.md delete mode 100644 agent-team-architecture-plan/4-acceptance-tests.md delete mode 100644 agent-team-architecture-plan/5-overview.md diff --git a/agent-team-architecture-plan/1-unit-tests.md b/agent-team-architecture-plan/1-unit-tests.md deleted file mode 100644 index 903dcd9b..00000000 --- a/agent-team-architecture-plan/1-unit-tests.md +++ /dev/null @@ -1,198 +0,0 @@ -# Apollo Testing Architecture — Layer 1: Unit Tests - -**Scope:** `services/global_chat/`, `services/workflow_chat/`, `services/job_chat/`, their tools, and any future sub-agent / tool service. - ---- - -## 1. What qualifies as a unit test in Apollo - -A test is a unit test iff ALL of: - -1. Exercises **exactly one function, method, or pure class behaviour**. -2. **Zero I/O** — no network (Anthropic, Pinecone, Langfuse export, Sentry), no subprocess, no DB, no file writes outside `tmp_path`. -3. **Zero LLM calls**, not even through a mock HTTP client. If the test needs a mocked Anthropic response to make sense, it's a service test. -4. **Deterministic and fast** — target <50 ms per test; whole unit suite <15 s. -5. **Free** — no API keys required. - -Tests that don't meet all five get moved to service or integration — see §7. - ---- - -## 2. Directory and file layout - -Per-service: one flat `tests/` folder holding all four tiers. This tier owns `*_unit.py` filenames. - -``` -services//tests/ - __init__.py - conftest.py # thin — auto-applies tier marker by filename suffix and folder name - test__unit.py # ← this tier - test__service.py # tier 2 - test__integration.py # tier 3 - acceptance/ # tier 4 — markdown specs - *.md - fixtures/ # optional; static JSON/YAML sample inputs (per-service only) -``` - -Cross-service util tests (`services/util.py` — `AdaptorSpecifier`, `DictObj`, `sum_usage`, `add_page_prefix`) live under `services/tests/test_util_unit.py` — a shared test directory at the services root, since these helpers don't relate to a specific chat service. - -**Why suffix + auto-marker, not subdirs?** While each service has <20 test files, subdirs add bureaucracy for no gain. Tier marker is applied by filename suffix in the per-service conftest. If a service grows past ~30 test files, promoting to subdirs is a mechanical rename. - ---- - -## 3. Shared helper package: `apollo/testing/` - -Lives at the apollo root as a peer of `services/` and `platform/` — it's test infrastructure, not a service. - -``` -apollo/testing/ - __init__.py - anthropic_mock.py # owned by service tier; unit tier does not import - fixtures.py # pytest fixtures: sample payloads, fake api keys, yaml-assertion helpers - server.py # owned by integration tier; unit tier does not import - judge.py # owned by acceptance tier; unit tier does not import - fixtures/ # sample YAML / JSON shared across services - workflows/*.yaml - histories/*.json -``` - -Unit-tier import rules: - -- ✅ Import from `testing.fixtures` — pytest fixtures, YAML assertion helpers, payload builders, fixture loaders. -- ❌ Do not import from `testing.anthropic_mock` — that's the service tier's territory. -- ❌ Do not import from `testing.server` or `testing.judge` — integration / acceptance tiers only. - -`testing/fixtures.py` owns (same module, flat — split into files only when this one grows past ~500 lines): - -- Pytest fixtures (`sample_workflow_yaml`, `sample__chat_payload`, `fake_api_key`). -- YAML assertion helpers migrated from the currently-duplicated `services/global_chat/tests/test_utils.py` and `services/workflow_chat/tests/test_utils.py` (`path_matches`, `assert_yaml_equal_except`, `assert_yaml_section_contains_all`, `assert_yaml_has_ids`, `assert_yaml_jobs_have_body`, `assert_no_special_chars`). -- Payload builders (`make__chat_payload`) — shared with integration tier. -- Fixture loaders (`load_fixture_json`, `load_fixture_yaml`). -- `set_unit_test_env()` — dummy keys, disable langfuse/sentry. - -These are pure functions, so they themselves deserve unit tests in `services/tests/test_fixtures_unit.py`. - ---- - -## 4. Test runner and configuration - -pytest. Add to `pyproject.toml`: - -```toml -[tool.pytest.ini_options] -minversion = "8.0" -pythonpath = ["services", "."] -testpaths = ["services"] -python_files = ["test_*.py"] -markers = [ - "unit: fast, isolated, no I/O, no LLM, no network", - "service: main() with mocked anthropic HTTP client", - "integration: real LLM via bun server", - "acceptance: LLM-judged quality/voice tests", -] -addopts = ["-ra"] -``` - -`pythonpath = ["services", "."]` kills the `sys.path.insert(...)` boilerplate (the `"services"` entry lets you write `from global_chat.global_chat import main`; the `"."` entry lets you write `from testing.fixtures import ...`). - -Commands: - -```bash -poetry run pytest -m unit -poetry run pytest services/global_chat/tests -m unit -poetry run pytest services/workflow_chat/tests/test_workflow_chat_functions_unit.py -``` - -TypeScript platform tests continue to use `bun:test`. Unchanged. - ---- - -## 5. Conftest strategy - -Two levels: - -### 5.1 Root: `apollo/conftest.py` - -- Calls `testing.fixtures.set_unit_test_env()` at import time. -- Exposes `pytest_plugins = ["testing.fixtures"]` so shared fixtures are globally available. - -### 5.2 Per-service: `services//tests/conftest.py` - -- Service-specific fixtures (e.g. a `global_chat_router_decision` factory). Usually <30 lines. -- Auto-applies the tier marker by filename suffix: - - ```python - def pytest_collection_modifyitems(config, items): - for item in items: - name = item.fspath.basename - if name.endswith("_unit.py"): - item.add_marker(pytest.mark.unit) - elif name.endswith("_service.py"): - item.add_marker(pytest.mark.service) - ``` - -Same loop can live in the root `apollo/conftest.py` instead — one copy rather than per-service. Minor detail for implementation. - ---- - -## 6. CI integration - -Unit and service both run in `.github/workflows/tests.yaml` under one job with `-m "unit or service"`. Triggers on every push / PR. No secrets. 10-minute timeout. See overview §6. - -Coverage: generate `--cov=services --cov-report=xml` and upload as artifact for visibility; no threshold gate. - ---- - -## 7. Migration path for existing tests - -- `services/workflow_chat/tests/test_functions.py` — all eight tests are already unit-shaped. Rename to `test_workflow_chat_functions_unit.py`, delete `sys.path.insert(...)`, delete the local `client` fixture entirely (unit tests don't need an Anthropic client per §1's "zero LLM calls" rule). No assertion changes. -- `services/job_chat/tests/test_functions.py` — misclassified. `test_generate_system_message_loads_adaptor_docs_when_missing` hits Postgres → integration. `test_generate_queries_returns_valid_structure` hits real Anthropic → service (with mocked client) or integration. `test_search_docs_returns_general_docs_only` hits Pinecone → integration. A new `test_prompt_unit.py` covers the pure helpers (`build_prompt`, `build_error_correction_prompt`, `extract_page_prefix_from_last_turn`). -- `services/global_chat/tests/test_utils.py` + `services/workflow_chat/tests/test_utils.py` — YAML helpers migrate to `testing/fixtures.py`. The `call__service` subprocess helpers are replaced by the integration tier's `ApolloClient`. Old files are deleted after all callers are updated. -- `*_pass_fail.py`, `*_qualitative.py`, `*_langfuse_tracing.py`, `*_adaptor_version_passthrough.py`, `*_planner_*.py`, `*_good_morning_*.py` — owned by service/integration/acceptance tiers. - -**Migration order:** - -1. Create `testing/` skeleton + root `apollo/conftest.py` (empty fixtures — just env guard + network block). -2. Add `[tool.pytest.ini_options]` to pyproject. -3. Migrate `workflow_chat/tests/test_functions.py` (smallest, cleanest — the worked example). -4. Wire `.github/workflows/tests.yaml` with `-m unit` initially. -5. Only then start adding net-new unit tests for `yaml_utils.py`, `config_loader.py`, `tool_definitions.py`, etc. - ---- - -## 8. Extensibility — new sub-agent or tool - -A new service `services/my_new_agent/`: - -1. Create `services/my_new_agent/tests/` with a conftest (copy the auto-marker loop). -2. Add `test__unit.py` files. - -A new tool in `services//tools/`: - -1. Add `services//tests/test__unit.py`. Tool schema dicts are ideal targets — assert shape, required keys, JSON-schema validity. - -No pyproject, CI, or shared-package edits required. - ---- - -## 9. Boundaries with other tiers - -- **Unit stops / service begins:** the moment a test needs `main()` to run, or needs to verify "logs appended to prompt" / "api key ended up in headers" / "router picked workflow_agent when given X", it's a service test. -- **Unit stops / integration begins:** if a test needs a live Anthropic response (even for shape), it's integration. If it needs the bun server, it's integration. -- **Unit stops / acceptance begins:** no overlap — acceptance is about answer quality, unit is about code correctness. - ---- - -## 10. Things deliberately NOT done - -- No per-tier subdirectories per service. -- No separate `env.py` / `loaders.py` / `yaml_assertions.py` modules in `testing/` — one `fixtures.py` until a concrete split is warranted. -- No coverage gates. -- No property-based testing (Hypothesis) in the initial plan — add a `strategies.py` module when the need arises. -- No pytest plugin dev dep beyond what's already in poetry.lock. - ---- - -## Summary - -Unit layer is small, boring, and fast. One flat `tests/` folder per service with filename-suffixed tiers, one `testing/` package with three files, dummy env vars in the root conftest, CI on every push with no secrets. Adding a unit test for a new function is a one-line import and a `def test_...`. diff --git a/agent-team-architecture-plan/2-service-tests.md b/agent-team-architecture-plan/2-service-tests.md deleted file mode 100644 index e79f584a..00000000 --- a/agent-team-architecture-plan/2-service-tests.md +++ /dev/null @@ -1,416 +0,0 @@ -# Section 2 — Service Tests Architecture - -> Scope: `services/global_chat/`, `services/workflow_chat/`, `services/job_chat/`, and the tools / sub-agents they invoke. - ---- - -## 1. Naming and position - -**Service tests** — direct calls to `main()` with Anthropic HTTP calls mocked. One rung above unit, one below integration. - -| Tier | Scope | LLM calls | HTTP layer | Cost | Runs on PR push | -|------------|-------------------|-----------|------------|------|-----------------| -| unit | single function | no | no | free | yes | -| **service** | **`main()` end-to-end** | **mocked** | **no (direct python)** | **free** | **yes** | -| integration | `main()` via server | real | yes | $$ | manual / label | -| acceptance | behaviour spec | real | yes | $$$ | nightly / manual | - -Service tests verify *logic and information flow*: payload validation, routing, prompt assembly, tool-call orchestration, sub-agent invocation, history/usage aggregation, error paths, headers, api_key passthrough. They do **not** verify model quality. - ---- - -## 2. The `test_hooks` second argument - -### 2.1 Signature change - -```python -def main(data_dict: dict, test_hooks: Optional[dict] = None) -> dict: ... -``` - -`entry.py` keeps calling `m.main(data)` with one positional arg — the HTTP path never sees `test_hooks`. `test_hooks` is a test-only affordance. - -### 2.2 The `test_hooks` dict — minimum viable shape - -Plain Python `dict`. No `TypedDict`, no pydantic model — just a dict with documented keys. The recognised keys are documented as a docstring in `testing/anthropic_mock.py`: - -```python -# testing/anthropic_mock.py -""" -The `test_hooks` dict accepts (all optional; all default to absent): - -- "anthropic_http_client": an httpx.Client backed by httpx.MockTransport. - When present, threaded into every Anthropic(...) constructor site. -- "tool_calls": a list[dict] the test allocates. Production code appends - breadcrumbs via record_tool_call(test_hooks, entry). -- "tool_stubs": dict[str, Callable] keyed by tool name. When the planner - dispatches a tool, if a stub exists for that name, the stub is called - with the tool input and its return value used as the tool result. Today - only used for "search_documentation" — see §5. -""" -``` - -Start with three keys. Add more only when a concrete test can't be written without one. Things intentionally left out until needed: - -- Sub-agent stub registry. Default behaviour: the sub-agent's `main()` runs under the same mock HTTP client — that's usually what a test wants. A stub registry (`test_hooks["subagent_stubs"]`) can be added if a test needs to bypass the sub-agent's logic entirely. -- `seed`, `disable_langfuse`, `scratch`. Add when a test fails without them. - -### 2.3 Threading `test_hooks` through - -Each chat service's `main()` passes `test_hooks` into the agent / client constructors it creates. Everywhere that currently calls `Anthropic(api_key=...)` swaps to `build_anthropic_client(api_key, test_hooks)` (new factory — see §3). - -Sites that change: - -- `services/job_chat/job_chat.py` — `AnthropicClient.__init__`. -- `services/workflow_chat/workflow_chat.py` — `AnthropicClient.__init__`. -- `services/global_chat/router.py` — `RouterAgent.__init__`. -- `services/global_chat/planner.py` — `PlannerAgent.__init__`. -- `services/global_chat/subagent_caller.py` — accepts `test_hooks` and forwards to sub-agent `main()` calls. - -Production behaviour when `test_hooks is None` is byte-identical to today. Every new kwarg defaults to `None`. - ---- - -## 3. Mock Anthropic HTTP client - -### 3.1 Factory in `services/util.py` - -```python -def build_anthropic_client(api_key: str, test_hooks: Optional[dict] = None) -> Anthropic: - http_client = (test_hooks or {}).get("anthropic_http_client") - kwargs = {"api_key": api_key} - if http_client is not None: - kwargs["http_client"] = http_client - return Anthropic(**kwargs) -``` - -Every `AnthropicClient` / `RouterAgent` / `PlannerAgent` constructor swaps `Anthropic(api_key=...)` for `build_anthropic_client(api_key, test_hooks)`. - -### 3.2 `MockAnthropic` in `testing/anthropic_mock.py` - -`MockAnthropic` is a thin wrapper over `httpx.MockTransport`. Tests register regex → response pairs; on each Anthropic request the mock matches the latest user message text against registered patterns and returns the first match. No new runtime dep — `httpx.MockTransport` is built into httpx, which is already in `poetry.lock`. - -```python -class MockAnthropic: - """Mock Anthropic API backed by httpx.MockTransport. - - Tests register regex → response pairs. Each request is matched against - the latest user message text (including tool_result content); the first - matching pattern wins. No match raises AssertionError. - - Usage: - mock = MockAnthropic() - mock.set_response(r"haiku", "sure, here's a haiku") - mock.set_response(r"create workflow", tool_use("call_workflow_agent", {...})) - - test_hooks = test_hooks_factory(anthropic=mock) - main(payload, test_hooks) - - assert mock.last_request.headers["x-api-key"] == "sk-test" - """ - - def __init__(self): - self._responses: list[tuple[re.Pattern, str | list[dict]]] = [] - self.requests: list[httpx.Request] = [] - - def set_response(self, pattern: str, response: str | list[dict]) -> None: - """Register a response for any request whose latest user message - text matches `pattern`. `response` is either: - - str: returned as a single text content block. - - list[dict]: returned as content blocks (use for tool_use, mixed). - """ - self._responses.append((re.compile(pattern), response)) - - @property - def httpx_client(self) -> httpx.Client: - return httpx.Client(transport=httpx.MockTransport(self._handle)) - - @property - def last_request(self) -> httpx.Request: - return self.requests[-1] - - def _handle(self, request: httpx.Request) -> httpx.Response: - self.requests.append(request) - body = json.loads(request.content) - user_text = _latest_user_text(body.get("messages", [])) - for pattern, resp in self._responses: - if pattern.search(user_text): - return httpx.Response(200, json=_build_message_body(resp)) - raise AssertionError( - f"MockAnthropic: no pattern matched user message {user_text!r}. " - f"Registered patterns: {[p.pattern for p, _ in self._responses]}" - ) - - -def tool_use(name: str, input: dict, id: str = "toolu_test") -> list[dict]: - """Build a single tool_use content block for `set_response`.""" - return [{"type": "tool_use", "id": id, "name": name, "input": input}] -``` - -Two private helpers in the same file: - -- `_latest_user_text(messages)` walks `messages` in reverse, takes the last `role=user` message, and concatenates its `text` blocks plus any `tool_result` content. Matching against `tool_result` text is what makes the planner's internal loop work — see §3.3. -- `_build_message_body(response)` wraps a string in `[{"type": "text", "text": ...}]` (or passes a list through), then assembles the standard Anthropic message envelope: `id`, `type`, `role: "assistant"`, `model`, `content`, `stop_reason`, `stop_sequence`, `usage`. - -Design choices worth knowing: - -- **First-match-wins.** Order specific patterns before general ones. -- **Loud no-match.** Raises `AssertionError` with the unmatched text and the list of registered patterns. Beats silent fallbacks that mask test drift. -- **Captured requests.** `mock.requests` (list) and `mock.last_request` for assertions on outbound headers, body shape, system prompt, `cache_control`, beta headers, history round-trip. - -Tool-call breadcrumbs (`record_tool_call`) live in the same file — see §4. - -### 3.3 The planner's internal loop - -`main()` is one user turn, but the planner agent internally calls Anthropic multiple times within that turn — once per round of the tool-use loop (`call → tool_use → run tool → call with tool_result → ...` until `end_turn`). The regex matcher handles this naturally because each round has different content in the latest user message: - -```python -def test_planner_calls_workflow_then_job_agents(test_hooks_factory): - mock = MockAnthropic() - mock.set_response(r"create a workflow", - tool_use("call_workflow_agent", {"message": "create workflow"})) - mock.set_response(r"workflow created", - tool_use("call_job_code_agent", {"message": "code", "job_key": "fetch"})) - mock.set_response(r"code generated", "all done") - - test_hooks = test_hooks_factory(anthropic=mock) - main(make_global_chat_payload("create a workflow"), test_hooks) - - assert [c["tool"] for c in test_hooks["tool_calls"]] == [ - "call_workflow_agent", "call_job_code_agent", - ] -``` - -- Round 1: latest user message = the user's prompt → matches `r"create a workflow"`, returns the first `tool_use`. -- Round 2: latest user message = the `tool_result` from `call_workflow_agent` → matches `r"workflow created"`, returns the next `tool_use`. -- Round 3: latest user message = the `tool_result` from `call_job_code_agent` → matches `r"code generated"`, returns the final text. - -When a sub-agent (`workflow_chat`, `job_chat`) gets invoked inside this loop, its own `main()` runs under the *same* mock client. Tests register regexes covering both the parent's and the child's expected user messages on one mock — no sub-agent stub registry needed. - -### 3.4 Streaming (deferred) - -Streaming endpoints (`/services//stream`, `/services/` WS) emit Anthropic-formatted SSE events through `bridge.ts` (`StreamManager` in `services/streaming_util.py`). Detailed mock support — likely a `set_stream_response(pattern, events)` method or a sibling `MockAnthropicStream` class — is out of scope for this section. Defer until the first service test for a streaming code path actually needs it; meanwhile the integration tier covers stream behaviour against the real bun server. - ---- - -## 4. Tool-call breadcrumbs (`test_hooks["tool_calls"]`) - -`test_hooks["tool_calls"]` is a list the test allocates and production code appends to. One helper in `testing/anthropic_mock.py`: - -```python -def record_tool_call(test_hooks: Optional[dict], entry: dict) -> None: - if test_hooks is None: - return - crumbs = test_hooks.get("tool_calls") - if crumbs is not None: - crumbs.append(entry) -``` - -Dispatch sites (`planner._execute_tool`, `router.route_and_execute`) call `record_tool_call(test_hooks, {"tool": ..., "input": ...})`. Two dict lookups per call when `test_hooks is None` — negligible. - -Tests read: - -```python -assert [c["tool"] for c in test_hooks["tool_calls"]] == ["router_decision", "call_workflow_agent"] -``` - ---- - -## 5. Tool stubs (`test_hooks["tool_stubs"]`) - -Most planner tools don't need stubbing. `call_workflow_agent` and `call_job_code_agent` inherit `test_hooks` and run the sub-agent's own mocked `main()`. `inspect_job_code` is pure local code with no network. The one tool that does need stubbing is **`search_documentation`** — without a stub it would hit Pinecone (vector store) and OpenAI (embeddings) on every service test. - -Production change in `services/global_chat/planner.py::_execute_tool` — one if/else at the top of the dispatch: - -```python -stub = (self._test_hooks or {}).get("tool_stubs", {}).get(tool_use_block.name) -if stub is not None: - tool_result = stub(tool_use_block.input) -else: - # original dispatch by name follows - ... -``` - -Test usage: - -```python -test_hooks = { - "anthropic_http_client": mock.httpx_client, - "tool_calls": [], - "tool_stubs": { - "search_documentation": lambda tool_input: "Cron triggers run on a schedule...", - }, -} -result = main(payload, test_hooks) -``` - -The stub returns whatever shape the real tool returns (here a string — the planner feeds it back into the next Anthropic call). - -A `build_search_documentation_stub(docs=[...])` helper in `testing/anthropic_mock.py` can emerge once a second test reuses the same shape — not preemptively. - ---- - -## 6. Directory layout - -``` -services//tests/ - __init__.py - conftest.py # re-exports shared fixtures; auto-marks by filename suffix - test__unit.py # tier 1 (unit-tests-architect) - test__service.py # tier 2 (this tier) - fixtures/ # per-service fixture data (optional) -``` - -Test filenames this tier will add (illustrative, not exhaustive): - -- `services/global_chat/tests/test_router_service.py` — router decisions by intent. -- `services/global_chat/tests/test_planner_service.py` — tool dispatch order, test_hooks propagation. -- `services/global_chat/tests/test_subagent_passthrough_service.py` — global → workflow / job wiring. -- `services/workflow_chat/tests/test_workflow_chat_service.py` — YAML extraction, retry loop, streaming events. -- `services/job_chat/tests/test_job_chat_service.py` — RAG injection (with stubbed retriever), suggest-code response shape, page-prefix detection, error-correction loop. - -Cross-service end-to-end flow tests (planner chain over mocks) also live under `services/global_chat/tests/` since `global_chat` owns the planner. - ---- - -## 7. Shared helpers in `testing/` - -``` -testing/ - __init__.py - anthropic_mock.py # MockAnthropic + tool_use helper, test_hooks-keys docstring, record_tool_call - fixtures.py # pytest fixtures + YAML assertion helpers + payload builders + loaders - fixtures/ - workflows/*.yaml - histories/*.json -``` - -`fixtures.py` is the flat home for: - -- `make_global_chat_payload`, `make_workflow_chat_payload`, `make_job_chat_payload`. -- `get_workflow_yaml_attachment`, `get_suggested_code_attachment`, `get_usage`. -- `assert_yaml_has_ids`, `assert_yaml_jobs_have_body`, `assert_yaml_equal_except`, `path_matches`, `assert_no_special_chars`. -- Pytest fixtures: `mock_anthropic`, `test_hooks_factory`, `fake_api_key`, `sample_workflow_yaml`. -- `set_unit_test_env` (dummy keys, disable langfuse/sentry). -- `load_fixture_json`, `load_fixture_yaml`. - -One file until it gets unwieldy (~500 lines). Split then, not pre-emptively. - -Key fixture: - -```python -@pytest.fixture -def test_hooks_factory(): - def _factory(*, anthropic=None, **overrides): - opts = {"tool_calls": []} - if anthropic is not None: - opts["anthropic_http_client"] = anthropic.httpx_client - opts.update(overrides) - return opts - return _factory -``` - -Per-service `conftest.py` just exposes `pytest_plugins = ["testing.fixtures"]` (inherited from root) plus any per-service niche fixtures. - ---- - -## 8. Pytest configuration - -Owned initially by this tier in PR #1 (bootstrap). See overview §5 for the full block. Relevant keys: - -```toml -[tool.pytest.ini_options] -pythonpath = ["services", "."] -testpaths = ["services"] -python_files = ["test_*.py"] -markers = [ - "unit: ...", - "service: main() tests with mocked LLM HTTP client", - "integration: ...", - "acceptance: ...", -] -addopts = ["-ra"] -``` - -Markers applied by filename suffix in the root `apollo/conftest.py` — authors don't decorate manually. - ---- - -## 9. CI integration - -Service runs in the same `tests.yaml` workflow as unit, via `pytest -m "unit or service"`. No secrets on this job (invariant). See overview §6. - ---- - -## 10. Migration recipe for existing `pass_fail` tests - -1. **Classify the assertion.** Content-sensitive (`"response mentions Salesforce"`) → integration or acceptance. Structural (`"workflow_yaml has 2 jobs"`) → service (with a canned mock producing that structure). -2. **Replace the call site.** Swap `subprocess.run([..., "entry.py", ...])` for `from . import main; main(payload, test_hooks)`. -3. **Build the mock.** Register one or more `set_response(pattern, response)` pairs on `MockAnthropic` so each Anthropic call in the path under test gets a matching canned response. -4. **Assert on structure + breadcrumbs.** Replace content asserts with routing / shape asserts; keep content in acceptance. -5. **Delete the old test** once the new one is stable. - -Expect ~50–70% of `pass_fail` tests to become service tests; the rest stay in integration. - ---- - -## 11. Production-code edits (summary) - -| File | Edit | -|------|------| -| `services/global_chat/global_chat.py` | `main(data_dict, test_hooks=None)`; thread through | -| `services/workflow_chat/workflow_chat.py` | same | -| `services/job_chat/job_chat.py` | same | -| `services/global_chat/router.py` | accept `test_hooks` in `__init__`; pass into sub-agent calls | -| `services/global_chat/planner.py` | accept `test_hooks`; use in `_execute_tool`; thread into subagent_caller | -| `services/global_chat/subagent_caller.py` | accept `test_hooks`; pass to sub-agent `main()` | -| `services/util.py` | add `build_anthropic_client()` | - -Everywhere: backward-compatible defaults. `test_hooks is None` ⇒ existing behaviour, byte-for-byte. - ---- - -## 12. Extensibility — new sub-agent or tool - -**New sub-agent:** - -1. `services/my_new_agent/my_new_agent.py` with `def main(data, test_hooks=None)`. -2. Every `Anthropic(...)` site uses `build_anthropic_client(api_key, test_hooks)`. -3. Thread `test_hooks` through internal calls. -4. Add `services/my_new_agent/tests/test_*_service.py`. Conftest auto-inherits. - -**New tool in the planner:** - -1. Append to `services/global_chat/tools/tool_definitions.py`. -2. Dispatch branch in `planner._execute_tool` calls `record_tool_call(test_hooks, ...)`. -3. Write service tests. - -Pattern: **one arg, one call to `record_tool_call`, one test**. No framework changes. - ---- - -## 13. What this tier deliberately does NOT do - -- **No sub-agent stub registry on day one.** Default behaviour (sub-agent runs under shared mock client) is what tests usually want. -- **No `test_hooks["seed"]` / `["disable_langfuse"]` / `["scratch"]`.** Add when a test fails without them. -- **No `pytest-asyncio`, `pytest-randomly`, or other dev deps.** Add when needed. -- **No frozen public API contract between tiers.** Shared helpers live in one package; rename when the signature improves. - ---- - -## 14. What else belongs in this tier - -Good service-test targets: - -- **API key threading** — payload `api_key` ends up in `mock.last_request.headers["x-api-key"]`; absent → env var is used. -- **Cache-control regression** — planner system prompt has `cache_control: {"type": "ephemeral"}`; assert on outbound request body via `json.loads(mock.last_request.content)` (httpx `Request.content` is bytes — there's no `.json` accessor on the request side). -- **Context-management beta** — planner sets `context-management-2025-06-27` header and `context_management` field on every call. -- **History round-trip** — returned `history` equals input + this turn's user/assistant messages. -- **`AdaptorSpecifier` propagation** — payload `context.adaptor = "@openfn/language-http@3.1.11"` shows up in the prompt. -- **Retry loops** — `workflow_chat` retries once on YAML parse failure; script invalid-then-valid and assert count. -- **Negative paths** — missing `content` → `ApolloError(400)`; malformed tool-use response → graceful fallback. - ---- - -## Summary - -`test_hooks` second arg on `main()` + `build_anthropic_client(api_key, test_hooks)` factory + `MockAnthropic` (regex → response pairs, `AssertionError` on no match) with a `tool_use(...)` helper for tool-use content blocks + three `test_hooks` keys (`anthropic_http_client`, `tool_calls`, `tool_stubs` — the last only used for `search_documentation` today). One mock file in `testing/`, filename-suffix markers, shared workflow with the unit tier. Streaming and any sub-agent stub infrastructure are deferred until a real test needs them. diff --git a/agent-team-architecture-plan/3-integration-tests.md b/agent-team-architecture-plan/3-integration-tests.md deleted file mode 100644 index 2cd4c26c..00000000 --- a/agent-team-architecture-plan/3-integration-tests.md +++ /dev/null @@ -1,416 +0,0 @@ -# Section 3 — Integration Tests Architecture - -**Scope:** `services/global_chat/`, `services/workflow_chat/`, `services/job_chat/` and future sub-agents / tools. - ---- - -## 1. Purpose and boundaries - -Integration tests exercise a chat service **end-to-end through the running bun server, with real model calls**. First tier in the pyramid where: - -- Full stack: TypeScript `bridge.ts` → `spawn poetry run python services/entry.py` → `services//.py:main()` → Anthropic / OpenAI / Pinecone network calls. -- Assertions are loose-to-moderate on content (regex, keyword presence, structural shape) and strict on payload shape (required fields, types, SSE event sequence, WS lifecycle). -- Slow (seconds to a minute each) and cost real money. - -**What belongs here:** "does the service return a valid workflow yaml end-to-end over HTTP?", "does `/services/global_chat/stream` emit Anthropic-formatted SSE events in order?", "does WS `start → log* → complete` work for a real LLM run?", "does `test_errors` still map `ApolloError` to the right HTTP status?". - -**What doesn't:** - -| Concern | Correct tier | -|---|---| -| "Is this specific LLM answer good quality?" | acceptance | -| "Is the router prompt formatted right?" | service (mocked Anthropic) | -| "Does `AdaptorSpecifier.parse()` handle shorthand?" | unit | - -**Dividing lines:** - -- Service → integration: assertion depends on LLM-generated content. -- Integration → acceptance: assertion needs a judge LLM to evaluate. - ---- - -## 2. Directory layout - -Integration tests live alongside their service. **No top-level `tests/` tree.** One `*_integration.py` file per service: - -``` -services//tests/ - test__integration.py # sync POST + SSE stream + WS in one file -``` - -Concretely: - -``` -services/global_chat/tests/test_global_chat_integration.py -services/workflow_chat/tests/test_workflow_chat_integration.py -services/job_chat/tests/test_job_chat_integration.py -``` - -**Cross-service / planner-chain end-to-end tests** live under `services/global_chat/tests/test_global_chat_integration.py` because `global_chat` owns the planner that dispatches to `workflow_chat` and `job_chat` over real HTTP. There's no separate "cross-service" home — the orchestrator hosts them. - -**Smoke tests** for the platform itself (healthcheck `GET /`, 404 for unknown service, `test_errors` mapping) live under `services/echo/tests/test_echo_integration.py` since `echo` is the test-pipeline-verification service. - -**One file per service** covers sync POST, SSE streaming, and WS in one place. Split by transport (`test__stream_integration.py`) only if a single file gets unwieldy (>500 lines). - ---- - -## 3. Server lifecycle - -The session-scoped bun server fixture lives in **`testing/server.py`** (the shared helper package), not in a top-level conftest. Registered globally via `pytest_plugins = ["testing.server"]` in the root `apollo/conftest.py`. - -### 3.1 Fixture - -```python -@pytest.fixture(scope="session") -def apollo_server() -> ApolloServerHandle: - ... -``` - -Handle: - -```python -@dataclass -class ApolloServerHandle: - base_url: str # "http://127.0.0.1:" - port: int - log_path: Path -``` - -**Startup:** - -1. If `APOLLO_TEST_BASE_URL` is set, return a handle pointing at that URL. Lets us reuse a deployed staging server or a local `bun dev` during debugging. -2. Otherwise, allocate a port (bind-port-0 trick). -3. `subprocess.Popen(["bun", "run", "start"], env={..., "PORT": str(port)}, cwd=repo_root)`. Use `start` (not `dev`) — no hot-reload noise; matches production. -4. Drain stdout/stderr to `tmp/test-logs//bun.log`. -5. Poll `GET http://127.0.0.1:/` with exponential backoff, 60s total cap. -6. Yield handle. - -**Teardown:** SIGTERM → `wait(timeout=5)` → SIGKILL if needed. Flush log threads. - -### 3.2 Scope - -Session-scoped. Cold-start is ~2–5s; per-file or per-test would double runtime. Bun server is stateless between requests (each invocation spawns a fresh Python subprocess via `bridge.ts`), so sharing is safe. - -No `pytest-xdist` parallelism on day one — Anthropic rate limits per API key. Add `-n auto --dist loadfile` with per-file scope when there's demand + headroom. - ---- - -## 4. Environment and secrets - -| Var | Purpose | Missing (dev) | Missing (CI) | -|---|---|---|---| -| `ANTHROPIC_API_KEY` | All chat services | skip | fail | -| `OPENAI_API_KEY` | embeddings, search_docsite | skip | fail | -| `PINECONE_API_KEY` | search_docsite | skip | fail | -| `POSTGRES_URL` | services using `util.get_db_connection()` | skip | fail | -| `APOLLO_TEST_BASE_URL` | reuse deployed server | spawn | spawn | -| `LANGFUSE_TRACING` | flip Langfuse export on/off without changing credentials | unset → off | unset → off | - -`require_env(*names)` helper in `testing/server.py`: - -```python -def require_env(*names: str) -> None: - missing = [n for n in names if not os.environ.get(n)] - if not missing: - return - if os.environ.get("CI") == "true": - pytest.fail(f"Missing env vars: {missing}") - pytest.skip(f"Missing env vars: {missing}", allow_module_level=True) -``` - -Per-module: `require_env("ANTHROPIC_API_KEY")` at module top. - -### 4.1 Secret sourcing in CI - -- Repo secrets: `ANTHROPIC_API_KEY_TEST`, `OPENAI_API_KEY_TEST`, `PINECONE_API_KEY_TEST`, `POSTGRES_URL_TEST`. -- Dedicated test Pinecone namespace; ephemeral test Postgres. Never share the production `apollo-mappings` index / DB. -- Anthropic: separate API key with a low monthly cap, scoped to a test project. - -### 4.2 Langfuse opt-in - -Integration runs *can* trace to Langfuse but don't have to. Mechanism: - -- `LANGFUSE_TRACING=true` → existing `@observe` decorators on production `main()` export traces. -- `LANGFUSE_TRACING=false` (or unset) → `@observe` short-circuits, no export. - -In the CI workflow, leave `LANGFUSE_TRACING` unset by default — integration runs are clean. Add a `workflow_dispatch` input "Trace to Langfuse" (or a separate `run-integration-traced` PR label) to flip it on for the runs where someone wants to inspect traces afterwards. Credentials stay configured either way. - -Integration writes traces only — no scores. Scoring is acceptance-tier territory. - ---- - -## 5. HTTP client - -`ApolloClient` lives in **`testing/server.py`** alongside the server fixture: - -```python -class ApolloClient: - def __init__(self, base_url: str, *, timeout: float = 120.0): ... - - def call(self, service: str, payload: dict) -> dict: ... - # POST /services/. Raises ApolloHTTPError on non-2xx. - - def stream(self, service: str, payload: dict) -> Iterator[SSEEvent]: ... - # POST /services//stream. Yields parsed SSE events. - - def ws(self, service: str, payload: dict) -> Iterator[WSEvent]: ... - # WS /services/. Yields events until "complete". -``` - -Dataclasses `SSEEvent(type, data)` and `WSEvent(event, type, data)` in the same file. - -Dependencies: `httpx` (already in `poetry.lock`) and `websockets`. Install via a new `test-integration` poetry group: - -```toml -[tool.poetry.group.test-integration] -optional = true - -[tool.poetry.group.test-integration.dependencies] -websockets = "^13" -``` - -Installed in CI with `poetry install --with test-integration`. - -Exposed fixture: - -```python -@pytest.fixture -def client(apollo_server) -> ApolloClient: - return ApolloClient(apollo_server.base_url) -``` - -Both integration and acceptance tiers use the same `client` fixture. - ---- - -## 6. Assertion helpers - -All in `testing/server.py` (or a small `testing/helpers.py` if it crowds the file): - -- `collect_until_complete(stream, *, timeout=120)` → `(list[SSEEvent], final_payload)`. -- `assert_event_sequence(events, expected_types, *, strict=False)` — subsequence match by default. -- `accumulate_text_deltas(events)` — concatenate `content_block_delta` text. -- `assert_response_shape(response, required)` — `{"response": str, "usage": dict, "history": list}` style. -- `assert_response_contains(response, *regexes, field="response", flags=re.IGNORECASE)`. - -YAML helpers (`assert_yaml_has_ids`, etc.) come from `testing/fixtures.py` — one canonical location. - ---- - -## 7. Pytest configuration - -Markers declared in `pyproject.toml [tool.pytest.ini_options].markers` — see overview §5 and service-tests plan §7. - -Marker auto-applied by filename suffix `_integration.py` in the per-service or root conftest: - -```python -def pytest_collection_modifyitems(config, items): - for item in items: - name = item.fspath.basename - if name.endswith("_integration.py"): - item.add_marker(pytest.mark.integration) -``` - -Authors don't write `@pytest.mark.integration`. Filename suffix is sufficient. - ---- - -## 8. Opt-in commands - -```bash -# Run the whole integration tier -poetry run pytest -m integration - -# Run one service's -poetry run pytest services/global_chat/tests -m integration - -# Smoke only -poetry run pytest services/echo/tests - -# Against a deployed server -APOLLO_TEST_BASE_URL=https://apollo-staging.openfn.org poetry run pytest -m integration - -# With Langfuse tracing on -LANGFUSE_TRACING=true poetry run pytest -m integration -``` - ---- - -## 9. CI integration - -Integration and acceptance share one workflow: `.github/workflows/llm-tests.yaml`. Integration job runs on PR label `run-integration`, push to `main`, or `workflow_dispatch`. Acceptance job runs on PR label `run-acceptance` or `workflow_dispatch` only — never cron. - -```yaml -name: llm-tests -on: - pull_request: - types: [labeled] - push: - branches: [main] - workflow_dispatch: - inputs: - trace_langfuse: - type: boolean - default: false - description: "Export traces to Langfuse" - -concurrency: - group: llm-${{ github.ref }} - cancel-in-progress: true - -jobs: - integration: - if: >- - (github.event_name == 'pull_request' && github.event.label.name == 'run-integration') - || github.event_name == 'push' - || github.event_name == 'workflow_dispatch' - runs-on: ubuntu-latest - timeout-minutes: 20 - env: - CI: "true" - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY_TEST }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY_TEST }} - PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY_TEST }} - POSTGRES_URL: ${{ secrets.POSTGRES_URL_TEST }} - LANGFUSE_PUBLIC_KEY: ${{ secrets.LANGFUSE_PUBLIC_KEY_TEST }} - LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY_TEST }} - LANGFUSE_BASE_URL: ${{ secrets.LANGFUSE_BASE_URL }} - LANGFUSE_TRACING: ${{ inputs.trace_langfuse == true && 'true' || 'false' }} - steps: - - uses: actions/checkout@v4 - - uses: oven-sh/setup-bun@v2 - - uses: actions/setup-python@v5 - with: { python-version: "3.11" } - - uses: snok/install-poetry@v1 - - run: bun install - - run: poetry install --with test-integration - - run: poetry run pytest -m integration -v --maxfail=5 - - uses: actions/upload-artifact@v4 - if: always() - with: - name: integration-logs-${{ github.run_id }} - path: tmp/test-logs/ - retention-days: 14 - - acceptance: - # see 4-acceptance-tests.md §6 - ... -``` - -`--maxfail=5` so a broken prompt doesn't burn the budget running every test. No `--timeout` plugin on day one. - ---- - -## 10. Cost, retries, flakiness — not on day one - -Deliberately deferred until a concrete bill or flake makes them necessary: - -- No `cost_tracker.py`, `pricing.py`, budget env var, circuit breaker. -- No `pytest-rerunfailures` — flakes are signal. -- No `pytest-timeout` — `--maxfail=5` + 20-min workflow timeout is enough defence. -- No per-test `@pytest.mark.budget(usd=...)`. -- No `slow`, `flaky_model`, `needs_pinecone` markers — add individually when first needed. - -Each of these has a real cost (learning curve, config churn, false alarms). Add one the first time its absence bites — not preemptively. - ---- - -## 11. Streaming and WebSocket testing - -Three transports need coverage. All three live in the same `test__integration.py` file per service. - -### Sync POST - -```python -def test_workflow_chat_cron_trigger(client, sample_workflow_yaml): - payload = make_workflow_chat_payload( - existing_yaml=sample_workflow_yaml, - content="Change the trigger to every day at midnight", - ) - r = client.call("workflow_chat", payload) - assert_response_shape(r, {"response": str, "response_yaml": str, "usage": dict}) - assert_yaml_has_ids(r["response_yaml"]) - yml = yaml.safe_load(r["response_yaml"]) - assert "cron" in yml["triggers"] -``` - -### SSE - -```python -def test_workflow_chat_streams_events(client): - payload = make_workflow_chat_payload(content="Generate a basic workflow", stream=True) - events, final = collect_until_complete(client.stream("workflow_chat", payload)) - assert_event_sequence(events, [ - "message_start", "content_block_start", "content_block_delta", - "content_block_stop", "message_delta", "message_stop", - ]) - assert final is not None - assert final["response"] == accumulate_text_deltas(events) -``` - -### WS - -```python -def test_workflow_chat_ws_lifecycle(client): - payload = make_workflow_chat_payload(content="Generate a basic workflow") - events = list(client.ws("workflow_chat", payload)) - start = [e for e in events if e.event == "start"] - complete = [e for e in events if e.event == "complete"] - assert len(start) == 1 and len(complete) == 1 -``` - -### Error paths - -Lives in the smoke tests for the test_errors service: - -```python -# services/echo/tests/test_echo_integration.py -def test_stream_emits_error_event(client): - events = list(client.stream("test_errors", {"trigger": "RATE_LIMIT"})) - err = next(e for e in events if e.type == "error") - assert err.data["code"] == 429 -``` - ---- - -## 12. Migration of existing tests - -| Existing file | Destination | -|---|---| -| `services/*/tests/test_functions.py` (pure) | unit tier | -| `services/*/tests/test_pass_fail*.py` using `subprocess.run(entry.py)` | `services//tests/test__integration.py` — swap subprocess for `client.call()` | -| `services/*/tests/test_pass_fail*.py` using mocked sub-agents (`test_adaptor_version_passthrough.py`, `test_planner_subagent_clarification.py`) | service tier (mocked LLM) | -| `services/*/tests/test_qualitative.py` | mostly acceptance; a few with machine-checkable shape → integration | -| `services/*/tests/test_langfuse_tracing.py` | `services//tests/test__integration.py` — swap subprocess for `client.call()` | - -Migration strategy: one service at a time, smallest first (workflow_chat). Keep old file for one PR with `pytest.mark.skip("migrated")`, delete in follow-up. - ---- - -## 13. Extensibility - -A new sub-agent or tool is a ~10-minute add: - -1. Create `services//tests/test__integration.py`. -2. Use `client.call() / .stream() / .ws()` — no middleware changes; `describe-modules.ts` auto-mounts. -3. Add `make__payload` to `testing/fixtures.py` if needed. -4. No CI changes — workflow already collects `-m integration` across all services. - -If a new tool requires a new external dep (new API key), add it to `require_env` and the CI secrets list. - ---- - -## 14. What this tier deliberately does NOT do - -- No top-level `tests/` tree — everything lives with the service. -- No cost tracker / pricing table / budget enforcement on day one. -- No retry plugin. -- No dual-API fixture (pytest fixture + context manager). Acceptance imports the same `client` fixture. -- No `APOLLO_TEST_QUICK` skip mode — `-m "not slow"` pattern can be added if needed, but no tests are marked `slow` on day one. -- No contract testing / VCR / recorded HAR. Possible future addition. - ---- - -## Summary - -One file per service: `services//tests/test__integration.py` covering sync + SSE + WS. Session-scoped `apollo_server` + `ApolloClient` in `testing/server.py`. Label-gated CI workflow shared with acceptance. Langfuse tracing is opt-in via `LANGFUSE_TRACING=true` (workflow input or env var). No cost tracking, no retries, no parallelism on day one. Add each when it's actually missed. diff --git a/agent-team-architecture-plan/4-acceptance-tests.md b/agent-team-architecture-plan/4-acceptance-tests.md deleted file mode 100644 index 6c3899c3..00000000 --- a/agent-team-architecture-plan/4-acceptance-tests.md +++ /dev/null @@ -1,377 +0,0 @@ -# Section 4 — Acceptance Tests Architecture - -**Scope:** quality-, voice-, and style-focused evaluation of Apollo chat services (`global_chat`, `workflow_chat`, `job_chat`) against product-owner-authored hero questions. Judged by an LLM-as-judge, reviewable by a human (Joe or Brandon), optionally logged to Langfuse for trend analysis. - -**Non-goals (other tiers):** - -- Unit tests of pure functions → Section 1. -- Mocked-LLM `main()` invocations → Section 2. -- Functional flow with regex content assertions → Section 3. - -Acceptance answers a different question than integration: not "does the system function end-to-end?" but "does the answer sound like us, read well, satisfy the user's intent, and not regress in voice as we bump model versions?" - ---- - -## 1. Guiding principles - -1. **Specs are markdown.** A PO edits a text file, not Python. YAML frontmatter + markdown sections. -2. **HTTP is internal plumbing.** Specs never mention ports, payload shapes, or service internals. -3. **Live models.** The whole point is to audit the real production path after model upgrades. -4. **LLM-as-judge with receipts.** Every evaluation records the judge's reasoning so a human can spot-check. -5. **pytest is the runner.** Same as every other tier. Spec files are collected via a tiny `pytest_collect_file` hook. No custom CLI, no `bless`/`differ`/`migrate-questions` subcommands. -6. **Human-triggered only.** Never on every push, never on a schedule. Humans decide when a change is big enough to warrant an acceptance run — via PR label or manual `workflow_dispatch`. - ---- - -## 2. Directory layout - -Acceptance specs live alongside the service they test, in an `acceptance/` subfolder of `services//tests/`. **No top-level `tests/` tree.** - -``` -services//tests/ - acceptance/ - *.md # one spec per file - _template.md # copy-paste starter (underscore = skipped by collector) -``` - -Concretely: - -``` -services/global_chat/tests/acceptance/ - hero-patient-sync.md - voice-concise-answers.md - refuse-non-openfn-questions.md -services/workflow_chat/tests/acceptance/ - *.md -services/job_chat/tests/acceptance/ - *.md -``` - -**Cross-service specs** (refusals, safety, "hero" questions that test the orchestrator end-to-end) live under `services/global_chat/tests/acceptance/` since global_chat is the entry point everyone hits. - -The judge helper and the markdown-spec pytest collector live in **`testing/`** (the shared package): - -``` -testing/ - judge.py # LLM-as-judge helper (~150 lines) - fixtures.py # already there; nothing new -``` - -The pytest_collect_file hook is registered in the **root `apollo/conftest.py`** so it picks up `*.md` under any `acceptance/` folder anywhere in `services/`. - -**No `golden/` tree, no `reports/` folder in git.** Langfuse is the trend / comparison backend. Local test output (pass/fail + judge reasoning) comes from pytest stdout and `--junitxml`. If a run needs an HTML report, generate it with `pytest-html` when someone asks for it — not preemptively. - -**No `services/llm_evaluator/` service.** The judge is a helper module in `testing/judge.py` that calls Anthropic directly via the SDK. Promote to a service only when a non-test caller needs to invoke it. - ---- - -## 3. Spec format - -One spec per markdown file. YAML frontmatter + named markdown sections. - -### 3.1 Frontmatter - -```yaml ---- -id: global-chat.hero.patient-sync -title: "Build a CommCare to DHIS2 sync" -service: global_chat # global_chat | workflow_chat | job_chat -tags: [hero, voice, multi-turn] -runs: 3 # default 1 — number of times to run the same spec -judge_model: claude-sonnet-4-6 # defaults to the same in conftest ---- -``` - -Only `id` and `service` are required. Everything else inherits sensible defaults from the root `apollo/conftest.py`. - -### 3.2 Body sections (top-level markdown headers, case-insensitive) - -| Section | Required | Purpose | -|---|---|---| -| `# conversation` | one of conversation/question | `- user:` / `- assistant:` list. Last user line is tested; earlier lines become `history`. | -| `# question` | one of conversation/question | Shorthand for a single-turn conversation. | -| `# context` | optional | YAML block merged into payload: `workflow_yaml`, `page`, `context`, `attachments`, etc. | -| `# must_include` | optional | Substrings or `/regex/` that must appear in `response`. Deterministic; failure short-circuits before judge runs. | -| `# must_not_include` | optional | Opposite. | -| `# assertions` | required | Natural-language criteria, one per bullet — each passed to the LLM judge. | -| `# notes` | optional | Reviewer context, not sent to judge. | - -### 3.3 Example - -```markdown ---- -id: global-chat.hero.patient-sync -title: "Build a CommCare to DHIS2 sync" -service: global_chat -tags: [hero, planner] -runs: 3 ---- - -# conversation - -- user: "I want to create a workflow that fetches new patient registrations from CommCare every hour and creates matching tracked entities in DHIS2." - -# must_include -- /commcare/i -- /dhis2/i - -# must_not_include -- "I cannot help with that" - -# assertions -- The response proposes a workflow with at least two jobs. -- The tone is warm and collaborative, not clinical. -- An attached workflow_yaml is present and syntactically valid. -- The response does not leak the user's api_key or any secret-looking string. -``` - ---- - -## 4. Collection: spec → pytest item - -The root `apollo/conftest.py` implements a standard pytest collection hook that finds markdown specs in any `acceptance/` folder under `services/`: - -```python -def pytest_collect_file(parent, file_path): - if ( - file_path.suffix == ".md" - and not file_path.name.startswith("_") - and file_path.parent.name == "acceptance" - ): - return SpecFile.from_parent(parent, path=file_path) - -class SpecFile(pytest.File): - def collect(self): - spec = parse_spec(self.path) - for run_index in range(spec.runs): - yield SpecItem.from_parent( - self, name=f"{spec.id}[run={run_index}]", - spec=spec, run_index=run_index, - ) - -class SpecItem(pytest.Item): - def runtest(self): - payload = build_payload(self.spec, self.run_index) - response = self.client.call(self.spec.service, payload) - check_must_include(self.spec, response) # hard precondition; raises on fail - verdict = judge.evaluate(self.spec, response, model=self.spec.judge_model) - if not verdict.passed: - raise AssertionError(verdict.summary) -``` - -Each run is a separate pytest item. Benefits: `pytest -m acceptance` works, `--junitxml` works, `pytest-xdist` works, filtering with `-k hero` works. No new runner. - -A tiny `pytest_sessionfinish` hook counts, per spec, how many of the N runs the judge marked `passed=True` and prints `spec-id: 2/3 passed` to stdout. No pass/fail policy is applied — the count is raw output for humans to read. (The pytest exit code still reflects individual item pass/fail in the usual way.) - ---- - -## 5. The judge - -`testing/judge.py` is a single module (~150 lines). Not an Apollo service. - -### 5.1 Interface - -```python -@dataclass -class Verdict: - passed: bool - score: float # 0..1 — fraction of criteria passed - criteria: list[CriterionResult] - reasoning_summary: str # shown on pytest failure - judge_usage: dict # input/output tokens - -@dataclass -class CriterionResult: - criterion: str - passed: bool - reasoning: str - evidence: str # verbatim span from candidate - -def evaluate(spec: Spec, response: dict, *, model: str) -> Verdict: ... -``` - -### 5.2 Prompt strategy - -- System prompt instructs the judge to be strict and to quote evidence verbatim. -- User prompt: numbered criteria + candidate response (+ attachments if present) + demand for JSON output with per-criterion pass/fail + reasoning. -- Prefilled `{` to force JSON. -- Malformed JSON / judge refusal → verdict with `passed=False`, reason `judge_error`. Surfaced loudly. - -### 5.3 Why a helper and not a service - -The judge only has one caller today (acceptance tier). A whole Apollo service + `/services/llm_evaluator` HTTP endpoint + per-service test directory is overkill for that. If future callers appear (a ranker for search_docsite, a sanity-check step in gen_job), promote `judge.py` to `services/llm_evaluator/llm_evaluator.py` — it's a ~50-line reshape, not a redesign. - -### 5.4 Self-tests - -`services/tests/test_judge_unit.py` exercises the prompt builder and JSON parser with canned inputs. Uses `MockAnthropicClient` from the service tier's shared helpers. - ---- - -## 6. Langfuse integration - -Langfuse is already wired on `add-langfuse` — acceptance leans on it lightly for cross-run comparison. The runner does NOT rebuild Langfuse's dataset / score UI. - -### 6.1 Already in place (we reuse) - -- `services/langfuse_util.py::should_track()` gates trace export. Payloads set `user.employee=True` to stay inside the employee window. -- `@observe` on each chat service's `main()` — every acceptance run is automatically traced when `LANGFUSE_TRACING=true`. - -### 6.2 What we add - -1. **Session tagging.** Each run sets `session_id = f"acceptance-{spec.id}-run{i}"` and `tags = ["acceptance", spec.id, ...spec.tags]`. Done via Langfuse's `propagate_attributes`. -2. **Score push.** After the judge returns, write one score per run: `acceptance_pass` (0/1) and `acceptance_score` (0..1). Use Langfuse's Scores API directly from `judge.py` — no `langfuse_sink.py` wrapper. -3. **Cross-version comparison.** Native Langfuse dataset-runs view does this. Runner surfaces the URL in stdout. - -### 6.3 What we don't do via Langfuse - -- No Langfuse-hosted eval (we own the prompt). -- No hard dependency — acceptance runs offline if `LANGFUSE_PUBLIC_KEY` is unset OR `LANGFUSE_TRACING=false`; scores are skipped, runs still complete. - ---- - -## 7. Multi-run sampling - -Specs that benefit from sampling (tone, voice, any criterion where the LLM varies between runs) declare `runs: N` in the frontmatter. Default is `1`. - -Mechanics: - -- The collection hook yields N separate pytest items per spec, named `[run=0]` ... `[run=N-1]`. -- Each run calls the service independently and is judged independently by `testing/judge.py`. -- A `pytest_sessionfinish` hook tallies per spec: `: / passed` to stdout. - -That's it — no `all` / `majority` / `any` policies, no named aggregator, no default that says "2 out of 3 is good enough." Whoever reads the output decides whether the ratio is acceptable for that spec. A refusal spec with 2/3 is probably a regression; a voice spec with 2/3 might be fine. Humans make the call, the tooling just reports the count. - -Per-run pass/fail comes from the LLM judge (`Verdict.passed`, see §5.1). The count in `2/3 passed` is simply the number of runs whose judge verdict was `passed=True`. - ---- - -## 8. Human review loop - -**Primary: Langfuse UI.** Joe / Brandon open the dashboard, filter by `tags:acceptance`, review candidate + judge reasoning + score, override with a human annotation if they disagree. - -**Secondary: pytest stdout / JUnit.** CI logs show `FAIL global-chat.hero.patient-sync[run=1]` with the judge's reasoning summary as the pytest message. Enough for a quick triage. - -No dedicated HTML report on day one. Add `pytest-html` the first time someone asks for it. - ---- - -## 9. Triggers - -Acceptance is **never triggered automatically**. A human decides when a change is big enough to warrant spending the money on a run. - -| Trigger | Mechanism | -|---|---| -| Local manual | `poetry run pytest -m acceptance` | -| CI manual (any branch) | GH Actions `workflow_dispatch` on `llm-tests.yaml` | -| PR label | Apply `run-acceptance` label to a PR | - -Explicitly excluded: no cron, no nightly, no push-to-main, no tag-push, no scheduled runs of any kind. If the team later decides they want continuous drift monitoring, that's a deliberate policy change — not a default. - -Shares `.github/workflows/llm-tests.yaml` with the integration tier — see Section 3 §9 for the skeleton. Acceptance is a second job in the same workflow, gated by its own label condition. - ---- - -## 10. CI workflow (acceptance job) - -Appended to `.github/workflows/llm-tests.yaml`: - -```yaml - acceptance: - # Human-triggered only. No 'schedule' trigger — by design. - if: >- - (github.event_name == 'pull_request' && github.event.label.name == 'run-acceptance') - || github.event_name == 'workflow_dispatch' - runs-on: ubuntu-latest - timeout-minutes: 45 - env: - CI: "true" - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY_TEST }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY_TEST }} - LANGFUSE_PUBLIC_KEY: ${{ secrets.LANGFUSE_PUBLIC_KEY_TEST }} - LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY_TEST }} - LANGFUSE_BASE_URL: ${{ secrets.LANGFUSE_BASE_URL }} - LANGFUSE_TRACING: "true" # acceptance runs always trace; that's the point - steps: - - uses: actions/checkout@v4 - - uses: oven-sh/setup-bun@v2 - - uses: actions/setup-python@v5 - with: { python-version: "3.11" } - - uses: snok/install-poetry@v1 - - run: bun install - - run: poetry install --with test-integration - - run: poetry run pytest -m acceptance -v --junitxml=tmp/test-logs/acceptance-junit.xml - - uses: actions/upload-artifact@v4 - if: always() - with: - name: acceptance-junit-${{ github.run_id }} - path: tmp/test-logs/acceptance-junit.xml - retention-days: 14 -``` - ---- - -## 11. Cost control - -Day-one approach is human-gated triggering + sensible defaults, not elaborate budget code: - -- Never automatic — every run is a deliberate human action. -- `runs: 1` default — specs must opt into sampling. -- Judge defaults to `claude-sonnet-4-6` (not opus). -- Prompt caching on candidate calls — preserved across the N runs of one spec by shared `session_id`. -- 45-minute workflow timeout as a hard ceiling. - -A budget env + soft circuit breaker can be added the first time a manual run surprises someone. Not on day one. - ---- - -## 12. Extensibility - -Adding a new sub-agent or tool — no Python required: - -1. Ensure the new service exposes `main()` at `services//.py` (auto-mounts). -2. Create `services//tests/acceptance/` and drop markdown specs. - -Adding a new judge model: list it in `apollo/conftest.py` (or let it be free-form — strings all the way). `judge_model:` in frontmatter. - ---- - -## 13. Relationship to integration - -| Concern | Integration | Acceptance | -|---|---|---| -| Goal | Functional correctness | Quality, voice, style | -| Assertions | Regex + shape | Natural-language criteria + LLM judge | -| Trigger | PR label / push to main / manual | PR label / manual — **never automatic** | -| Stability | Deterministic | Probabilistic (N runs) | -| Runner | pytest | pytest | -| Marker | `@pytest.mark.integration` | `@pytest.mark.acceptance` | -| Location | `services//tests/test__integration.py` | `services//tests/acceptance/*.md` | - -**Overlap rule:** a test lives in exactly one tier. An "acceptance" spec that merely asserts a YAML attachment exists belongs in integration. An integration test that checks "the tone feels terse enough" belongs in acceptance. - ---- - -## 14. Migration of existing artefacts - -- `services/job_chat/evaluation/questions.md` — mostly-compatible format. One-time manual conversion (split per entry, add frontmatter, drop into `services/job_chat/tests/acceptance/`). No migration CLI needed — it's a one-shot editor task. -- `services/global_chat/tests/test_*_qualitative.py` — the prose at the top of each test (in `print()` statements) becomes `# notes`; `content`/`context` become spec sections; qualitative asserts become `# assertions` bullets. Drop the resulting markdown files into `services/global_chat/tests/acceptance/` (or the relevant sub-agent's acceptance folder if the test is targeting a sub-agent specifically). Any machine-checkable asserts move to integration. -- Migration is opt-in, one file at a time. - ---- - -## 15. What this tier deliberately does NOT do - -- **No top-level `tests/` tree.** Specs live under their service. -- **No `services/llm_evaluator/` service.** Judge is a helper module. -- **No custom acceptance runner.** Pytest collects specs; that's it. -- **No `bless` / `differ` / `migrate-questions` / `review` subcommands.** The first two make sense if we adopt golden-file diffing; we don't on day one (Langfuse's dataset-runs comparison is the primitive). The last two are one-off editor tasks. -- **No `golden/` git tree.** Model drift is tracked in Langfuse. -- **No HTML reporter.** `pytest-html` is a line in `pyproject.toml` the day we want it. -- **No per-spec cost caps, budget estimator, `list`/`lint` commands, skip-on-no-change mode.** Defer until bills say otherwise. -- **No `criteria_mode: weighted` with per-criterion weights.** `all` or `any` across criteria. Add weighting when a spec genuinely needs it. - ---- - -## Summary - -Acceptance = markdown specs in `services//tests/acceptance/` + pytest collector + tiny judge helper in `testing/judge.py` + Langfuse scores. No new Apollo service, no custom runner, no golden tree, no top-level `tests/` directory. Runs via the standard pytest mechanism under a shared `llm-tests.yaml` workflow. Adding a sub-agent means dropping markdown files under that service's `acceptance/` folder. The judge promotes to a service the day it has a second caller; nothing else changes. diff --git a/agent-team-architecture-plan/5-overview.md b/agent-team-architecture-plan/5-overview.md deleted file mode 100644 index 05617e05..00000000 --- a/agent-team-architecture-plan/5-overview.md +++ /dev/null @@ -1,205 +0,0 @@ -# Apollo Testing Architecture — Overview - -**Supervisor:** team-lead -**Scope:** architecture (files, utils, fixtures, CI wiring) for a four-tier test framework covering `global_chat`, `workflow_chat`, `job_chat`, their tools, and any future sub-agent / tool services. Specific tests are NOT designed here. - -Deep detail lives in: - -- `1-unit-tests.md` — pure-function tests, free, every push. -- `2-service-tests.md` — `main()` with a mocked Anthropic HTTP client, free, every push. -- `3-integration-tests.md` — HTTP to the running bun server with real LLMs, label-gated. -- `4-acceptance-tests.md` — markdown specs judged by an LLM helper, manually triggered only. - ---- - -## 1. The four tiers at a glance - -| Tier | Transport | LLM | Asserts on | Runs on | -|---|---|---|---|---| -| **Unit** | direct import of one function | none | function output | every push | -| **Service** | direct `main(data, test_hooks)` | mocked via injected `httpx.MockTransport` | code path, mock call args, payload shape | every push | -| **Integration** | HTTP / SSE / WS via running bun server | real | response shape + loose regex on content | push to `main`, PR label `run-integration`, manual | -| **Acceptance** | HTTP via running bun server | real + judge LLM | natural-language criteria graded by judge | PR label `run-acceptance`, manual (`workflow_dispatch`). **Never automatic** — humans decide when a change is big enough to warrant a run. | - -**Sharp dividing lines:** - -- Content assertion depends on *what the LLM wrote* → integration or acceptance (not service). -- Content assertion needs a judge LLM → acceptance (not integration). -- Test exercises `main()` → service or integration (not unit). -- Test needs the bun server → integration or acceptance (not unit or service). - ---- - -## 2. Directory layout - -Every test lives under the service it relates to. One `tests/` folder per service holds all four tiers. - -``` -apollo/ -├── conftest.py # root: dummy api keys, register shared fixtures, markdown spec collector -│ -├── testing/ # Shared test helpers — peer to services/, not a service itself -│ ├── __init__.py -│ ├── anthropic_mock.py # MockAnthropic (regex → response) + tool_use helper + record_tool_call; documents the `test_hooks` dict keys -│ ├── fixtures.py # pytest fixtures (mock client, test_hooks factory, payloads, yaml assertions) -│ ├── server.py # apollo_server fixture + ApolloClient (sync / sse / ws) -│ ├── judge.py # small LLM-as-judge helper for acceptance specs -│ └── fixtures/ # sample yaml/json shared across services -│ ├── workflows/*.yaml -│ └── histories/*.json -│ -├── services/ -│ └── / # global_chat, workflow_chat, job_chat -│ ├── .py # main() gains optional `test_hooks` 2nd arg (default None) -│ └── tests/ -│ ├── conftest.py # auto-marks by filename suffix and acceptance/ folder -│ ├── test__unit.py # tier 1 — pure functions, marker: unit -│ ├── test__service.py # tier 2 — main() with mocked anthropic, marker: service -│ ├── test__integration.py # tier 3 — sync + stream + ws, marker: integration -│ └── acceptance/ # tier 4 — markdown specs, marker: acceptance -│ └── *.md # question + data + natural-language assertions -│ -├── .github/workflows/ -│ ├── tests.yaml # unit + service, every push (free) -│ └── llm-tests.yaml # integration + acceptance, label-gated / manual only (no cron, no schedule) -│ -└── pyproject.toml # [tool.pytest.ini_options]: markers, pythonpath=["services", "."] -``` - -**Decisions worth knowing:** - -- **All tests live with their service.** `services//tests/` holds unit + service + integration tests for that service, plus an `acceptance/` subfolder of markdown specs. No top-level `tests/` tree. -- **Cross-service work lives with the orchestrator.** Planner-chain end-to-end integration tests live under `services/global_chat/tests/test_global_chat_integration.py` because `global_chat` owns the planner. Cross-service acceptance specs (refusals, safety) live under `services/global_chat/tests/acceptance/`. -- **One shared helper package: `apollo/testing/`.** A peer of `services/` and `platform/`, not a service. Server fixture, ApolloClient, mock anthropic, judge, payload builders, yaml assertions — all here. -- **Filename suffix + folder name carry tier intent.** `*_unit.py` → unit, `*_service.py` → service, `*_integration.py` → integration, `acceptance/*.md` → acceptance. Marker auto-applied by conftest. Authors never write `@pytest.mark.X` manually. -- **No `llm_evaluator` Apollo service.** The judge is `testing/judge.py`. Promote to a service the day it has a non-test caller. -- **`test_hooks` second-arg stays.** Load-bearing production change; cheap. `test_hooks is None` is byte-identical to today. - ---- - -## 3. The `test_hooks` second-arg pattern (service tier's anchor) - -Every chat service's `main()` gains an optional second argument: - -```python -def main(data_dict: dict, test_hooks: Optional[dict] = None) -> dict: ... -``` - -- **HTTP path is untouched.** `entry.py` calls `m.main(data)` with one positional arg. Because `test_hooks` defaults to `None`, the bridge never sees it. -- **Only direct-Python callers (pytest) set `test_hooks`.** Integration and acceptance tiers don't use it (they go through HTTP, which strips `test_hooks`). Unit tier doesn't use it (unit tests never call `main()`). - -`test_hooks` is a plain Python `dict` (not a formal type). Its recognised keys are documented as a docstring in `testing/anthropic_mock.py`: - -- `anthropic_http_client` — an `httpx.Client` backed by `httpx.MockTransport`. Built by `MockAnthropic`, which matches each request's latest user message text against test-registered regex → response pairs (no match → `AssertionError`). Threaded into every `Anthropic(api_key=..., http_client=...)` constructor site via a new `services/util.py::build_anthropic_client()` factory. The planner's internal tool-use loop (multiple Anthropic calls per `main()`) is covered by the same mechanism — each round has different latest-user-message text, so different regexes match. See `2-service-tests.md` §3. -- `tool_calls` — a test-allocated `list[dict]` that production code appends to as breadcrumbs when present. -- `tool_stubs` — a `dict[str, Callable]` keyed by tool name. The planner consults it before dispatching a tool; if a stub is registered, it's called instead of the real tool. Today only used for `search_documentation` (which otherwise hits Pinecone + OpenAI). See `2-service-tests.md` §5. - -No sub-agent stub registry, no tool stub registry, no `seed`, no `disable_langfuse` — the shape starts minimal and grows only when a test can't be written without it. - -Minimal production-code edits: the three chat services' `main()`, their `AnthropicClient`/`RouterAgent`/`PlannerAgent` constructors (one-line swap to `build_anthropic_client(api_key, test_hooks)`), and the new factory in `services/util.py`. - ---- - -## 4. Server lifecycle - -`testing/server.py` exposes a session-scoped pytest fixture `apollo_server` that: - -- Honours `APOLLO_TEST_BASE_URL` to reuse a running staging server or local `bun dev`. -- Otherwise spawns `bun run start` on an OS-allocated port, polls `GET /` until 200, yields `(base_url, port)`. -- On teardown: SIGTERM, 5s wait, SIGKILL if needed. Drains stdout/stderr to `tmp/test-logs/`. - -`ApolloClient` (also in `testing/server.py`) wraps `.call()`, `.stream()`, `.ws()`. Function-scoped `client` fixture binds to the session server. Both integration and acceptance use the same fixture. - -Registered globally via `pytest_plugins = ["testing.fixtures", "testing.server"]` in the root `apollo/conftest.py`. - ---- - -## 5. Test runner and commands - -Everything is pytest. Acceptance markdown specs are collected by a `pytest_collect_file` hook in the root `apollo/conftest.py`. No custom runner. - -```bash -# Dev default — free, fast -poetry run pytest -m "unit or service" - -# Run just one tier -poetry run pytest -m unit -poetry run pytest -m service -poetry run pytest -m integration # opt-in -poetry run pytest -m acceptance # opt-in - -# Run all tests for a single service (any tier) -poetry run pytest services/global_chat/tests - -# Run one tier within one service -poetry run pytest services/global_chat/tests -m integration -poetry run pytest services/global_chat/tests/acceptance -``` - -TypeScript platform tests continue to run via `bun test`. Unchanged. - ---- - -## 6. CI integration - -Two workflow files: - -| File | Runs | Trigger | Secrets | Timeout | -|---|---|---|---|---| -| `.github/workflows/tests.yaml` | `pytest -m "unit or service"` | every push, every PR | **none** (deliberate — mocks must not hit real APIs; missing `ANTHROPIC_API_KEY` is a loud failure) | 10 min | -| `.github/workflows/llm-tests.yaml` | `pytest -m "integration or acceptance"` | **Integration:** PR label `run-integration`, push to `main`, `workflow_dispatch`. **Acceptance:** PR label `run-acceptance` or `workflow_dispatch` only — never cron, schedule, or push. | `ANTHROPIC_API_KEY_TEST`, `OPENAI_API_KEY_TEST`, `PINECONE_API_KEY_TEST`, `POSTGRES_URL_TEST`, `LANGFUSE_*_TEST` | 45 min | - -Design invariant: **the fast job has no `ANTHROPIC_API_KEY` env var**. If a service test accidentally constructs a real Anthropic client (because someone forgot to thread `test_hooks`), the SDK errors on missing-key — a loud failure is free defence-in-depth against mocked-test leaks. - -Cost controls for the LLM workflow are intentionally out of day-one scope: start with `--maxfail=5` and a 45-min timeout; add budget tracking the first time a bill surprises someone. - ---- - -## 7. Conventions that hold across all tiers - -- **Layer marker auto-apply.** Per-service `services//tests/conftest.py` uses `pytest_collection_modifyitems` to tag tests by filename suffix (`_unit.py` / `_service.py` / `_integration.py`) and folder (`acceptance/`). Authors never write `@pytest.mark.X` manually. -- **No Langfuse export from free-tier tests.** `LANGFUSE_TRACING=false` set in unit/service runs. -- **Dummy API keys for free-tier tests.** `ANTHROPIC_API_KEY="unit-test-dummy"` so imports don't crash but any accidentally-real call fails loud. -- **`pythonpath = ["services"]`.** Kills the `sys.path.insert(...)` boilerplate at the top of every existing test file. - ---- - -## 8. Extensibility — adding a new sub-agent or tool - -1. Add `services//.py` with `main(data, test_hooks=None)`. Auto-mounted by `describe-modules.ts`. -2. Create `services//tests/conftest.py` — copy from an existing service. -3. Add files as needed: `test__unit.py`, `test__service.py`, `test__integration.py`, `acceptance/*.md`. - -No changes needed in pyproject, CI, or shared helpers. Discovery is zero-config — pytest already crawls `services/*/tests/` and the markdown collector finds any `acceptance/` folder. - ---- - -## 9. Implementation order (recommended) - -1. **Scaffolding.** Create `testing/` (skeleton — `anthropic_mock.py`, `fixtures.py`, `server.py`, `judge.py`), root `apollo/conftest.py`, `[tool.pytest.ini_options]` block in pyproject. One PR — unblocks everything else. -2. **Unit tier.** Migrate `services/workflow_chat/tests/test_functions.py` → `test_workflow_chat_functions_unit.py` as the worked example. Wire `tests.yaml` with just `-m unit`. Green CI on every push. -3. **Service tier.** Add `test_hooks=None` to the three chat services' `main()`. Add `services/util.py::build_anthropic_client()`. Build `MockAnthropic`. Extend `tests.yaml` to `-m "unit or service"`. Migrate the first `pass_fail` test whose assertion doesn't depend on content. -4. **Integration tier.** Add `testing/server.py` (server fixture + `ApolloClient`). Create `llm-tests.yaml`. Migrate the first cross-service end-to-end test into `services/global_chat/tests/test_global_chat_integration.py`. Secrets wired. -5. **Acceptance tier.** Add `testing/judge.py` and the markdown collector hook in the root conftest. Drop the first 2–3 hero specs into `services/global_chat/tests/acceptance/`. First manual run green (`workflow_dispatch`). - -Each step is an independent PR. Nothing here blocks shipping production features in between. - ---- - -## 10. What this architecture deliberately does NOT do - -- Does not design specific tests. That's the next phase. -- Does not modify production code beyond the `test_hooks` second-arg + `build_anthropic_client` factory. -- Does not replace `bun:test` for TypeScript. -- Does not create a top-level `tests/` tree. Everything lives with the service. -- Does not create a separate `llm_evaluator` service. Judge is a helper module. -- Does not build a custom acceptance runner with `bless`/`differ`/`migrate-questions`/etc. Specs are markdown, collection is pytest, judge writes scores to Langfuse via its existing SDK — that's it. -- Does not track golden answers as a separate git tree. If we want regression diffing later, Langfuse's native dataset-run comparison is the primitive. -- Does not build cost tracking, pricing tables, retry plugins, or circuit breakers on day one. Add each the first time its absence costs something. -- Does not gate merges on coverage percentages. - ---- - -## Summary - -Four tiers, one home (`services//tests/`), one shared Python helper package (`testing/`), zero new Apollo services, two CI workflows. The `test_hooks` second-arg pattern is the single architectural change to production code; everything else is additive. Adding a new sub-agent or tool is a ~10-minute task with no framework edits. From 0087835b34a88cf4e3aa10a0b8044e0ee82ad990 Mon Sep 17 00:00:00 2001 From: Joe Clark Date: Thu, 7 May 2026 10:36:28 +0100 Subject: [PATCH 11/16] update unit tess --- .github/workflows/unit-tests.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index 9e1f5fc4..6373b8ec 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -19,7 +19,7 @@ jobs: python-version: "3.11" - name: Install Poetry - run: pipx install poetry==1.8.3 + run: pipx install poetry==2.3.2 - name: Cache Poetry virtualenv uses: actions/cache@v4 @@ -27,8 +27,8 @@ jobs: path: .venv key: poetry-${{ runner.os }}-${{ hashFiles('poetry.lock') }} - - name: Install dependencies (main group only, no ft) - run: poetry install --no-interaction --no-root + - name: Install dependencies + run: poetry install - name: Run unit tests run: poetry run pytest -m unit From 59dac4ad2717e20c508700e2a78c1e15890c0409 Mon Sep 17 00:00:00 2001 From: Joe Clark Date: Thu, 7 May 2026 10:38:01 +0100 Subject: [PATCH 12/16] rethink cache strategy --- .github/workflows/unit-tests.yaml | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index 6373b8ec..9cbbe349 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -13,19 +13,14 @@ jobs: steps: - uses: actions/checkout@v6 - - name: Set up Python 3.11 - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - name: Install Poetry run: pipx install poetry==2.3.2 - - name: Cache Poetry virtualenv - uses: actions/cache@v4 + - name: Set up Python 3.11 + uses: actions/setup-python@v5 with: - path: .venv - key: poetry-${{ runner.os }}-${{ hashFiles('poetry.lock') }} + python-version: "3.11" + cache: poetry - name: Install dependencies run: poetry install From 2ff173bc9419616f069731ea61ff38bb667443ae Mon Sep 17 00:00:00 2001 From: Joe Clark Date: Thu, 7 May 2026 11:09:30 +0100 Subject: [PATCH 13/16] add bun tests action --- .github/workflows/unit-tests.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index 9cbbe349..afc83cbf 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -27,3 +27,20 @@ jobs: - name: Run unit tests run: poetry run pytest -m unit + + bun: + name: Bun unit tests + runs-on: ubuntu-latest + timeout-minutes: 5 + + steps: + - uses: actions/checkout@v6 + + - name: Set up Bun + uses: oven-sh/setup-bun@v2 + + - name: Install dependencies + run: bun install --frozen-lockfile + + - name: Run unit tests + run: bun test From 368cc676aff48a204de667ce19be8152ba12e63d Mon Sep 17 00:00:00 2001 From: Joe Clark Date: Thu, 7 May 2026 11:13:05 +0100 Subject: [PATCH 14/16] add poetry to bun tests --- .github/workflows/unit-tests.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index afc83cbf..76dc7b96 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -42,5 +42,14 @@ jobs: - name: Install dependencies run: bun install --frozen-lockfile + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: poetry + + - name: Install dependencies + run: poetry install + - name: Run unit tests run: bun test From 0a2e729527fe8034c026e69e6244ffcfc12ba00c Mon Sep 17 00:00:00 2001 From: Joe Clark Date: Thu, 7 May 2026 11:13:48 +0100 Subject: [PATCH 15/16] fix --- .github/workflows/unit-tests.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index 76dc7b96..19930e8d 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -42,6 +42,9 @@ jobs: - name: Install dependencies run: bun install --frozen-lockfile + - name: Install Poetry + run: pipx install poetry==2.3.2 + - name: Set up Python 3.11 uses: actions/setup-python@v5 with: From 1a172bc2716c3ede04ab9bf912610d490cc74a50 Mon Sep 17 00:00:00 2001 From: Joe Clark Date: Thu, 7 May 2026 18:49:05 +0100 Subject: [PATCH 16/16] make unit tests more selective --- .github/workflows/unit-tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index 19930e8d..725acaa1 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -26,7 +26,7 @@ jobs: run: poetry install - name: Run unit tests - run: poetry run pytest -m unit + run: poetry run pytest services/*/tests/unit bun: name: Bun unit tests