diff --git a/internal/agentcrd/agent.go b/internal/agentcrd/agent.go index 1f6c8301..c2f81c06 100644 --- a/internal/agentcrd/agent.go +++ b/internal/agentcrd/agent.go @@ -85,6 +85,18 @@ type SeedOptions struct { // data path. SOUL.md is only written when missing (or when OverwriteSoul is // true). // +// CONTRACT — sub-agents-for-sale ONLY. This helper also writes the +// `.no-bundled-skills` marker (see writeNoBundledSkillsMarker), which makes +// Hermes skip its ~80 bundled skills and run with just the operator-chosen +// subset. That is correct for a narrow, EVM-focused paid service but WRONG for +// the stack-managed master agent, which keeps the full bundled set. The master +// (internal/hermes) seeds its own home via a separate path (it does not call +// this) and must NEVER be routed through SeedHostFiles, or it would silently +// lose its bundled skills. The reusable seed primitives this is built from +// (WriteSoul, embed.WriteSkillSubset) deliberately do NOT write the marker, so +// the objective-only update path — and any future master-side reuse — stays +// safe; locked by TestMarkerOnlyWrittenBySeedHostFiles. +// // Returns whether SOUL.md was written this call so callers can report the // difference between "fresh agent" and "existing agent, skills resynced". func SeedHostFiles(cfg *config.Config, name string, skills []string, objective string, opts SeedOptions) (soulWritten bool, err error) { diff --git a/internal/agentcrd/agent_contract_integration_test.go b/internal/agentcrd/agent_contract_integration_test.go new file mode 100644 index 00000000..f08cf878 --- /dev/null +++ b/internal/agentcrd/agent_contract_integration_test.go @@ -0,0 +1,379 @@ +//go:build integration + +package agentcrd + +import ( + "bytes" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/ObolNetwork/obol-stack/internal/config" +) + +// ───────────────────────────────────────────────────────────────────────────── +// Agent external-contract integration test (PR #582) +// +// The unit tests in agent_test.go and serviceoffercontroller/agent_render_test.go +// only prove that we *render* the `.no-bundled-skills` marker and the capped +// hermes-config keys. They do NOT prove the Hermes image +// (nousresearch/hermes-agent:v2026.5.28) actually honors them. This test closes +// that gap end-to-end against a LIVE cluster: +// +// (1) the .no-bundled-skills marker exists on the agent's host PVC path +// (agentcrd.HostNoBundledSkillsMarkerPath), so Hermes' installer/sync +// skips seeding its ~80 bundled skills; +// (2) the rendered hermes-config ConfigMap in the agent's namespace carries +// the capped knobs: lifetime_seconds: 90, max_turns: 30, +// reasoning_effort: low, and disabled_toolsets {memory, web}; +// (3) a BEHAVIORAL signal that bundled skills were actually skipped — see +// assertBundledSkillsSkippedInPod for why we assert pod filesystem state +// rather than grep a log line. +// +// Like the rest of the integration suite (internal/openclaw/*_integration_test.go) +// it t.Skip()s gracefully when prerequisites are missing, so `go test` without a +// cluster is a no-op. +// +// To run manually (a local OpenAI-compatible inference endpoint is available at +// host 100.117.108.5 for development — never hardcode it): +// +// export OBOL_DEVELOPMENT=true \ +// OBOL_CONFIG_DIR=$(pwd)/.workspace/config \ +// OBOL_BIN_DIR=$(pwd)/.workspace/bin \ +// OBOL_DATA_DIR=$(pwd)/.workspace/data +// go build -o .workspace/bin/obol ./cmd/obol # MUST rebuild after code changes +// # point the stack at the dev endpoint first (see CLAUDE.md "Pointing the +// # stack at an external OpenAI-compatible LLM"), then optionally: +// export OBOL_LLM_ENDPOINT=http://100.117.108.5:8000/v1 +// go test -tags integration -run TestIntegration_AgentContract -v -timeout 15m ./internal/agentcrd/ +// +// ───────────────────────────────────────────────────────────────────────────── + +// acObolRun executes the obol binary and returns combined stdout/stderr, +// fataling on failure. Mirrors internal/openclaw.obolRun; redeclared here +// (with an `ac` prefix) because the two packages do not share a test helper +// file and the non-tagged agent_test.go in this package must keep compiling. +func acObolRun(t *testing.T, cfg *config.Config, args ...string) string { + t.Helper() + out, err := acObolRunErr(cfg, args...) + if err != nil { + t.Fatalf("obol %v failed: %v", args, err) + } + return out +} + +// acObolRunErr executes the obol binary and returns output + error (no fatal). +func acObolRunErr(cfg *config.Config, args ...string) (string, error) { + obolBinary := filepath.Join(cfg.BinDir, "obol") + cmd := exec.Command(obolBinary, args...) + var buf bytes.Buffer + cmd.Stdout = &buf + cmd.Stderr = &buf + if err := cmd.Run(); err != nil { + return buf.String(), fmt.Errorf("obol %v: %w\n%s", args, err, buf.String()) + } + return buf.String(), nil +} + +// acRequireCluster skips when no k3d cluster is reachable. Same shape as +// internal/openclaw.requireCluster: kubeconfig presence + a live cluster-info +// probe through the obol-managed KUBECONFIG. +func acRequireCluster(t *testing.T) *config.Config { + t.Helper() + cfg := config.Load() + kubeconfig := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + if _, err := os.Stat(kubeconfig); os.IsNotExist(err) { + t.Skip("no kubeconfig found — cluster not running") + } + if _, err := acObolRunErr(cfg, "kubectl", "cluster-info"); err != nil { + t.Skipf("cluster not reachable: %v", err) + } + return cfg +} + +// acRequireAgentCRD skips when the obol.org Agent CRD is not installed (the +// reconciler that provisions sub-agent pods watches it). Mirrors the +// requireCRD shape used by internal/openclaw for ServiceOffers. +func acRequireAgentCRD(t *testing.T, cfg *config.Config) { + t.Helper() + out, err := acObolRunErr(cfg, "kubectl", "get", "crd", "agents.obol.org") + if err != nil || !strings.Contains(out, "agents.obol.org") { + t.Skip("Agent CRD (agents.obol.org) not installed — run: obol stack up") + } +} + +// TestIntegration_AgentContract_HonorsRenderedConstraints deploys a real CRD +// sub-agent through the canonical `obol agent new` path and verifies the Hermes +// image honors the streamlined-sub-agent contract introduced in PR #582. +func TestIntegration_AgentContract_HonorsRenderedConstraints(t *testing.T) { + cfg := acRequireCluster(t) + acRequireAgentCRD(t, cfg) + + // Unique-ish name so reruns don't collide with a stuck previous agent. + // Lowercase + dashes only, to satisfy ValidateName / the CRD pattern. + const name = "test-contract" + ns := Namespace(name) + + // Always tear the agent down first so a leftover from a crashed run doesn't + // make `obol agent new` reject creation, then again at the end. + cleanup := func() { + if _, err := acObolRunErr(cfg, "agent", "delete", "--force", name); err != nil { + t.Logf("cleanup of agent %q failed (non-fatal): %v", name, err) + } + } + cleanup() + t.Cleanup(cleanup) + + // Deploy via the real CLI. We deliberately leave --model unset so the + // controller auto-pins the cluster's top-of-rank LiteLLM model at first + // reconcile — that keeps the test independent of which exact model is + // configured. The skills are real embedded skills so the obol-skills + // external_dirs is genuinely non-empty (needed for the behavioral signal). + t.Logf("deploying CRD sub-agent via: obol agent new %s --skills addresses,gas", name) + acObolRun(t, cfg, "agent", "new", name, "--skills", "addresses,gas") + + // ── Check (1): the .no-bundled-skills marker exists on the host PVC path ── + // SeedHostFiles (called by `obol agent new`) writes it; the cluster mounts + // HostHomePath into the pod via the data PVC, so its presence here is what + // Hermes sees at /data/.hermes/.no-bundled-skills inside the container. + marker := HostNoBundledSkillsMarkerPath(cfg, name) + if _, err := os.Stat(marker); err != nil { + t.Fatalf("no-bundled-skills marker missing on host PVC path %s: %v", marker, err) + } + t.Logf("✓ marker present on host PVC: %s", marker) + + // Wait for the reconciler to create the Deployment and for the pod to come + // up. The agent pod carries app.kubernetes.io/instance= (see + // serviceoffercontroller.agentLabels). The startup probe gives Hermes up to + // ~2m before it would be reported ready, so allow a generous timeout. + waitForAgentPodReady(t, cfg, ns, name) + + // ── Check (2): rendered hermes-config ConfigMap carries the capped knobs ── + cfgYAML := getHermesConfigYAML(t, cfg, ns) + t.Logf("rendered hermes-config config.yaml:\n%s", cfgYAML) + assertHermesConfigCaps(t, cfgYAML) + + // ── Check (3): behavioral skip-bundled-skills signal from the running pod ── + assertBundledSkillsSkippedInPod(t, cfg, ns) + + // ── Optional: exercise the agent so we prove it actually boots and serves + // under the capped config. This needs a live LLM, so it is gated on + // OBOL_LLM_ENDPOINT and skipped (not failed) when unset — consistent with + // the rest of the integration suite. The structural checks above are the + // load-bearing contract assertions; this is corroboration that the capped + // config doesn't wedge startup. + if os.Getenv("OBOL_LLM_ENDPOINT") == "" { + t.Log("OBOL_LLM_ENDPOINT unset — skipping the live inference exercise " + + "(structural marker/config/pod checks above already passed). " + + "Set OBOL_LLM_ENDPOINT (e.g. a dev endpoint at http://100.117.108.5:8000/v1) " + + "after pointing the stack at it to exercise the agent API.") + return + } + assertAgentHealthEndpointServes(t, cfg, ns) +} + +// waitForAgentPodReady blocks until the reconciler-created Hermes pod for the +// sub-agent reports Ready. Driven through `obol kubectl wait` so we reuse the +// obol-managed KUBECONFIG rather than constructing a client-go client. +func waitForAgentPodReady(t *testing.T, cfg *config.Config, ns, name string) { + t.Helper() + // First wait for the Deployment to exist — the CR is applied, but the + // reconciler creates the workload asynchronously. A short poll avoids a + // `kubectl wait` against a not-yet-created selector. + deadline := time.Now().Add(2 * time.Minute) + for time.Now().Before(deadline) { + if _, err := acObolRunErr(cfg, "kubectl", "get", "deploy", hermesDeploymentName, "-n", ns); err == nil { + break + } + time.Sleep(3 * time.Second) + } + + t.Logf("waiting for agent pod readiness in %s", ns) + acObolRun(t, cfg, "kubectl", + "wait", "--for=condition=ready", "pod", + "-l", "app.kubernetes.io/instance="+name, + "-n", ns, + "--timeout=240s", + ) +} + +// getHermesConfigYAML returns the rendered config.yaml from the agent's +// hermes-config ConfigMap. Uses jsonpath to pull just the data key, matching +// the kubectl-helper style used elsewhere in the integration suite (e.g. +// internal/openclaw.getERPCConfigYAML). +func getHermesConfigYAML(t *testing.T, cfg *config.Config, ns string) string { + t.Helper() + out := acObolRun(t, cfg, "kubectl", "get", "configmap", hermesConfigMapName, "-n", ns, + "-o", `jsonpath={.data.config\.yaml}`) + if strings.TrimSpace(out) == "" { + t.Fatalf("hermes-config ConfigMap in %s has empty config.yaml", ns) + } + return out +} + +// assertHermesConfigCaps verifies the streamlined-sub-agent caps are present in +// the rendered config. These strings must match renderHermesConfig's output in +// internal/serviceoffercontroller/agent_render.go. +func assertHermesConfigCaps(t *testing.T, cfgYAML string) { + t.Helper() + for _, want := range []string{ + "lifetime_seconds: 90", + "max_turns: 30", + "reasoning_effort: low", + "disabled_toolsets:", + } { + if !strings.Contains(cfgYAML, want) { + t.Errorf("rendered hermes-config missing %q:\n%s", want, cfgYAML) + } + } + // disabled_toolsets must actually drop both memory and web. They render as + // YAML list items, so check for the "- memory" / "- web" entries rather + // than a bare substring (which could match a comment or model name). + for _, toolset := range []string{"memory", "web"} { + if !containsListItem(cfgYAML, toolset) { + t.Errorf("disabled_toolsets does not include %q as a list item:\n%s", toolset, cfgYAML) + } + } +} + +// containsListItem reports whether the YAML body has a `- ` list entry +// (any indentation). Used to assert disabled_toolsets membership precisely. +func containsListItem(yamlBody, item string) bool { + for _, line := range strings.Split(yamlBody, "\n") { + if strings.TrimSpace(line) == "- "+item { + return true + } + } + return false +} + +// assertBundledSkillsSkippedInPod is the behavioral proof that the marker was +// honored. +// +// We deliberately do NOT grep the Hermes pod logs for a "skipping bundled +// skills" / "no-bundled-skills" line: that string is owned by the external +// nousresearch/hermes-agent image, is not referenced anywhere in this repo, and +// could change wording or log level across image bumps without our knowledge — +// grepping for it would be a brittle, unverifiable contract. The plan +// (plans/sell-agent-perf.md, Verification) only ever treated the log line as an +// eyeball aid, not an assertion. +// +// The reliable, image-version-independent signal is the pod's on-disk skills +// layout: when the marker is honored, Hermes seeds NONE of its ~80 bundled +// skills into its native skills dir ($HERMES_HOME/skills), while the +// operator-chosen subset we layer in via OBOL_SKILLS_DIR +// (/data/.hermes/obol-skills, the external_dirs entry) is present and populated. +// So we assert both halves: +// +// (a) /data/.hermes/obol-skills is present and non-empty (our subset survived); +// (b) the native bundled-skills dir /data/.hermes/skills is absent or empty +// (the marker stopped bundled seeding). +// +// All exec is via `obol kubectl exec` into the "hermes" container (see +// agentPodSpec), the same non-interactive pattern internal/openclaw uses. +func assertBundledSkillsSkippedInPod(t *testing.T, cfg *config.Config, ns string) { + t.Helper() + + // (precondition) The pod must actually SEE the marker on its mounted data + // volume. Asserting this ties the "bundled dir empty" result in (b) to the + // contract INPUT: without it, an empty bundled dir could equally mean a + // future image simply uses a different bundled-skills path. Marker-visible + // in-pod AND bundled-dir-empty together prove Hermes saw the marker and + // honored it — not that we guessed the wrong directory name. + podMarker := podHermesHome + "/.no-bundled-skills" + if mOut, err := execInAgentPodErr(cfg, ns, "ls", "-la", podMarker); err != nil { + t.Fatalf("the .no-bundled-skills marker is not visible inside the pod at %s "+ + "(the data PVC mount did not surface it): %v\n%s", podMarker, err, mOut) + } + t.Logf("✓ marker visible inside pod at %s", podMarker) + + // (a) obol-skills external_dirs is present and non-empty. `ls -A` omits + // . and .., so any output means the dir has entries (our seeded skills). + obolSkills := podHermesHome + "/" + obolSkillsExternalDir + extOut, err := execInAgentPodErr(cfg, ns, "ls", "-A", obolSkills) + if err != nil { + t.Fatalf("obol-skills external dir %s not readable in pod: %v\n%s", obolSkills, err, extOut) + } + if strings.TrimSpace(extOut) == "" { + t.Fatalf("obol-skills external dir %s is empty in pod — the operator skill subset did not land", obolSkills) + } + // Sanity: the skills we asked for should be among the entries. + for _, skill := range []string{"addresses", "gas"} { + if !strings.Contains(extOut, skill) { + t.Errorf("expected seeded skill %q under %s; got:\n%s", skill, obolSkills, extOut) + } + } + t.Logf("✓ obol-skills external_dirs populated in pod (%s):\n%s", obolSkills, strings.TrimSpace(extOut)) + + // (b) Native bundled-skills dir is absent OR empty. We test for non-empty + // rather than non-existent because a future Hermes image could create the + // dir but leave it empty when the marker is set; both mean "no bundled + // skills were seeded". Run `ls -A` and tolerate a non-zero exit (dir + // missing). Any non-empty output is a contract violation: the marker was + // not honored and Hermes seeded its bundled skills. + bundled := podHermesHome + "/skills" + bundledOut, _ := execInAgentPodErr(cfg, ns, "ls", "-A", bundled) + bundledOut = strings.TrimSpace(bundledOut) + // kubectl/sh may surface a "No such file or directory" message on stderr, + // which our combined-output helper folds in; treat that as "absent". + if bundledOut != "" && !strings.Contains(bundledOut, "No such file") { + t.Fatalf("native bundled-skills dir %s is non-empty in pod — Hermes did NOT honor "+ + ".no-bundled-skills and seeded bundled skills:\n%s", bundled, bundledOut) + } + t.Logf("✓ native bundled-skills dir %s absent/empty in pod — bundled seeding skipped", bundled) +} + +// assertAgentHealthEndpointServes proves the agent pod actually serves its API +// under the capped config (it has not wedged on startup). We hit the unauth +// health path from inside the cluster via a one-shot busybox-style exec in the +// hermes container itself, since the per-agent route is x402-gated externally. +// Best-effort: any 2xx/expected health response is enough. +func assertAgentHealthEndpointServes(t *testing.T, cfg *config.Config, ns string) { + t.Helper() + // The hermes container is the only guaranteed exec target; it ships a + // Python runtime (it's a Hermes image). Use stdlib urllib to GET the + // in-pod health endpoint so we don't depend on curl/wget being present. + script := fmt.Sprintf(`import urllib.request,sys +try: + with urllib.request.urlopen("http://127.0.0.1:%d%s", timeout=10) as r: + sys.stdout.write("HTTP %%d" %% r.status) +except Exception as e: + sys.stdout.write("ERR %%s" %% e) +`, hermesContainerPort, hermesHealthPath) + out, err := execInAgentPodErr(cfg, ns, "python3", "-c", script) + t.Logf("agent in-pod health probe result: %q (err=%v)", strings.TrimSpace(out), err) + if err != nil || !strings.Contains(out, "HTTP 200") { + t.Errorf("agent health endpoint did not serve HTTP 200 under capped config: out=%q err=%v", out, err) + } +} + +// execInAgentPodErr runs a command in the "hermes" container of the sub-agent +// pod via `obol kubectl exec`, returning combined output + error. Non-fatal so +// callers can tolerate expected failures (e.g. ls on an absent dir). +func execInAgentPodErr(cfg *config.Config, ns string, args ...string) (string, error) { + full := append([]string{ + "kubectl", "exec", "-i", + "-n", ns, "deploy/" + hermesDeploymentName, + "-c", hermesContainerName, "--", + }, args...) + return acObolRunErr(cfg, full...) +} + +// Constants mirroring the in-cluster render in +// internal/serviceoffercontroller/agent_render.go. Kept local (and prefixed +// where they would otherwise be generic) so this test reads as an external +// contract: if the render changes these, the assertions here should be the +// thing that has to change too. +const ( + hermesDeploymentName = "hermes" + hermesContainerName = "hermes" + hermesConfigMapName = "hermes-config" + hermesContainerPort = 8642 // hermesPort in agent_render.go + hermesHealthPath = "/health" + podHermesHome = "/data/.hermes" // HERMES_HOME inside the pod + obolSkillsExternalDir = "obol-skills" // matches OBOL_SKILLS_DIR / external_dirs +) diff --git a/internal/agentcrd/agent_test.go b/internal/agentcrd/agent_test.go index 004eb505..931e96c8 100644 --- a/internal/agentcrd/agent_test.go +++ b/internal/agentcrd/agent_test.go @@ -7,6 +7,7 @@ import ( "testing" "github.com/ObolNetwork/obol-stack/internal/config" + "github.com/ObolNetwork/obol-stack/internal/embed" ) func TestValidateName(t *testing.T) { @@ -181,6 +182,56 @@ func TestSeedHostFiles_MarkerIsIdempotent(t *testing.T) { } } +// The `.no-bundled-skills` marker must be written ONLY by SeedHostFiles, never +// by the lower-level seed primitives (WriteSoul, embed.WriteSkillSubset) that +// the master/objective-only paths could reuse. SeedHostFiles is the +// sub-agents-for-sale seam; the master agent (internal/hermes) seeds via those +// primitives and must keep its ~80 bundled skills. If a future refactor moved +// the marker write down into a shared primitive, the master would silently lose +// them — this test fails first. +func TestMarkerOnlyWrittenBySeedHostFiles(t *testing.T) { + // WriteSoul on its own — the objective-rewrite path — must not drop a marker. + t.Run("WriteSoul", func(t *testing.T) { + dir := t.TempDir() + cfg := &config.Config{DataDir: dir} + + if _, err := WriteSoul(cfg, "quant", "objective only", true); err != nil { + t.Fatalf("WriteSoul: %v", err) + } + if _, err := os.Stat(HostNoBundledSkillsMarkerPath(cfg, "quant")); !os.IsNotExist(err) { + t.Fatalf("WriteSoul created a .no-bundled-skills marker (err=%v); only SeedHostFiles may", err) + } + }) + + // Writing the skill subset into the agent's skills dir — the primitive a + // master/full-skill path would call — must not drop a marker either. + t.Run("WriteSkillSubset", func(t *testing.T) { + dir := t.TempDir() + cfg := &config.Config{DataDir: dir} + + if err := embed.WriteSkillSubset(HostSkillsPath(cfg, "quant"), []string{"addresses"}); err != nil { + t.Fatalf("WriteSkillSubset: %v", err) + } + if _, err := os.Stat(HostNoBundledSkillsMarkerPath(cfg, "quant")); !os.IsNotExist(err) { + t.Fatalf("embed.WriteSkillSubset created a .no-bundled-skills marker (err=%v); only SeedHostFiles may", err) + } + }) + + // Sanity anchor: the full SeedHostFiles seam DOES write the marker, so the + // negative assertions above are meaningful and not testing a dead path. + t.Run("SeedHostFiles", func(t *testing.T) { + dir := t.TempDir() + cfg := &config.Config{DataDir: dir} + + if _, err := SeedHostFiles(cfg, "quant", []string{"addresses"}, "obj", SeedOptions{}); err != nil { + t.Fatalf("SeedHostFiles: %v", err) + } + if _, err := os.Stat(HostNoBundledSkillsMarkerPath(cfg, "quant")); err != nil { + t.Fatalf("SeedHostFiles must write the .no-bundled-skills marker: %v", err) + } + }) +} + func TestSeedHostFiles_PreservesExistingSoul(t *testing.T) { dir := t.TempDir() cfg := &config.Config{DataDir: dir} diff --git a/internal/serviceoffercontroller/agent_render.go b/internal/serviceoffercontroller/agent_render.go index a141affa..55f181a8 100644 --- a/internal/serviceoffercontroller/agent_render.go +++ b/internal/serviceoffercontroller/agent_render.go @@ -93,9 +93,10 @@ func agentManifests(agent *monetizeapi.Agent, litellmKey, apiKey string) ([]*uns // master is deployed via `obol agent init`, not via ServiceOffer), so the // terminal/agent caps below apply unconditionally. The Cloudflare free // tunnel cuts off requests at 100s, so lifetime_seconds is bounded under -// that. max_turns and reasoning_effort cap chattiness, and -// disabled_toolsets drops Hermes tool families that aren't useful in a -// paid-service context (memory persistence, web search). +// that. terminal.timeout must stay <= lifetime_seconds so no single +// operation can outlive the session. max_turns and reasoning_effort cap +// chattiness, and disabled_toolsets drops Hermes tool families that aren't +// useful in a paid-service context (memory persistence, web search). func renderHermesConfig(model, litellmKey string) string { return fmt.Sprintf(`model: default: %q @@ -105,7 +106,7 @@ func renderHermesConfig(model, litellmKey string) string { terminal: backend: local cwd: /data/.hermes/workspace - timeout: 180 + timeout: 80 lifetime_seconds: 90 docker_mount_cwd_to_workspace: false agent: diff --git a/internal/serviceoffercontroller/agent_render_test.go b/internal/serviceoffercontroller/agent_render_test.go index aacfe75e..56a90414 100644 --- a/internal/serviceoffercontroller/agent_render_test.go +++ b/internal/serviceoffercontroller/agent_render_test.go @@ -1,6 +1,7 @@ package serviceoffercontroller import ( + "strconv" "strings" "testing" @@ -263,6 +264,7 @@ func TestRenderHermesConfig_HasModelAndSkillsDir(t *testing.T) { func TestRenderHermesConfig_SubAgentConstraints(t *testing.T) { cfg := renderHermesConfig("qwen3.5:9b", "lit-key") for _, must := range []string{ + `timeout: 80`, `lifetime_seconds: 90`, `max_turns: 30`, `reasoning_effort: low`, @@ -274,6 +276,36 @@ func TestRenderHermesConfig_SubAgentConstraints(t *testing.T) { t.Errorf("hermes config missing sub-agent constraint %q\n---\n%s", must, cfg) } } + + // A per-operation timeout larger than the whole session lifetime is + // incoherent: a single tool/command could nominally outlive the session + // the Cloudflare free tunnel caps at 100s. Parse both out of the rendered + // config and assert timeout <= lifetime_seconds. + timeout := parseTerminalInt(t, cfg, "timeout") + lifetime := parseTerminalInt(t, cfg, "lifetime_seconds") + if timeout > lifetime { + t.Errorf("terminal.timeout (%d) must be <= lifetime_seconds (%d)\n---\n%s", timeout, lifetime, cfg) + } +} + +// parseTerminalInt extracts the integer value of a `key: ` line from the +// rendered Hermes config. Fails the test if the key is absent or unparsable. +func parseTerminalInt(t *testing.T, cfg, key string) int { + t.Helper() + for _, line := range strings.Split(cfg, "\n") { + trimmed := strings.TrimSpace(line) + if !strings.HasPrefix(trimmed, key+":") { + continue + } + val := strings.TrimSpace(strings.TrimPrefix(trimmed, key+":")) + n, err := strconv.Atoi(val) + if err != nil { + t.Fatalf("parsing %q value %q: %v", key, val, err) + } + return n + } + t.Fatalf("config missing %q line\n---\n%s", key, cfg) + return 0 } func TestGenerateAPIKey_HexAndUnique(t *testing.T) { diff --git a/plans/sell-agent-perf.md b/plans/sell-agent-perf.md index cf2dbc19..4442c9c4 100644 --- a/plans/sell-agent-perf.md +++ b/plans/sell-agent-perf.md @@ -115,12 +115,23 @@ All flags opt-in; existing callers untouched. ~50 LOC, stdlib only. | Knob | Old | New | |------|-----|-----| | `terminal.lifetime_seconds` | 300 | **90** | +| `terminal.timeout` | 180 | **80** (measured; see Post-review hardening #3) | | `agent.max_turns` | 90 (default) | **30** | | `agent.reasoning_effort` | medium (default) | **low** | | `agent.disabled_toolsets` | — | `[memory, web]` | -| SOUL.md size | ~1050 tok | **~500 tok** | +| SOUL.md template | 2128 bytes | **1460 bytes** (measured; rendered ~1480 bytes, ~370 tok) | | `addresses` SKILL.md | ~7k tok | **~1k tok** (rest in `references/`) | +The earlier "~1050 -> ~500 tok" SOUL estimate was optimistic. The +**measured** result is a template shrink from 2128 -> 1460 bytes; rendered +SOUL.md (after objective interpolation) is ~1480 bytes, ~370 tokens at +4 chars/token — a real reduction, just not as large as guessed. + +The addresses skill split preserved **all 177 unique contract addresses +exactly** across 8 `references/` files (verified by diff against the +pre-split SKILL.md). No address was dropped, renamed, or mistyped in the +move — only the prompt-load footprint changed. + ## Files touched - `internal/serviceoffercontroller/agent_render.go` — extend `renderHermesConfig` @@ -144,3 +155,29 @@ All flags opt-in; existing callers untouched. ~50 LOC, stdlib only. - Confirm marker file present in the agent PVC: `ls -la /agent-quant/hermes-data/.hermes/.no-bundled-skills` - Eyeball Hermes pod logs for "skipping bundled skills" on startup. + +## Post-review hardening (PR #582 follow-up) + +Three follow-ups landed on this branch after review. None reopen the +decisions above; they tighten the contract and add coverage. + +- **#2 — `SeedHostFiles` marker contract + regression test.** + `SeedHostFiles` now carries an explicit "sub-agents only" doc contract: + the `.no-bundled-skills` marker is written **only** by `SeedHostFiles`, + never by the reusable seed primitives (`WriteSoul`, + `embed.WriteSkillSubset`) that a master or objective-only path could + route through. A regression test locks this in, so a future refactor that + pushes the marker write down into a shared primitive — the way the master + (which seeds via its own `internal/hermes` path) could accidentally + inherit it — fails first. +- **#3 — `terminal.timeout` lowered 180 -> 80.** It now stays + `<= terminal.lifetime_seconds` (90), so a single operation can no longer + outlive the session. (Previously a 180s per-op timeout could exceed the + 90s session lifetime.) Reflected in the Numbers table above. +- **#1 — integration test for marker + config honoring.** A new + `//go:build integration` test deploys a sub-agent and verifies the Hermes + image actually honors the new state: the `.no-bundled-skills` marker on + the PVC, the new `agent.*` / `lifetime_seconds` keys in the rendered + ConfigMap, and a behavioral skip-bundled-skills signal from the running + pod. It skips gracefully without a cluster (consistent with the rest of + the integration suite).