From 09051881fa767867d06287aa27073b3c7c45ef0a Mon Sep 17 00:00:00 2001 From: HananNouman Date: Fri, 22 May 2026 22:53:45 +0300 Subject: [PATCH 1/2] fix(hermes): host-side chown for CRD child agent PVCs on Linux k3d PR #481 only repaired hermes- volumes after hermes.Sync (master agent). Child agents live under agent- and are provisioned by the controller or agent-factory without that path, so hermes-data stayed 1000:1000 while Hermes runs as 10000:10000 and crash-looped on Permission denied under /data/.hermes. Extend EnsureHermesDataPVCOwnership to agent-/hermes-data, call it from obol agent new and obol sell demo quant, and add obol agent repair-perms for factory-only creates that cannot docker-exec the k3d node from in-cluster. Co-authored-by: Cursor --- cmd/obol/agent.go | 27 ++++++++++ cmd/obol/agent_crd.go | 4 ++ cmd/obol/sell_agent.go | 1 + internal/agentcrd/agent.go | 11 +++- internal/agentcrd/agent_test.go | 13 +++++ internal/embed/skills/agent-factory/SKILL.md | 13 +++++ internal/hermes/hermes.go | 56 +++++++++++--------- plans/crd-agent-hermes-pvc-chown.md | 44 +++++++++++++++ 8 files changed, 141 insertions(+), 28 deletions(-) create mode 100644 plans/crd-agent-hermes-pvc-chown.md diff --git a/cmd/obol/agent.go b/cmd/obol/agent.go index f3062f66..748de8ab 100644 --- a/cmd/obol/agent.go +++ b/cmd/obol/agent.go @@ -200,6 +200,33 @@ Hermes/OpenClaw onboard flow used by the master agent.`, return listAgentInstances(cfg, cmd.String("runtime"), getUI(cmd)) }, }, + { + Name: "repair-perms", + Usage: "Fix Hermes data PVC ownership for a CRD child agent (Linux k3d)", + ArgsUsage: "", + Description: `Host-side chown for agent-/hermes-data so the child Hermes pod can write under /data. + +Use when a factory-spawned or CLI-created child agent crash-loops with +Permission denied under /data/.hermes on Linux k3d (KubeletInUserNamespace). + +This is a no-op on backends where in-pod init chown already works.`, + Action: func(ctx context.Context, cmd *cli.Command) error { + if cmd.NArg() != 1 { + return fmt.Errorf("agent name required: obol agent repair-perms ") + } + name := strings.TrimSpace(cmd.Args().First()) + if err := agentcrd.ValidateName(name); err != nil { + return err + } + if err := kubectl.EnsureCluster(cfg); err != nil { + return fmt.Errorf("Obol Stack is not running. Start it with `obol stack up` first") + } + u := getUI(cmd) + hermes.EnsureCRDAgentHermesPVCOwnership(cfg, name, u) + u.Successf("Repaired Hermes data volume ownership for agent %s", name) + return nil + }, + }, { Name: "delete", Usage: "Remove an agent instance and its cluster resources", diff --git a/cmd/obol/agent_crd.go b/cmd/obol/agent_crd.go index 7892982e..4b55c8ba 100644 --- a/cmd/obol/agent_crd.go +++ b/cmd/obol/agent_crd.go @@ -6,6 +6,7 @@ import ( "github.com/ObolNetwork/obol-stack/internal/agentcrd" "github.com/ObolNetwork/obol-stack/internal/config" + "github.com/ObolNetwork/obol-stack/internal/hermes" "github.com/ObolNetwork/obol-stack/internal/kubectl" "github.com/ObolNetwork/obol-stack/internal/ui" "github.com/urfave/cli/v3" @@ -132,6 +133,9 @@ func createCRDAgent(cfg *config.Config, u *ui.UI, opts createCRDAgentOptions) er } u.Successf("Agent %s/%s %s", agentcrd.Namespace(opts.Name), opts.Name, action) u.Infof("Reconciler will provision: namespace → %s deployment → status updates", "hermes") + // Host-side chown for agent-/hermes-data (#475). Master-agent Sync + // already does this for hermes-obol-agent; CRD children skip helm Sync. + hermes.EnsureCRDAgentHermesPVCOwnership(cfg, opts.Name, u) u.Infof("Inspect: kubectl get agent %s -n %s -o yaml", opts.Name, agentcrd.Namespace(opts.Name)) return nil } diff --git a/cmd/obol/sell_agent.go b/cmd/obol/sell_agent.go index 80b77315..24d3b041 100644 --- a/cmd/obol/sell_agent.go +++ b/cmd/obol/sell_agent.go @@ -348,6 +348,7 @@ func runAgentBackedDemo( } u.Successf("Agent %s/%s created (skills: %s)", agentcrd.Namespace(agentName), agentName, strings.Join(spec.Agent.Skills, ", ")) + hermes.EnsureCRDAgentHermesPVCOwnership(cfg, agentName, u) } // 2. Build and apply the agent-typed ServiceOffer. diff --git a/internal/agentcrd/agent.go b/internal/agentcrd/agent.go index 02ca080e..ddbd1f00 100644 --- a/internal/agentcrd/agent.go +++ b/internal/agentcrd/agent.go @@ -31,12 +31,19 @@ func Namespace(name string) string { return "agent-" + name } +// HermesDataPVCHostPath is the host-side directory backing the child agent's +// hermes-data PVC (/agent-/hermes-data). local-path-provisioner +// maps the PVC here; HostHomePath writes under .../hermes-data/.hermes. +func HermesDataPVCHostPath(cfg *config.Config, name string) string { + desc := agentruntime.Describe(agentruntime.Hermes) + return filepath.Join(cfg.DataDir, Namespace(name), desc.DataPVCName) +} + // HostHomePath is where the agent's .hermes data lives on the host. The // cluster mounts this into the Hermes pod via hostPath; writing // SOUL.md/skills here puts them inside the pod automatically. func HostHomePath(cfg *config.Config, name string) string { - desc := agentruntime.Describe(agentruntime.Hermes) - return filepath.Join(cfg.DataDir, Namespace(name), desc.DataPVCName, desc.HomeDir) + return filepath.Join(HermesDataPVCHostPath(cfg, name), agentruntime.Describe(agentruntime.Hermes).HomeDir) } // HostSkillsPath is the per-agent skills dir. OBOL_SKILLS_DIR points here diff --git a/internal/agentcrd/agent_test.go b/internal/agentcrd/agent_test.go index a2aa5973..636a563a 100644 --- a/internal/agentcrd/agent_test.go +++ b/internal/agentcrd/agent_test.go @@ -114,6 +114,19 @@ func TestBuildAgent_Populated(t *testing.T) { } } +func TestHermesDataPVCHostPath(t *testing.T) { + cfg := &config.Config{DataDir: "/data/obol"} + got := HermesDataPVCHostPath(cfg, "quant-rc4") + want := "/data/obol/agent-quant-rc4/hermes-data" + if got != want { + t.Fatalf("HermesDataPVCHostPath = %q, want %q", got, want) + } + home := HostHomePath(cfg, "quant-rc4") + if home != want+"/.hermes" { + t.Fatalf("HostHomePath = %q, want %q/.hermes", home, want) + } +} + func TestSeedHostFiles_FreshAgent(t *testing.T) { dir := t.TempDir() cfg := &config.Config{DataDir: dir} diff --git a/internal/embed/skills/agent-factory/SKILL.md b/internal/embed/skills/agent-factory/SKILL.md index 1808e70e..4d6d4335 100644 --- a/internal/embed/skills/agent-factory/SKILL.md +++ b/internal/embed/skills/agent-factory/SKILL.md @@ -40,3 +40,16 @@ python3 scripts/factory.py create medical-advisor \ - The profile seed Secret is named `hermes-profile-seed` and contains `profile.tar.gz`. - Runtime environment overrides go in the optional `hermes-env` Secret. - The factory intentionally writes deterministic resource names only. + +### Linux k3d: fix Hermes PVC permissions after create + +On Linux k3d (`KubeletInUserNamespace`), the child Hermes pod may crash-loop with +`Permission denied` under `/data/.hermes` until the host-side data directory is +chowned. The factory runs in-cluster and cannot do that itself — run from the host: + +```bash +obol agent repair-perms +``` + +`obol agent new` and `obol sell demo quant` run this automatically; factory-only +creates need the one-liner above (same workaround as rc1 manual `docker exec` chown). diff --git a/internal/hermes/hermes.go b/internal/hermes/hermes.go index 59e8c66a..d64039b7 100644 --- a/internal/hermes/hermes.go +++ b/internal/hermes/hermes.go @@ -12,6 +12,7 @@ import ( "path/filepath" "strings" + "github.com/ObolNetwork/obol-stack/internal/agentcrd" "github.com/ObolNetwork/obol-stack/internal/agentruntime" "github.com/ObolNetwork/obol-stack/internal/config" "github.com/ObolNetwork/obol-stack/internal/dns" @@ -1306,10 +1307,12 @@ func fixRuntimeVolumeOwnership(cfg *config.Config, hostPath string, u *ui.UI) { owner := fmt.Sprintf("%d:%d", containerUID, containerGID) switch backendName { case "k3d": - if err := k3dNodeExec(cfg, hostPath, "chown -R "+owner+" {}"); err != nil && u != nil { + shellCmd := fmt.Sprintf("mkdir -p {} && chown -R %s {}", owner) + if err := k3dNodeExec(cfg, hostPath, shellCmd); err != nil && u != nil { u.Warnf("Failed to fix volume ownership for %s: %v", hostPath, err) } default: + _ = os.MkdirAll(hostPath, 0o755) _ = os.Chown(hostPath, containerUID, containerGID) } } @@ -1333,11 +1336,10 @@ func hermesPVCPaths(cfg *config.Config, id string) []string { } } -// ensureHermesPVCOwnership host-side chowns the Hermes PVC backing directories -// to containerUID:containerGID so the agent's init containers can write under -// /data on the first start. +// EnsureHermesDataPVCOwnership host-side chowns a Hermes data PVC backing dir +// to containerUID:containerGID so the agent can write under /data on first start. // -// Why this is needed (issue #475): +// Why this is needed (issue #475, extended to agent-* namespaces in #481 follow-up): // - The embedded k3d config (internal/embed/k3d-config.yaml) sets // KubeletInUserNamespace=true. Pod "root" maps to a host subuid that // lacks chown authority over the host bind-mount path provisioned by @@ -1352,38 +1354,28 @@ func hermesPVCPaths(cfg *config.Config, id string) []string { // k3d server container runs at the host Docker daemon's authority, which is // real root and is not subject to the kubelet's user-namespacing. // -// Best-effort. Waits up to 60s for each PVC to be Bound (local-path uses -// WaitForFirstConsumer, so the host dir doesn't exist until the consuming +// Best-effort. Waits up to 60s for the PVC to be Bound (local-path uses +// WaitForFirstConsumer, so the host dir may not exist until the consuming // pod is scheduled). On non-k3d backends fixRuntimeVolumeOwnership falls // back to a plain os.Chown. // // If a Hermes pod is currently stuck in Init:CrashLoopBackOff because of the // pre-fix permissions, deletes it so kubelet re-creates with the corrected // perms immediately rather than after exponential backoff (up to ~5 min). -// Skips the delete when no pod is stuck so repeated `Sync` calls -// (e.g. `obol model sync` after `obol model prefer`) do not gratuitously +// Skips the delete when no pod is stuck so repeated calls do not gratuitously // restart a healthy agent. -func ensureHermesPVCOwnership(cfg *config.Config, id string, u *ui.UI) { - namespace := agentruntime.Namespace(agentruntime.Hermes, id) +func EnsureHermesDataPVCOwnership(cfg *config.Config, namespace, hostPath string, u *ui.UI) { kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") kubectlBin := filepath.Join(cfg.BinDir, "kubectl") - // Wait only for the PVCs hermesPVCPaths chowns. remote-signer-keystores - // is intentionally NOT in this loop — see the doc comment on - // hermesPVCPaths for why. - for _, pvc := range []string{ - agentruntime.Describe(agentruntime.Hermes).DataPVCName, - } { - waitCmd := exec.Command(kubectlBin, - "wait", "--for=jsonpath={.status.phase}=Bound", - "--timeout=60s", "pvc/"+pvc, "-n", namespace) - waitCmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath) - _ = waitCmd.Run() // best-effort; continue even on timeout - } + pvcName := agentruntime.Describe(agentruntime.Hermes).DataPVCName + waitCmd := exec.Command(kubectlBin, + "wait", "--for=jsonpath={.status.phase}=Bound", + "--timeout=60s", "pvc/"+pvcName, "-n", namespace) + waitCmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath) + _ = waitCmd.Run() // best-effort; continue even on timeout - for _, p := range hermesPVCPaths(cfg, id) { - fixRuntimeVolumeOwnership(cfg, p, u) - } + fixRuntimeVolumeOwnership(cfg, hostPath, u) if hermesInitStuck(cfg, namespace) { deleteCmd := exec.Command(kubectlBin, @@ -1397,6 +1389,18 @@ func ensureHermesPVCOwnership(cfg *config.Config, id string, u *ui.UI) { } } +// EnsureCRDAgentHermesPVCOwnership applies EnsureHermesDataPVCOwnership for +// serviceoffer-controller child agents (namespace agent-). Call after +// `obol agent new`, `obol sell demo quant`, or when repairing factory-spawned +// children that crash-loop on Linux k3d. +func EnsureCRDAgentHermesPVCOwnership(cfg *config.Config, agentName string, u *ui.UI) { + EnsureHermesDataPVCOwnership(cfg, agentcrd.Namespace(agentName), agentcrd.HermesDataPVCHostPath(cfg, agentName), u) +} + +func ensureHermesPVCOwnership(cfg *config.Config, id string, u *ui.UI) { + EnsureHermesDataPVCOwnership(cfg, agentruntime.Namespace(agentruntime.Hermes, id), hermesPVCPaths(cfg, id)[0], u) +} + // hermesInitStuck reports whether at least one Hermes pod has an init // container in CrashLoopBackOff or an Error waiting state — the signature of // the perm-denied symptom this fix targets. Returns false on any kubectl diff --git a/plans/crd-agent-hermes-pvc-chown.md b/plans/crd-agent-hermes-pvc-chown.md new file mode 100644 index 00000000..0964ad2d --- /dev/null +++ b/plans/crd-agent-hermes-pvc-chown.md @@ -0,0 +1,44 @@ +# CRD child agent Hermes PVC ownership (Linux k3d) + +## Problem + +PR #481 (`ensureHermesPVCOwnership`) runs only after `hermes.Sync` for the **master** +agent (`hermes-obol-agent`). Child agents provisioned by the serviceoffer-controller +use namespace `agent-` and never go through helm Sync. + +On Linux k3d with `KubeletInUserNamespace`, local-path-provisioner creates +`/agent-/hermes-data` owned `1000:1000`. Hermes runs as `10000:10000` +and cannot write `/data/.hermes/home` → Init crash-loop. + +Factory-spawned children (`agent-factory` skill, in-cluster API) hit the same gap: +they never invoke host-side `obol` seed + chown. + +## Fix (branch `fix/crd-agent-hermes-pvc-chown`) + +| Call site | When | +|-----------|------| +| `hermes.EnsureCRDAgentHermesPVCOwnership` | Shared: wait PVC Bound → `mkdir -p` + `chown 10000:10000` via k3d `docker exec` | +| `obol agent new` | After Agent CR apply | +| `obol sell demo quant` | After new Agent CR apply | +| `obol agent repair-perms ` | Manual repair after `agent-factory` create | + +Master agent path unchanged: `ensureHermesPVCOwnership` → `EnsureHermesDataPVCOwnership` for `hermes-`. + +## Operator workaround (pre-fix / factory-only) + +```bash +obol agent repair-perms +# or legacy rc1 style: +docker exec k3d-obol-stack--server-0 \ + chown -R 10000:10000 /data/agent-/hermes-data +``` + +## Verify + +```bash +kubectl get pods -n agent- +# hermes should reach Running, not Init:CrashLoopBackOff + +obol sell status -n agent- +# Ready=True once Hermes is healthy +``` From cc1325afce8ad9f2eb6fcdaf2065891134646330 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 20:59:34 +0400 Subject: [PATCH 2/2] Use fsGroup for Hermes PVC ownership --- cmd/obol/agent.go | 27 ---- cmd/obol/agent_crd.go | 4 - cmd/obol/sell_agent.go | 1 - internal/agentcrd/agent.go | 13 +- internal/agentcrd/agent_test.go | 13 -- internal/embed/skills/agent-factory/SKILL.md | 13 -- internal/hermes/hermes.go | 132 ++++-------------- internal/hermes/hermes_test.go | 52 +------ .../serviceoffercontroller/agent_render.go | 7 +- .../agent_render_test.go | 37 +++++ plans/crd-agent-hermes-pvc-chown.md | 44 ------ 11 files changed, 76 insertions(+), 267 deletions(-) delete mode 100644 plans/crd-agent-hermes-pvc-chown.md diff --git a/cmd/obol/agent.go b/cmd/obol/agent.go index 748de8ab..f3062f66 100644 --- a/cmd/obol/agent.go +++ b/cmd/obol/agent.go @@ -200,33 +200,6 @@ Hermes/OpenClaw onboard flow used by the master agent.`, return listAgentInstances(cfg, cmd.String("runtime"), getUI(cmd)) }, }, - { - Name: "repair-perms", - Usage: "Fix Hermes data PVC ownership for a CRD child agent (Linux k3d)", - ArgsUsage: "", - Description: `Host-side chown for agent-/hermes-data so the child Hermes pod can write under /data. - -Use when a factory-spawned or CLI-created child agent crash-loops with -Permission denied under /data/.hermes on Linux k3d (KubeletInUserNamespace). - -This is a no-op on backends where in-pod init chown already works.`, - Action: func(ctx context.Context, cmd *cli.Command) error { - if cmd.NArg() != 1 { - return fmt.Errorf("agent name required: obol agent repair-perms ") - } - name := strings.TrimSpace(cmd.Args().First()) - if err := agentcrd.ValidateName(name); err != nil { - return err - } - if err := kubectl.EnsureCluster(cfg); err != nil { - return fmt.Errorf("Obol Stack is not running. Start it with `obol stack up` first") - } - u := getUI(cmd) - hermes.EnsureCRDAgentHermesPVCOwnership(cfg, name, u) - u.Successf("Repaired Hermes data volume ownership for agent %s", name) - return nil - }, - }, { Name: "delete", Usage: "Remove an agent instance and its cluster resources", diff --git a/cmd/obol/agent_crd.go b/cmd/obol/agent_crd.go index 4b55c8ba..7892982e 100644 --- a/cmd/obol/agent_crd.go +++ b/cmd/obol/agent_crd.go @@ -6,7 +6,6 @@ import ( "github.com/ObolNetwork/obol-stack/internal/agentcrd" "github.com/ObolNetwork/obol-stack/internal/config" - "github.com/ObolNetwork/obol-stack/internal/hermes" "github.com/ObolNetwork/obol-stack/internal/kubectl" "github.com/ObolNetwork/obol-stack/internal/ui" "github.com/urfave/cli/v3" @@ -133,9 +132,6 @@ func createCRDAgent(cfg *config.Config, u *ui.UI, opts createCRDAgentOptions) er } u.Successf("Agent %s/%s %s", agentcrd.Namespace(opts.Name), opts.Name, action) u.Infof("Reconciler will provision: namespace → %s deployment → status updates", "hermes") - // Host-side chown for agent-/hermes-data (#475). Master-agent Sync - // already does this for hermes-obol-agent; CRD children skip helm Sync. - hermes.EnsureCRDAgentHermesPVCOwnership(cfg, opts.Name, u) u.Infof("Inspect: kubectl get agent %s -n %s -o yaml", opts.Name, agentcrd.Namespace(opts.Name)) return nil } diff --git a/cmd/obol/sell_agent.go b/cmd/obol/sell_agent.go index 24d3b041..80b77315 100644 --- a/cmd/obol/sell_agent.go +++ b/cmd/obol/sell_agent.go @@ -348,7 +348,6 @@ func runAgentBackedDemo( } u.Successf("Agent %s/%s created (skills: %s)", agentcrd.Namespace(agentName), agentName, strings.Join(spec.Agent.Skills, ", ")) - hermes.EnsureCRDAgentHermesPVCOwnership(cfg, agentName, u) } // 2. Build and apply the agent-typed ServiceOffer. diff --git a/internal/agentcrd/agent.go b/internal/agentcrd/agent.go index ddbd1f00..c9507926 100644 --- a/internal/agentcrd/agent.go +++ b/internal/agentcrd/agent.go @@ -31,19 +31,12 @@ func Namespace(name string) string { return "agent-" + name } -// HermesDataPVCHostPath is the host-side directory backing the child agent's -// hermes-data PVC (/agent-/hermes-data). local-path-provisioner -// maps the PVC here; HostHomePath writes under .../hermes-data/.hermes. -func HermesDataPVCHostPath(cfg *config.Config, name string) string { - desc := agentruntime.Describe(agentruntime.Hermes) - return filepath.Join(cfg.DataDir, Namespace(name), desc.DataPVCName) -} - // HostHomePath is where the agent's .hermes data lives on the host. The -// cluster mounts this into the Hermes pod via hostPath; writing +// cluster mounts this into the Hermes pod via the data PVC; writing // SOUL.md/skills here puts them inside the pod automatically. func HostHomePath(cfg *config.Config, name string) string { - return filepath.Join(HermesDataPVCHostPath(cfg, name), agentruntime.Describe(agentruntime.Hermes).HomeDir) + desc := agentruntime.Describe(agentruntime.Hermes) + return filepath.Join(cfg.DataDir, Namespace(name), desc.DataPVCName, desc.HomeDir) } // HostSkillsPath is the per-agent skills dir. OBOL_SKILLS_DIR points here diff --git a/internal/agentcrd/agent_test.go b/internal/agentcrd/agent_test.go index 636a563a..a2aa5973 100644 --- a/internal/agentcrd/agent_test.go +++ b/internal/agentcrd/agent_test.go @@ -114,19 +114,6 @@ func TestBuildAgent_Populated(t *testing.T) { } } -func TestHermesDataPVCHostPath(t *testing.T) { - cfg := &config.Config{DataDir: "/data/obol"} - got := HermesDataPVCHostPath(cfg, "quant-rc4") - want := "/data/obol/agent-quant-rc4/hermes-data" - if got != want { - t.Fatalf("HermesDataPVCHostPath = %q, want %q", got, want) - } - home := HostHomePath(cfg, "quant-rc4") - if home != want+"/.hermes" { - t.Fatalf("HostHomePath = %q, want %q/.hermes", home, want) - } -} - func TestSeedHostFiles_FreshAgent(t *testing.T) { dir := t.TempDir() cfg := &config.Config{DataDir: dir} diff --git a/internal/embed/skills/agent-factory/SKILL.md b/internal/embed/skills/agent-factory/SKILL.md index 4d6d4335..1808e70e 100644 --- a/internal/embed/skills/agent-factory/SKILL.md +++ b/internal/embed/skills/agent-factory/SKILL.md @@ -40,16 +40,3 @@ python3 scripts/factory.py create medical-advisor \ - The profile seed Secret is named `hermes-profile-seed` and contains `profile.tar.gz`. - Runtime environment overrides go in the optional `hermes-env` Secret. - The factory intentionally writes deterministic resource names only. - -### Linux k3d: fix Hermes PVC permissions after create - -On Linux k3d (`KubeletInUserNamespace`), the child Hermes pod may crash-loop with -`Permission denied` under `/data/.hermes` until the host-side data directory is -chowned. The factory runs in-cluster and cannot do that itself — run from the host: - -```bash -obol agent repair-perms -``` - -`obol agent new` and `obol sell demo quant` run this automatically; factory-only -creates need the one-liner above (same workaround as rc1 manual `docker exec` chown). diff --git a/internal/hermes/hermes.go b/internal/hermes/hermes.go index d64039b7..5774330b 100644 --- a/internal/hermes/hermes.go +++ b/internal/hermes/hermes.go @@ -12,7 +12,6 @@ import ( "path/filepath" "strings" - "github.com/ObolNetwork/obol-stack/internal/agentcrd" "github.com/ObolNetwork/obol-stack/internal/agentruntime" "github.com/ObolNetwork/obol-stack/internal/config" "github.com/ObolNetwork/obol-stack/internal/dns" @@ -246,10 +245,7 @@ func Sync(cfg *config.Config, id string, u *ui.UI) error { return fmt.Errorf("helmfile sync failed: %w", err) } - // Host-side chown the PVC backing dirs to the in-pod UID/GID, bypassing - // the user-namespacing that defeats the in-pod `init-hermes-perms` - // chown from #446 (see ensureHermesPVCOwnership doc comment for details). - ensureHermesPVCOwnership(cfg, id, u) + fixHermesDataPVCK3dFallback(cfg, id, u) // Publish wallet-metadata ConfigMap for the frontend (namespace now exists). applyWalletMetadataConfigMap(cfg, id, deploymentDir) @@ -776,20 +772,8 @@ func generateValues(namespace, hostname, dashboardHostname, agentBaseURL, token, runAsUser: %d runAsGroup: %d fsGroup: %d + fsGroupChangePolicy: OnRootMismatch initContainers: - - name: init-hermes-perms - image: %s - imagePullPolicy: IfNotPresent - securityContext: - runAsUser: 0 - runAsGroup: 0 - command: - - sh - - -c - - chown -R %d:%d /data - volumeMounts: - - name: data - mountPath: /data - name: init-hermes-data image: %s imagePullPolicy: IfNotPresent @@ -863,7 +847,7 @@ func generateValues(namespace, hostname, dashboardHostname, agentBaseURL, token, value: %s - name: OBOL_SKILLS_DIR value: /data/.hermes/%s - `, desc.DataPVCName, namespace, desc.ServiceName, desc.ServiceName, namespace, desc.ServiceName, desc.ServiceName, desc.ServiceName, desc.ServiceName, containerUID, containerGID, containerGID, quoteYAML(image()), containerUID, containerGID, quoteYAML(image()), desc.ServiceName, quoteYAML(image()), quoteYAML(hermesBinary), desc.DefaultPort, desc.DefaultPort, quoteYAML(primary), quoteYAML(namespace), obolSkillsDirName) + `, desc.DataPVCName, namespace, desc.ServiceName, desc.ServiceName, namespace, desc.ServiceName, desc.ServiceName, desc.ServiceName, desc.ServiceName, containerUID, containerGID, containerGID, quoteYAML(image()), desc.ServiceName, quoteYAML(image()), quoteYAML(hermesBinary), desc.DefaultPort, desc.DefaultPort, quoteYAML(primary), quoteYAML(namespace), obolSkillsDirName) if agentBaseURL != "" { fmt.Fprintf(&b, " - name: AGENT_BASE_URL\n value: %s\n", quoteYAML(agentBaseURL)) @@ -1023,7 +1007,6 @@ func syncRuntimeFiles(cfg *config.Config, id string, configData []byte, u *ui.UI if err := removeLegacyHeartbeat(targetDir); err != nil { return err } - fixRuntimeVolumeOwnership(cfg, targetDir, u) return nil } @@ -1307,105 +1290,46 @@ func fixRuntimeVolumeOwnership(cfg *config.Config, hostPath string, u *ui.UI) { owner := fmt.Sprintf("%d:%d", containerUID, containerGID) switch backendName { case "k3d": - shellCmd := fmt.Sprintf("mkdir -p {} && chown -R %s {}", owner) - if err := k3dNodeExec(cfg, hostPath, shellCmd); err != nil && u != nil { + if err := k3dNodeExec(cfg, hostPath, "chown -R "+owner+" {}"); err != nil && u != nil { u.Warnf("Failed to fix volume ownership for %s: %v", hostPath, err) } default: - _ = os.MkdirAll(hostPath, 0o755) _ = os.Chown(hostPath, containerUID, containerGID) } } -// hermesPVCPaths returns the host-side PVC backing directories owned by the -// Hermes pod and chowned to containerUID:containerGID. -// -// Intentionally limited to PVCs that the Hermes container itself mounts — -// `remote-signer-keystores` is excluded even though it sits in the same -// namespace because the remote-signer pod runs as runAsUser=65532 with -// fsGroup=1000 (obol/remote-signer chart) and forcing its volume to -// 10000:10000 (Hermes' UID) makes the remote-signer crash-loop on -// `failed to load keystores: Permission denied (os error 13)` against -// the read-only /data/keystores mount. The local-path-provisioner default -// of 1000:1000 already matches that pod's fsGroup contract, so leaving -// that volume untouched is the safe behavior. -func hermesPVCPaths(cfg *config.Config, id string) []string { - namespace := agentruntime.Namespace(agentruntime.Hermes, id) - return []string{ - filepath.Join(cfg.DataDir, namespace, agentruntime.Describe(agentruntime.Hermes).DataPVCName), +// fsGroup should own Hermes' data volume. This fallback only repairs legacy +// k3d/userns clusters when the init container is already visibly stuck. +func fixHermesDataPVCK3dFallback(cfg *config.Config, id string, u *ui.UI) { + backendName := "k3d" + if data, err := os.ReadFile(filepath.Join(cfg.ConfigDir, ".stack-backend")); err == nil { + backendName = strings.TrimSpace(string(data)) + } + if backendName != "k3d" { + return } -} - -// EnsureHermesDataPVCOwnership host-side chowns a Hermes data PVC backing dir -// to containerUID:containerGID so the agent can write under /data on first start. -// -// Why this is needed (issue #475, extended to agent-* namespaces in #481 follow-up): -// - The embedded k3d config (internal/embed/k3d-config.yaml) sets -// KubeletInUserNamespace=true. Pod "root" maps to a host subuid that -// lacks chown authority over the host bind-mount path provisioned by -// local-path-provisioner. The in-pod `init-hermes-perms` chown added in -// #446 (commit c066baa) silently no-ops in this configuration. -// - local-path-provisioner's helper-pod sets the dir to 1000:1000 (see -// internal/embed/infrastructure/base/templates/local-path.yaml). Hermes -// runs as 10000:10000, so the next init container fails on -// `mkdir /data/.hermes/home: Permission denied`. -// -// The fix is to chown from outside the user namespace: `docker exec` into the -// k3d server container runs at the host Docker daemon's authority, which is -// real root and is not subject to the kubelet's user-namespacing. -// -// Best-effort. Waits up to 60s for the PVC to be Bound (local-path uses -// WaitForFirstConsumer, so the host dir may not exist until the consuming -// pod is scheduled). On non-k3d backends fixRuntimeVolumeOwnership falls -// back to a plain os.Chown. -// -// If a Hermes pod is currently stuck in Init:CrashLoopBackOff because of the -// pre-fix permissions, deletes it so kubelet re-creates with the corrected -// perms immediately rather than after exponential backoff (up to ~5 min). -// Skips the delete when no pod is stuck so repeated calls do not gratuitously -// restart a healthy agent. -func EnsureHermesDataPVCOwnership(cfg *config.Config, namespace, hostPath string, u *ui.UI) { - kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") - kubectlBin := filepath.Join(cfg.BinDir, "kubectl") - pvcName := agentruntime.Describe(agentruntime.Hermes).DataPVCName - waitCmd := exec.Command(kubectlBin, - "wait", "--for=jsonpath={.status.phase}=Bound", - "--timeout=60s", "pvc/"+pvcName, "-n", namespace) - waitCmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath) - _ = waitCmd.Run() // best-effort; continue even on timeout + namespace := agentruntime.Namespace(agentruntime.Hermes, id) + if !hermesInitContainerStuck(cfg, namespace) { + return + } + hostPath := filepath.Join(cfg.DataDir, namespace, agentruntime.Describe(agentruntime.Hermes).DataPVCName) fixRuntimeVolumeOwnership(cfg, hostPath, u) - if hermesInitStuck(cfg, namespace) { - deleteCmd := exec.Command(kubectlBin, - "-n", namespace, "delete", "pod", - "-l", "app.kubernetes.io/name=hermes", - "--ignore-not-found", "--wait=false") - deleteCmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath) - if err := deleteCmd.Run(); err == nil && u != nil { - u.Info("Restarted Hermes pod to apply fresh volume ownership") - } + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + kubectlBin := filepath.Join(cfg.BinDir, "kubectl") + deleteCmd := exec.Command(kubectlBin, + "-n", namespace, "delete", "pod", + "-l", "app.kubernetes.io/name=hermes", + "--ignore-not-found", "--wait=false") + deleteCmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath) + if err := deleteCmd.Run(); err == nil && u != nil { + u.Info("Restarted Hermes pod after best-effort k3d PVC ownership repair") } } -// EnsureCRDAgentHermesPVCOwnership applies EnsureHermesDataPVCOwnership for -// serviceoffer-controller child agents (namespace agent-). Call after -// `obol agent new`, `obol sell demo quant`, or when repairing factory-spawned -// children that crash-loop on Linux k3d. -func EnsureCRDAgentHermesPVCOwnership(cfg *config.Config, agentName string, u *ui.UI) { - EnsureHermesDataPVCOwnership(cfg, agentcrd.Namespace(agentName), agentcrd.HermesDataPVCHostPath(cfg, agentName), u) -} - -func ensureHermesPVCOwnership(cfg *config.Config, id string, u *ui.UI) { - EnsureHermesDataPVCOwnership(cfg, agentruntime.Namespace(agentruntime.Hermes, id), hermesPVCPaths(cfg, id)[0], u) -} - -// hermesInitStuck reports whether at least one Hermes pod has an init -// container in CrashLoopBackOff or an Error waiting state — the signature of -// the perm-denied symptom this fix targets. Returns false on any kubectl -// failure so that a transient API hiccup does not trigger spurious restarts. -func hermesInitStuck(cfg *config.Config, namespace string) bool { +func hermesInitContainerStuck(cfg *config.Config, namespace string) bool { kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") kubectlBin := filepath.Join(cfg.BinDir, "kubectl") cmd := exec.Command(kubectlBin, diff --git a/internal/hermes/hermes_test.go b/internal/hermes/hermes_test.go index e18499a0..2fa09d1a 100644 --- a/internal/hermes/hermes_test.go +++ b/internal/hermes/hermes_test.go @@ -142,9 +142,8 @@ func TestGenerateValues_UsesHermesNativeNames(t *testing.T) { "/data/.hermes/obol-skills", "containerPort: 8642", "containerPort: 9119", - "init-hermes-perms", + "fsGroupChangePolicy: OnRootMismatch", "init-hermes-data", - "chown -R 10000:10000 /data", `Hermes binary missing from image: /opt/hermes/.venv/bin/hermes`, `Hermes image is missing required extras: web,messaging,mcp,pty,cli,acp,google`, `import fastapi, uvicorn, telegram, mcp, ptyprocess, simple_term_menu, googleapiclient`, @@ -166,9 +165,11 @@ func TestGenerateValues_UsesHermesNativeNames(t *testing.T) { "git clone", "uv pip install", "/data/.hermes/hermes-agent", + "init-hermes-perms", + "chown -R 10000:10000 /data", } { if strings.Contains(values, banned) { - t.Fatalf("generateValues() should not rebuild Hermes inside the PVC, found %q:\n%s", banned, values) + t.Fatalf("generateValues() contains banned fragment %q:\n%s", banned, values) } } @@ -351,48 +352,3 @@ func mkdirInstance(t *testing.T, cfg *config.Config, id string) { t.Fatalf("create Hermes instance %q: %v", id, err) } } - -// TestHermesPVCPaths pins the host-side directories that -// ensureHermesPVCOwnership chowns. Renaming the Hermes data PVC or -// relocating the namespace prefix in agentruntime without updating this -// list would silently regress the #475 fix on Linux k3d, because the chown -// would land on a non-existent path while the real PVC backing dir kept -// its local-path-provisioner default ownership of 1000:1000. -func TestHermesPVCPaths(t *testing.T) { - cfg := testConfig(t) - const id = "obol-agent" - namespace := agentruntime.Namespace(agentruntime.Hermes, id) - - got := hermesPVCPaths(cfg, id) - want := []string{ - filepath.Join(cfg.DataDir, namespace, "hermes-data"), - } - if !reflect.DeepEqual(got, want) { - t.Fatalf("hermesPVCPaths(%q) = %#v; want %#v", id, got, want) - } -} - -// TestHermesPVCPaths_ExcludesRemoteSignerKeystores pins the negative half of -// the contract: the helper MUST NOT include the remote-signer-keystores PVC -// path. The first revision of this fix included it, which chowned the -// volume to 10000:10000 (Hermes' UID) and broke the remote-signer pod — -// remote-signer runs as runAsUser=65532 with fsGroup=1000, expecting the -// local-path-provisioner default ownership of 1000:1000. The result was -// a remote-signer CrashLoopBackOff with -// -// failed to load keystores: Permission denied (os error 13) -// -// against /data/keystores. This guard makes that regression impossible to -// re-introduce by accident. -func TestHermesPVCPaths_ExcludesRemoteSignerKeystores(t *testing.T) { - cfg := testConfig(t) - const id = "obol-agent" - namespace := agentruntime.Namespace(agentruntime.Hermes, id) - - keystoreVolume := filepath.Join(cfg.DataDir, namespace, "remote-signer-keystores") - for _, p := range hermesPVCPaths(cfg, id) { - if p == keystoreVolume { - t.Fatalf("hermesPVCPaths included %q — the remote-signer pod (runAsUser=65532, fsGroup=1000) crash-loops when this volume is chowned to Hermes' containerUID:containerGID", keystoreVolume) - } - } -} diff --git a/internal/serviceoffercontroller/agent_render.go b/internal/serviceoffercontroller/agent_render.go index 0619ed77..e44874cc 100644 --- a/internal/serviceoffercontroller/agent_render.go +++ b/internal/serviceoffercontroller/agent_render.go @@ -316,9 +316,10 @@ func agentPodSpec(agent *monetizeapi.Agent) map[string]any { "serviceAccountName": hermesServiceName, "automountServiceAccountToken": true, "securityContext": map[string]any{ - "runAsUser": int64(hermesContainerUID), - "runAsGroup": int64(hermesContainerGID), - "fsGroup": int64(hermesContainerGID), + "runAsUser": int64(hermesContainerUID), + "runAsGroup": int64(hermesContainerGID), + "fsGroup": int64(hermesContainerGID), + "fsGroupChangePolicy": "OnRootMismatch", }, "initContainers": []any{ buildAgentProfileInitContainer(), diff --git a/internal/serviceoffercontroller/agent_render_test.go b/internal/serviceoffercontroller/agent_render_test.go index 29735b3d..e04e1e32 100644 --- a/internal/serviceoffercontroller/agent_render_test.go +++ b/internal/serviceoffercontroller/agent_render_test.go @@ -137,6 +137,43 @@ func TestAgentManifests_DeploymentEnvCarriesContext(t *testing.T) { } } +func TestAgentManifests_DeploymentUsesFSGroup(t *testing.T) { + agent := &monetizeapi.Agent{} + agent.Name = "quant" + agent.Namespace = "agent-quant" + agent.Spec = monetizeapi.AgentSpec{Model: "qwen3.5:9b"} + + out, err := agentManifests(agent, "litellm", "api") + if err != nil { + t.Fatalf("agentManifests: %v", err) + } + var dep map[string]any + for _, m := range out { + if m.GetKind() == "Deployment" { + dep = m.UnstructuredContent() + break + } + } + if dep == nil { + t.Fatal("Deployment manifest missing") + } + + podSpec := dep["spec"].(map[string]any)["template"].(map[string]any)["spec"].(map[string]any) + securityContext := podSpec["securityContext"].(map[string]any) + if securityContext["runAsUser"] != int64(hermesContainerUID) { + t.Fatalf("runAsUser = %v, want %d", securityContext["runAsUser"], hermesContainerUID) + } + if securityContext["runAsGroup"] != int64(hermesContainerGID) { + t.Fatalf("runAsGroup = %v, want %d", securityContext["runAsGroup"], hermesContainerGID) + } + if securityContext["fsGroup"] != int64(hermesContainerGID) { + t.Fatalf("fsGroup = %v, want %d", securityContext["fsGroup"], hermesContainerGID) + } + if securityContext["fsGroupChangePolicy"] != "OnRootMismatch" { + t.Fatalf("fsGroupChangePolicy = %v, want OnRootMismatch", securityContext["fsGroupChangePolicy"]) + } +} + func TestAgentManifests_ProfileSeedInitContainer(t *testing.T) { agent := &monetizeapi.Agent{} agent.Name = "quant" diff --git a/plans/crd-agent-hermes-pvc-chown.md b/plans/crd-agent-hermes-pvc-chown.md deleted file mode 100644 index 0964ad2d..00000000 --- a/plans/crd-agent-hermes-pvc-chown.md +++ /dev/null @@ -1,44 +0,0 @@ -# CRD child agent Hermes PVC ownership (Linux k3d) - -## Problem - -PR #481 (`ensureHermesPVCOwnership`) runs only after `hermes.Sync` for the **master** -agent (`hermes-obol-agent`). Child agents provisioned by the serviceoffer-controller -use namespace `agent-` and never go through helm Sync. - -On Linux k3d with `KubeletInUserNamespace`, local-path-provisioner creates -`/agent-/hermes-data` owned `1000:1000`. Hermes runs as `10000:10000` -and cannot write `/data/.hermes/home` → Init crash-loop. - -Factory-spawned children (`agent-factory` skill, in-cluster API) hit the same gap: -they never invoke host-side `obol` seed + chown. - -## Fix (branch `fix/crd-agent-hermes-pvc-chown`) - -| Call site | When | -|-----------|------| -| `hermes.EnsureCRDAgentHermesPVCOwnership` | Shared: wait PVC Bound → `mkdir -p` + `chown 10000:10000` via k3d `docker exec` | -| `obol agent new` | After Agent CR apply | -| `obol sell demo quant` | After new Agent CR apply | -| `obol agent repair-perms ` | Manual repair after `agent-factory` create | - -Master agent path unchanged: `ensureHermesPVCOwnership` → `EnsureHermesDataPVCOwnership` for `hermes-`. - -## Operator workaround (pre-fix / factory-only) - -```bash -obol agent repair-perms -# or legacy rc1 style: -docker exec k3d-obol-stack--server-0 \ - chown -R 10000:10000 /data/agent-/hermes-data -``` - -## Verify - -```bash -kubectl get pods -n agent- -# hermes should reach Running, not Init:CrashLoopBackOff - -obol sell status -n agent- -# Ready=True once Hermes is healthy -```