From 64a4a3b0c7ec1943ad265cf6430a176d43f7f3ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ois=C3=ADn=20Kyne?= Date: Sun, 3 May 2026 21:56:46 +0100 Subject: [PATCH 1/9] Allow hermes messaging --- internal/hermes/hermes.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/internal/hermes/hermes.go b/internal/hermes/hermes.go index 108fdfd2..cd9ff6b0 100644 --- a/internal/hermes/hermes.go +++ b/internal/hermes/hermes.go @@ -844,15 +844,16 @@ func generateValues(namespace, hostname, dashboardHostname, agentBaseURL, token, fi cd "$install_dir" # Reinstall when the venv is missing the hermes binary OR - # when the dashboard's web extra (fastapi/uvicorn) is absent. - # The upstream image installs ".[all]" (which pulls in - # ".[web]"); we re-create the venv from a fresh clone, so - # the extras must be re-requested explicitly here. + # any selected extra is absent. The upstream image installs + # ".[all]"; we re-create the venv from a fresh clone, so the + # extras must be re-requested explicitly. The import check + # picks one module per extra so existing PVCs trigger a + # rebuild when we add a new extra to the install line. if [ ! -x "$install_dir/venv/bin/hermes" ] || \ - ! "$install_dir/venv/bin/python3" -c "import fastapi, uvicorn" >/dev/null 2>&1; then + ! "$install_dir/venv/bin/python3" -c "import fastapi, uvicorn, telegram, mcp, ptyprocess, simple_term_menu, googleapiclient" >/dev/null 2>&1; then rm -rf "$install_dir/venv" uv venv --python python3 --system-site-packages venv - VIRTUAL_ENV="$install_dir/venv" uv pip install -e ".[web]" + VIRTUAL_ENV="$install_dir/venv" uv pip install -e ".[web,messaging,mcp,pty,cli,acp,google]" fi if [ -f /data/.hermes/state.db ]; then if ! python3 - <<'PY' From 2bc578301ccfa2b82b8c929fea94b8c1fe13e5f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ois=C3=ADn=20Kyne?= Date: Sun, 3 May 2026 22:21:08 +0100 Subject: [PATCH 2/9] fix(store): being served old docker images and not detecting a new one --- .../workflows/docker-publish-storefront.yml | 10 +- .github/workflows/docker-publish-x402.yml | 5 + cmd/obol/sell.go | 3 +- internal/images/images.go | 54 ++++++++++ internal/images/images_test.go | 102 ++++++++++++++++++ internal/tunnel/tunnel.go | 3 +- 6 files changed, 174 insertions(+), 3 deletions(-) create mode 100644 internal/images/images.go create mode 100644 internal/images/images_test.go diff --git a/.github/workflows/docker-publish-storefront.yml b/.github/workflows/docker-publish-storefront.yml index 3b944668..f78721fc 100644 --- a/.github/workflows/docker-publish-storefront.yml +++ b/.github/workflows/docker-publish-storefront.yml @@ -51,7 +51,15 @@ jobs: tags: | type=semver,pattern={{version}} type=semver,pattern={{major}}.{{minor}} - type=sha,prefix= + # Long SHA: needed by the security-scan step which references + # ${{ github.sha }} (40-char). Without it Trivy fails with + # MANIFEST_UNKNOWN. Same bug the x402 workflow used to have. + type=sha,format=long,prefix= + # Short SHA: matches the obol binary's version.GitCommit (set via + # ldflags from `git rev-parse --short HEAD`). internal/images.Resolve + # uses it to commit-pin the storefront deployment so binary upgrades + # actually roll the pod. + type=sha,format=short,prefix= type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }} labels: | org.opencontainers.image.title=obol-stack-public-storefront diff --git a/.github/workflows/docker-publish-x402.yml b/.github/workflows/docker-publish-x402.yml index 4733c412..f66e7b46 100644 --- a/.github/workflows/docker-publish-x402.yml +++ b/.github/workflows/docker-publish-x402.yml @@ -91,6 +91,11 @@ jobs: # Previously `type=sha,prefix=` produced the 7-char short SHA, # causing Trivy to fail with MANIFEST_UNKNOWN on every run. type=sha,format=long,prefix= + # Also publish the 7-char short SHA as a separate tag. The obol + # binary embeds version.GitCommit (short SHA) via ldflags and uses + # it through internal/images.Resolve to commit-pin the deployments + # it manages. Without this, binary upgrades wouldn't roll the pods. + type=sha,format=short,prefix= type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/feat/secure-enclave-inference' }} labels: | org.opencontainers.image.title=${{ matrix.component }} diff --git a/cmd/obol/sell.go b/cmd/obol/sell.go index f51a57be..17546275 100644 --- a/cmd/obol/sell.go +++ b/cmd/obol/sell.go @@ -26,6 +26,7 @@ import ( "github.com/ObolNetwork/obol-stack/internal/enclave" "github.com/ObolNetwork/obol-stack/internal/erc8004" "github.com/ObolNetwork/obol-stack/internal/hermes" + "github.com/ObolNetwork/obol-stack/internal/images" "github.com/ObolNetwork/obol-stack/internal/inference" "github.com/ObolNetwork/obol-stack/internal/kubectl" "github.com/ObolNetwork/obol-stack/internal/monetizeapi" @@ -1270,7 +1271,7 @@ func buildDemoResources(name string, spec demoSpec, paymentChain string) []map[s "containers": []map[string]any{ { "name": "demo", - "image": "ghcr.io/obolnetwork/demo-server:latest", + "image": images.Resolve("ghcr.io/obolnetwork/demo-server"), "imagePullPolicy": "IfNotPresent", "env": env, "ports": []map[string]any{ diff --git a/internal/images/images.go b/internal/images/images.go new file mode 100644 index 00000000..1c596eed --- /dev/null +++ b/internal/images/images.go @@ -0,0 +1,54 @@ +// Package images centralises the policy for selecting Docker image tags in +// embedded Kubernetes manifests. +// +// The problem this solves: when the obol binary is upgraded, the K8s +// Deployments it creates must trigger a rolling update so old pods are +// replaced with ones running the new image. With ":latest" tags the embedded +// manifest is byte-identical across binary versions, so kubectl apply reports +// "unchanged" and stale pods keep serving the old image forever. +// +// The fix: production binaries are built with version.GitCommit injected via +// ldflags, and CI publishes images tagged with the same short commit SHA. +// Resolve uses that SHA so upgrading the binary changes the image:tag in +// every manifest, which is what triggers K8s to roll the pod. +// +// Dev mode (OBOL_DEVELOPMENT=true), unset/unknown GitCommit, and dirty repos +// fall back to ":latest" — that path matches the buildAndImportLocalImages +// flow that imports freshly-built images into k3d as ":latest". +package images + +import ( + "os" + "strings" + + "github.com/ObolNetwork/obol-stack/internal/version" +) + +// Resolve returns the fully-qualified image reference for an image whose +// repository part is `repo` (e.g. "ghcr.io/obolnetwork/demo-server"). +// +// images.Resolve("ghcr.io/obolnetwork/demo-server") +// // → "ghcr.io/obolnetwork/demo-server:abc1234" (production) +// // → "ghcr.io/obolnetwork/demo-server:latest" (dev / unknown commit) +func Resolve(repo string) string { + if useLatest() { + return repo + ":latest" + } + return repo + ":" + version.GitCommit +} + +// useLatest reports whether the current binary should reach for the mutable +// :latest tag rather than a commit-pinned tag. +func useLatest() bool { + if strings.EqualFold(strings.TrimSpace(os.Getenv("OBOL_DEVELOPMENT")), "true") { + return true + } + commit := strings.TrimSpace(version.GitCommit) + if commit == "" || commit == "unknown" || commit == "dev" { + return true + } + if strings.EqualFold(strings.TrimSpace(version.GitDirty), "true") { + return true + } + return false +} diff --git a/internal/images/images_test.go b/internal/images/images_test.go new file mode 100644 index 00000000..f7629aaa --- /dev/null +++ b/internal/images/images_test.go @@ -0,0 +1,102 @@ +package images + +import ( + "testing" + + "github.com/ObolNetwork/obol-stack/internal/version" +) + +// withVersion temporarily overrides the version package globals for one test. +// Restores them in t.Cleanup so subsequent tests see the package's natural +// state (whatever ldflags or defaults left behind). +func withVersion(t *testing.T, commit, dirty string) { + t.Helper() + prevCommit := version.GitCommit + prevDirty := version.GitDirty + version.GitCommit = commit + version.GitDirty = dirty + t.Cleanup(func() { + version.GitCommit = prevCommit + version.GitDirty = prevDirty + }) +} + +func TestResolve_DevModeForcesLatest(t *testing.T) { + // Even when GitCommit is set to a real SHA, OBOL_DEVELOPMENT=true must win. + // The local-build path imports images into k3d as :latest, so the manifest + // must reference :latest to actually pick up the local image. + withVersion(t, "abc1234", "false") + t.Setenv("OBOL_DEVELOPMENT", "true") + + got := Resolve("ghcr.io/obolnetwork/demo-server") + want := "ghcr.io/obolnetwork/demo-server:latest" + if got != want { + t.Errorf("Resolve = %q, want %q", got, want) + } +} + +func TestResolve_UnknownCommitFallsBackToLatest(t *testing.T) { + // Binaries built without ldflags leave GitCommit at "unknown". There's no + // matching CI image tag, so :latest is the only safe choice. + withVersion(t, "unknown", "false") + t.Setenv("OBOL_DEVELOPMENT", "") + + got := Resolve("ghcr.io/obolnetwork/demo-server") + want := "ghcr.io/obolnetwork/demo-server:latest" + if got != want { + t.Errorf("Resolve = %q, want %q", got, want) + } +} + +func TestResolve_DirtyRepoFallsBackToLatest(t *testing.T) { + // A dirty build has no published image — its commit doesn't match anything + // on GHCR. Use :latest rather than producing a tag that 404s. + withVersion(t, "abc1234", "true") + t.Setenv("OBOL_DEVELOPMENT", "") + + got := Resolve("ghcr.io/obolnetwork/demo-server") + want := "ghcr.io/obolnetwork/demo-server:latest" + if got != want { + t.Errorf("Resolve = %q, want %q", got, want) + } +} + +func TestResolve_ProductionUsesCommitPin(t *testing.T) { + // Released binary: GitCommit is the short SHA, repo is clean, not in dev + // mode. Result must be the commit-pinned tag — this is what makes binary + // upgrades roll the K8s pods automatically. + withVersion(t, "abc1234", "false") + t.Setenv("OBOL_DEVELOPMENT", "") + + got := Resolve("ghcr.io/obolnetwork/demo-server") + want := "ghcr.io/obolnetwork/demo-server:abc1234" + if got != want { + t.Errorf("Resolve = %q, want %q", got, want) + } +} + +func TestResolve_EmptyCommitFallsBackToLatest(t *testing.T) { + withVersion(t, "", "false") + t.Setenv("OBOL_DEVELOPMENT", "") + + got := Resolve("ghcr.io/obolnetwork/storefront") + want := "ghcr.io/obolnetwork/storefront:latest" + if got != want { + t.Errorf("Resolve = %q, want %q", got, want) + } +} + +func TestResolve_DevModeCaseInsensitive(t *testing.T) { + // People sometimes set OBOL_DEVELOPMENT=True or =TRUE. Don't penalise + // them — the env var is binary-true, not a string match. + withVersion(t, "abc1234", "false") + + for _, val := range []string{"true", "TRUE", "True", "tRuE"} { + t.Run(val, func(t *testing.T) { + t.Setenv("OBOL_DEVELOPMENT", val) + if got := Resolve("img"); got != "img:latest" { + t.Errorf("OBOL_DEVELOPMENT=%q: Resolve = %q, want img:latest", val, got) + } + }) + } +} diff --git a/internal/tunnel/tunnel.go b/internal/tunnel/tunnel.go index 76e83b53..ed56caf8 100644 --- a/internal/tunnel/tunnel.go +++ b/internal/tunnel/tunnel.go @@ -15,6 +15,7 @@ import ( "github.com/ObolNetwork/obol-stack/internal/agentruntime" "github.com/ObolNetwork/obol-stack/internal/config" + "github.com/ObolNetwork/obol-stack/internal/images" "github.com/ObolNetwork/obol-stack/internal/ui" ) @@ -532,7 +533,7 @@ func CreateStorefront(cfg *config.Config, tunnelURL string) error { "containers": []map[string]any{ { "name": "storefront", - "image": "ghcr.io/obolnetwork/obol-stack-public-storefront:latest", + "image": images.Resolve("ghcr.io/obolnetwork/obol-stack-public-storefront"), "imagePullPolicy": "IfNotPresent", "ports": []map[string]any{ {"containerPort": 3000, "name": "http"}, From 543666c64243f97773ac0267c063b25fd9fc7235 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ois=C3=ADn=20Kyne?= Date: Sun, 3 May 2026 22:54:09 +0100 Subject: [PATCH 3/9] Lost conflicted fixes --- internal/hermes/hermes_test.go | 4 ++-- internal/stack/stack.go | 43 ++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/internal/hermes/hermes_test.go b/internal/hermes/hermes_test.go index 734477a1..4e27c869 100644 --- a/internal/hermes/hermes_test.go +++ b/internal/hermes/hermes_test.go @@ -150,8 +150,8 @@ func TestGenerateValues_UsesHermesNativeNames(t *testing.T) { `Timed out waiting for Hermes install lock`, `git clone --depth 1 "$repo_url" "${install_dir}.tmp"`, "uv venv --python python3 --system-site-packages venv", - `uv pip install -e ".[web]"`, - `import fastapi, uvicorn`, + `uv pip install -e ".[web,messaging,mcp,pty,cli,acp,google]"`, + `import fastapi, uvicorn, telegram, mcp, ptyprocess, simple_term_menu, googleapiclient`, `PRAGMA quick_check`, `state-db-corrupt-$ts`, `- "/data/.hermes/hermes-agent/venv/bin/hermes"`, diff --git a/internal/stack/stack.go b/internal/stack/stack.go index 90700775..b6704e61 100644 --- a/internal/stack/stack.go +++ b/internal/stack/stack.go @@ -362,6 +362,18 @@ func syncDefaults(cfg *config.Config, u *ui.UI, kubeconfigPath string, dataDir s u.Warnf("Failed to preserve LiteLLM config across Helm sync: %v", err) } + // Release runtime field ownership of litellm-config.data.config.yaml so the + // upcoming helm upgrade can reclaim it without an SSA conflict. Without + // this step, the second `obol stack up` after autoConfigureLLM/restore has + // claimed the field via SSA (manager=helm, op=Apply) fails with + // "conflict with helm using v1: .data.config.yaml" because helm registers + // a separate managedFields entry (manager=helm, op=Update) for the same + // field. The data is already snapshotted in previousLiteLLMConfig and gets + // re-applied by restoreLiteLLMConfig after helm runs. + if err := releaseLiteLLMConfigOwnership(cfg, kubeconfigPath); err != nil { + u.Warnf("Failed to release LiteLLM config field ownership: %v", err) + } + // Compatibility migration if err := migrateDefaultsHTTPRouteHostnames(helmfilePath); err != nil { u.Warnf("Failed to migrate defaults helmfile hostnames: %v", err) @@ -907,6 +919,37 @@ func preserveLiteLLMConfigForHelm(cfg *config.Config, kubeconfigPath string) (st return raw, nil } +// releaseLiteLLMConfigOwnership strips managedFields from the litellm-config +// ConfigMap so the next helm upgrade can claim ownership of every field +// without an SSA conflict. Helm tracks release ownership via the +// meta.helm.sh/release-name annotation, not managedFields, so clearing +// managedFields does not detach the resource from its release. +// +// The single empty entry [{}] is the documented apiserver idiom for clearing +// all field-ownership claims on a resource. See: +// https://kubernetes.io/docs/reference/using-api/server-side-apply/#clearing-managedfields +func releaseLiteLLMConfigOwnership(cfg *config.Config, kubeconfigPath string) error { + kubectlBinary := filepath.Join(cfg.BinDir, "kubectl") + + // Skip if the configmap doesn't exist (first install). + if _, err := kubectl.Output(kubectlBinary, kubeconfigPath, + "get", "configmap", "litellm-config", "-n", "llm", "-o", "name"); err != nil { + return nil + } + + cmd := exec.Command(kubectlBinary, + "patch", "configmap", "litellm-config", + "-n", "llm", + "--type=merge", + "--patch", `{"metadata":{"managedFields":[{}]}}`, + ) + cmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath) + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("kubectl patch managedFields: %w\n%s", err, string(out)) + } + return nil +} + func restoreLiteLLMConfig(cfg *config.Config, kubeconfigPath, raw string) error { if strings.TrimSpace(raw) == "" { return nil From 2df7f80490e76012f2df948edfbb7a5e9d91ce28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ois=C3=ADn=20Kyne?= Date: Mon, 4 May 2026 00:19:51 +0100 Subject: [PATCH 4/9] Make wallet backup work for hermes too --- cmd/obol/agent.go | 48 ++-- internal/hermes/wallet_backup.go | 242 +++++++++++++++++ internal/openclaw/wallet_backup.go | 296 ++++----------------- internal/walletbackup/walletbackup.go | 246 +++++++++++++++++ internal/walletbackup/walletbackup_test.go | 99 +++++++ 5 files changed, 666 insertions(+), 265 deletions(-) create mode 100644 internal/hermes/wallet_backup.go create mode 100644 internal/walletbackup/walletbackup.go create mode 100644 internal/walletbackup/walletbackup_test.go diff --git a/cmd/obol/agent.go b/cmd/obol/agent.go index eb687258..123fbfbe 100644 --- a/cmd/obol/agent.go +++ b/cmd/obol/agent.go @@ -227,14 +227,22 @@ func agentWalletCommand(cfg *config.Config) *cli.Command { if err != nil { return err } - if target.Runtime != agentruntime.OpenClaw { - return errors.New("Hermes wallet backup needs a Hermes-native product decision; use OpenClaw backup only for OpenClaw instances") + switch target.Runtime { + case agentruntime.Hermes: + return hermes.BackupWalletCmd(cfg, target.ID, hermes.BackupWalletOptions{ + Output: cmd.String("output"), + Passphrase: cmd.String("passphrase"), + HasPassFlag: cmd.IsSet("passphrase"), + }, getUI(cmd)) + case agentruntime.OpenClaw: + return openclaw.BackupWalletCmd(cfg, target.ID, openclaw.BackupWalletOptions{ + Output: cmd.String("output"), + Passphrase: cmd.String("passphrase"), + HasPassFlag: cmd.IsSet("passphrase"), + }, getUI(cmd)) + default: + return fmt.Errorf("unsupported runtime %q", target.Runtime) } - return openclaw.BackupWalletCmd(cfg, target.ID, openclaw.BackupWalletOptions{ - Output: cmd.String("output"), - Passphrase: cmd.String("passphrase"), - HasPassFlag: cmd.IsSet("passphrase"), - }, getUI(cmd)) }, }, { @@ -266,15 +274,25 @@ func agentWalletCommand(cfg *config.Config) *cli.Command { if err != nil { return err } - if target.Runtime != agentruntime.OpenClaw { - return errors.New("Hermes wallet restore needs a Hermes-native product decision; use OpenClaw restore only for OpenClaw instances") + switch target.Runtime { + case agentruntime.Hermes: + return hermes.RestoreWalletCmd(cfg, target.ID, hermes.RestoreWalletOptions{ + Input: cmd.String("input"), + Passphrase: cmd.String("passphrase"), + HasPassFlag: cmd.IsSet("passphrase"), + Force: cmd.Bool("force"), + ApplyCluster: true, + }, getUI(cmd)) + case agentruntime.OpenClaw: + return openclaw.RestoreWalletCmd(cfg, target.ID, openclaw.RestoreWalletOptions{ + Input: cmd.String("input"), + Passphrase: cmd.String("passphrase"), + HasPassFlag: cmd.IsSet("passphrase"), + Force: cmd.Bool("force"), + }, getUI(cmd)) + default: + return fmt.Errorf("unsupported runtime %q", target.Runtime) } - return openclaw.RestoreWalletCmd(cfg, target.ID, openclaw.RestoreWalletOptions{ - Input: cmd.String("input"), - Passphrase: cmd.String("passphrase"), - HasPassFlag: cmd.IsSet("passphrase"), - Force: cmd.Bool("force"), - }, getUI(cmd)) }, }, }, diff --git a/internal/hermes/wallet_backup.go b/internal/hermes/wallet_backup.go new file mode 100644 index 00000000..99ad3f0a --- /dev/null +++ b/internal/hermes/wallet_backup.go @@ -0,0 +1,242 @@ +package hermes + +import ( + "bytes" + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + + "github.com/ObolNetwork/obol-stack/internal/agentruntime" + "github.com/ObolNetwork/obol-stack/internal/config" + "github.com/ObolNetwork/obol-stack/internal/kubectl" + "github.com/ObolNetwork/obol-stack/internal/ui" + "github.com/ObolNetwork/obol-stack/internal/walletbackup" +) + +// BackupWalletOptions holds options for `obol agent wallet backup`. +type BackupWalletOptions struct { + Output string + Passphrase string + HasPassFlag bool +} + +// RestoreWalletOptions holds options for `obol agent wallet restore`. +type RestoreWalletOptions struct { + Input string + Passphrase string + HasPassFlag bool + Force bool + ApplyCluster bool +} + +// BackupWalletCmd creates a backup of the Hermes instance's remote-signer +// wallet. The on-disk format is identical to OpenClaw's, so a Hermes backup +// can be restored into an OpenClaw instance and vice versa — instance +// names and namespace scoping are not part of the backup payload. +func BackupWalletCmd(cfg *config.Config, id string, opts BackupWalletOptions, u *ui.UI) error { + deployDir := DeploymentPath(cfg, id) + + wallet, err := ReadWalletMetadata(deployDir) + if err != nil { + return fmt.Errorf("no wallet found for instance %q: %w", id, err) + } + + keystorePath := filepath.Join(agentruntime.KeystoreVolumePath(cfg, agentruntime.Hermes, id), wallet.KeystoreUUID+".json") + keystoreData, err := os.ReadFile(keystorePath) + if err != nil { + return fmt.Errorf("failed to read keystore file: %w", err) + } + + password, err := walletbackup.ReadKeystorePassword(deployDir) + if err != nil { + return fmt.Errorf("failed to read keystore password: %w", err) + } + + backup := &walletbackup.File{ + Version: walletbackup.Version, + Instance: id, + Wallets: []walletbackup.Wallet{{ + Address: wallet.Address, + PublicKey: wallet.PublicKey, + KeystoreUUID: wallet.KeystoreUUID, + CreatedAt: wallet.CreatedAt, + Keystore: json.RawMessage(keystoreData), + KeystorePassword: password, + }}, + } + + passphrase, err := walletbackup.PromptPassphrase(opts.Passphrase, opts.HasPassFlag, u) + if err != nil { + return err + } + + payload, encrypted, err := walletbackup.Encode(backup, passphrase) + if err != nil { + return err + } + + addrSuffix := wallet.Address + if len(addrSuffix) > 8 { + addrSuffix = addrSuffix[len(addrSuffix)-8:] + } + outputPath := opts.Output + if outputPath == "" { + ext := "json" + if encrypted { + ext = "enc" + } + outputPath = fmt.Sprintf("obol-wallet-backup-%s.%s", addrSuffix, ext) + } + + if err := os.WriteFile(outputPath, payload, 0o600); err != nil { + return fmt.Errorf("failed to write backup: %w", err) + } + + u.Success("Wallet backup created") + u.Detail("Address", wallet.Address) + u.Detail("Output", outputPath) + if encrypted { + u.Detail("Encrypted", "yes (AES-256-GCM)") + } else { + u.Detail("Encrypted", "no") + u.Warn("Backup contains unencrypted keystore password — store securely") + } + return nil +} + +// RestoreWalletCmd restores a Hermes wallet from a backup file. Mirrors +// openclaw.RestoreWalletCmd, sharing the wire format via walletbackup. +func RestoreWalletCmd(cfg *config.Config, id string, opts RestoreWalletOptions, u *ui.UI) error { + raw, err := os.ReadFile(opts.Input) + if err != nil { + return fmt.Errorf("failed to read backup file: %w", err) + } + + passphrase := opts.Passphrase + if walletbackup.IsEncrypted(raw) && !opts.HasPassFlag { + passphrase, err = u.SecretInput("Backup passphrase") + if err != nil { + return fmt.Errorf("failed to read passphrase: %w", err) + } + } + + backup, err := walletbackup.Decode(raw, passphrase) + if err != nil { + return err + } + + w := backup.Wallets[0] + + deployDir := DeploymentPath(cfg, id) + if _, err := os.Stat(deployDir); os.IsNotExist(err) { + return fmt.Errorf("instance %q not found — run 'obol agent new --runtime hermes --id %s' first", id, id) + } + + existingWallet, _ := ReadWalletMetadata(deployDir) + if existingWallet != nil && !opts.Force { + return fmt.Errorf("instance %q already has a wallet (address: %s)\nUse --force to overwrite", id, existingWallet.Address) + } + + keystoreDir := agentruntime.KeystoreVolumePath(cfg, agentruntime.Hermes, id) + ensureVolumeWritableFn(cfg, keystoreDir, u) + if err := os.MkdirAll(keystoreDir, 0o700); err != nil { + return fmt.Errorf("failed to create keystore directory: %w", err) + } + + keystorePath := filepath.Join(keystoreDir, w.KeystoreUUID+".json") + if err := os.WriteFile(keystorePath, []byte(w.Keystore), 0o600); err != nil { + return fmt.Errorf("failed to write keystore: %w", err) + } + fixRuntimeVolumeOwnershipFn(cfg, keystoreDir, u) + + walletInfo := &WalletInfo{ + Address: w.Address, + PublicKey: w.PublicKey, + KeystoreUUID: w.KeystoreUUID, + KeystorePath: keystorePath, + CreatedAt: w.CreatedAt, + Password: w.KeystorePassword, + } + + rsValues := generateRemoteSignerValues(walletInfo) + if err := walletbackup.WriteValuesRemoteSigner(deployDir, rsValues); err != nil { + return fmt.Errorf("failed to write values-remote-signer.yaml: %w", err) + } + if err := WriteWalletMetadata(deployDir, walletInfo); err != nil { + return fmt.Errorf("failed to write wallet metadata: %w", err) + } + if err := archiveReplacedHermesKeystore(cfg, id, existingWallet, w.KeystoreUUID, u); err != nil { + return fmt.Errorf("failed to archive replaced keystore: %w", err) + } + + if opts.ApplyCluster { + applyHermesKeystorePasswordSecret(cfg, id, w.KeystorePassword, u) + restartHermesRemoteSignerFn(cfg, id, u) + } + + u.Success("Wallet restored") + u.Detail("Address", w.Address) + u.Detail("Instance", id) + return nil +} + +// FindInstancesWithWallets returns Hermes instance IDs that have wallet +// metadata on disk. Used by purge prompts. +func FindInstancesWithWallets(cfg *config.Config) []string { + ids, err := agentruntime.ListInstanceIDs(cfg, agentruntime.Hermes) + if err != nil { + return nil + } + var out []string + for _, id := range ids { + if _, err := ReadWalletMetadata(DeploymentPath(cfg, id)); err == nil { + out = append(out, id) + } + } + return out +} + +// applyHermesKeystorePasswordSecret applies the remote-signer keystore +// password Secret in the instance namespace. Best-effort; if the cluster is +// down the caller is expected to re-sync later. +func applyHermesKeystorePasswordSecret(cfg *config.Config, id, password string, u *ui.UI) { + if password == "" { + return + } + namespace := agentruntime.Namespace(agentruntime.Hermes, id) + manifest := map[string]any{ + "apiVersion": "v1", + "kind": "Secret", + "metadata": map[string]any{ + "name": "remote-signer-keystore-password", + "namespace": namespace, + "labels": map[string]string{ + "app.kubernetes.io/component": "remote-signer", + "app.kubernetes.io/managed-by": "obol", + }, + }, + "type": "Opaque", + "stringData": map[string]string{ + "password": password, + }, + } + raw, err := json.Marshal(manifest) + if err != nil { + u.Warnf("Could not marshal remote-signer password Secret: %v", err) + return + } + + kubectlBin, kubeconfig := kubectl.Paths(cfg) + cmd := exec.Command(kubectlBin, "apply", "-f", "-") + cmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfig) + cmd.Stdin = bytes.NewReader(raw) + var stderr bytes.Buffer + cmd.Stderr = &stderr + if err := cmd.Run(); err != nil { + u.Blank() + u.Warnf("Could not update remote-signer password Secret (cluster may not be running)") + u.Printf("Run 'obol agent sync %s' to apply changes to the cluster.", id) + } +} diff --git a/internal/openclaw/wallet_backup.go b/internal/openclaw/wallet_backup.go index 2412724b..6975c8f0 100644 --- a/internal/openclaw/wallet_backup.go +++ b/internal/openclaw/wallet_backup.go @@ -2,9 +2,6 @@ package openclaw import ( "bytes" - "crypto/aes" - "crypto/cipher" - "crypto/rand" "encoding/json" "errors" "fmt" @@ -17,31 +14,16 @@ import ( "github.com/ObolNetwork/obol-stack/internal/config" "github.com/ObolNetwork/obol-stack/internal/kubectl" "github.com/ObolNetwork/obol-stack/internal/ui" - "golang.org/x/crypto/scrypt" - "gopkg.in/yaml.v3" + "github.com/ObolNetwork/obol-stack/internal/walletbackup" ) -// backupMagic is the first 4 bytes of an encrypted backup file. -var backupMagic = []byte("OBOL") +// BackupFile is re-exported from walletbackup for backwards-compatibility +// with existing OpenClaw callers. Both runtimes share the same on-disk shape. +type BackupFile = walletbackup.File -const backupVersion byte = 1 - -// BackupFile is the JSON structure of a wallet backup. -type BackupFile struct { - Version int `json:"version"` - Instance string `json:"instance"` - Wallets []BackupWallet `json:"wallets"` -} - -// BackupWallet holds a single wallet's backup data. -type BackupWallet struct { - Address string `json:"address"` - PublicKey string `json:"publicKey"` - KeystoreUUID string `json:"keystoreUUID"` - CreatedAt string `json:"createdAt"` - Keystore json.RawMessage `json:"keystore"` - KeystorePassword string `json:"keystorePassword"` -} +// BackupWallet is re-exported from walletbackup so the OpenClaw subcommand +// surface stays unchanged. +type BackupWallet = walletbackup.Wallet // BackupWalletOptions holds options for the backup command. type BackupWalletOptions struct { @@ -90,76 +72,55 @@ func BackupWalletCmd(cfg *config.Config, id string, opts BackupWalletOptions, u return fmt.Errorf("failed to read keystore password: %w", err) } - // Build backup structure. - backup := BackupFile{ - Version: 1, + backup := &walletbackup.File{ + Version: walletbackup.Version, Instance: id, - Wallets: []BackupWallet{ - { - Address: wallet.Address, - PublicKey: wallet.PublicKey, - KeystoreUUID: wallet.KeystoreUUID, - CreatedAt: wallet.CreatedAt, - Keystore: json.RawMessage(keystoreData), - KeystorePassword: password, - }, - }, + Wallets: []walletbackup.Wallet{{ + Address: wallet.Address, + PublicKey: wallet.PublicKey, + KeystoreUUID: wallet.KeystoreUUID, + CreatedAt: wallet.CreatedAt, + Keystore: json.RawMessage(keystoreData), + KeystorePassword: password, + }}, } - backupJSON, err := json.MarshalIndent(backup, "", " ") + passphrase, err := walletbackup.PromptPassphrase(opts.Passphrase, opts.HasPassFlag, u) if err != nil { - return fmt.Errorf("failed to marshal backup: %w", err) + return err } - // Determine passphrase. - passphrase, err := resolvePassphrase(opts.Passphrase, opts.HasPassFlag, u) + payload, encrypted, err := walletbackup.Encode(backup, passphrase) if err != nil { return err } - // Determine output path and write. addrSuffix := wallet.Address if len(addrSuffix) > 8 { addrSuffix = addrSuffix[len(addrSuffix)-8:] } - outputPath := opts.Output - encrypted := passphrase != "" - if outputPath == "" { + ext := "json" if encrypted { - outputPath = fmt.Sprintf("obol-wallet-backup-%s.enc", addrSuffix) - } else { - outputPath = fmt.Sprintf("obol-wallet-backup-%s.json", addrSuffix) + ext = "enc" } + outputPath = fmt.Sprintf("obol-wallet-backup-%s.%s", addrSuffix, ext) } - if encrypted { - ciphertext, err := encryptBackup(backupJSON, passphrase) - if err != nil { - return fmt.Errorf("encryption failed: %w", err) - } - - if err := os.WriteFile(outputPath, ciphertext, 0o600); err != nil { - return fmt.Errorf("failed to write backup: %w", err) - } - } else { - if err := os.WriteFile(outputPath, backupJSON, 0o600); err != nil { - return fmt.Errorf("failed to write backup: %w", err) - } + if err := os.WriteFile(outputPath, payload, 0o600); err != nil { + return fmt.Errorf("failed to write backup: %w", err) } u.Success("Wallet backup created") u.Detail("Address", wallet.Address) u.Detail("Output", outputPath) - if encrypted { u.Detail("Encrypted", "yes (AES-256-GCM)") } else { u.Detail("Encrypted", "no") u.Warn("Backup contains unencrypted keystore password — store securely") } - return nil } @@ -204,53 +165,26 @@ func ImportPrivateKeyWalletCmd(cfg *config.Config, id string, opts ImportPrivate // RestoreWalletCmd restores a wallet from a backup file. func RestoreWalletCmd(cfg *config.Config, id string, opts RestoreWalletOptions, u *ui.UI) error { - // Read backup file. raw, err := os.ReadFile(opts.Input) if err != nil { return fmt.Errorf("failed to read backup file: %w", err) } - // Detect format and decrypt if needed. - var backupJSON []byte - - if isEncryptedBackup(raw) { - passphrase := opts.Passphrase - if !opts.HasPassFlag { - passphrase, err = u.SecretInput("Backup passphrase") - if err != nil { - return fmt.Errorf("failed to read passphrase: %w", err) - } - } - - if passphrase == "" { - return errors.New("passphrase required for encrypted backup") - } - - backupJSON, err = decryptBackup(raw, passphrase) + passphrase := opts.Passphrase + if walletbackup.IsEncrypted(raw) && !opts.HasPassFlag { + passphrase, err = u.SecretInput("Backup passphrase") if err != nil { - return fmt.Errorf("decryption failed (wrong passphrase?): %w", err) + return fmt.Errorf("failed to read passphrase: %w", err) } - } else { - backupJSON = raw } - // Parse backup. - var backup BackupFile - if err := json.Unmarshal(backupJSON, &backup); err != nil { - return fmt.Errorf("invalid backup file: %w", err) - } - - if backup.Version != 1 { - return fmt.Errorf("unsupported backup version %d (expected 1)", backup.Version) - } - - if len(backup.Wallets) == 0 { - return errors.New("backup contains no wallets") + backup, err := walletbackup.Decode(raw, passphrase) + if err != nil { + return err } w := backup.Wallets[0] - // Verify deployment dir exists. deployDir := DeploymentPath(cfg, id) if _, err := os.Stat(deployDir); os.IsNotExist(err) { return fmt.Errorf("instance %q not found — run 'obol openclaw onboard --id %s' first", id, id) @@ -477,172 +411,34 @@ func keystorePasswordSecretManifest(id, password string) ([]byte, error) { return json.Marshal(manifest) } -// resolvePassphrase determines the passphrase via flag or interactive prompt. +// resolvePassphrase delegates to walletbackup.PromptPassphrase. Kept as a +// package-private wrapper so existing OpenClaw call sites stay compact. func resolvePassphrase(flagValue string, hasFlag bool, u *ui.UI) (string, error) { - if hasFlag { - return flagValue, nil - } - - passphrase, err := u.SecretInput("Backup passphrase (empty for no encryption)") - if err != nil { - return "", fmt.Errorf("failed to read passphrase: %w", err) - } - - if passphrase != "" { - confirm, err := u.SecretInput("Confirm passphrase") - if err != nil { - return "", fmt.Errorf("failed to read confirmation: %w", err) - } - - if passphrase != confirm { - return "", errors.New("passphrases do not match") - } - } - - return passphrase, nil + return walletbackup.PromptPassphrase(flagValue, hasFlag, u) } -// readKeystorePassword extracts the keystore password from values-remote-signer.yaml. +// readKeystorePassword delegates to walletbackup.ReadKeystorePassword. func readKeystorePassword(deployDir string) (string, error) { - data, err := os.ReadFile(filepath.Join(deployDir, "values-remote-signer.yaml")) - if err != nil { - return "", err - } - - var values struct { - KeystorePassword struct { - Value string `yaml:"value"` - } `yaml:"keystorePassword"` - } - if err := yaml.Unmarshal(data, &values); err != nil { - return "", fmt.Errorf("failed to parse values-remote-signer.yaml: %w", err) - } - - if values.KeystorePassword.Value == "" { - return "", errors.New("keystorePassword.value not found in values-remote-signer.yaml") - } - - return values.KeystorePassword.Value, nil + return walletbackup.ReadKeystorePassword(deployDir) } -// writeKeystorePassword writes the remote-signer values YAML with the given password. +// writeKeystorePassword renders the remote-signer values YAML for the given +// password and writes it under deployDir. func writeKeystorePassword(deployDir, password string) error { content := generateRemoteSignerValues(&WalletInfo{Password: password}) - return os.WriteFile(filepath.Join(deployDir, "values-remote-signer.yaml"), []byte(content), 0o600) + return walletbackup.WriteValuesRemoteSigner(deployDir, content) } -// encryptBackup encrypts plaintext using AES-256-GCM with a scrypt-derived key. -// Format: magic(4) || version(1) || salt(32) || nonce(12) || ciphertext+tag -func encryptBackup(plaintext []byte, passphrase string) ([]byte, error) { - salt := make([]byte, 32) - if _, err := rand.Read(salt); err != nil { - return nil, fmt.Errorf("salt generation: %w", err) - } - - key, err := scrypt.Key([]byte(passphrase), salt, scryptN, scryptR, scryptP, scryptDKLen) - if err != nil { - return nil, fmt.Errorf("scrypt key derivation: %w", err) - } - - block, err := aes.NewCipher(key) - if err != nil { - return nil, fmt.Errorf("aes cipher: %w", err) - } - - gcm, err := cipher.NewGCM(block) - if err != nil { - return nil, fmt.Errorf("gcm: %w", err) - } - - nonce := make([]byte, gcm.NonceSize()) - if _, err := rand.Read(nonce); err != nil { - return nil, fmt.Errorf("nonce generation: %w", err) - } - - ciphertext := gcm.Seal(nil, nonce, plaintext, nil) +// Compat shims for tests in this package that exercise the crypto envelope +// directly. New code should call walletbackup.Encrypt/Decrypt/IsEncrypted. +func isEncryptedBackup(data []byte) bool { return walletbackup.IsEncrypted(data) } - // Assemble: magic || version || salt || nonce || ciphertext - result := make([]byte, 0, len(backupMagic)+1+len(salt)+len(nonce)+len(ciphertext)) - result = append(result, backupMagic...) - result = append(result, backupVersion) - result = append(result, salt...) - result = append(result, nonce...) - result = append(result, ciphertext...) - - return result, nil +func encryptBackup(plaintext []byte, passphrase string) ([]byte, error) { + return walletbackup.Encrypt(plaintext, passphrase) } -// decryptBackup decrypts an encrypted backup file. func decryptBackup(data []byte, passphrase string) ([]byte, error) { - minLen := len(backupMagic) + 1 + 32 + 12 // magic + version + salt + nonce - if len(data) < minLen { - return nil, errors.New("encrypted file too short") - } - - offset := 0 - - // Verify magic. - if string(data[offset:offset+len(backupMagic)]) != string(backupMagic) { - return nil, errors.New("not an encrypted backup file") - } - - offset += len(backupMagic) - - // Check version. - version := data[offset] - offset++ - - if version != backupVersion { - return nil, fmt.Errorf("unsupported encryption version %d", version) - } - - // Extract salt. - salt := data[offset : offset+32] - offset += 32 - - // Derive key. - key, err := scrypt.Key([]byte(passphrase), salt, scryptN, scryptR, scryptP, scryptDKLen) - if err != nil { - return nil, fmt.Errorf("scrypt key derivation: %w", err) - } - - block, err := aes.NewCipher(key) - if err != nil { - return nil, fmt.Errorf("aes cipher: %w", err) - } - - gcm, err := cipher.NewGCM(block) - if err != nil { - return nil, fmt.Errorf("gcm: %w", err) - } - - // Extract nonce. - nonceSize := gcm.NonceSize() - if len(data) < offset+nonceSize { - return nil, errors.New("encrypted file too short for nonce") - } - - nonce := data[offset : offset+nonceSize] - offset += nonceSize - - // Decrypt. - ciphertext := data[offset:] - - plaintext, err := gcm.Open(nil, nonce, ciphertext, nil) - if err != nil { - return nil, fmt.Errorf("decryption failed: %w", err) - } - - return plaintext, nil -} - -// isEncryptedBackup checks if data starts with the OBOL magic bytes. -func isEncryptedBackup(data []byte) bool { - if len(data) < len(backupMagic) { - return false - } - - return string(data[:len(backupMagic)]) == string(backupMagic) + return walletbackup.Decrypt(data, passphrase) } // walletAddressesForPurgeWarning returns addresses of wallets that would be lost. diff --git a/internal/walletbackup/walletbackup.go b/internal/walletbackup/walletbackup.go new file mode 100644 index 00000000..c527ee2b --- /dev/null +++ b/internal/walletbackup/walletbackup.go @@ -0,0 +1,246 @@ +// Package walletbackup is the runtime-agnostic core of `obol agent wallet +// backup` / `restore`. It owns the on-disk backup wire format, the +// AES-256-GCM encryption envelope, and the helpers that read/write the +// keystore password from values-remote-signer.yaml. Per-runtime callers +// (internal/openclaw, internal/hermes) compose these primitives with their +// own deployDir/keystoreDir conventions and namespace-specific cluster apply +// steps. The on-disk format must round-trip across runtimes, so a backup +// taken from an OpenClaw instance can restore into a Hermes one and vice +// versa. +package walletbackup + +import ( + "crypto/aes" + "crypto/cipher" + "crypto/rand" + "encoding/json" + "errors" + "fmt" + "os" + "path/filepath" + + "github.com/ObolNetwork/obol-stack/internal/ui" + "golang.org/x/crypto/scrypt" + "gopkg.in/yaml.v3" +) + +// Magic is the 4-byte prefix of an encrypted backup file. +var Magic = []byte("OBOL") + +// Version is the current backup-format version. Bumping requires a parallel +// bump in Decode's accepted-versions check. +const Version = 1 + +const ( + scryptN = 262144 + scryptR = 8 + scryptP = 1 + scryptDKLen = 32 +) + +// File is the JSON shape of a wallet backup. One backup may carry multiple +// wallets; today both runtimes write a single-wallet file. +type File struct { + Version int `json:"version"` + Instance string `json:"instance"` + Wallets []Wallet `json:"wallets"` +} + +// Wallet holds a single wallet's backup data — enough to restore both the +// keystore JSON on disk and the keystore password the remote-signer needs. +type Wallet struct { + Address string `json:"address"` + PublicKey string `json:"publicKey"` + KeystoreUUID string `json:"keystoreUUID"` + CreatedAt string `json:"createdAt"` + Keystore json.RawMessage `json:"keystore"` + KeystorePassword string `json:"keystorePassword"` +} + +// Encode marshals a backup to bytes. If passphrase is non-empty, it returns +// an encrypted blob; otherwise it returns the indented JSON. The second +// return value reports which form was emitted. +func Encode(f *File, passphrase string) ([]byte, bool, error) { + plain, err := json.MarshalIndent(f, "", " ") + if err != nil { + return nil, false, fmt.Errorf("marshal backup: %w", err) + } + if passphrase == "" { + return plain, false, nil + } + enc, err := Encrypt(plain, passphrase) + if err != nil { + return nil, false, err + } + return enc, true, nil +} + +// Decode parses raw bytes into a File. If the input starts with the OBOL +// magic, passphrase is required; otherwise it must be empty (or is ignored). +func Decode(data []byte, passphrase string) (*File, error) { + var plain []byte + if IsEncrypted(data) { + if passphrase == "" { + return nil, errors.New("passphrase required for encrypted backup") + } + dec, err := Decrypt(data, passphrase) + if err != nil { + return nil, fmt.Errorf("decryption failed (wrong passphrase?): %w", err) + } + plain = dec + } else { + plain = data + } + + var f File + if err := json.Unmarshal(plain, &f); err != nil { + return nil, fmt.Errorf("invalid backup file: %w", err) + } + if f.Version != Version { + return nil, fmt.Errorf("unsupported backup version %d (expected %d)", f.Version, Version) + } + if len(f.Wallets) == 0 { + return nil, errors.New("backup contains no wallets") + } + return &f, nil +} + +// IsEncrypted reports whether data carries the OBOL magic prefix. +func IsEncrypted(data []byte) bool { + if len(data) < len(Magic) { + return false + } + return string(data[:len(Magic)]) == string(Magic) +} + +// PromptPassphrase resolves a passphrase for backup. If the caller already +// passed --passphrase explicitly, hasFlag=true short-circuits the prompt +// (even when flagValue is the empty string, which means "no encryption"). +func PromptPassphrase(flagValue string, hasFlag bool, u *ui.UI) (string, error) { + if hasFlag { + return flagValue, nil + } + pass, err := u.SecretInput("Backup passphrase (empty for no encryption)") + if err != nil { + return "", fmt.Errorf("failed to read passphrase: %w", err) + } + if pass == "" { + return "", nil + } + confirm, err := u.SecretInput("Confirm passphrase") + if err != nil { + return "", fmt.Errorf("failed to read confirmation: %w", err) + } + if pass != confirm { + return "", errors.New("passphrases do not match") + } + return pass, nil +} + +// ReadKeystorePassword extracts keystorePassword.value from +// values-remote-signer.yaml under deployDir. Both Hermes and OpenClaw write +// the same shape, generated by their respective generateRemoteSignerValues. +func ReadKeystorePassword(deployDir string) (string, error) { + data, err := os.ReadFile(filepath.Join(deployDir, "values-remote-signer.yaml")) + if err != nil { + return "", err + } + var values struct { + KeystorePassword struct { + Value string `yaml:"value"` + } `yaml:"keystorePassword"` + } + if err := yaml.Unmarshal(data, &values); err != nil { + return "", fmt.Errorf("failed to parse values-remote-signer.yaml: %w", err) + } + if values.KeystorePassword.Value == "" { + return "", errors.New("keystorePassword.value not found in values-remote-signer.yaml") + } + return values.KeystorePassword.Value, nil +} + +// WriteValuesRemoteSigner writes the rendered values-remote-signer.yaml to +// deployDir. Callers pass the runtime-specific rendered content (the YAML +// shape is identical across runtimes, but the comment header differs). +func WriteValuesRemoteSigner(deployDir, content string) error { + return os.WriteFile(filepath.Join(deployDir, "values-remote-signer.yaml"), []byte(content), 0o600) +} + +// Encrypt wraps plaintext with AES-256-GCM under a scrypt-derived key. +// Layout: magic(4) | version(1) | salt(32) | nonce(12) | ciphertext+tag. +// Exported so callers (and crypto-only tests) can exercise the envelope +// without going through Encode's JSON marshalling step. +func Encrypt(plaintext []byte, passphrase string) ([]byte, error) { + salt := make([]byte, 32) + if _, err := rand.Read(salt); err != nil { + return nil, fmt.Errorf("salt generation: %w", err) + } + key, err := scrypt.Key([]byte(passphrase), salt, scryptN, scryptR, scryptP, scryptDKLen) + if err != nil { + return nil, fmt.Errorf("scrypt key derivation: %w", err) + } + block, err := aes.NewCipher(key) + if err != nil { + return nil, fmt.Errorf("aes cipher: %w", err) + } + gcm, err := cipher.NewGCM(block) + if err != nil { + return nil, fmt.Errorf("gcm: %w", err) + } + nonce := make([]byte, gcm.NonceSize()) + if _, err := rand.Read(nonce); err != nil { + return nil, fmt.Errorf("nonce generation: %w", err) + } + ciphertext := gcm.Seal(nil, nonce, plaintext, nil) + out := make([]byte, 0, len(Magic)+1+len(salt)+len(nonce)+len(ciphertext)) + out = append(out, Magic...) + out = append(out, byte(Version)) + out = append(out, salt...) + out = append(out, nonce...) + out = append(out, ciphertext...) + return out, nil +} + +// Decrypt reverses Encrypt for the same passphrase, returning an error if +// the magic, version, or AEAD tag fails to verify. +func Decrypt(data []byte, passphrase string) ([]byte, error) { + minLen := len(Magic) + 1 + 32 + 12 + if len(data) < minLen { + return nil, errors.New("encrypted file too short") + } + off := 0 + if string(data[off:off+len(Magic)]) != string(Magic) { + return nil, errors.New("not an encrypted backup file") + } + off += len(Magic) + if data[off] != byte(Version) { + return nil, fmt.Errorf("unsupported encryption version %d", data[off]) + } + off++ + salt := data[off : off+32] + off += 32 + key, err := scrypt.Key([]byte(passphrase), salt, scryptN, scryptR, scryptP, scryptDKLen) + if err != nil { + return nil, fmt.Errorf("scrypt key derivation: %w", err) + } + block, err := aes.NewCipher(key) + if err != nil { + return nil, fmt.Errorf("aes cipher: %w", err) + } + gcm, err := cipher.NewGCM(block) + if err != nil { + return nil, fmt.Errorf("gcm: %w", err) + } + nonceSize := gcm.NonceSize() + if len(data) < off+nonceSize { + return nil, errors.New("encrypted file too short for nonce") + } + nonce := data[off : off+nonceSize] + off += nonceSize + ciphertext := data[off:] + plain, err := gcm.Open(nil, nonce, ciphertext, nil) + if err != nil { + return nil, fmt.Errorf("decryption failed: %w", err) + } + return plain, nil +} diff --git a/internal/walletbackup/walletbackup_test.go b/internal/walletbackup/walletbackup_test.go new file mode 100644 index 00000000..a587faa1 --- /dev/null +++ b/internal/walletbackup/walletbackup_test.go @@ -0,0 +1,99 @@ +package walletbackup + +import ( + "bytes" + "encoding/json" + "testing" +) + +func TestEncodeDecodePlain(t *testing.T) { + in := &File{ + Version: Version, + Instance: "demo", + Wallets: []Wallet{{ + Address: "0xabc", + PublicKey: "0xpub", + KeystoreUUID: "uuid-1", + CreatedAt: "2026-01-01T00:00:00Z", + Keystore: json.RawMessage(`{"version":3}`), + KeystorePassword: "hunter2", + }}, + } + + payload, encrypted, err := Encode(in, "") + if err != nil { + t.Fatalf("Encode: %v", err) + } + if encrypted { + t.Fatalf("expected plain payload") + } + if IsEncrypted(payload) { + t.Fatalf("plain payload reported as encrypted") + } + + out, err := Decode(payload, "") + if err != nil { + t.Fatalf("Decode: %v", err) + } + if out.Instance != in.Instance || out.Wallets[0].KeystorePassword != in.Wallets[0].KeystorePassword { + t.Fatalf("round-trip mismatch: got %+v", out) + } +} + +func TestEncodeDecodeEncrypted(t *testing.T) { + in := &File{ + Version: Version, + Wallets: []Wallet{{Address: "0x1", KeystorePassword: "p"}}, + } + payload, encrypted, err := Encode(in, "correct horse") + if err != nil { + t.Fatalf("Encode: %v", err) + } + if !encrypted { + t.Fatalf("expected encrypted payload") + } + if !IsEncrypted(payload) { + t.Fatalf("encrypted payload missing magic prefix") + } + + if _, err := Decode(payload, "wrong"); err == nil { + t.Fatalf("Decode with wrong passphrase should fail") + } + + out, err := Decode(payload, "correct horse") + if err != nil { + t.Fatalf("Decode: %v", err) + } + if out.Wallets[0].Address != "0x1" { + t.Fatalf("round-trip mismatch") + } +} + +func TestEncryptDecryptRawBytes(t *testing.T) { + plain := []byte("hello world") + cipher, err := Encrypt(plain, "passphrase") + if err != nil { + t.Fatalf("Encrypt: %v", err) + } + if !IsEncrypted(cipher) { + t.Fatalf("ciphertext missing magic prefix") + } + out, err := Decrypt(cipher, "passphrase") + if err != nil { + t.Fatalf("Decrypt: %v", err) + } + if !bytes.Equal(plain, out) { + t.Fatalf("got %q want %q", out, plain) + } +} + +func TestDecodeRejectsUnknownVersion(t *testing.T) { + in := &File{Version: 99, Wallets: []Wallet{{}}} + payload, _, err := Encode(in, "") + if err != nil { + t.Fatalf("Encode: %v", err) + } + if _, err := Decode(payload, ""); err == nil { + t.Fatalf("Decode should reject unknown version") + } +} From 4f84352ff056c58e9d52d4e1aeb09016adf37a3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ois=C3=ADn=20Kyne?= Date: Mon, 4 May 2026 00:55:07 +0100 Subject: [PATCH 5/9] Hermes publish address --- internal/hermes/hermes.go | 3 ++ internal/hermes/wallet.go | 70 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/internal/hermes/hermes.go b/internal/hermes/hermes.go index cd9ff6b0..98616cd6 100644 --- a/internal/hermes/hermes.go +++ b/internal/hermes/hermes.go @@ -243,6 +243,9 @@ func Sync(cfg *config.Config, id string, u *ui.UI) error { return fmt.Errorf("helmfile sync failed: %w", err) } + // Publish wallet-metadata ConfigMap for the frontend (namespace now exists). + applyWalletMetadataConfigMap(cfg, id, deploymentDir) + u.Blank() u.Success("Hermes installed successfully!") u.Detail("Namespace", agentruntime.Namespace(agentruntime.Hermes, id)) diff --git a/internal/hermes/wallet.go b/internal/hermes/wallet.go index 936757a0..198bf1c0 100644 --- a/internal/hermes/wallet.go +++ b/internal/hermes/wallet.go @@ -1,6 +1,7 @@ package hermes import ( + "bytes" "crypto/aes" "crypto/cipher" "crypto/rand" @@ -9,6 +10,7 @@ import ( "fmt" "math/big" "os" + "os/exec" "path/filepath" "strings" "time" @@ -272,6 +274,74 @@ func ReadWalletMetadata(deploymentDir string) (*WalletInfo, error) { return &wallet, nil } +// applyWalletMetadataConfigMap creates or updates a wallet-metadata ConfigMap +// in the instance namespace. The frontend reads this to display wallet +// addresses on the agent card. Mirrors the OpenClaw helper at +// internal/openclaw/wallet.go so the frontend's getWalletMetadata works +// identically for either runtime. Must be called after helmfile sync (the +// namespace must exist). +func applyWalletMetadataConfigMap(cfg *config.Config, id, deploymentDir string) { + wallet, err := ReadWalletMetadata(deploymentDir) + if err != nil { + return + } + + namespace := agentruntime.Namespace(agentruntime.Hermes, id) + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + kubectlBinary := filepath.Join(cfg.BinDir, "kubectl") + + addressesJSON := map[string]any{ + "instanceId": id, + "addresses": []map[string]string{ + { + "address": wallet.Address, + "publicKey": wallet.PublicKey, + "createdAt": wallet.CreatedAt, + "label": "hermes-" + id, + }, + }, + "count": 1, + } + + addressesData, err := json.Marshal(addressesJSON) + if err != nil { + fmt.Printf("Warning: could not marshal wallet metadata: %v\n", err) + return + } + + manifest := map[string]any{ + "apiVersion": "v1", + "kind": "ConfigMap", + "metadata": map[string]any{ + "name": "wallet-metadata", + "namespace": namespace, + "labels": map[string]string{ + "app.kubernetes.io/component": "remote-signer", + "app.kubernetes.io/managed-by": "obol", + }, + }, + "data": map[string]string{ + "addresses.json": string(addressesData), + }, + } + + raw, err := json.Marshal(manifest) + if err != nil { + fmt.Printf("Warning: could not marshal ConfigMap: %v\n", err) + return + } + + cmd := exec.Command(kubectlBinary, "apply", "-f", "-") + cmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath) + cmd.Stdin = bytes.NewReader(raw) + + var stderr bytes.Buffer + cmd.Stderr = &stderr + if err := cmd.Run(); err != nil { + fmt.Printf("Warning: could not apply wallet-metadata ConfigMap: %v\n%s", err, stderr.String()) + } +} + func ResolveWalletAddress(cfg *config.Config) (string, error) { ids, err := agentruntime.ListInstanceIDs(cfg, agentruntime.Hermes) if err != nil { From dce9cd2a0f758929705e273d64e61ebfc3b9e58d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ois=C3=ADn=20Kyne?= Date: Mon, 4 May 2026 01:07:52 +0100 Subject: [PATCH 6/9] Unsure this is better --- internal/hermes/hermes.go | 77 ++++++++-------------------------- internal/hermes/hermes_test.go | 24 ++++++----- 2 files changed, 30 insertions(+), 71 deletions(-) diff --git a/internal/hermes/hermes.go b/internal/hermes/hermes.go index 98616cd6..1a2635a7 100644 --- a/internal/hermes/hermes.go +++ b/internal/hermes/hermes.go @@ -33,10 +33,13 @@ const ( // renovate: datasource=helm depName=raw registryUrl=https://bedag.github.io/helm-charts/ rawChartVersion = "2.0.2" - defaultImage = "nousresearch/hermes-agent:v2026.4.23" - hermesInstallDir = "/data/.hermes/hermes-agent" - hermesRepoURL = "https://github.com/NousResearch/hermes-agent.git" - hermesBinary = hermesInstallDir + "/venv/bin/hermes" + defaultImage = "nousresearch/hermes-agent:v2026.4.23" + // hermesBinary points at the venv that the upstream image preinstalls at + // build time via `uv pip install -e ".[all]"`. Using the in-image venv + // avoids cloning the repo + rebuilding a venv on every cold start, and + // keeps the persistent PVC free of the read-only git pack files that + // poison subsequent local-path-provisioner chowns on macOS virtiofs. + hermesBinary = "/opt/hermes/.venv/bin/hermes" containerUID = 10000 containerGID = 10000 @@ -798,68 +801,24 @@ func generateValues(namespace, hostname, dashboardHostname, agentBaseURL, token, runAsGroup: %d fsGroup: %d initContainers: + # Single init container that runs the same Hermes image used by + # the runtime. The upstream image already ships /opt/hermes/.venv + # with all dependencies preinstalled, so there is no clone or pip + # install at pod start. fsGroup on the pod makes the PVC mount + # group-writable to the hermes user (uid 10000), so we don't need + # a recursive chown — which is the operation that fails on macOS + # virtiofs once the volume has read-only files like git pack + # files left from any previous install. - name: init-hermes-data - image: busybox:1.36 - command: - - sh - - -c - - mkdir -p /data/.hermes && chown -R %d:%d /data/.hermes - securityContext: - runAsUser: 0 - volumeMounts: - - name: data - mountPath: /data - - name: bootstrap-hermes-install image: %s imagePullPolicy: IfNotPresent command: - sh - -ec - | - install_dir=%s - repo_url=%s mkdir -p /data/.hermes/home /data/.hermes/workspace - lock_dir="${install_dir}.lock" - got_lock=0 - for _ in $(seq 1 120); do - if mkdir "$lock_dir" 2>/dev/null; then - got_lock=1 - break - fi - sleep 1 - done - if [ "$got_lock" != 1 ]; then - echo "Timed out waiting for Hermes install lock: $lock_dir" >&2 - exit 1 - fi - cleanup_lock() { - rmdir "$lock_dir" 2>/dev/null || true - } - trap cleanup_lock EXIT - - if [ ! -d "$install_dir/.git" ] || { [ ! -f "$install_dir/pyproject.toml" ] && [ ! -f "$install_dir/setup.py" ]; }; then - rm -rf "${install_dir}.tmp" - if [ -e "$install_dir" ]; then - mv "$install_dir" "${install_dir}.backup.$(date +%%s)" - fi - git clone --depth 1 "$repo_url" "${install_dir}.tmp" - mv "${install_dir}.tmp" "$install_dir" - fi - cd "$install_dir" - # Reinstall when the venv is missing the hermes binary OR - # any selected extra is absent. The upstream image installs - # ".[all]"; we re-create the venv from a fresh clone, so the - # extras must be re-requested explicitly. The import check - # picks one module per extra so existing PVCs trigger a - # rebuild when we add a new extra to the install line. - if [ ! -x "$install_dir/venv/bin/hermes" ] || \ - ! "$install_dir/venv/bin/python3" -c "import fastapi, uvicorn, telegram, mcp, ptyprocess, simple_term_menu, googleapiclient" >/dev/null 2>&1; then - rm -rf "$install_dir/venv" - uv venv --python python3 --system-site-packages venv - VIRTUAL_ENV="$install_dir/venv" uv pip install -e ".[web,messaging,mcp,pty,cli,acp,google]" - fi if [ -f /data/.hermes/state.db ]; then - if ! python3 - <<'PY' + if ! /opt/hermes/.venv/bin/python3 - <<'PY' import sqlite3 conn = sqlite3.connect('/data/.hermes/state.db') row = conn.execute('PRAGMA quick_check').fetchone() @@ -875,8 +834,6 @@ func generateValues(namespace, hostname, dashboardHostname, agentBaseURL, token, echo "Backed up malformed Hermes state DB to $backup_dir" fi fi - cleanup_lock - trap - EXIT volumeMounts: - name: data mountPath: /data @@ -917,7 +874,7 @@ func generateValues(namespace, hostname, dashboardHostname, agentBaseURL, token, value: %s - name: OBOL_SKILLS_DIR value: /data/.hermes/%s - `, desc.DataPVCName, namespace, desc.ServiceName, desc.ServiceName, namespace, desc.ServiceName, desc.ServiceName, desc.ServiceName, desc.ServiceName, containerUID, containerGID, containerGID, containerUID, containerGID, quoteYAML(image()), quoteYAML(hermesInstallDir), quoteYAML(hermesRepoURL), desc.ServiceName, quoteYAML(image()), quoteYAML(hermesBinary), desc.DefaultPort, desc.DefaultPort, quoteYAML(primary), quoteYAML(namespace), obolSkillsDirName) + `, desc.DataPVCName, namespace, desc.ServiceName, desc.ServiceName, namespace, desc.ServiceName, desc.ServiceName, desc.ServiceName, desc.ServiceName, containerUID, containerGID, containerGID, quoteYAML(image()), desc.ServiceName, quoteYAML(image()), quoteYAML(hermesBinary), desc.DefaultPort, desc.DefaultPort, quoteYAML(primary), quoteYAML(namespace), obolSkillsDirName) if agentBaseURL != "" { fmt.Fprintf(&b, " - name: AGENT_BASE_URL\n value: %s\n", quoteYAML(agentBaseURL)) diff --git a/internal/hermes/hermes_test.go b/internal/hermes/hermes_test.go index 4e27c869..9728c526 100644 --- a/internal/hermes/hermes_test.go +++ b/internal/hermes/hermes_test.go @@ -143,18 +143,9 @@ func TestGenerateValues_UsesHermesNativeNames(t *testing.T) { "containerPort: 8642", "containerPort: 9119", "init-hermes-data", - "bootstrap-hermes-install", - `install_dir="/data/.hermes/hermes-agent"`, - `repo_url="https://github.com/NousResearch/hermes-agent.git"`, - `lock_dir="${install_dir}.lock"`, - `Timed out waiting for Hermes install lock`, - `git clone --depth 1 "$repo_url" "${install_dir}.tmp"`, - "uv venv --python python3 --system-site-packages venv", - `uv pip install -e ".[web,messaging,mcp,pty,cli,acp,google]"`, - `import fastapi, uvicorn, telegram, mcp, ptyprocess, simple_term_menu, googleapiclient`, `PRAGMA quick_check`, `state-db-corrupt-$ts`, - `- "/data/.hermes/hermes-agent/venv/bin/hermes"`, + `- "/opt/hermes/.venv/bin/hermes"`, `- "hermes-obol-agent.obol.stack"`, `- "obol-agent.obol.stack"`, "name: hermes-dashboard", @@ -165,6 +156,17 @@ func TestGenerateValues_UsesHermesNativeNames(t *testing.T) { } } + for _, banned := range []string{ + "bootstrap-hermes-install", + "git clone", + "uv pip install", + "/data/.hermes/hermes-agent", + } { + if strings.Contains(values, banned) { + t.Fatalf("generateValues() should no longer reference %q (the in-pod git clone + venv build); use the upstream image's /opt/hermes/.venv instead:\n%s", banned, values) + } + } + var parsed any if err := yaml.Unmarshal([]byte(values), &parsed); err != nil { t.Fatalf("generateValues() produced invalid YAML: %v\n%s", err, values) @@ -203,7 +205,7 @@ func TestHermesExecArgs_UsesNativeHermesBinary(t *testing.T) { "-n", "hermes-obol-agent", "deploy/hermes", "--", - "/data/.hermes/hermes-agent/venv/bin/hermes", + "/opt/hermes/.venv/bin/hermes", "skills", "audit", } From 0042d0f01866af2e84fc5d0e8ba4d54e8f85d66b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ois=C3=ADn=20Kyne?= Date: Mon, 4 May 2026 01:39:43 +0100 Subject: [PATCH 7/9] Updates config map with recreate --- internal/stack/backend_k3d.go | 24 +++++++----- internal/stack/stack.go | 71 +++++++++++++++++++++-------------- 2 files changed, 58 insertions(+), 37 deletions(-) diff --git a/internal/stack/backend_k3d.go b/internal/stack/backend_k3d.go index f530f0e3..8bc15469 100644 --- a/internal/stack/backend_k3d.go +++ b/internal/stack/backend_k3d.go @@ -103,6 +103,21 @@ func (b *K3dBackend) Up(cfg *config.Config, u *ui.UI, stackID string) ([]byte, e return nil, err } + // Ensure the dev registry caches are started in BOTH branches. `k3d + // cluster start` does not auto-restart registry containers attached via + // `--registry-use` at create time — it only starts the cluster's own + // nodes. Without this call, every retry after a `cluster stop` falls + // back to direct upstream pulls and re-fetches every image, costing + // minutes per attempt. + if os.Getenv("OBOL_DEVELOPMENT") == "true" { + setup, setupErr := ensureDevRegistries(cfg, u) + if setupErr != nil { + u.Warnf("Dev registry cache unavailable, falling back to direct upstream pulls: %v", setupErr) + } else { + registrySetup = setup + } + } + if running { u.Warn("Cluster already exists, starting it") @@ -128,15 +143,6 @@ func (b *K3dBackend) Up(cfg *config.Config, u *ui.UI, stackID string) ([]byte, e // 'obol stack init' wrote the k3d config. ensureK3dPortsAvailable(k3dConfigPath, u) - if os.Getenv("OBOL_DEVELOPMENT") == "true" { - setup, setupErr := ensureDevRegistries(cfg, u) - if setupErr != nil { - u.Warnf("Dev registry cache unavailable, falling back to direct upstream pulls: %v", setupErr) - } else { - registrySetup = setup - } - } - createCmd := exec.Command( filepath.Join(cfg.BinDir, "k3d"), k3dCreateArgs(stackName, k3dConfigPath, registrySetup)..., diff --git a/internal/stack/stack.go b/internal/stack/stack.go index b6704e61..94395ee3 100644 --- a/internal/stack/stack.go +++ b/internal/stack/stack.go @@ -362,16 +362,14 @@ func syncDefaults(cfg *config.Config, u *ui.UI, kubeconfigPath string, dataDir s u.Warnf("Failed to preserve LiteLLM config across Helm sync: %v", err) } - // Release runtime field ownership of litellm-config.data.config.yaml so the - // upcoming helm upgrade can reclaim it without an SSA conflict. Without - // this step, the second `obol stack up` after autoConfigureLLM/restore has - // claimed the field via SSA (manager=helm, op=Apply) fails with - // "conflict with helm using v1: .data.config.yaml" because helm registers - // a separate managedFields entry (manager=helm, op=Update) for the same - // field. The data is already snapshotted in previousLiteLLMConfig and gets - // re-applied by restoreLiteLLMConfig after helm runs. - if err := releaseLiteLLMConfigOwnership(cfg, kubeconfigPath); err != nil { - u.Warnf("Failed to release LiteLLM config field ownership: %v", err) + // Establish field manager "helm" as the SSA owner of + // litellm-config.data["config.yaml"] before helmfile sync runs, so the + // upcoming helm upgrade's SSA merges in place instead of conflicting + // with the synthesised "before-first-apply" or a previous "helm" Apply + // entry. The data is already snapshotted in previousLiteLLMConfig and + // gets re-applied by restoreLiteLLMConfig after helm runs. + if err := releaseLiteLLMConfigOwnership(cfg, kubeconfigPath, previousLiteLLMConfig); err != nil { + u.Warnf("Failed to claim LiteLLM config field ownership: %v", err) } // Compatibility migration @@ -919,33 +917,50 @@ func preserveLiteLLMConfigForHelm(cfg *config.Config, kubeconfigPath string) (st return raw, nil } -// releaseLiteLLMConfigOwnership strips managedFields from the litellm-config -// ConfigMap so the next helm upgrade can claim ownership of every field -// without an SSA conflict. Helm tracks release ownership via the -// meta.helm.sh/release-name annotation, not managedFields, so clearing -// managedFields does not detach the resource from its release. +// releaseLiteLLMConfigOwnership deletes the litellm-config ConfigMap (if it +// exists) before the next helmfile sync runs, so helm's upgrade creates a +// fresh ConfigMap and becomes the sole SSA owner of every field — no +// pre-existing field-ownership entries, no synthesised "before-first-apply" +// manager, no conflict on `.data.config.yaml`. // -// The single empty entry [{}] is the documented apiserver idiom for clearing -// all field-ownership claims on a resource. See: -// https://kubernetes.io/docs/reference/using-api/server-side-apply/#clearing-managedfields -func releaseLiteLLMConfigOwnership(cfg *config.Config, kubeconfigPath string) error { - kubectlBinary := filepath.Join(cfg.BinDir, "kubectl") - - // Skip if the configmap doesn't exist (first install). - if _, err := kubectl.Output(kubectlBinary, kubeconfigPath, - "get", "configmap", "litellm-config", "-n", "llm", "-o", "name"); err != nil { +// Why deletion rather than re-applying with manager "helm" or clearing +// managedFields: +// - clearing managedFields ([{}]) leaves data fields with no SSA owner; +// Kubernetes synthesises "before-first-apply" on the next SSA call to +// track them, and helm's apply then conflicts on `.data.config.yaml` +// against that synthesised manager. +// - re-applying with manager "helm" via SSA only claims the fields in our +// manifest. Adjacent fields (labels, annotations, other data keys) stay +// under their original Update manager, and synthesised +// "before-first-apply" still appears on helm's SSA call. +// +// Helm tracks release ownership via the meta.helm.sh/release-name annotation +// on the resource — not via managedFields — but those annotations ride along +// with the ConfigMap content. Helm reconstructs them from its release +// secret on the next upgrade, so deleting the ConfigMap does not orphan it +// from the release. +// +// The window where the ConfigMap is missing is bounded by helmfile sync +// (seconds). Running LiteLLM pods are unaffected because volume projections +// happen at pod start, not on ConfigMap mutation. The user data (custom +// providers, paid model routes) is already snapshotted in +// previousLiteLLMConfig and re-applied by restoreLiteLLMConfig after helm +// finishes. +func releaseLiteLLMConfigOwnership(cfg *config.Config, kubeconfigPath, snapshot string) error { + if strings.TrimSpace(snapshot) == "" { + // No existing ConfigMap to delete (first install or earlier failure). return nil } + kubectlBinary := filepath.Join(cfg.BinDir, "kubectl") cmd := exec.Command(kubectlBinary, - "patch", "configmap", "litellm-config", + "delete", "configmap", "litellm-config", "-n", "llm", - "--type=merge", - "--patch", `{"metadata":{"managedFields":[{}]}}`, + "--ignore-not-found", ) cmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath) if out, err := cmd.CombinedOutput(); err != nil { - return fmt.Errorf("kubectl patch managedFields: %w\n%s", err, string(out)) + return fmt.Errorf("kubectl delete configmap litellm-config: %w\n%s", err, string(out)) } return nil } From c2df08e7e5c812b6ac8ce8f387db2497d89c39cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ois=C3=ADn=20Kyne?= Date: Mon, 4 May 2026 12:37:19 +0100 Subject: [PATCH 8/9] Update to do same change on rs --- internal/embed/infrastructure/helmfile.yaml | 11 ++++ internal/hermes/hermes.go | 11 ++++ internal/stack/stack.go | 58 --------------------- 3 files changed, 22 insertions(+), 58 deletions(-) diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml index 05ff5f27..7f7ca348 100644 --- a/internal/embed/infrastructure/helmfile.yaml +++ b/internal/embed/infrastructure/helmfile.yaml @@ -2,6 +2,17 @@ # Orchestrates core infrastructure components deployed with every stack # Uses Traefik with Gateway API for routing (replaces nginx-ingress) +# Force helm to use SSA with --force-conflicts on every release so that +# upgrades take ownership of fields written by other field managers +# (kubectl-client-side-apply from helm's pre-3.13 default, "before-first-apply" +# synthesised by the apiserver, or runtime writers like obol's auto-config +# patches). Without this, every `obol stack down`/`up` cycle hits whack-a-mole +# SSA conflicts on resources helm shares with other writers. +helmDefaults: + args: + - "--server-side" + - "--force-conflicts" + repositories: - name: traefik url: https://traefik.github.io/charts diff --git a/internal/hermes/hermes.go b/internal/hermes/hermes.go index 1a2635a7..b72b5fdd 100644 --- a/internal/hermes/hermes.go +++ b/internal/hermes/hermes.go @@ -692,6 +692,17 @@ func writeDeploymentFiles(cfg *config.Config, id, deploymentDir, agentBaseURL st func generateHelmfile(namespace string) string { return fmt.Sprintf(`# Managed by obol agent +# --server-side --force-conflicts on every helm release so upgrades take +# ownership of fields previously written by other managers (e.g. helm's +# pre-3.13 client-side-apply default, the apiserver's synthesised +# "before-first-apply", or runtime kubectl applies). Without this, every +# subsequent `+"`obol agent sync`"+` after a fresh install hits whack-a-mole +# SSA conflicts on the remote-signer Secret labels and similar shared fields. +helmDefaults: + args: + - "--server-side" + - "--force-conflicts" + repositories: - name: obol url: https://obolnetwork.github.io/helm-charts/ diff --git a/internal/stack/stack.go b/internal/stack/stack.go index 94395ee3..90700775 100644 --- a/internal/stack/stack.go +++ b/internal/stack/stack.go @@ -362,16 +362,6 @@ func syncDefaults(cfg *config.Config, u *ui.UI, kubeconfigPath string, dataDir s u.Warnf("Failed to preserve LiteLLM config across Helm sync: %v", err) } - // Establish field manager "helm" as the SSA owner of - // litellm-config.data["config.yaml"] before helmfile sync runs, so the - // upcoming helm upgrade's SSA merges in place instead of conflicting - // with the synthesised "before-first-apply" or a previous "helm" Apply - // entry. The data is already snapshotted in previousLiteLLMConfig and - // gets re-applied by restoreLiteLLMConfig after helm runs. - if err := releaseLiteLLMConfigOwnership(cfg, kubeconfigPath, previousLiteLLMConfig); err != nil { - u.Warnf("Failed to claim LiteLLM config field ownership: %v", err) - } - // Compatibility migration if err := migrateDefaultsHTTPRouteHostnames(helmfilePath); err != nil { u.Warnf("Failed to migrate defaults helmfile hostnames: %v", err) @@ -917,54 +907,6 @@ func preserveLiteLLMConfigForHelm(cfg *config.Config, kubeconfigPath string) (st return raw, nil } -// releaseLiteLLMConfigOwnership deletes the litellm-config ConfigMap (if it -// exists) before the next helmfile sync runs, so helm's upgrade creates a -// fresh ConfigMap and becomes the sole SSA owner of every field — no -// pre-existing field-ownership entries, no synthesised "before-first-apply" -// manager, no conflict on `.data.config.yaml`. -// -// Why deletion rather than re-applying with manager "helm" or clearing -// managedFields: -// - clearing managedFields ([{}]) leaves data fields with no SSA owner; -// Kubernetes synthesises "before-first-apply" on the next SSA call to -// track them, and helm's apply then conflicts on `.data.config.yaml` -// against that synthesised manager. -// - re-applying with manager "helm" via SSA only claims the fields in our -// manifest. Adjacent fields (labels, annotations, other data keys) stay -// under their original Update manager, and synthesised -// "before-first-apply" still appears on helm's SSA call. -// -// Helm tracks release ownership via the meta.helm.sh/release-name annotation -// on the resource — not via managedFields — but those annotations ride along -// with the ConfigMap content. Helm reconstructs them from its release -// secret on the next upgrade, so deleting the ConfigMap does not orphan it -// from the release. -// -// The window where the ConfigMap is missing is bounded by helmfile sync -// (seconds). Running LiteLLM pods are unaffected because volume projections -// happen at pod start, not on ConfigMap mutation. The user data (custom -// providers, paid model routes) is already snapshotted in -// previousLiteLLMConfig and re-applied by restoreLiteLLMConfig after helm -// finishes. -func releaseLiteLLMConfigOwnership(cfg *config.Config, kubeconfigPath, snapshot string) error { - if strings.TrimSpace(snapshot) == "" { - // No existing ConfigMap to delete (first install or earlier failure). - return nil - } - - kubectlBinary := filepath.Join(cfg.BinDir, "kubectl") - cmd := exec.Command(kubectlBinary, - "delete", "configmap", "litellm-config", - "-n", "llm", - "--ignore-not-found", - ) - cmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath) - if out, err := cmd.CombinedOutput(); err != nil { - return fmt.Errorf("kubectl delete configmap litellm-config: %w\n%s", err, string(out)) - } - return nil -} - func restoreLiteLLMConfig(cfg *config.Config, kubeconfigPath, raw string) error { if strings.TrimSpace(raw) == "" { return nil From be8a0d5e71d2d68e4194a0604053d8c917941b17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ois=C3=ADn=20Kyne?= Date: Mon, 4 May 2026 13:11:50 +0100 Subject: [PATCH 9/9] Update docker registry handling network loss --- internal/embed/infrastructure/helmfile.yaml | 5 ++++- internal/hermes/hermes.go | 5 ++++- internal/stack/dev_registry.go | 15 ++++++++++++--- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml index 7f7ca348..ea40be45 100644 --- a/internal/embed/infrastructure/helmfile.yaml +++ b/internal/embed/infrastructure/helmfile.yaml @@ -10,7 +10,10 @@ # SSA conflicts on resources helm shares with other writers. helmDefaults: args: - - "--server-side" + # `=true` form is required: helm's `--server-side` takes a value + # (auto|true|false). Without `=true`, helm consumes the next arg as the + # value and rejects `--force-conflicts` as an unknown apply method. + - "--server-side=true" - "--force-conflicts" repositories: diff --git a/internal/hermes/hermes.go b/internal/hermes/hermes.go index b72b5fdd..d321f296 100644 --- a/internal/hermes/hermes.go +++ b/internal/hermes/hermes.go @@ -700,7 +700,10 @@ func generateHelmfile(namespace string) string { # SSA conflicts on the remote-signer Secret labels and similar shared fields. helmDefaults: args: - - "--server-side" + # =true form is required: helm's --server-side takes a value + # (auto|true|false), so without =true helm consumes the next arg as the + # value and rejects --force-conflicts as an unknown apply method. + - "--server-side=true" - "--force-conflicts" repositories: diff --git a/internal/stack/dev_registry.go b/internal/stack/dev_registry.go index 91ed2f96..7633968a 100644 --- a/internal/stack/dev_registry.go +++ b/internal/stack/dev_registry.go @@ -79,11 +79,20 @@ func ensureDevRegistry(cfg *config.Config, k3dBinary string, mirror registryMirr return nil } - if err := runCommand(exec.Command("docker", "start", containerName)); err != nil { - return fmt.Errorf("start registry %s: %w", containerName, err) + // Container exists but is stopped. Try to start it. + if startErr := runCommand(exec.Command("docker", "start", containerName)); startErr == nil { + return nil } - return nil + // Start failed — most commonly because the k3d-obol-stack-* Docker + // network the registry was attached to has been removed (cluster + // purge or reclaimLeakedDevK3dNetworks). The container's stored + // network reference is now a dangling ID and `docker start` aborts + // with "network ... not found". Force-remove the container and + // fall through to recreate it. The cache content lives on a host + // volume mount, so the recreated container picks it back up + // without re-downloading anything. + _ = runCommand(exec.Command("docker", "rm", "-f", containerName)) } createCmd := exec.Command(