diff --git a/cmd/obol/agent.go b/cmd/obol/agent.go index f372b0fe..8500ba15 100644 --- a/cmd/obol/agent.go +++ b/cmd/obol/agent.go @@ -8,6 +8,7 @@ import ( "os" "path/filepath" "strings" + "time" agentmgr "github.com/ObolNetwork/obol-stack/internal/agent" "github.com/ObolNetwork/obol-stack/internal/agentcrd" @@ -20,6 +21,14 @@ import ( "github.com/urfave/cli/v3" ) +// agentDeleteWaitTimeout is how long deleteCRDAgent waits for the K8s +// finalizer to drain before reporting the delete as stuck. Sized to be +// longer than the controller's typical tearDownAgent (which deletes +// Deployment + Service + Secret + remote-signer manifests) but short +// enough that the user notices when a controller is unreachable or +// running a pre-agent-CRD image without finalizer handling. +const agentDeleteWaitTimeout = 60 * time.Second + type agentTarget struct { Runtime agentruntime.Runtime ID string @@ -220,7 +229,7 @@ Hermes/OpenClaw onboard flow used by the master agent.`, return nil } } - return deleteCRDAgent(cfg, name, u) + return deleteCRDAgent(cfg, name, cmd.Bool("force"), u) } } @@ -907,21 +916,55 @@ func listCRDAgents(cfg *config.Config) ([]agentListItem, error) { // (skills + soul.md). Used by `obol agent delete ` when the // argument matches a CRD-declared agent. Idempotent: missing cluster, // missing CR, and missing host dir are all treated as "already gone". -func deleteCRDAgent(cfg *config.Config, name string, u *ui.UI) error { +// +// The CR delete uses --wait=false and the CLI polls separately so we +// can show a spinner and surface a clear error when the finalizer drain +// stalls (the common cause: a controller image that pre-dates Agent +// CRD support, which never removes the finalizer). With force=true a +// stuck delete is escalated to a finalizer strip so the user can +// recover without hand-running kubectl patch. +func deleteCRDAgent(cfg *config.Config, name string, force bool, u *ui.UI) error { if err := agentcrd.ValidateName(name); err != nil { return err } + ns := agentcrd.Namespace(name) + if err := kubectl.EnsureCluster(cfg); err == nil { bin, kc := kubectl.Paths(cfg) - ns := agentcrd.Namespace(name) - // --ignore-not-found makes the CRUD idempotent. The namespace - // itself stays — step 2d's controller may own its lifecycle once - // it provisions resources there. This avoids the CLI deleting - // a namespace another piece of the system thinks it owns. - if err := kubectl.Run(bin, kc, "delete", "agent", name, "-n", ns, "--ignore-not-found"); err != nil { + + // Fire-and-watch: send the DELETE request immediately, then poll + // for absence under a spinner. --wait=false makes kubectl return + // after the API server accepts the request (DeletionTimestamp + // set) rather than blocking on finalizer drain. That lets us + // surface clear progress and a recovery hint when drain stalls. + if err := kubectl.Run(bin, kc, "delete", "agent", name, "-n", ns, "--ignore-not-found", "--wait=false"); err != nil { return fmt.Errorf("delete Agent: %w", err) } + + drainErr := u.RunWithSpinner( + fmt.Sprintf("Waiting for Agent %s/%s finalizer to drain", ns, name), + func() error { + return waitForAgentGone(cfg, name, ns, agentDeleteWaitTimeout) + }, + ) + if drainErr != nil { + if !force { + return fmt.Errorf("%w\n\nThe controller hasn't drained the Agent finalizer. "+ + "Re-run with --force to strip the finalizer and complete the deletion locally.", + drainErr) + } + u.Warnf("Drain timed out; stripping Agent finalizer (--force)") + if err := stripAgentFinalizers(cfg, name, ns); err != nil { + return fmt.Errorf("force-strip finalizer: %w", err) + } + // One more wait pass — finalizer stripped, the CR should + // drop within a second or two. Short timeout: if this also + // stalls, something more fundamental is wrong. + if err := waitForAgentGone(cfg, name, ns, 10*time.Second); err != nil { + return fmt.Errorf("agent still present after finalizer strip: %w", err) + } + } u.Successf("Agent %s/%s deleted", ns, name) } else { u.Dim("Cluster unreachable; skipping CR deletion (host-side files only)") @@ -940,6 +983,38 @@ func deleteCRDAgent(cfg *config.Config, name string, u *ui.UI) error { return nil } +// waitForAgentGone polls for the Agent CR's absence. Returns nil once +// kubectl reports the CR is gone (either fully GC'd or never existed), +// and a timeout error otherwise. The caller decides what to do with a +// timeout — surface to the user or escalate to a finalizer strip. +func waitForAgentGone(cfg *config.Config, name, ns string, timeout time.Duration) error { + bin, kc := kubectl.Paths(cfg) + deadline := time.Now().Add(timeout) + for { + out, err := kubectl.Output(bin, kc, "get", "agent", name, "-n", ns, "-o", "name", "--ignore-not-found") + if err != nil { + return err + } + if strings.TrimSpace(out) == "" { + return nil + } + if time.Now().After(deadline) { + return fmt.Errorf("timed out after %s waiting for Agent %s/%s to be removed", timeout, ns, name) + } + time.Sleep(time.Second) + } +} + +// stripAgentFinalizers clears spec.metadata.finalizers via a JSON merge +// patch. The K8s GC then completes the deletion that was stuck. Used by +// deleteCRDAgent under --force when the controller can't drain the +// finalizer (typically a stale controller image). +func stripAgentFinalizers(cfg *config.Config, name, ns string) error { + bin, kc := kubectl.Paths(cfg) + return kubectl.Run(bin, kc, "patch", "agent", name, "-n", ns, + "--type=merge", "-p", `{"metadata":{"finalizers":[]}}`) +} + func listAgentWallets(cfg *config.Config, runtimeValue string, args []string, u *ui.UI) error { runtimeValue = strings.TrimSpace(runtimeValue) if runtimeValue != "" && runtimeValue != "all" { diff --git a/cmd/obol/agent_crd.go b/cmd/obol/agent_crd.go index c790486d..3de96809 100644 --- a/cmd/obol/agent_crd.go +++ b/cmd/obol/agent_crd.go @@ -164,3 +164,43 @@ func getAgentCR(cfg *config.Config, name string) (string, error) { } return strings.TrimSpace(out), nil } + +// getAgentCRState reports both existence and whether the CR is mid- +// deletion. Callers that branch on "already exists, skip creation" must +// not treat a Terminating CR as "already exists" — that path silently +// no-ops `obol sell demo quant` while the previous Agent is still +// finalizing, leaving the user confused about why nothing got created. +type agentCRState struct { + Exists bool + Terminating bool + ResourceName string // e.g. "agent.obol.org/demo-quant", empty when absent +} + +func getAgentCRState(cfg *config.Config, name string) (agentCRState, error) { + if err := kubectl.EnsureCluster(cfg); err != nil { + return agentCRState{}, err + } + bin, kc := kubectl.Paths(cfg) + // jsonpath outputs the deletion timestamp (empty if not set) so we + // don't need a second kubectl call to disambiguate present-but- + // terminating from fully-present. + out, err := kubectl.Output(bin, kc, "get", "agent", name, "-n", agentcrd.Namespace(name), + "-o", `jsonpath={.metadata.name}{"|"}{.metadata.deletionTimestamp}`, + "--ignore-not-found") + if err != nil { + return agentCRState{}, err + } + trimmed := strings.TrimSpace(out) + if trimmed == "" || trimmed == "|" { + return agentCRState{Exists: false}, nil + } + parts := strings.SplitN(trimmed, "|", 2) + state := agentCRState{ + Exists: parts[0] != "", + ResourceName: "agent.obol.org/" + parts[0], + } + if len(parts) == 2 && strings.TrimSpace(parts[1]) != "" { + state.Terminating = true + } + return state, nil +} diff --git a/cmd/obol/sell_agent.go b/cmd/obol/sell_agent.go index 68319293..c7207a28 100644 --- a/cmd/obol/sell_agent.go +++ b/cmd/obol/sell_agent.go @@ -282,10 +282,20 @@ func runAgentBackedDemo( // 1. Seed host-side files (skills + soul.md) and apply the Agent CR. // Idempotent — re-running `obol sell demo quant` after a previous - // run is a no-op for the agent if it already exists. - if existing, err := getAgentCR(cfg, agentName); err == nil && existing != "" { + // run is a no-op for the agent if it already exists. A CR that is + // mid-deletion (DeletionTimestamp set, finalizer still draining) + // is NOT treated as "already exists": short-circuiting on it + // means a follow-up `sell demo quant` after a hung `agent delete` + // silently does nothing, which is what motivated this check. + state, stateErr := getAgentCRState(cfg, agentName) + switch { + case stateErr == nil && state.Exists && state.Terminating: + return fmt.Errorf("Agent %s is still being deleted (DeletionTimestamp set, finalizer draining).\n\n"+ + "Wait for the controller to finish, or run `obol agent delete %s --force` to strip the finalizer "+ + "and retry.", agentName, agentName) + case stateErr == nil && state.Exists: u.Dim(fmt.Sprintf("Agent %s already exists, leaving as-is", agentName)) - } else { + default: soulWritten, seedErr := agentcrd.SeedHostFiles(cfg, agentName, spec.Agent.Skills, spec.Agent.Objective, agentcrd.SeedOptions{}) if seedErr != nil { diff --git a/internal/defaults/defaults.go b/internal/defaults/defaults.go index f4fd702c..28131a7c 100644 --- a/internal/defaults/defaults.go +++ b/internal/defaults/defaults.go @@ -96,16 +96,26 @@ var devLocallyBuiltImageBases = []string{ } // rewriteDevDigestPins walks the copied defaults tree and replaces -// every `@sha256:` reference whose base is in -// devLocallyBuiltImageBases with `:latest`. Only operates on .yaml -// and .yml files so we don't risk corrupting binaries or charts. +// every `@sha256:` or `:` reference whose +// base is in devLocallyBuiltImageBases with `:latest`. Only +// operates on .yaml and .yml files so we don't risk corrupting binaries +// or charts. +// +// Both pin styles are matched because release pipelines may publish +// either digest pins (immutable) or short-SHA tag pins (e.g. `:b13254e`) +// — in either case the local dev build is tagged `:latest`, so the +// rewrite needs to catch both forms or `obol stack up` would pull from +// the registry instead of using the freshly-built local image. func rewriteDevDigestPins(defaultsDir string) error { patterns := make([]*regexp.Regexp, 0, len(devLocallyBuiltImageBases)) replaceWith := make([]string, 0, len(devLocallyBuiltImageBases)) for _, base := range devLocallyBuiltImageBases { - // Escape the base in case it ever grows a regex metachar; sha256 - // digests are hex so a simple character class is enough. - patterns = append(patterns, regexp.MustCompile(regexp.QuoteMeta(base)+"@sha256:[a-f0-9]{64}")) + // Match either: + // @sha256:<64 hex> (digest pin) + // :<7-40 hex> (short-SHA tag pin, e.g. b13254e) + // The short-SHA pattern intentionally excludes `:latest` and any + // non-hex tag so we only rewrite immutable pins. + patterns = append(patterns, regexp.MustCompile(regexp.QuoteMeta(base)+"(@sha256:[a-f0-9]{64}|:[a-f0-9]{7,40})")) replaceWith = append(replaceWith, base+":latest") } diff --git a/internal/defaults/defaults_test.go b/internal/defaults/defaults_test.go index aaae1aad..709534b1 100644 --- a/internal/defaults/defaults_test.go +++ b/internal/defaults/defaults_test.go @@ -3,6 +3,7 @@ package defaults import ( "os" "path/filepath" + "regexp" "strings" "testing" @@ -42,10 +43,12 @@ func TestCopyInfrastructure_DevModeRewritesDigestPins(t *testing.T) { } } -func TestCopyInfrastructure_ProductionPreservesDigestPins(t *testing.T) { - // Without OBOL_DEVELOPMENT=true, the digest pins must survive - // untouched. A regression here would silently downgrade prod - // installs to floating :latest tags. +func TestCopyInfrastructure_ProductionPreservesImagePins(t *testing.T) { + // Without OBOL_DEVELOPMENT=true, the immutable image pins must + // survive untouched. A regression here would silently downgrade prod + // installs to floating :latest tags. We accept either pin style + // (digest or short-SHA tag) — both are immutable, and the rewrite + // path under OBOL_DEVELOPMENT now handles both equivalently. t.Setenv("OBOL_DEVELOPMENT", "") cfg := &config.Config{ConfigDir: t.TempDir()} @@ -56,11 +59,22 @@ func TestCopyInfrastructure_ProductionPreservesDigestPins(t *testing.T) { if err != nil { t.Fatal(err) } - if !strings.Contains(string(data), "ghcr.io/obolnetwork/serviceoffer-controller@sha256:") { - t.Error("production install lost serviceoffer-controller digest pin") + out := string(data) + if strings.Contains(out, "ghcr.io/obolnetwork/serviceoffer-controller:latest") { + t.Error("production install downgraded serviceoffer-controller to :latest") + } + hasDigest := strings.Contains(out, "ghcr.io/obolnetwork/serviceoffer-controller@sha256:") + hasShortTag := pinnedShortTag.MatchString(out) + if !hasDigest && !hasShortTag { + t.Error("production install lost serviceoffer-controller immutable pin (expected digest or short-SHA tag)") } } +// pinnedShortTag matches a serviceoffer-controller tag of 7-40 hex chars +// (git short SHA). Used by the production-pin test to accept either of +// the two pinning styles the dev-mode rewriter knows about. +var pinnedShortTag = regexp.MustCompile(`ghcr\.io/obolnetwork/serviceoffer-controller:[a-f0-9]{7,40}\b`) + func TestCopyInfrastructureRendersStackPlaceholders(t *testing.T) { cfg := &config.Config{ConfigDir: t.TempDir()} diff --git a/internal/embed/embed_image_pin_test.go b/internal/embed/embed_image_pin_test.go index 2459fc40..36dd1676 100644 --- a/internal/embed/embed_image_pin_test.go +++ b/internal/embed/embed_image_pin_test.go @@ -32,9 +32,7 @@ func TestEmbeddedImages_NoNewLatestTags(t *testing.T) { // Known unpinned images as of PR #343 follow-up. Each entry MUST have a // TODO in the template body explaining the pin-by-digest policy. - allowed := map[string]string{ - "base/templates/llm.yaml:ghcr.io/obolnetwork/x402-buyer:latest": "x402-buyer: pin by digest once CI publishes a stable tag", - } + allowed := map[string]string{} files := []string{ "base/templates/llm.yaml", diff --git a/internal/embed/infrastructure/base/templates/llm.yaml b/internal/embed/infrastructure/base/templates/llm.yaml index bf31e2bc..4e70ec1e 100644 --- a/internal/embed/infrastructure/base/templates/llm.yaml +++ b/internal/embed/infrastructure/base/templates/llm.yaml @@ -203,13 +203,12 @@ spec: cpu: "1" memory: 1Gi - name: x402-buyer - # TODO(image-pin): replace :latest with a digest (ghcr.io/obolnetwork/x402-buyer@sha256:…) - # or the short-SHA tag pattern used for litellm above (sha-). - # :latest drift between the deployed sidecar and the main branch is what - # produced the /v1 vs bare-path 404 debacle in PR #343: the mux was - # updated to register both paths, but the deployed image still served - # only /v1/*. See internal/embed/embed_image_pin_test.go. - image: ghcr.io/obolnetwork/x402-buyer:latest + # Pinned to the same short-SHA tag as the rest of the x402 control + # plane (verifier + controller in x402.yaml). The previous :latest + # was the root cause of the /v1 vs bare-path 404 saga in PR #343 + # (deployed sidecar lagged the source mux). See + # internal/embed/embed_image_pin_test.go. + image: ghcr.io/obolnetwork/x402-buyer:b13254e imagePullPolicy: IfNotPresent args: - --config-dir=/config/buyer-config diff --git a/internal/embed/infrastructure/base/templates/x402.yaml b/internal/embed/infrastructure/base/templates/x402.yaml index 9db18619..1431c371 100644 --- a/internal/embed/infrastructure/base/templates/x402.yaml +++ b/internal/embed/infrastructure/base/templates/x402.yaml @@ -206,7 +206,7 @@ spec: serviceAccountName: x402-verifier containers: - name: verifier - image: ghcr.io/obolnetwork/x402-verifier@sha256:05d6e535143974386e4de054eb44ef83270b62da5e48add1f113f4743d08eccc + image: ghcr.io/obolnetwork/x402-verifier:b13254e imagePullPolicy: IfNotPresent ports: - name: http @@ -277,7 +277,7 @@ spec: serviceAccountName: serviceoffer-controller containers: - name: controller - image: ghcr.io/obolnetwork/serviceoffer-controller@sha256:943ba3506c7fdd090bd0db8e563cab6a32c558de8ab69f505008cce48ea3b366 + image: ghcr.io/obolnetwork/serviceoffer-controller:b13254e imagePullPolicy: IfNotPresent env: - name: POD_NAMESPACE diff --git a/internal/embed/infrastructure/values/erpc.yaml.gotmpl b/internal/embed/infrastructure/values/erpc.yaml.gotmpl index f9a1fb49..3b45529e 100644 --- a/internal/embed/infrastructure/values/erpc.yaml.gotmpl +++ b/internal/embed/infrastructure/values/erpc.yaml.gotmpl @@ -16,7 +16,7 @@ replicas: 1 image: repository: ghcr.io/erpc/erpc - tag: 0.0.62 + tag: 0.0.64 pullPolicy: IfNotPresent # Config file diff --git a/renovate.json b/renovate.json index 94e35fe7..4f258dfe 100644 --- a/renovate.json +++ b/renovate.json @@ -118,6 +118,20 @@ "depNameTemplate": "k3s-io/k3s", "versioningTemplate": "regex:^v?(?\\d+)\\.(?\\d+)\\.(?\\d+)([+-](?.+))?$", "extractVersionTemplate": "^v(?.+)$" + }, + { + "customType": "regex", + "description": "Track ghcr.io/obolnetwork/* image pins in embedded infra templates. Captures both `image: :` and `image: :@sha256:` shapes, plus the pure-digest `image: @sha256:` form used historically. Renovate uses the docker datasource to discover newer tags and (with pinDigests) keeps the @sha256 part in lockstep. For this to drive useful tag bumps, release CI should tag images with sortable versions (e.g. v0.9.0-rc10) rather than only short commit SHAs — loose versioning can't decide which arbitrary SHA is newer.", + "matchStrings": [ + "image:\\s*(?ghcr\\.io/obolnetwork/[A-Za-z0-9._-]+):(?[A-Za-z0-9._-]+)(?:@(?sha256:[a-f0-9]+))?", + "image:\\s*(?ghcr\\.io/obolnetwork/[A-Za-z0-9._-]+)@(?sha256:[a-f0-9]+)" + ], + "fileMatch": [ + "^internal/embed/infrastructure/.+\\.yaml$", + "^internal/embed/infrastructure/.+\\.yaml\\.gotmpl$" + ], + "datasourceTemplate": "docker", + "versioningTemplate": "loose" } ], "packageRules": [ @@ -331,6 +345,23 @@ "before 6am on monday" ], "groupName": "k3s image updates" + }, + { + "description": "Group ghcr.io/obolnetwork/* image bumps (verifier, controller, buyer, demo-server, public-storefront, etc.) and ensure they are digest-pinned. The custom regex manager above captures the tag and optional @sha256 suffix; pinDigests=true causes Renovate to add a digest to any pure-tag pin and keep it fresh.", + "matchDatasources": [ + "docker" + ], + "matchPackagePrefixes": [ + "ghcr.io/obolnetwork/" + ], + "labels": [ + "renovate/obol-images" + ], + "schedule": [ + "before 6am on monday" + ], + "groupName": "Obol Network image updates", + "pinDigests": true } ] }