Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 83 additions & 8 deletions cmd/obol/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"os"
"path/filepath"
"strings"
"time"

agentmgr "github.com/ObolNetwork/obol-stack/internal/agent"
"github.com/ObolNetwork/obol-stack/internal/agentcrd"
Expand All @@ -20,6 +21,14 @@ import (
"github.com/urfave/cli/v3"
)

// agentDeleteWaitTimeout is how long deleteCRDAgent waits for the K8s
// finalizer to drain before reporting the delete as stuck. Sized to be
// longer than the controller's typical tearDownAgent (which deletes
// Deployment + Service + Secret + remote-signer manifests) but short
// enough that the user notices when a controller is unreachable or
// running a pre-agent-CRD image without finalizer handling.
const agentDeleteWaitTimeout = 60 * time.Second

type agentTarget struct {
Runtime agentruntime.Runtime
ID string
Expand Down Expand Up @@ -220,7 +229,7 @@ Hermes/OpenClaw onboard flow used by the master agent.`,
return nil
}
}
return deleteCRDAgent(cfg, name, u)
return deleteCRDAgent(cfg, name, cmd.Bool("force"), u)
}
}

Expand Down Expand Up @@ -907,21 +916,55 @@ func listCRDAgents(cfg *config.Config) ([]agentListItem, error) {
// (skills + soul.md). Used by `obol agent delete <name>` when the
// argument matches a CRD-declared agent. Idempotent: missing cluster,
// missing CR, and missing host dir are all treated as "already gone".
func deleteCRDAgent(cfg *config.Config, name string, u *ui.UI) error {
//
// The CR delete uses --wait=false and the CLI polls separately so we
// can show a spinner and surface a clear error when the finalizer drain
// stalls (the common cause: a controller image that pre-dates Agent
// CRD support, which never removes the finalizer). With force=true a
// stuck delete is escalated to a finalizer strip so the user can
// recover without hand-running kubectl patch.
func deleteCRDAgent(cfg *config.Config, name string, force bool, u *ui.UI) error {
if err := agentcrd.ValidateName(name); err != nil {
return err
}

ns := agentcrd.Namespace(name)

if err := kubectl.EnsureCluster(cfg); err == nil {
bin, kc := kubectl.Paths(cfg)
ns := agentcrd.Namespace(name)
// --ignore-not-found makes the CRUD idempotent. The namespace
// itself stays — step 2d's controller may own its lifecycle once
// it provisions resources there. This avoids the CLI deleting
// a namespace another piece of the system thinks it owns.
if err := kubectl.Run(bin, kc, "delete", "agent", name, "-n", ns, "--ignore-not-found"); err != nil {

// Fire-and-watch: send the DELETE request immediately, then poll
// for absence under a spinner. --wait=false makes kubectl return
// after the API server accepts the request (DeletionTimestamp
// set) rather than blocking on finalizer drain. That lets us
// surface clear progress and a recovery hint when drain stalls.
if err := kubectl.Run(bin, kc, "delete", "agent", name, "-n", ns, "--ignore-not-found", "--wait=false"); err != nil {
return fmt.Errorf("delete Agent: %w", err)
}

drainErr := u.RunWithSpinner(
fmt.Sprintf("Waiting for Agent %s/%s finalizer to drain", ns, name),
func() error {
return waitForAgentGone(cfg, name, ns, agentDeleteWaitTimeout)
},
)
if drainErr != nil {
if !force {
return fmt.Errorf("%w\n\nThe controller hasn't drained the Agent finalizer. "+
"Re-run with --force to strip the finalizer and complete the deletion locally.",
drainErr)
}
u.Warnf("Drain timed out; stripping Agent finalizer (--force)")
if err := stripAgentFinalizers(cfg, name, ns); err != nil {
return fmt.Errorf("force-strip finalizer: %w", err)
}
// One more wait pass — finalizer stripped, the CR should
// drop within a second or two. Short timeout: if this also
// stalls, something more fundamental is wrong.
if err := waitForAgentGone(cfg, name, ns, 10*time.Second); err != nil {
return fmt.Errorf("agent still present after finalizer strip: %w", err)
}
}
u.Successf("Agent %s/%s deleted", ns, name)
} else {
u.Dim("Cluster unreachable; skipping CR deletion (host-side files only)")
Expand All @@ -940,6 +983,38 @@ func deleteCRDAgent(cfg *config.Config, name string, u *ui.UI) error {
return nil
}

// waitForAgentGone polls for the Agent CR's absence. Returns nil once
// kubectl reports the CR is gone (either fully GC'd or never existed),
// and a timeout error otherwise. The caller decides what to do with a
// timeout — surface to the user or escalate to a finalizer strip.
func waitForAgentGone(cfg *config.Config, name, ns string, timeout time.Duration) error {
bin, kc := kubectl.Paths(cfg)
deadline := time.Now().Add(timeout)
for {
out, err := kubectl.Output(bin, kc, "get", "agent", name, "-n", ns, "-o", "name", "--ignore-not-found")
if err != nil {
return err
}
if strings.TrimSpace(out) == "" {
return nil
}
if time.Now().After(deadline) {
return fmt.Errorf("timed out after %s waiting for Agent %s/%s to be removed", timeout, ns, name)
}
time.Sleep(time.Second)
}
}

// stripAgentFinalizers clears spec.metadata.finalizers via a JSON merge
// patch. The K8s GC then completes the deletion that was stuck. Used by
// deleteCRDAgent under --force when the controller can't drain the
// finalizer (typically a stale controller image).
func stripAgentFinalizers(cfg *config.Config, name, ns string) error {
bin, kc := kubectl.Paths(cfg)
return kubectl.Run(bin, kc, "patch", "agent", name, "-n", ns,
"--type=merge", "-p", `{"metadata":{"finalizers":[]}}`)
}

func listAgentWallets(cfg *config.Config, runtimeValue string, args []string, u *ui.UI) error {
runtimeValue = strings.TrimSpace(runtimeValue)
if runtimeValue != "" && runtimeValue != "all" {
Expand Down
40 changes: 40 additions & 0 deletions cmd/obol/agent_crd.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,43 @@ func getAgentCR(cfg *config.Config, name string) (string, error) {
}
return strings.TrimSpace(out), nil
}

// getAgentCRState reports both existence and whether the CR is mid-
// deletion. Callers that branch on "already exists, skip creation" must
// not treat a Terminating CR as "already exists" — that path silently
// no-ops `obol sell demo quant` while the previous Agent is still
// finalizing, leaving the user confused about why nothing got created.
type agentCRState struct {
Exists bool
Terminating bool
ResourceName string // e.g. "agent.obol.org/demo-quant", empty when absent
}

func getAgentCRState(cfg *config.Config, name string) (agentCRState, error) {
if err := kubectl.EnsureCluster(cfg); err != nil {
return agentCRState{}, err
}
bin, kc := kubectl.Paths(cfg)
// jsonpath outputs the deletion timestamp (empty if not set) so we
// don't need a second kubectl call to disambiguate present-but-
// terminating from fully-present.
out, err := kubectl.Output(bin, kc, "get", "agent", name, "-n", agentcrd.Namespace(name),
"-o", `jsonpath={.metadata.name}{"|"}{.metadata.deletionTimestamp}`,
"--ignore-not-found")
if err != nil {
return agentCRState{}, err
}
trimmed := strings.TrimSpace(out)
if trimmed == "" || trimmed == "|" {
return agentCRState{Exists: false}, nil
}
parts := strings.SplitN(trimmed, "|", 2)
state := agentCRState{
Exists: parts[0] != "",
ResourceName: "agent.obol.org/" + parts[0],
}
if len(parts) == 2 && strings.TrimSpace(parts[1]) != "" {
state.Terminating = true
}
return state, nil
}
16 changes: 13 additions & 3 deletions cmd/obol/sell_agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -282,10 +282,20 @@ func runAgentBackedDemo(

// 1. Seed host-side files (skills + soul.md) and apply the Agent CR.
// Idempotent — re-running `obol sell demo quant` after a previous
// run is a no-op for the agent if it already exists.
if existing, err := getAgentCR(cfg, agentName); err == nil && existing != "" {
// run is a no-op for the agent if it already exists. A CR that is
// mid-deletion (DeletionTimestamp set, finalizer still draining)
// is NOT treated as "already exists": short-circuiting on it
// means a follow-up `sell demo quant` after a hung `agent delete`
// silently does nothing, which is what motivated this check.
state, stateErr := getAgentCRState(cfg, agentName)
switch {
case stateErr == nil && state.Exists && state.Terminating:
return fmt.Errorf("Agent %s is still being deleted (DeletionTimestamp set, finalizer draining).\n\n"+
"Wait for the controller to finish, or run `obol agent delete %s --force` to strip the finalizer "+
"and retry.", agentName, agentName)
case stateErr == nil && state.Exists:
u.Dim(fmt.Sprintf("Agent %s already exists, leaving as-is", agentName))
} else {
default:
soulWritten, seedErr := agentcrd.SeedHostFiles(cfg, agentName,
spec.Agent.Skills, spec.Agent.Objective, agentcrd.SeedOptions{})
if seedErr != nil {
Expand Down
22 changes: 16 additions & 6 deletions internal/defaults/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,16 +96,26 @@ var devLocallyBuiltImageBases = []string{
}

// rewriteDevDigestPins walks the copied defaults tree and replaces
// every `<base>@sha256:<hex>` reference whose base is in
// devLocallyBuiltImageBases with `<base>:latest`. Only operates on .yaml
// and .yml files so we don't risk corrupting binaries or charts.
// every `<base>@sha256:<hex>` or `<base>:<short-sha>` reference whose
// base is in devLocallyBuiltImageBases with `<base>:latest`. Only
// operates on .yaml and .yml files so we don't risk corrupting binaries
// or charts.
//
// Both pin styles are matched because release pipelines may publish
// either digest pins (immutable) or short-SHA tag pins (e.g. `:b13254e`)
// — in either case the local dev build is tagged `:latest`, so the
// rewrite needs to catch both forms or `obol stack up` would pull from
// the registry instead of using the freshly-built local image.
func rewriteDevDigestPins(defaultsDir string) error {
patterns := make([]*regexp.Regexp, 0, len(devLocallyBuiltImageBases))
replaceWith := make([]string, 0, len(devLocallyBuiltImageBases))
for _, base := range devLocallyBuiltImageBases {
// Escape the base in case it ever grows a regex metachar; sha256
// digests are hex so a simple character class is enough.
patterns = append(patterns, regexp.MustCompile(regexp.QuoteMeta(base)+"@sha256:[a-f0-9]{64}"))
// Match either:
// <base>@sha256:<64 hex> (digest pin)
// <base>:<7-40 hex> (short-SHA tag pin, e.g. b13254e)
// The short-SHA pattern intentionally excludes `:latest` and any
// non-hex tag so we only rewrite immutable pins.
patterns = append(patterns, regexp.MustCompile(regexp.QuoteMeta(base)+"(@sha256:[a-f0-9]{64}|:[a-f0-9]{7,40})"))
replaceWith = append(replaceWith, base+":latest")
}

Expand Down
26 changes: 20 additions & 6 deletions internal/defaults/defaults_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package defaults
import (
"os"
"path/filepath"
"regexp"
"strings"
"testing"

Expand Down Expand Up @@ -42,10 +43,12 @@ func TestCopyInfrastructure_DevModeRewritesDigestPins(t *testing.T) {
}
}

func TestCopyInfrastructure_ProductionPreservesDigestPins(t *testing.T) {
// Without OBOL_DEVELOPMENT=true, the digest pins must survive
// untouched. A regression here would silently downgrade prod
// installs to floating :latest tags.
func TestCopyInfrastructure_ProductionPreservesImagePins(t *testing.T) {
// Without OBOL_DEVELOPMENT=true, the immutable image pins must
// survive untouched. A regression here would silently downgrade prod
// installs to floating :latest tags. We accept either pin style
// (digest or short-SHA tag) — both are immutable, and the rewrite
// path under OBOL_DEVELOPMENT now handles both equivalently.
t.Setenv("OBOL_DEVELOPMENT", "")

cfg := &config.Config{ConfigDir: t.TempDir()}
Expand All @@ -56,11 +59,22 @@ func TestCopyInfrastructure_ProductionPreservesDigestPins(t *testing.T) {
if err != nil {
t.Fatal(err)
}
if !strings.Contains(string(data), "ghcr.io/obolnetwork/serviceoffer-controller@sha256:") {
t.Error("production install lost serviceoffer-controller digest pin")
out := string(data)
if strings.Contains(out, "ghcr.io/obolnetwork/serviceoffer-controller:latest") {
t.Error("production install downgraded serviceoffer-controller to :latest")
}
hasDigest := strings.Contains(out, "ghcr.io/obolnetwork/serviceoffer-controller@sha256:")
hasShortTag := pinnedShortTag.MatchString(out)
if !hasDigest && !hasShortTag {
t.Error("production install lost serviceoffer-controller immutable pin (expected digest or short-SHA tag)")
}
}

// pinnedShortTag matches a serviceoffer-controller tag of 7-40 hex chars
// (git short SHA). Used by the production-pin test to accept either of
// the two pinning styles the dev-mode rewriter knows about.
var pinnedShortTag = regexp.MustCompile(`ghcr\.io/obolnetwork/serviceoffer-controller:[a-f0-9]{7,40}\b`)

func TestCopyInfrastructureRendersStackPlaceholders(t *testing.T) {
cfg := &config.Config{ConfigDir: t.TempDir()}

Expand Down
4 changes: 1 addition & 3 deletions internal/embed/embed_image_pin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@ func TestEmbeddedImages_NoNewLatestTags(t *testing.T) {

// Known unpinned images as of PR #343 follow-up. Each entry MUST have a
// TODO in the template body explaining the pin-by-digest policy.
allowed := map[string]string{
"base/templates/llm.yaml:ghcr.io/obolnetwork/x402-buyer:latest": "x402-buyer: pin by digest once CI publishes a stable tag",
}
allowed := map[string]string{}

files := []string{
"base/templates/llm.yaml",
Expand Down
13 changes: 6 additions & 7 deletions internal/embed/infrastructure/base/templates/llm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -203,13 +203,12 @@ spec:
cpu: "1"
memory: 1Gi
- name: x402-buyer
# TODO(image-pin): replace :latest with a digest (ghcr.io/obolnetwork/x402-buyer@sha256:…)
# or the short-SHA tag pattern used for litellm above (sha-<short>).
# :latest drift between the deployed sidecar and the main branch is what
# produced the /v1 vs bare-path 404 debacle in PR #343: the mux was
# updated to register both paths, but the deployed image still served
# only /v1/*. See internal/embed/embed_image_pin_test.go.
image: ghcr.io/obolnetwork/x402-buyer:latest
# Pinned to the same short-SHA tag as the rest of the x402 control
# plane (verifier + controller in x402.yaml). The previous :latest
# was the root cause of the /v1 vs bare-path 404 saga in PR #343
# (deployed sidecar lagged the source mux). See
# internal/embed/embed_image_pin_test.go.
image: ghcr.io/obolnetwork/x402-buyer:b13254e
imagePullPolicy: IfNotPresent
args:
- --config-dir=/config/buyer-config
Expand Down
4 changes: 2 additions & 2 deletions internal/embed/infrastructure/base/templates/x402.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ spec:
serviceAccountName: x402-verifier
containers:
- name: verifier
image: ghcr.io/obolnetwork/x402-verifier@sha256:05d6e535143974386e4de054eb44ef83270b62da5e48add1f113f4743d08eccc
image: ghcr.io/obolnetwork/x402-verifier:b13254e
imagePullPolicy: IfNotPresent
ports:
- name: http
Expand Down Expand Up @@ -277,7 +277,7 @@ spec:
serviceAccountName: serviceoffer-controller
containers:
- name: controller
image: ghcr.io/obolnetwork/serviceoffer-controller@sha256:943ba3506c7fdd090bd0db8e563cab6a32c558de8ab69f505008cce48ea3b366
image: ghcr.io/obolnetwork/serviceoffer-controller:b13254e
imagePullPolicy: IfNotPresent
env:
- name: POD_NAMESPACE
Expand Down
2 changes: 1 addition & 1 deletion internal/embed/infrastructure/values/erpc.yaml.gotmpl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ replicas: 1

image:
repository: ghcr.io/erpc/erpc
tag: 0.0.62
tag: 0.0.64
pullPolicy: IfNotPresent

# Config file
Expand Down
31 changes: 31 additions & 0 deletions renovate.json
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,20 @@
"depNameTemplate": "k3s-io/k3s",
"versioningTemplate": "regex:^v?(?<major>\\d+)\\.(?<minor>\\d+)\\.(?<patch>\\d+)([+-](?<build>.+))?$",
"extractVersionTemplate": "^v(?<version>.+)$"
},
{
"customType": "regex",
"description": "Track ghcr.io/obolnetwork/* image pins in embedded infra templates. Captures both `image: <ref>:<tag>` and `image: <ref>:<tag>@sha256:<digest>` shapes, plus the pure-digest `image: <ref>@sha256:<digest>` form used historically. Renovate uses the docker datasource to discover newer tags and (with pinDigests) keeps the @sha256 part in lockstep. For this to drive useful tag bumps, release CI should tag images with sortable versions (e.g. v0.9.0-rc10) rather than only short commit SHAs — loose versioning can't decide which arbitrary SHA is newer.",
"matchStrings": [
"image:\\s*(?<depName>ghcr\\.io/obolnetwork/[A-Za-z0-9._-]+):(?<currentValue>[A-Za-z0-9._-]+)(?:@(?<currentDigest>sha256:[a-f0-9]+))?",
"image:\\s*(?<depName>ghcr\\.io/obolnetwork/[A-Za-z0-9._-]+)@(?<currentDigest>sha256:[a-f0-9]+)"
],
"fileMatch": [
"^internal/embed/infrastructure/.+\\.yaml$",
"^internal/embed/infrastructure/.+\\.yaml\\.gotmpl$"
],
"datasourceTemplate": "docker",
"versioningTemplate": "loose"
}
],
"packageRules": [
Expand Down Expand Up @@ -331,6 +345,23 @@
"before 6am on monday"
],
"groupName": "k3s image updates"
},
{
"description": "Group ghcr.io/obolnetwork/* image bumps (verifier, controller, buyer, demo-server, public-storefront, etc.) and ensure they are digest-pinned. The custom regex manager above captures the tag and optional @sha256 suffix; pinDigests=true causes Renovate to add a digest to any pure-tag pin and keep it fresh.",
"matchDatasources": [
"docker"
],
"matchPackagePrefixes": [
"ghcr.io/obolnetwork/"
],
"labels": [
"renovate/obol-images"
],
"schedule": [
"before 6am on monday"
],
"groupName": "Obol Network image updates",
"pinDigests": true
}
]
}
Loading