Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions internal/embed/embed_buyer_state_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package embed

import (
"testing"
)

// TestBuyerStatePVC asserts that x402-buyer's /state is backed by a PVC
// (not an emptyDir), and that the litellm Deployment uses the Recreate
// strategy so the RWO PVC can be remounted without overlap.
//
// Regression: emptyDir lost consumed.json on every pod restart, causing
// the buyer to re-spend already-consumed auths from the ConfigMap pool
// and cascading into facilitator 400s ("nonce already used") until a
// manual `buy.py process --all` reseeded.
func TestBuyerStatePVC(t *testing.T) {
data, err := ReadInfrastructureFile("base/templates/llm.yaml")
if err != nil {
t.Fatalf("ReadInfrastructureFile: %v", err)
}

docs := multiDoc(data)

// PVC must exist in the llm namespace with RWO + local-path storage class.
pvc := findDocByName(docs, "PersistentVolumeClaim", "x402-buyer-state")
if pvc == nil {
t.Fatal("PersistentVolumeClaim 'x402-buyer-state' missing from llm.yaml")
}

if ns := nested(pvc, "metadata", "namespace"); ns != "llm" {
t.Errorf("PVC namespace = %v, want llm", ns)
}

modes, ok := nested(pvc, "spec", "accessModes").([]any)
if !ok || len(modes) != 1 || modes[0] != "ReadWriteOnce" {
t.Errorf("PVC accessModes = %v, want [ReadWriteOnce]", modes)
}

if sc := nested(pvc, "spec", "storageClassName"); sc != "local-path" {
t.Errorf("PVC storageClassName = %v, want local-path", sc)
}

if storage := nested(pvc, "spec", "resources", "requests", "storage"); storage == nil {
t.Error("PVC missing spec.resources.requests.storage")
}

// litellm Deployment volume entry must reference the PVC, not emptyDir.
dep := findDocByName(docs, "Deployment", "litellm")
if dep == nil {
t.Fatal("litellm Deployment missing from llm.yaml")
}

volumes, ok := nested(dep, "spec", "template", "spec", "volumes").([]any)
if !ok {
t.Fatal("litellm Deployment has no volumes")
}

var stateVolume map[string]any
for _, v := range volumes {
vm, ok := v.(map[string]any)
if !ok {
continue
}
if vm["name"] == "x402-buyer-state" {
stateVolume = vm
break
}
}

if stateVolume == nil {
t.Fatal("litellm Deployment missing 'x402-buyer-state' volume")
}

if _, isEmptyDir := stateVolume["emptyDir"]; isEmptyDir {
t.Error("x402-buyer-state is still emptyDir — must be persistentVolumeClaim to survive pod restarts")
}

pvcRef, ok := stateVolume["persistentVolumeClaim"].(map[string]any)
if !ok {
t.Fatal("x402-buyer-state volume is not backed by persistentVolumeClaim")
}

if claim := pvcRef["claimName"]; claim != "x402-buyer-state" {
t.Errorf("persistentVolumeClaim.claimName = %v, want x402-buyer-state", claim)
}

// Strategy must be Recreate so the new pod waits for the old pod to
// release the RWO PVC before mounting. RollingUpdate with maxSurge>0
// would block indefinitely.
if strat := nested(dep, "spec", "strategy", "type"); strat != "Recreate" {
t.Errorf("litellm Deployment strategy.type = %v, want Recreate (RWO PVC cannot be co-mounted during surge)", strat)
}
}
36 changes: 31 additions & 5 deletions internal/embed/infrastructure/base/templates/llm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,30 @@ type: Opaque
stringData:
LITELLM_MASTER_KEY: "sk-obol-{{CLUSTER_ID}}"

---
# x402-buyer maintains consumed-nonce state in /state/consumed.json.
# Previously this was emptyDir, which lost state on every pod restart
# — the buyer would then attempt to re-spend already-consumed auths
# from the ConfigMap-loaded pool, cascading into 400s from the
# facilitator's nonce protection until a manual buy.py process --all.
# PVC backed by local-path (single-node k3d default storage class)
# gives crash-safety without conversion to StatefulSet.
#
# Deployment strategy: Recreate — RWO PVC can't be mounted by two
# pods, so RollingUpdate's surge would block. Recreate accepts a
# brief gap during rollout (litellm is replicas:1 anyway).
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: x402-buyer-state
namespace: llm
spec:
accessModes: [ReadWriteOnce]
storageClassName: local-path
resources:
requests:
storage: 50Mi

---
apiVersion: apps/v1
kind: Deployment
Expand All @@ -126,11 +150,12 @@ spec:
# is local to the sidecar pod. Scale this back out only after consumed auth
# state is shared or auth pools are sharded per replica.
replicas: 1
# Recreate (not RollingUpdate) because the x402-buyer-state PVC is RWO and
# cannot be co-mounted by an overlapping new pod during surge. Litellm is
# replicas: 1 so this just trades the (currently maxSurge:1) overlap for a
# short gap during rollout — acceptable, and unavoidable with RWO storage.
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 0
maxSurge: 1
type: Recreate
selector:
matchLabels:
app: litellm
Expand Down Expand Up @@ -267,7 +292,8 @@ spec:
name: x402-buyer-auths
optional: true
- name: x402-buyer-state
emptyDir: {}
persistentVolumeClaim:
claimName: x402-buyer-state

---
apiVersion: policy/v1
Expand Down
Loading