diff --git a/internal/embed/embed_buyer_state_test.go b/internal/embed/embed_buyer_state_test.go new file mode 100644 index 0000000..e7ed4ae --- /dev/null +++ b/internal/embed/embed_buyer_state_test.go @@ -0,0 +1,92 @@ +package embed + +import ( + "testing" +) + +// TestBuyerStatePVC asserts that x402-buyer's /state is backed by a PVC +// (not an emptyDir), and that the litellm Deployment uses the Recreate +// strategy so the RWO PVC can be remounted without overlap. +// +// Regression: emptyDir lost consumed.json on every pod restart, causing +// the buyer to re-spend already-consumed auths from the ConfigMap pool +// and cascading into facilitator 400s ("nonce already used") until a +// manual `buy.py process --all` reseeded. +func TestBuyerStatePVC(t *testing.T) { + data, err := ReadInfrastructureFile("base/templates/llm.yaml") + if err != nil { + t.Fatalf("ReadInfrastructureFile: %v", err) + } + + docs := multiDoc(data) + + // PVC must exist in the llm namespace with RWO + local-path storage class. + pvc := findDocByName(docs, "PersistentVolumeClaim", "x402-buyer-state") + if pvc == nil { + t.Fatal("PersistentVolumeClaim 'x402-buyer-state' missing from llm.yaml") + } + + if ns := nested(pvc, "metadata", "namespace"); ns != "llm" { + t.Errorf("PVC namespace = %v, want llm", ns) + } + + modes, ok := nested(pvc, "spec", "accessModes").([]any) + if !ok || len(modes) != 1 || modes[0] != "ReadWriteOnce" { + t.Errorf("PVC accessModes = %v, want [ReadWriteOnce]", modes) + } + + if sc := nested(pvc, "spec", "storageClassName"); sc != "local-path" { + t.Errorf("PVC storageClassName = %v, want local-path", sc) + } + + if storage := nested(pvc, "spec", "resources", "requests", "storage"); storage == nil { + t.Error("PVC missing spec.resources.requests.storage") + } + + // litellm Deployment volume entry must reference the PVC, not emptyDir. + dep := findDocByName(docs, "Deployment", "litellm") + if dep == nil { + t.Fatal("litellm Deployment missing from llm.yaml") + } + + volumes, ok := nested(dep, "spec", "template", "spec", "volumes").([]any) + if !ok { + t.Fatal("litellm Deployment has no volumes") + } + + var stateVolume map[string]any + for _, v := range volumes { + vm, ok := v.(map[string]any) + if !ok { + continue + } + if vm["name"] == "x402-buyer-state" { + stateVolume = vm + break + } + } + + if stateVolume == nil { + t.Fatal("litellm Deployment missing 'x402-buyer-state' volume") + } + + if _, isEmptyDir := stateVolume["emptyDir"]; isEmptyDir { + t.Error("x402-buyer-state is still emptyDir — must be persistentVolumeClaim to survive pod restarts") + } + + pvcRef, ok := stateVolume["persistentVolumeClaim"].(map[string]any) + if !ok { + t.Fatal("x402-buyer-state volume is not backed by persistentVolumeClaim") + } + + if claim := pvcRef["claimName"]; claim != "x402-buyer-state" { + t.Errorf("persistentVolumeClaim.claimName = %v, want x402-buyer-state", claim) + } + + // Strategy must be Recreate so the new pod waits for the old pod to + // release the RWO PVC before mounting. RollingUpdate with maxSurge>0 + // would block indefinitely. + if strat := nested(dep, "spec", "strategy", "type"); strat != "Recreate" { + t.Errorf("litellm Deployment strategy.type = %v, want Recreate (RWO PVC cannot be co-mounted during surge)", strat) + } +} diff --git a/internal/embed/infrastructure/base/templates/llm.yaml b/internal/embed/infrastructure/base/templates/llm.yaml index cf34841..b9a304e 100644 --- a/internal/embed/infrastructure/base/templates/llm.yaml +++ b/internal/embed/infrastructure/base/templates/llm.yaml @@ -113,6 +113,30 @@ type: Opaque stringData: LITELLM_MASTER_KEY: "sk-obol-{{CLUSTER_ID}}" +--- +# x402-buyer maintains consumed-nonce state in /state/consumed.json. +# Previously this was emptyDir, which lost state on every pod restart +# — the buyer would then attempt to re-spend already-consumed auths +# from the ConfigMap-loaded pool, cascading into 400s from the +# facilitator's nonce protection until a manual buy.py process --all. +# PVC backed by local-path (single-node k3d default storage class) +# gives crash-safety without conversion to StatefulSet. +# +# Deployment strategy: Recreate — RWO PVC can't be mounted by two +# pods, so RollingUpdate's surge would block. Recreate accepts a +# brief gap during rollout (litellm is replicas:1 anyway). +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: x402-buyer-state + namespace: llm +spec: + accessModes: [ReadWriteOnce] + storageClassName: local-path + resources: + requests: + storage: 50Mi + --- apiVersion: apps/v1 kind: Deployment @@ -126,11 +150,12 @@ spec: # is local to the sidecar pod. Scale this back out only after consumed auth # state is shared or auth pools are sharded per replica. replicas: 1 + # Recreate (not RollingUpdate) because the x402-buyer-state PVC is RWO and + # cannot be co-mounted by an overlapping new pod during surge. Litellm is + # replicas: 1 so this just trades the (currently maxSurge:1) overlap for a + # short gap during rollout — acceptable, and unavoidable with RWO storage. strategy: - type: RollingUpdate - rollingUpdate: - maxUnavailable: 0 - maxSurge: 1 + type: Recreate selector: matchLabels: app: litellm @@ -267,7 +292,8 @@ spec: name: x402-buyer-auths optional: true - name: x402-buyer-state - emptyDir: {} + persistentVolumeClaim: + claimName: x402-buyer-state --- apiVersion: policy/v1