diff --git a/k8s/helm/commonly/templates/agents/litellm-deployment.yaml b/k8s/helm/commonly/templates/agents/litellm-deployment.yaml index b8d7b21d..c810790f 100644 --- a/k8s/helm/commonly/templates/agents/litellm-deployment.yaml +++ b/k8s/helm/commonly/templates/agents/litellm-deployment.yaml @@ -1,4 +1,23 @@ {{- if .Values.litellm.enabled }} +{{- if .Values.litellm.chatgptAuth.persistence.enabled }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: litellm-chatgpt-auth + namespace: {{ include "commonly.namespace" . }} + labels: + {{- include "commonly.labels" . | nindent 4 }} + app: litellm +spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: {{ .Values.litellm.chatgptAuth.persistence.size | default "1Gi" }} + {{- with .Values.litellm.chatgptAuth.persistence.storageClass }} + storageClassName: {{ . }} + {{- end }} +--- +{{- end }} apiVersion: apps/v1 kind: Deployment metadata: @@ -9,6 +28,13 @@ metadata: app: litellm spec: replicas: 1 + {{- if .Values.litellm.chatgptAuth.persistence.enabled }} + # PVC is RWO single-writer; Recreate ensures the old pod releases before + # the new one tries to attach. Without this, helm-upgrade can hang for + # 5+ minutes on the new pod waiting for the volume. + strategy: + type: Recreate + {{- end }} selector: matchLabels: {{- include "commonly.selectorLabels" . | nindent 6 }} @@ -315,13 +341,53 @@ spec: except Exception as e: print(f'[rotator] save_state failed: {e}', flush=True) + def _read_pod_auth_file(path): + """Read a codex auth.json from disk. Returns (access, refresh, id_token) or None.""" + try: + with open(path) as f: + d = json.load(f) + # codex CLI writes flat shape; legacy rotator-written file also flat + access = d.get('access_token', '') + refresh = d.get('refresh_token', '') + id_tok = d.get('id_token', '') + if access or refresh: + return access, refresh, id_tok + except Exception: + pass + return None + def get_candidates(): + """Build the rotation candidate list. + + PREFERRED: pod-side device-auth'd files at /chatgpt-auth/auth-{1,2,3}.json. + Operator created these via `kubectl exec` into the codex-cli sidecar + + `codex login --device-auth`. Tokens are cluster-IP-bound, so ChatGPT + doesn't invalidate them on cluster usage (the inverse of the env-var + path below where laptop-bound sessions die on first cluster call). + + FALLBACK: env-var-fed tokens from GCP SM. Kept for backward compat with + older operator flows; flagged stale by `--mode envvar` in logs so + it's clear when we're on the dead path. + """ + pod_files = [ + ('1', '/chatgpt-auth/auth-1.json'), + ('2', '/chatgpt-auth/auth-2.json'), + ('3', '/chatgpt-auth/auth-3.json'), + ] + out = [] + for label, path in pod_files: + rec = _read_pod_auth_file(path) + if rec: + access, refresh, id_tok = rec + out.append((label, access, refresh, id_tok)) + if out: + return out + # Fallback: env-var path (legacy, pre-cluster-bound) specs = [ ('1', 'OPENAI_CODEX_ACCESS_TOKEN', 'OPENAI_CODEX_REFRESH_TOKEN', 'OPENAI_CODEX_ID_TOKEN'), ('2', 'OPENAI_CODEX_ACCESS_TOKEN_2', 'OPENAI_CODEX_REFRESH_TOKEN_2', ''), ('3', 'OPENAI_CODEX_ACCESS_TOKEN_3', 'OPENAI_CODEX_REFRESH_TOKEN_3', 'OPENAI_CODEX_ID_TOKEN_3'), ] - out = [] for label, a_env, r_env, i_env in specs: access = os.environ.get(a_env, '') refresh = os.environ.get(r_env, '') @@ -426,6 +492,77 @@ spec: - name: chatgpt-auth mountPath: /chatgpt-auth {{- end }} + # codex-cli sidecar: provides the codex binary inside the LiteLLM pod + # so an operator can run `codex login --device-auth` from within the + # cluster. The resulting auth.json lands on the shared chatgpt-auth + # PVC (as /chatgpt-auth/auth-N.json), and the codex-auth-rotator + # above prefers those pod-side files over the env-var-fed tokens. + # + # ChatGPT binds OAuth sessions to the IP/device that completed + # device-auth. Doing device-auth on a laptop and uploading tokens + # to the cluster invalidates the session on first cluster use. + # Doing device-auth from inside this pod produces a cluster-IP- + # bound session that survives. See PR #362 / cloud-codex-deployment + # for the per-agent precedent. + # + # Operator flow (one-time per account, after pod is up): + # kubectl exec -n {{ include "commonly.namespace" . }} -it deploy/litellm -c codex-cli -- /scripts/auth-login.sh 1 + # Sign in to ChatGPT in browser with account #N. Upon success the + # script copies ~/.codex/auth.json to /chatgpt-auth/auth-1.json. + # Repeat for accounts 2 and 3. + - name: codex-cli + image: node:22-bookworm-slim + command: + - /bin/sh + - -c + - | + # Install codex CLI + ca-certs on first boot, then idle so the + # operator can exec into us. The sleep loop is intentional — + # there's no continuous workload here; this sidecar exists + # purely to provide `codex` inside the same pod that holds the + # chatgpt-auth PVC. + if [ ! -x /usr/local/bin/codex ]; then + apt-get update >/dev/null 2>&1 || true + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates >/dev/null 2>&1 || true + update-ca-certificates >/dev/null 2>&1 || true + npm install --global --no-audit --no-fund "@openai/codex@{{ .Values.litellm.codexCli.version | default "0.125.0" }}" >/dev/null 2>&1 || true + fi + mkdir -p /scripts + cat > /scripts/auth-login.sh <<'SCRIPT' + #!/bin/sh + # auth-login.sh + # Runs codex login --device-auth interactively. Operator follows the URL+code + # printed to stdout, completes in browser, and on success the resulting + # auth.json is copied to /chatgpt-auth/auth-N.json for the rotator to pick up. + set -e + N="${1:-}" + if [ -z "$N" ]; then echo "usage: $0 "; exit 1; fi + HOMEDIR="/tmp/codex-login-$N" + rm -rf "$HOMEDIR" + mkdir -p "$HOMEDIR/.codex" + HOME="$HOMEDIR" codex login --device-auth + if [ ! -s "$HOMEDIR/.codex/auth.json" ]; then + echo "auth.json was not written — login did not complete" + exit 1 + fi + cp "$HOMEDIR/.codex/auth.json" "/chatgpt-auth/auth-$N.json" + chmod 600 "/chatgpt-auth/auth-$N.json" + echo "wrote /chatgpt-auth/auth-$N.json — rotator will pick it up on next tick" + SCRIPT + chmod +x /scripts/auth-login.sh + echo "[codex-cli] ready. Run device-auth via:" + echo "[codex-cli] kubectl exec -it -c codex-cli -- /scripts/auth-login.sh <1|2|3>" + while true; do sleep 3600; done + volumeMounts: + - name: chatgpt-auth + mountPath: /chatgpt-auth + resources: + requests: + cpu: 20m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi initContainers: # Write the best available (non-expired) Codex token to auth.json for LiteLLM's chatgpt/ provider. # Account-1 entries in litellm-config have no api_key — the chatgpt/ provider reads @@ -605,7 +742,12 @@ spec: configMap: name: litellm-config - name: chatgpt-auth + {{- if .Values.litellm.chatgptAuth.persistence.enabled }} + persistentVolumeClaim: + claimName: litellm-chatgpt-auth + {{- else }} emptyDir: {} + {{- end }} {{- with .Values.litellm.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/k8s/helm/commonly/values-dev.yaml b/k8s/helm/commonly/values-dev.yaml index 4e203849..ea962289 100644 --- a/k8s/helm/commonly/values-dev.yaml +++ b/k8s/helm/commonly/values-dev.yaml @@ -241,6 +241,15 @@ litellm: # rotation gives real multi-account benefit when one account exhausts. codexAuthRotator: enabled: true + # Persist chatgpt-auth across pod restarts so pod-side device-auth'd + # auth-N.json files survive litellm rollouts. Without this every + # helm-upgrade nukes the cluster-bound tokens and Nova/Pixel/Cody + # go silent until the operator re-device-auths. + chatgptAuth: + persistence: + enabled: true + size: 1Gi + storageClass: standard-rwo nodeSelector: pool: dev tolerations: diff --git a/k8s/helm/commonly/values.yaml b/k8s/helm/commonly/values.yaml index 10bbf108..26434c20 100644 --- a/k8s/helm/commonly/values.yaml +++ b/k8s/helm/commonly/values.yaml @@ -369,6 +369,22 @@ litellm: # tokens). Re-enable once the upstream LiteLLM bug is fixed. codexAuthRotator: enabled: true + # Codex CLI sidecar: provides the `codex` binary inside the LiteLLM + # pod so an operator can run device-auth FROM the cluster (not from + # a laptop). The resulting auth.json lands on the chatgpt-auth PVC + # and the codex-auth-rotator picks it up preferentially over env-var + # tokens. See litellm-deployment.yaml for the operator flow. + codexCli: + version: "0.125.0" + # chatgpt-auth volume backs the codex-auth-rotator's auth.json plus + # any pod-side device-auth'd auth-N.json files. Persistence is + # REQUIRED for the cluster-bound auth flow — emptyDir loses tokens + # on every pod restart, forcing re-device-auth every helm-upgrade. + chatgptAuth: + persistence: + enabled: false + size: 1Gi + # storageClass defaults to the cluster's default StorageClass. image: repository: ghcr.io/berriai/litellm tag: v1.82.3-stable