From 304304353ca8d184c096dcd455bf870188e4a463 Mon Sep 17 00:00:00 2001 From: Brad DerManouelian Date: Wed, 22 Apr 2026 16:03:16 -0500 Subject: [PATCH] ci(workers): smoke-test worker image boots before promoting to :latest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On 2026-04-22 the multitenant-workers deploy crashlooped at startup because testmoImportWorker's require graph pulled in next/headers, which is stripped from the workers Docker image. `docker build` never executed the compiled modules, so the failure didn't surface in CI — only at deploy time. Close the gap with a smoke-test step in each per-arch build job: - testplanit/scripts/smoke-test-workers.js (new): require() each compiled worker entrypoint + scheduler.js. Any require-time error (missing module, bad native dep) throws synchronously and fails the script. Force-exits after the require loop to short-circuit the async startWorker() side-effects that run under CJS — this is why we don't need live Valkey/Postgres in CI. - .github/workflows/release.yml: after each `docker buildx bake --push` (tag-push and manual-dispatch, amd64 and arm64), pull the just- published workers image and run the smoke script via `docker run --entrypoint node ... ./scripts/smoke-test-workers.js`. Failure blocks the per-arch job, which blocks merge-manifests and prevents :latest-workers from being retagged to a broken image. Verified locally against a full workers build: ✓ All 16 entrypoints load, script exits 0. ✗ Injected a missing require into one entrypoint → smoke script exits 1 with a clear "Cannot find module ..." error naming the broken worker. This is the signal that would have caught 2026-04-22. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/release.yml | 34 +++++++++++ testplanit/scripts/smoke-test-workers.js | 76 ++++++++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 testplanit/scripts/smoke-test-workers.js diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 676c41ab5..11909f9c3 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -88,6 +88,17 @@ jobs: --set "*.args.GIT_COMMIT=${{ github.sha }}" \ --set "*.args.BASE_DOMAIN=testplanit.com" + - name: Smoke-test workers image (AMD64) + run: | + VERSION="${{ github.ref_name }}" + VERSION_NUM="${VERSION#v}" + IMAGE="ghcr.io/${REPO_LC}:${VERSION_NUM}-workers-amd64" + docker pull "$IMAGE" + # Fail fast if any worker entrypoint's require graph is broken + # (e.g. a missing native dep or a stripped package — the failure + # mode from the 2026-04-22 next/headers incident). + docker run --rm --entrypoint node "$IMAGE" ./scripts/smoke-test-workers.js + # Build ARM64 images natively on macOS arm64 runner (using Colima) build-arm64: if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') @@ -146,6 +157,17 @@ jobs: --set "*.args.GIT_COMMIT=${{ github.sha }}" \ --set "*.args.BASE_DOMAIN=testplanit.com" + - name: Smoke-test workers image (ARM64) + run: | + VERSION="${{ github.ref_name }}" + VERSION_NUM="${VERSION#v}" + IMAGE="ghcr.io/${REPO_LC}:${VERSION_NUM}-workers-arm64" + docker pull "$IMAGE" + # Fail fast if any worker entrypoint's require graph is broken + # (e.g. a missing native dep or a stripped package — the failure + # mode from the 2026-04-22 next/headers incident). + docker run --rm --entrypoint node "$IMAGE" ./scripts/smoke-test-workers.js + # Merge architecture-specific images into multi-arch manifests merge-manifests: if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') @@ -259,6 +281,12 @@ jobs: --set "*.args.GIT_COMMIT=${{ github.sha }}" \ --set "*.args.BASE_DOMAIN=testplanit.com" + - name: Smoke-test workers image (AMD64) + run: | + IMAGE="ghcr.io/${REPO_LC}:${{ github.event.inputs.tag }}-workers-amd64" + docker pull "$IMAGE" + docker run --rm --entrypoint node "$IMAGE" ./scripts/smoke-test-workers.js + # Manual build: ARM64 images natively on macOS arm64 runner (using Colima) docker-manual-arm64: if: github.event_name == 'workflow_dispatch' @@ -314,6 +342,12 @@ jobs: --set "*.args.GIT_COMMIT=${{ github.sha }}" \ --set "*.args.BASE_DOMAIN=testplanit.com" + - name: Smoke-test workers image (ARM64) + run: | + IMAGE="ghcr.io/${REPO_LC}:${{ github.event.inputs.tag }}-workers-arm64" + docker pull "$IMAGE" + docker run --rm --entrypoint node "$IMAGE" ./scripts/smoke-test-workers.js + # Manual build: Merge architecture-specific images into multi-arch manifests docker-manual-merge: if: github.event_name == 'workflow_dispatch' diff --git a/testplanit/scripts/smoke-test-workers.js b/testplanit/scripts/smoke-test-workers.js new file mode 100644 index 000000000..617ca3899 --- /dev/null +++ b/testplanit/scripts/smoke-test-workers.js @@ -0,0 +1,76 @@ +#!/usr/bin/env node +/** + * Smoke-test: verify every worker's compiled entry point loads cleanly + * inside the workers Docker image. + * + * Background: on 2026-04-22 the multitenant-workers deploy crashlooped + * at startup because a module imported by testmoImportWorker had a + * top-level `import "next/headers"` — but the workers image strips + * Next.js from node_modules to save ~900MB. `docker build` never + * surfaces this: the build only compiles files, never loads them at + * runtime. This script closes that gap. + * + * What it does: `require()` each compiled worker entry so Node's CJS + * resolver walks the full import graph. Any missing/unresolved module + * throws synchronously and fails this script. Only require-time errors + * count; the workers' connection attempts to Valkey/Postgres that fire + * from the main-guard (`typeof import.meta === "undefined"` branch) + * happen asynchronously and we exit(0) before they escape, so we + * don't need a live Valkey/Postgres in CI. + * + * Keep the list in sync with ecosystem.config.js + scripts/build-workers.js. + */ + +const path = require("path"); + +// Worker entrypoints — must match scripts/build-workers.js entryPoints. +const WORKERS = [ + "notificationWorker", + "emailWorker", + "forecastWorker", + "syncWorker", + "testmoImportWorker", + "elasticsearchReindexWorker", + "auditLogWorker", + "autoTagWorker", + "budgetAlertWorker", + "repoCacheWorker", + "copyMoveWorker", + "duplicateScanWorker", + "magicSelectWorker", + "stepSequenceScanWorker", + "generateFromUrlWorker", +]; + +const entryPoints = [ + ...WORKERS.map((name) => ({ name, file: `dist/workers/${name}.js` })), + { name: "scheduler", file: "dist/scheduler.js" }, +]; + +let failed = 0; +for (const { name, file } of entryPoints) { + const abs = path.resolve(process.cwd(), file); + try { + require(abs); + console.log(`✓ ${name}`); + } catch (err) { + failed += 1; + console.error(`✗ ${name}: ${err && err.message ? err.message : err}`); + if (err && err.stack) { + console.error(err.stack); + } + } +} + +if (failed > 0) { + console.error(`\n${failed} worker entrypoint(s) failed to load.`); + // Exit on the next tick so pending error logs flush before we bail. + process.exit(1); +} + +console.log( + `\nAll ${entryPoints.length} worker entrypoints loaded successfully.` +); +// Force exit to short-circuit any async startup work (BullMQ workers +// keep the event loop alive once new Worker(...) has been called). +process.exit(0);