diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fac56aa..2adf258 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -106,3 +106,139 @@ jobs: name: ci-test-logs path: /tmp/SwiftLM-test-*.log retention-days: 7 + + # ── Speculative Decoding E2E (dual-model: 0.8B draft + 4B main) ── + # Uses the standard macos-15 runner (7 GB RAM). + # We test the 4B main model which safely fits within memory. + speculative-decoding: + runs-on: macos-15 + timeout-minutes: 45 + needs: ci # Only run after core CI passes + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Install Metal Toolchain + run: xcodebuild -downloadComponent MetalToolchain || true + + - name: Cache Swift packages + uses: actions/cache@v4 + with: + path: .build + key: ${{ runner.os }}-spm-SwiftLM-v2-${{ hashFiles('Package.resolved') }} + restore-keys: | + ${{ runner.os }}-spm-SwiftLM-v2- + + - name: Clear stale module cache + run: find .build -type d -name ModuleCache -exec rm -rf {} + 2>/dev/null || true + + - name: Resolve dependencies + run: swift package resolve + + - name: Build (Release) + run: swift build -c release + + - name: Install MLX Metal library + run: | + python3 -m venv /tmp/mlx_venv + /tmp/mlx_venv/bin/pip install --quiet mlx + cp /tmp/mlx_venv/lib/python*/site-packages/mlx/lib/mlx.metallib .build/release/ + + - name: Cache MLX models (draft + main) + uses: actions/cache@v4 + with: + path: ~/.cache/huggingface + key: mlx-speculative-qwen35-0.8b-9b + + - name: Run speculative decoding E2E + env: + HF_HUB_DOWNLOAD_TIMEOUT: "900" + run: | + chmod +x tests/test-speculative.sh + for attempt in 1 2 3; do + echo "Attempt $attempt of 3..." + if tests/test-speculative.sh .build/release/SwiftLM 15414; then + exit 0 + fi + if [ "$attempt" -lt 3 ]; then + echo "Test failed, retrying in 10s..." + sleep 10 + fi + done + echo "All attempts failed" + exit 1 + + - name: Upload speculative test logs on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: speculative-test-logs + path: /tmp/SwiftLM-test-speculative.log + retention-days: 7 + + # ── Speculative Decoding Memory Evaluation ── + # Runs the 9B model with NUM_DRAFT_TOKENS=2 to check peak + # memory compression/efficiency. Allowed to OOM/fail. + speculative-decoding-eval: + runs-on: macos-15 + timeout-minutes: 45 + needs: ci + continue-on-error: true + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Install Metal Toolchain + run: xcodebuild -downloadComponent MetalToolchain || true + + - name: Cache Swift packages + uses: actions/cache@v4 + with: + path: .build + key: ${{ runner.os }}-spm-SwiftLM-v2-${{ hashFiles('Package.resolved') }} + restore-keys: | + ${{ runner.os }}-spm-SwiftLM-v2- + + - name: Clear stale module cache + run: find .build -type d -name ModuleCache -exec rm -rf {} + 2>/dev/null || true + + - name: Resolve dependencies + run: swift package resolve + + - name: Build (Release) + run: swift build -c release + + - name: Install MLX Metal library + run: | + python3 -m venv /tmp/mlx_venv + /tmp/mlx_venv/bin/pip install --quiet mlx + cp /tmp/mlx_venv/lib/python*/site-packages/mlx/lib/mlx.metallib .build/release/ + + - name: Run speculative evaluation E2E + env: + HF_HUB_DOWNLOAD_TIMEOUT: "900" + run: | + chmod +x tests/test-speculative-eval.sh + for attempt in 1 2 3; do + echo "Attempt $attempt of 3..." + if tests/test-speculative-eval.sh .build/release/SwiftLM 15414; then + exit 0 + fi + if [ "$attempt" -lt 3 ]; then + echo "Test failed, retrying in 10s..." + sleep 10 + fi + done + echo "All attempts failed" + exit 1 + + - name: Upload speculative eval logs on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: speculative-eval-logs + path: /tmp/SwiftLM-test-speculative-eval.log + retention-days: 7 + diff --git a/Package.resolved b/Package.resolved index ab15d90..558ae83 100644 --- a/Package.resolved +++ b/Package.resolved @@ -42,7 +42,7 @@ "location" : "https://github.com/SharpAI/mlx-swift-lm.git", "state" : { "branch" : "main", - "revision" : "b71fad20ff634df1024fcf4c81f4748907a4fa59" + "revision" : "f14895559f051ebaf4cb61d6959250f57d2fa225" } }, { diff --git a/Package.swift b/Package.swift index 0a13cfa..1026ea9 100644 --- a/Package.swift +++ b/Package.swift @@ -13,7 +13,7 @@ let package = Package( // Local Apple MLX Swift fork for C++ extensions .package(url: "https://github.com/SharpAI/mlx-swift.git", branch: "main"), // Apple's LLM library built on MLX Swift (SharpAI fork — with GPU/CPU layer partitioning) - .package(url: "https://github.com/ericjlake/mlx-swift-lm.git", branch: "feat/ssd-streaming-10x"), + .package(url: "https://github.com/SharpAI/mlx-swift-lm.git", branch: "main"), // HuggingFace tokenizers + model download .package(url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "1.2.0")), // Lightweight HTTP server (Apple-backed Swift server project) diff --git a/README.md b/README.md index b5ee8f8..068fa4e 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,8 @@ Benchmark results for `gemma-4-26b-a4b-it-4bit` (26B MoE, 4-bit) on M5 Pro 64 GB - 🔌 **OpenAI-compatible**: Drop-in replacement for OpenAI SDKs (`/v1/chat/completions`, streaming, etc). - 🧠 **Smart Model Routing**: Loads HuggingFace format models directly, with native Safetensors parsing. - ⚡️ **TurboQuantization Integrated**: Custom low-level MLX Metal primitives that apply extremely fast quantization for KV caching out-of-the-box. -- 💾 **SSD Expert Streaming**: *Experimental* zero-copy streaming that swaps Mixture of Experts (MoE) layers directly from the NVMe SSD to the GPU command buffer without trashing macOS Unified Memory (prevents Watchdog OS kernel panics on 122B+ models). +- 💾 **SSD Expert Streaming (10x)**: High-performance NVMe streaming that loads Mixture of Experts (MoE) layers directly from SSD to GPU — engineered by [@ericjlake](https://github.com/ericjlake), achieving **10x speedup** (0.58 → 5.91 tok/s) on 122B+ models with only ~10 GB resident memory. Uses cross-projection batching, concurrent pread (QD=24), asyncEval pipeline, and runtime top-k expert selection. +- 🔮 **Speculative Decoding**: Load a small draft model (e.g. 9B) alongside a large main model to generate candidate tokens and verify in bulk — accelerating in-RAM inference. - 🎛️ **Granular Memory Control**: Integrated Layer Partitioning (`--gpu-layers`) and Wisdom Auto-Calibration for squeezing massive models into RAM. --- @@ -146,6 +147,64 @@ Reference implementations: [`turboquant-mlx`](https://github.com/sharpner/turboq --- +## 💾 SSD Expert Streaming: 10x MoE Speedup + +SwiftLM implements a **rewritten SSD expert streaming pipeline** (engineered by [Eric Lake](https://github.com/ericjlake)) that achieves 10x generation speedup for massive Mixture of Experts (MoE) models running on memory-constrained Apple Silicon. This enables running models like **Qwen3.5-122B** (69.6 GB) and **Qwen3.5-397B** (209 GB) on a **64 GB Mac** by streaming expert weights from NVMe SSD. + +### Benchmark Results (M1 Ultra 64GB, Qwen3.5-122B-A10B-4bit) + +| Configuration | tok/s | vs. Original | Notes | +|---|---|---|---| +| Original `--stream-experts` | 0.58 | baseline | Sequential pread, 1 NVMe queue | +| **This PR (top-k=8, full quality)** | **4.95** | **8.5×** | All 8 experts evaluated | +| **This PR (top-k=6, default)** | **5.20** | **9.0×** | Recommended default | +| **This PR (top-k=4, speed mode)** | **5.91** | **10.2×** | Best quality/speed tradeoff | +| **This PR (top-k=2, turbo mode)** | **6.52** | **11.2×** | Still coherent output | + +> Memory stable at **~10.6 GB resident**, no swap activity. Tested over 200-token generation runs. + +### The Approach: Small Model Helps Large Model + +A novel aspect of this architecture is the **dual-model speculative decoding** pattern: a small draft model (e.g. Qwen3.5-9B at 73 tok/s) runs **entirely in RAM** while the large MoE model (e.g. 122B) streams experts from SSD. The draft model generates candidate tokens at high speed, and the main model verifies them in bulk — dramatically reducing the number of SSD-bound generation rounds needed. + +> **Important finding:** Speculative decoding is **counterproductive for SSD-streaming MoE** specifically. The verify pass sends N+1 tokens, each routing to *different* experts — SSD I/O scales with the *union* of all positions' expert selections. Speculative decoding is therefore routed exclusively to **in-RAM models**. + +### Optimization Techniques + +1. **Cross-Projection Batching**: Collapses ~1,400 per-expert `eval()` calls down to ~48 per token by orchestrating gate/up/down projections together in `SwitchGLU`. +2. **Concurrent NVMe pread (QD=24)**: Replaces sequential pread with `DispatchQueue.concurrentPerform`, saturating the NVMe controller's queue depth (8 experts × 3 projections = 24 parallel reads). +3. **AsyncEval Pipeline with Speculative Pread**: Overlaps GPU compute with SSD I/O — uses previous-token routing to speculatively pre-load experts for the next token during the GPU async window (~70% hit rate). Only missed experts (~30%) require on-demand pread after routing sync. +4. **Persistent Metal Buffers**: Expert weight buffers are allocated once per `SwitchGLU` layer and reused across tokens, eliminating per-token allocation overhead. +5. **Runtime Top-K Expert Selection**: The `SWIFTLM_TOP_K` environment variable reduces the number of active experts per token at runtime without model recompilation — trading marginal quality for significant speed gains. + +### Key Engineering Findings + +| Finding | Detail | +|---|---| +| **GPU compute is the bottleneck** | At steady state, GPU compute is ~190ms of ~200ms per-token time. The OS page cache serves ~90% of expert reads from RAM. | +| **Don't cache experts in application memory** | An LRU expert cache *stole* from the OS page cache and regressed performance (4.84 → 4.01 tok/s). Let the kernel manage it. | +| **MambaCache requires checkpoint rollback** | Unlike attention KV caches (trim = decrement offset), Mamba's recurrent state integrates all history and cannot be partially undone. We implemented `checkpoint()`/`restore()` for speculative decoding on hybrid Attention+Mamba architectures (Qwen3.5). | + +### Usage + +```bash +# Standard SSD streaming (recommended, top-k=6): +SWIFTLM_TOP_K=6 SwiftLM --port 8002 \ + --model /Qwen3.5-122B-A10B-4bit --stream-experts + +# Speed mode (top-k=4): +SWIFTLM_TOP_K=4 SwiftLM --port 8002 \ + --model /Qwen3.5-122B-A10B-4bit --stream-experts + +# With speculative decoding (in-RAM models only): +SwiftLM --port 8002 \ + --model /Qwen3.5-27B-4bit \ + --draft-model /Qwen3.5-9B-4bit \ + --num-draft-tokens 4 +``` + +--- + ## 💻 Benchmarks & Testing Run our automated benchmark suites via the interactive script: @@ -226,8 +285,10 @@ curl http://localhost:5413/v1/chat/completions \ | `--max-tokens` | `2048` | Max tokens limit per generation | | `--prefill-size`| `512` | Prompt prefill chunk size (micro-batching for long contexts) | | `--gpu-layers` | `model_default`| Restrict the amount of layers allocated to GPU hardware | -| `--stream-experts` | `false` | Enable experimental SSD streaming for MoE model expert matrices | +| `--stream-experts` | `false` | Enable SSD expert streaming for MoE models (10x speedup) | | `--turbo-kv` | `false` | Enable TurboQuant 3-bit KV cache compression | +| `--draft-model` | (none) | Draft model path/ID for speculative decoding (in-RAM models only) | +| `--num-draft-tokens` | `4` | Number of draft tokens per speculation round | ## 📦 Requirements @@ -247,7 +308,13 @@ The model instantly woke up from "whispering" whitespace and successfully respon ## 🙏 Acknowledgments & Credits -`SwiftLM` leverages the powerful foundation of the Apple MLX community and relies heavily on the open-source ecosystem. While the custom C++ implementations, Metal optimizations, and high-performance pipeline architecture were engineered natively for this engine, we owe massive thanks to the following projects for their indispensable reference materials and underlying protocols: +`SwiftLM` leverages the powerful foundation of the Apple MLX community and relies heavily on the open-source ecosystem. While the custom C++ implementations, Metal optimizations, and high-performance pipeline architecture were engineered natively for this engine, we owe massive thanks to the following projects and contributors for their indispensable reference materials and underlying protocols: + +### Contributors + +- **[Eric Lake](https://github.com/ericjlake)** — Engineered the **SSD Expert Streaming 10x rewrite** ([PR #26](https://github.com/SharpAI/SwiftLM/pull/26)), achieving 10× generation speedup on 122B+ MoE models via cross-projection batching, concurrent NVMe pread (QD=24), asyncEval pipeline with speculative pread, and runtime top-k expert selection. Also implemented the **speculative decoding infrastructure** with `DraftModelRef`, dual-model loading, and **MambaCache checkpoint/restore** for hybrid Attention+Mamba architectures. + +### Projects & References - **[mlx-swift](https://github.com/ml-explore/mlx-swift)** — The core Apple MLX wrapper bringing Metal-accelerated operations into the Swift ecosystem. - **[mlx-lm](https://github.com/ml-explore/mlx/tree/main/mlx_lm)** — The official Python language models implementation, serving as the core inspiration for our chunked-prefill architecture and attention manipulation logic. diff --git a/tests/test-speculative-eval.sh b/tests/test-speculative-eval.sh new file mode 100755 index 0000000..862f3c3 --- /dev/null +++ b/tests/test-speculative-eval.sh @@ -0,0 +1,236 @@ +#!/bin/bash +# test-speculative.sh — Speculative decoding E2E verification +# +# Uses a small draft model (Qwen3.5-0.8B) to accelerate a larger main model +# (Qwen3.5-4B) via speculative decoding. Verifies: +# 1. Dual-model loading (draft + main) +# 2. Speculative decoding path activation +# 3. Correct token generation +# 4. Server stability under dual-model memory pressure +# +# Usage: +# ./tests/test-speculative.sh [binary_path] [port] +# +# Requirements: +# - ~4 GB RAM (0.8B draft ~1 GB + 4B main ~3 GB) +# - macos-15 (7 GB) on GitHub Actions is sufficient +# - curl, jq + +set -euo pipefail + +BINARY="${1:-.build/release/SwiftLM}" +PORT="${2:-15414}" +HOST="127.0.0.1" +MAIN_MODEL="${MAIN_MODEL:-mlx-community/Qwen3.5-9B-4bit}" +DRAFT_MODEL="${DRAFT_MODEL:-mlx-community/Qwen3.5-0.8B-MLX-4bit}" +NUM_DRAFT_TOKENS=2 +URL="http://${HOST}:${PORT}" +PASS=0 +FAIL=0 +TOTAL=0 +LOG_FILE="/tmp/SwiftLM-test-speculative-eval.log" + +# Colors +GREEN='\033[0;32m' +RED='\033[0;31m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +log() { echo -e "${YELLOW}[spec-test]${NC} $*"; } +pass() { PASS=$((PASS + 1)); TOTAL=$((TOTAL + 1)); echo -e " ${GREEN}✅ PASS${NC}: $*"; } +fail() { FAIL=$((FAIL + 1)); TOTAL=$((TOTAL + 1)); echo -e " ${RED}❌ FAIL${NC}: $*"; } + +cleanup() { + if [ -n "${SERVER_PID:-}" ]; then + log "Stopping server (PID $SERVER_PID)" + kill -9 "$SERVER_PID" 2>/dev/null || true + wait "$SERVER_PID" 2>/dev/null || true + fi +} +trap cleanup EXIT + +# ── Check prerequisites ───────────────────────────────────────────── +if [ ! -f "$BINARY" ]; then + echo "Error: Binary not found at $BINARY" + echo "Run 'swift build -c release' first." + exit 1 +fi + +if ! command -v jq &>/dev/null; then + echo "Error: jq is required. Install with: brew install jq" + exit 1 +fi + +# ── Memory check ──────────────────────────────────────────────────── +TOTAL_RAM_GB=$(sysctl -n hw.memsize 2>/dev/null | awk '{printf "%.0f", $1 / 1073741824}') +log "System RAM: ${TOTAL_RAM_GB} GB" + +if [ "$TOTAL_RAM_GB" -lt 8 ] 2>/dev/null; then + log "⚠️ WARNING: ${TOTAL_RAM_GB} GB RAM detected. Dual-model test requires ~6 GB." + log " Consider running on a machine with ≥8 GB RAM." +fi + +# ══════════════════════════════════════════════════════════════════════ +echo -e "\n${CYAN}╔══════════════════════════════════════════════════════════╗${NC}" +echo -e "${CYAN}║ SwiftLM Speculative Decoding Eval Test ║${NC}" +echo -e "${CYAN}║ Draft: Qwen3.5-0.8B (4-bit) → Main: Qwen3.5-9B (4-bit) ║${NC}" +echo -e "${CYAN}║ Draft tokens per round: ${NUM_DRAFT_TOKENS} ║${NC}" +echo -e "${CYAN}╚══════════════════════════════════════════════════════════╝${NC}\n" + +# ── Start server with dual models ─────────────────────────────────── +log "Starting server with speculative decoding..." +log " Main model: $MAIN_MODEL" +log " Draft model: $DRAFT_MODEL" +log " Draft tokens per round: $NUM_DRAFT_TOKENS" + +"$BINARY" --model "$MAIN_MODEL" --port "$PORT" --host "$HOST" \ + --draft-model "$DRAFT_MODEL" \ + --num-draft-tokens "$NUM_DRAFT_TOKENS" \ + > "$LOG_FILE" 2>&1 & +SERVER_PID=$! + +# Wait for server to be ready (both models need to download + load) +log "Waiting for server to load both models (this may take a while on first run)..." +MAX_WAIT=900 # 15 minutes for two model downloads +for i in $(seq 1 "$MAX_WAIT"); do + if curl -sf "$URL/health" >/dev/null 2>&1; then + log "Server ready after ${i}s" + break + fi + if ! kill -0 "$SERVER_PID" 2>/dev/null; then + echo "Error: Server process died. Last 30 lines of log:" + tail -30 "$LOG_FILE" + exit 1 + fi + # Print progress every 30 seconds + if [ $((i % 30)) -eq 0 ]; then + log " Still waiting... (${i}s elapsed)" + fi + sleep 1 +done + +if ! curl -sf "$URL/health" >/dev/null 2>&1; then + echo "Error: Server did not become ready in ${MAX_WAIT}s" + echo "Last 30 lines of log:" + tail -30 "$LOG_FILE" + exit 1 +fi + +# ── Test 1: Verify server loaded both models ──────────────────────── +log "Test 1: Verify dual-model loading" + +# Check server log for draft model loading confirmation +if grep -q "Draft model loaded successfully" "$LOG_FILE"; then + pass "Draft model loaded successfully" +else + fail "Draft model loading not confirmed in server logs" +fi + +if grep -q "speculative decoding" "$LOG_FILE"; then + pass "Speculative decoding mode detected in server logs" +else + fail "Speculative decoding not mentioned in server logs" +fi + +# ── Test 2: Health endpoint works with dual models ────────────────── +log "Test 2: Health endpoint" + +HEALTH=$(curl -sf "$URL/health") +if echo "$HEALTH" | jq -e '.status == "ok"' >/dev/null 2>&1; then + pass "Health endpoint returns status=ok" +else + fail "Health endpoint: $HEALTH" +fi + +# ── Test 3: Streaming speculative generation ──────────────────────── +log "Test 3: Streaming speculative generation" + +STREAM_OUTPUT=$(curl -sf -N --max-time 120 -X POST "$URL/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{\"model\":\"$MAIN_MODEL\",\"stream\":true,\"max_tokens\":30,\"messages\":[{\"role\":\"user\",\"content\":\"Name three fruits.\"}]}" \ + 2>/dev/null || true) + +if echo "$STREAM_OUTPUT" | grep -q "data: \[DONE\]"; then + pass "Streaming speculative: received [DONE] sentinel" +else + fail "Streaming speculative: missing [DONE] sentinel" +fi + +CHUNK_COUNT=$(echo "$STREAM_OUTPUT" | grep -c "^data: {" || true) +if [ "$CHUNK_COUNT" -gt 0 ]; then + pass "Streaming speculative: received $CHUNK_COUNT data chunks" +else + fail "Streaming speculative: no data chunks received" +fi + +# Check server log for speculative decoding activation +if grep -q "Using speculative decoding" "$LOG_FILE"; then + pass "Speculative decoding path activated during generation" +else + fail "Speculative decoding path not activated (missing log line)" +fi + +# ── Test 5: Multiple sequential requests (stability) ──────────────── +log "Test 5: Sequential request stability (3 requests)" + +SEQ_PASS=true +for i in 1 2 3; do + SEQ_RESP=$(curl -sf --max-time 120 -X POST "$URL/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{\"model\":\"$MAIN_MODEL\",\"max_tokens\":10,\"messages\":[{\"role\":\"user\",\"content\":\"Say the number $i.\"}]}" 2>/dev/null || echo "") + + SEQ_CONTENT=$(echo "$SEQ_RESP" | jq -r '.choices[0].message.content // empty' 2>/dev/null || echo "") + + if [ -z "$SEQ_CONTENT" ]; then + SEQ_PASS=false + fail "Sequential request $i: empty response" + break + fi +done + +if [ "$SEQ_PASS" = true ]; then + pass "Sequential stability: 3/3 speculative requests completed successfully" +fi + +# ── Test 6: Memory stability check ───────────────────────────────── +log "Test 6: Memory stability" + +HEALTH_FINAL=$(curl -sf "$URL/health") +MEM_ACTIVE=$(echo "$HEALTH_FINAL" | jq -r '.memory.active_mb // 0') +MEM_PEAK=$(echo "$HEALTH_FINAL" | jq -r '.memory.peak_mb // 0') + +if [ "$MEM_ACTIVE" -gt 0 ] 2>/dev/null; then + pass "Memory: active=${MEM_ACTIVE} MB, peak=${MEM_PEAK} MB" +else + fail "Memory: could not read memory stats" +fi + +# Verify server is still responsive after all tests +if curl -sf "$URL/health" >/dev/null 2>&1; then + pass "Server still responsive after all speculative decoding tests" +else + fail "Server became unresponsive" +fi + +# ── Results ────────────────────────────────────────────────────────── +echo "" +log "═══════════════════════════════════════" +log "Speculative Decoding Test Results" +log " Draft: $DRAFT_MODEL" +log " Main: $MAIN_MODEL" +log " Tokens/round: $NUM_DRAFT_TOKENS" +log " Results: ${PASS} passed, ${FAIL} failed, ${TOTAL} total" +log "═══════════════════════════════════════" + +if [ "$FAIL" -gt 0 ]; then + echo "" + log "Server log tail (last 20 lines):" + tail -20 "$LOG_FILE" + exit 1 +fi + +echo "" +log "Server log tail (last 20 lines):" +tail -20 "$LOG_FILE" +exit 0 diff --git a/tests/test-speculative.sh b/tests/test-speculative.sh new file mode 100755 index 0000000..4e36411 --- /dev/null +++ b/tests/test-speculative.sh @@ -0,0 +1,236 @@ +#!/bin/bash +# test-speculative.sh — Speculative decoding E2E verification +# +# Uses a small draft model (Qwen3.5-0.8B) to accelerate a larger main model +# (Qwen3.5-4B) via speculative decoding. Verifies: +# 1. Dual-model loading (draft + main) +# 2. Speculative decoding path activation +# 3. Correct token generation +# 4. Server stability under dual-model memory pressure +# +# Usage: +# ./tests/test-speculative.sh [binary_path] [port] +# +# Requirements: +# - ~4 GB RAM (0.8B draft ~1 GB + 4B main ~3 GB) +# - macos-15 (7 GB) on GitHub Actions is sufficient +# - curl, jq + +set -euo pipefail + +BINARY="${1:-.build/release/SwiftLM}" +PORT="${2:-15414}" +HOST="127.0.0.1" +MAIN_MODEL="${MAIN_MODEL:-mlx-community/Qwen3.5-2B-4bit}" +DRAFT_MODEL="${DRAFT_MODEL:-mlx-community/Qwen3.5-0.8B-MLX-4bit}" +NUM_DRAFT_TOKENS=4 +URL="http://${HOST}:${PORT}" +PASS=0 +FAIL=0 +TOTAL=0 +LOG_FILE="/tmp/SwiftLM-test-speculative.log" + +# Colors +GREEN='\033[0;32m' +RED='\033[0;31m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +log() { echo -e "${YELLOW}[spec-test]${NC} $*"; } +pass() { PASS=$((PASS + 1)); TOTAL=$((TOTAL + 1)); echo -e " ${GREEN}✅ PASS${NC}: $*"; } +fail() { FAIL=$((FAIL + 1)); TOTAL=$((TOTAL + 1)); echo -e " ${RED}❌ FAIL${NC}: $*"; } + +cleanup() { + if [ -n "${SERVER_PID:-}" ]; then + log "Stopping server (PID $SERVER_PID)" + kill -9 "$SERVER_PID" 2>/dev/null || true + wait "$SERVER_PID" 2>/dev/null || true + fi +} +trap cleanup EXIT + +# ── Check prerequisites ───────────────────────────────────────────── +if [ ! -f "$BINARY" ]; then + echo "Error: Binary not found at $BINARY" + echo "Run 'swift build -c release' first." + exit 1 +fi + +if ! command -v jq &>/dev/null; then + echo "Error: jq is required. Install with: brew install jq" + exit 1 +fi + +# ── Memory check ──────────────────────────────────────────────────── +TOTAL_RAM_GB=$(sysctl -n hw.memsize 2>/dev/null | awk '{printf "%.0f", $1 / 1073741824}') +log "System RAM: ${TOTAL_RAM_GB} GB" + +if [ "$TOTAL_RAM_GB" -lt 8 ] 2>/dev/null; then + log "⚠️ WARNING: ${TOTAL_RAM_GB} GB RAM detected. Dual-model test requires ~6 GB." + log " Consider running on a machine with ≥8 GB RAM." +fi + +# ══════════════════════════════════════════════════════════════════════ +echo -e "\n${CYAN}╔══════════════════════════════════════════════════════════╗${NC}" +echo -e "${CYAN}║ SwiftLM Speculative Decoding E2E Test ║${NC}" +echo -e "${CYAN}║ Draft: Qwen3.5-0.8B (4-bit) → Main: Qwen3.5-2B (4-bit) ║${NC}" +echo -e "${CYAN}║ Draft tokens per round: ${NUM_DRAFT_TOKENS} ║${NC}" +echo -e "${CYAN}╚══════════════════════════════════════════════════════════╝${NC}\n" + +# ── Start server with dual models ─────────────────────────────────── +log "Starting server with speculative decoding..." +log " Main model: $MAIN_MODEL" +log " Draft model: $DRAFT_MODEL" +log " Draft tokens per round: $NUM_DRAFT_TOKENS" + +"$BINARY" --model "$MAIN_MODEL" --port "$PORT" --host "$HOST" \ + --draft-model "$DRAFT_MODEL" \ + --num-draft-tokens "$NUM_DRAFT_TOKENS" \ + > "$LOG_FILE" 2>&1 & +SERVER_PID=$! + +# Wait for server to be ready (both models need to download + load) +log "Waiting for server to load both models (this may take a while on first run)..." +MAX_WAIT=900 # 15 minutes for two model downloads +for i in $(seq 1 "$MAX_WAIT"); do + if curl -sf "$URL/health" >/dev/null 2>&1; then + log "Server ready after ${i}s" + break + fi + if ! kill -0 "$SERVER_PID" 2>/dev/null; then + echo "Error: Server process died. Last 30 lines of log:" + tail -30 "$LOG_FILE" + exit 1 + fi + # Print progress every 30 seconds + if [ $((i % 30)) -eq 0 ]; then + log " Still waiting... (${i}s elapsed)" + fi + sleep 1 +done + +if ! curl -sf "$URL/health" >/dev/null 2>&1; then + echo "Error: Server did not become ready in ${MAX_WAIT}s" + echo "Last 30 lines of log:" + tail -30 "$LOG_FILE" + exit 1 +fi + +# ── Test 1: Verify server loaded both models ──────────────────────── +log "Test 1: Verify dual-model loading" + +# Check server log for draft model loading confirmation +if grep -q "Draft model loaded successfully" "$LOG_FILE"; then + pass "Draft model loaded successfully" +else + fail "Draft model loading not confirmed in server logs" +fi + +if grep -q "speculative decoding" "$LOG_FILE"; then + pass "Speculative decoding mode detected in server logs" +else + fail "Speculative decoding not mentioned in server logs" +fi + +# ── Test 2: Health endpoint works with dual models ────────────────── +log "Test 2: Health endpoint" + +HEALTH=$(curl -sf "$URL/health") +if echo "$HEALTH" | jq -e '.status == "ok"' >/dev/null 2>&1; then + pass "Health endpoint returns status=ok" +else + fail "Health endpoint: $HEALTH" +fi + +# ── Test 3: Streaming speculative generation ──────────────────────── +log "Test 3: Streaming speculative generation" + +STREAM_OUTPUT=$(curl -sf -N --max-time 120 -X POST "$URL/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{\"model\":\"$MAIN_MODEL\",\"stream\":true,\"max_tokens\":30,\"messages\":[{\"role\":\"user\",\"content\":\"Name three fruits.\"}]}" \ + 2>/dev/null || true) + +if echo "$STREAM_OUTPUT" | grep -q "data: \[DONE\]"; then + pass "Streaming speculative: received [DONE] sentinel" +else + fail "Streaming speculative: missing [DONE] sentinel" +fi + +CHUNK_COUNT=$(echo "$STREAM_OUTPUT" | grep -c "^data: {" || true) +if [ "$CHUNK_COUNT" -gt 0 ]; then + pass "Streaming speculative: received $CHUNK_COUNT data chunks" +else + fail "Streaming speculative: no data chunks received" +fi + +# Check server log for speculative decoding activation +if grep -q "Using speculative decoding" "$LOG_FILE"; then + pass "Speculative decoding path activated during generation" +else + fail "Speculative decoding path not activated (missing log line)" +fi + +# ── Test 5: Multiple sequential requests (stability) ──────────────── +log "Test 5: Sequential request stability (3 requests)" + +SEQ_PASS=true +for i in 1 2 3; do + SEQ_RESP=$(curl -sf --max-time 120 -X POST "$URL/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{\"model\":\"$MAIN_MODEL\",\"max_tokens\":10,\"messages\":[{\"role\":\"user\",\"content\":\"Say the number $i.\"}]}" 2>/dev/null || echo "") + + SEQ_CONTENT=$(echo "$SEQ_RESP" | jq -r '.choices[0].message.content // empty' 2>/dev/null || echo "") + + if [ -z "$SEQ_CONTENT" ]; then + SEQ_PASS=false + fail "Sequential request $i: empty response" + break + fi +done + +if [ "$SEQ_PASS" = true ]; then + pass "Sequential stability: 3/3 speculative requests completed successfully" +fi + +# ── Test 6: Memory stability check ───────────────────────────────── +log "Test 6: Memory stability" + +HEALTH_FINAL=$(curl -sf "$URL/health") +MEM_ACTIVE=$(echo "$HEALTH_FINAL" | jq -r '.memory.active_mb // 0') +MEM_PEAK=$(echo "$HEALTH_FINAL" | jq -r '.memory.peak_mb // 0') + +if [ "$MEM_ACTIVE" -gt 0 ] 2>/dev/null; then + pass "Memory: active=${MEM_ACTIVE} MB, peak=${MEM_PEAK} MB" +else + fail "Memory: could not read memory stats" +fi + +# Verify server is still responsive after all tests +if curl -sf "$URL/health" >/dev/null 2>&1; then + pass "Server still responsive after all speculative decoding tests" +else + fail "Server became unresponsive" +fi + +# ── Results ────────────────────────────────────────────────────────── +echo "" +log "═══════════════════════════════════════" +log "Speculative Decoding Test Results" +log " Draft: $DRAFT_MODEL" +log " Main: $MAIN_MODEL" +log " Tokens/round: $NUM_DRAFT_TOKENS" +log " Results: ${PASS} passed, ${FAIL} failed, ${TOTAL} total" +log "═══════════════════════════════════════" + +if [ "$FAIL" -gt 0 ]; then + echo "" + log "Server log tail (last 20 lines):" + tail -20 "$LOG_FILE" + exit 1 +fi + +echo "" +log "Server log tail (last 20 lines):" +tail -20 "$LOG_FILE" +exit 0