Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
248 changes: 244 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,8 @@ jobs:
retention-days: 7

# ── Speculative Decoding Memory Evaluation ──
# Runs the 9B model with NUM_DRAFT_TOKENS=2 to check peak
# memory compression/efficiency. Allowed to OOM/fail.
# Runs the 2B model with NUM_DRAFT_TOKENS=2 to check peak
# memory compression/efficiency. Emits vm_stat readings as step summary.
speculative-decoding-eval:
runs-on: macos-15
timeout-minutes: 45
Expand Down Expand Up @@ -277,7 +277,7 @@ jobs:
python3 -m venv /tmp/mlx_venv
/tmp/mlx_venv/bin/pip install --quiet huggingface_hub hf

- name: Cache MLX models (draft + 9B)
- name: Cache MLX models (draft + 2B)
uses: actions/cache@v4
with:
path: ~/.cache/huggingface
Expand All @@ -288,6 +288,19 @@ jobs:
source /tmp/mlx_venv/bin/activate
hf download mlx-community/Qwen3.5-2B-4bit || true
hf download mlx-community/Qwen3.5-0.8B-MLX-4bit || true

- name: Snapshot RAM before test
id: ram_before
run: |
PAGE_SIZE=$(sysctl -n hw.pagesize)
RAM=$(vm_stat | awk -v page_size="$PAGE_SIZE" '
/Pages active:/ { v=$3; gsub(/\./, "", v); act=v+0 }
/Pages wired down:/ { v=$4; gsub(/\./, "", v); wire=v+0 }
/Pages occupied by compressor:/ { v=$5; gsub(/\./, "", v); comp=v+0 }
END { printf "%.2f", (act+wire+comp)*page_size/1073741824 }
')
echo "ram_before=$RAM" >> $GITHUB_OUTPUT
echo "RAM before eval: ${RAM} GB"

- name: Run speculative evaluation E2E
env:
Expand All @@ -309,11 +322,238 @@ jobs:
done
echo "All attempts failed"
exit 1


- name: Snapshot RAM after test
if: always()
id: ram_after
run: |
PAGE_SIZE=$(sysctl -n hw.pagesize)
RAM=$(vm_stat | awk -v page_size="$PAGE_SIZE" '
/Pages active:/ { v=$3; gsub(/\./, "", v); act=v+0 }
/Pages wired down:/ { v=$4; gsub(/\./, "", v); wire=v+0 }
/Pages occupied by compressor:/ { v=$5; gsub(/\./, "", v); comp=v+0 }
END { printf "%.2f", (act+wire+comp)*page_size/1073741824 }
')
echo "ram_after=$RAM" >> $GITHUB_OUTPUT
echo "RAM after eval: ${RAM} GB"

- name: Emit memory summary
if: always()
run: |
BEFORE="${{ steps.ram_before.outputs.ram_before }}"
AFTER="${{ steps.ram_after.outputs.ram_after }}"
TOTAL=$(sysctl -n hw.memsize | awk '{printf "%.1f", $1/1073741824}')
{
echo "## 📊 Speculative Eval — Memory Readings"
echo "| Metric | Value |"
echo "|--------|-------|"
echo "| Runner physical RAM | ${TOTAL} GB |"
echo "| RAM before test | ${BEFORE} GB |"
echo "| RAM after test | ${AFTER} GB |"
echo "| Delta | $(echo "$AFTER $BEFORE" | awk '{printf "%.2f", $1-$2}') GB |"
} >> $GITHUB_STEP_SUMMARY

- name: Upload speculative eval logs on failure
if: failure()
uses: actions/upload-artifact@v4
with:
name: speculative-eval-logs
path: /tmp/SwiftLM-test-speculative-eval.log

# ── Issue #72 Regression: SSD streaming + draft model RAM guard ──────────────
# Mandatory (not continue-on-error). Enforces the auto-cap-to-1 fix and the
# memoryLimit sentinel on every PR. Uses tiny models (2B main + 0.8B draft)
# sized for the 7 GB macos-15 runner.
#
# Three checks mirror the local Test 10 in run_benchmark.sh:
# [1] Auto-cap warning present in server log
# [2] Peak RAM ≤ 85% of runner physical RAM during inference
# [3] /v1/chat/completions returns valid content
ssd-draft-memory-guard:
runs-on: macos-15
timeout-minutes: 45
needs: build_and_unit_test
steps:
- uses: actions/checkout@v4
with:
submodules: recursive

- name: Download Binary Artifact
uses: actions/download-artifact@v4
continue-on-error: true # fall back to building if artifact expired
with:
name: swiftlm-architecture
path: .build/release/

- name: Build (Release) if artifact missing
run: |
if [ ! -f ".build/release/SwiftLM" ]; then
swift build -c release
fi
chmod +x .build/release/SwiftLM

- name: Install MLX Metal library
run: |
python3 -m venv /tmp/mlx_venv
/tmp/mlx_venv/bin/pip install --quiet mlx huggingface_hub hf
cp /tmp/mlx_venv/lib/python*/site-packages/mlx/lib/mlx.metallib .build/release/

- name: Cache MLX models (2B main + 0.8B draft)
uses: actions/cache@v4
with:
path: ~/.cache/huggingface
key: mlx-ssd-draft-guard-qwen35-2b-0.8b

- name: Pre-download models
run: |
source /tmp/mlx_venv/bin/activate
hf download mlx-community/Qwen3.5-2B-4bit || true
hf download mlx-community/Qwen3.5-0.8B-MLX-4bit || true

- name: Snapshot RAM baseline
id: ram_base
run: |
PAGE_SIZE=$(sysctl -n hw.pagesize)
RAM=$(vm_stat | awk -v page_size="$PAGE_SIZE" '
/Pages active:/ { v=$3; gsub(/\./, "", v); act=v+0 }
/Pages wired down:/ { v=$4; gsub(/\./, "", v); wire=v+0 }
/Pages occupied by compressor:/ { v=$5; gsub(/\./, "", v); comp=v+0 }
END { printf "%.2f", (act+wire+comp)*page_size/1073741824 }
')
TOTAL=$(sysctl -n hw.memsize | awk '{printf "%.0f", $1/1073741824}')
LIMIT=$(echo "$TOTAL * 0.85" | bc | cut -d. -f1)
echo "ram_base=$RAM" >> $GITHUB_OUTPUT
echo "runner_ram=$TOTAL" >> $GITHUB_OUTPUT
echo "ram_limit=$LIMIT" >> $GITHUB_OUTPUT
echo "Baseline RAM: ${RAM} GB | Runner: ${TOTAL} GB | Limit: ${LIMIT} GB"

- name: Start SSD + draft server (Issue #72 scenario)
id: server
run: |
# Launch with --num-draft-tokens 4 intentionally — the auto-cap should
# silently reduce it to 1 and log the advisory message.
.build/release/SwiftLM \
--model mlx-community/Qwen3.5-2B-4bit \
--draft-model mlx-community/Qwen3.5-0.8B-MLX-4bit \
--stream-experts \
--num-draft-tokens 4 \
--port 15473 \
--max-tokens 64 \
> /tmp/ssd_draft_guard.log 2>&1 &
PID=$!
echo "server_pid=$PID" >> $GITHUB_OUTPUT

echo "Waiting for server (up to 300s)..."
for i in $(seq 1 300); do
if ! kill -0 $PID 2>/dev/null; then
echo "Server died early:"
cat /tmp/ssd_draft_guard.log
exit 1
fi
Comment on lines +435 to +452
Copy link

Copilot AI Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this step, the server PID is written to $GITHUB_OUTPUT, but then the loop immediately references ${{ steps.server.outputs.server_pid }} within the same step. Step outputs are not available until the step completes, so this will expand to empty and the kill -0 check will be unreliable. Capture $! into a shell variable (e.g., PID) and use that inside the loop, while still emitting it as an output for later steps.

Copilot uses AI. Check for mistakes.
if curl -sf http://127.0.0.1:15473/health >/dev/null 2>&1; then
echo "Server ready after ${i}s"
break
fi
sleep 1
if [ "$i" -eq 300 ]; then echo "Timeout"; exit 1; fi
done

- name: Snapshot RAM after model load
id: ram_loaded
run: |
PAGE_SIZE=$(sysctl -n hw.pagesize)
RAM=$(vm_stat | awk -v page_size="$PAGE_SIZE" '
/Pages active:/ { v=$3; gsub(/\./, "", v); act=v+0 }
/Pages wired down:/ { v=$4; gsub(/\./, "", v); wire=v+0 }
/Pages occupied by compressor:/ { v=$5; gsub(/\./, "", v); comp=v+0 }
END { printf "%.2f", (act+wire+comp)*page_size/1073741824 }
')
echo "ram_loaded=$RAM" >> $GITHUB_OUTPUT
echo "RAM after load: ${RAM} GB"

- name: "[1/3] Verify auto-cap warning in server log"
run: |
if grep -q "auto-capping" /tmp/ssd_draft_guard.log; then
echo "✅ Auto-cap warning found — numDraftTokens correctly reduced to 1"
else
echo "❌ Auto-cap warning NOT found in server log"
echo "--- Last 20 lines of server log ---"
tail -20 /tmp/ssd_draft_guard.log
exit 1
fi

- name: "[2/3] Run inference and snapshot peak RAM"
id: ram_peak
run: |
RESULT=$(curl -sf --max-time 90 http://127.0.0.1:15473/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{"model":"test","messages":[{"role":"user","content":"What is 2+2? One word."}],"max_tokens":32,"stream":false}' \
2>/dev/null || echo "{}")
echo "$RESULT" > /tmp/inf_result.json

PAGE_SIZE=$(sysctl -n hw.pagesize)
RAM=$(vm_stat | awk -v page_size="$PAGE_SIZE" '
/Pages active:/ { v=$3; gsub(/\./, "", v); act=v+0 }
/Pages wired down:/ { v=$4; gsub(/\./, "", v); wire=v+0 }
/Pages occupied by compressor:/ { v=$5; gsub(/\./, "", v); comp=v+0 }
END { printf "%.2f", (act+wire+comp)*page_size/1073741824 }
')
echo "ram_peak=$RAM" >> $GITHUB_OUTPUT
echo "RAM after inference: ${RAM} GB"

LIMIT="${{ steps.ram_base.outputs.ram_limit }}"
OK=$(echo "$RAM <= $LIMIT" | bc -l)
if [ "$OK" = "1" ]; then
echo "✅ RAM=${RAM}GB ≤ ${LIMIT}GB (85% of ${{ steps.ram_base.outputs.runner_ram }}GB runner RAM)"
else
echo "❌ RAM=${RAM}GB EXCEEDS limit ${LIMIT}GB — Issue #72 regression detected"
echo " (memoryLimit sentinel or auto-cap may have regressed)"
exit 1
fi

- name: "[3/3] Validate inference response"
run: |
RESULT=$(cat /tmp/inf_result.json)
if echo "$RESULT" | grep -q '"content"'; then
TEXT=$(echo "$RESULT" | python3 -c \
"import sys,json;d=json.load(sys.stdin);print(d['choices'][0]['message']['content'])" \
2>/dev/null || echo "(parse error)")
echo "✅ Response: $TEXT"
else
echo "❌ No content in response — server may have crashed or returned empty"
echo "Raw: ${RESULT:0:300}"
exit 1
fi

- name: Stop server
if: always()
run: kill ${{ steps.server.outputs.server_pid }} 2>/dev/null || true

- name: Emit memory summary to step summary
if: always()
run: |
BASE="${{ steps.ram_base.outputs.ram_base }}"
LOADED="${{ steps.ram_loaded.outputs.ram_loaded }}"
PEAK="${{ steps.ram_peak.outputs.ram_peak }}"
TOTAL="${{ steps.ram_base.outputs.runner_ram }}"
LIMIT="${{ steps.ram_base.outputs.ram_limit }}"
{
echo "## 🛡️ Issue #72 — SSD + Draft Model RAM Guard"
echo "| Metric | Value | Threshold |"
echo "|--------|-------|-----------|"
echo "| Runner physical RAM | ${TOTAL} GB | — |"
echo "| RAM baseline (before server) | ${BASE} GB | — |"
echo "| RAM after model load | ${LOADED} GB | — |"
echo "| RAM after inference (peak) | ${PEAK} GB | ≤ ${LIMIT} GB (85%) |"
echo "| Load delta | $(echo "$LOADED $BASE" | awk '{printf "%.2f", $1-$2}') GB | — |"
echo "| Inference delta | $(echo "$PEAK $LOADED" | awk '{printf "%.2f", $1-$2}') GB | — |"
} >> $GITHUB_STEP_SUMMARY

- name: Upload server log on failure
if: failure()
uses: actions/upload-artifact@v4
with:
name: ssd-draft-guard-log
path: /tmp/ssd_draft_guard.log
retention-days: 7

21 changes: 17 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,11 @@ SwiftLM implements a **rewritten SSD expert streaming pipeline** (engineered by

A novel aspect of this architecture is the **dual-model speculative decoding** pattern: a small draft model (e.g. Qwen3.5-9B at 73 tok/s) runs **entirely in RAM** while the large MoE model (e.g. 122B) streams experts from SSD. The draft model generates candidate tokens at high speed, and the main model verifies them in bulk — dramatically reducing the number of SSD-bound generation rounds needed.

> **Important finding:** Speculative decoding is **counterproductive for SSD-streaming MoE** specifically. The verify pass sends N+1 tokens, each routing to *different* experts — SSD I/O scales with the *union* of all positions' expert selections. Speculative decoding is therefore routed exclusively to **in-RAM models**.
> **Performance note:** Combining `--stream-experts` with `--draft-model` requires care. The verify pass sends N+1 tokens simultaneously, each routing to *different* experts — SSD I/O scales with the *union* of all positions' expert selections. At the default `--num-draft-tokens 4` this creates a **5× I/O fan-out** that regresses throughput below solo SSD streaming.
>
> **Auto-cap strategy (Issue #72 fix):** SwiftLM automatically caps `--num-draft-tokens` to **1** when both flags are active. With 1 draft token the verify pass covers only 2 positions (2× fan-out). If the draft model's acceptance rate is ≥ 50% — typical for same-family models — the net throughput is still positive despite the 2× I/O overhead. A startup advisory is printed when the cap fires.
>
> For maximum throughput: use `--stream-experts` alone (no draft model).

### Optimization Techniques

Expand Down Expand Up @@ -271,11 +275,20 @@ SWIFTLM_TOP_K=6 SwiftLM --port 8002 \
SWIFTLM_TOP_K=4 SwiftLM --port 8002 \
--model <path>/Qwen3.5-122B-A10B-4bit --stream-experts

# With speculative decoding (in-RAM models only):
# With speculative decoding (in-RAM models only — both models fit in RAM):
SwiftLM --port 8002 \
--model <path>/Qwen3.5-27B-4bit \
--draft-model <path>/Qwen3.5-9B-4bit \
--num-draft-tokens 4

# With SSD streaming + draft model (auto-cap mode):
# SwiftLM automatically caps --num-draft-tokens to 1 to minimise the
# verify-pass I/O fan-out. Net positive if draft acceptance rate ≥ 50%.
SwiftLM --port 8002 \
--model <path>/Qwen3.5-122B-A10B-4bit \
--stream-experts \
--draft-model <path>/Qwen3.5-9B-4bit
# ↑ num-draft-tokens is auto-capped to 1 at startup
```

---
Expand Down Expand Up @@ -404,8 +417,8 @@ curl http://localhost:5413/v1/chat/completions \
| `--gpu-layers` | `model_default`| Restrict the amount of layers allocated to GPU hardware |
| `--stream-experts` | `false` | Enable SSD expert streaming for MoE models (10x speedup) |
| `--turbo-kv` | `false` | Enable TurboQuant 3-bit KV cache compression (activates after 2048 tokens, server-wide) |
| `--draft-model` | (none) | Draft model path/ID for speculative decoding (in-RAM models only) |
| `--num-draft-tokens` | `4` | Number of draft tokens per speculation round |
| `--draft-model` | (none) | Draft model path/ID for speculative decoding. When used with `--stream-experts`, `--num-draft-tokens` is auto-capped to 1 to minimise SSD I/O fan-out (see performance note above). |
| `--num-draft-tokens` | `4` | Tokens per speculation round. Auto-capped to 1 when combined with `--stream-experts`. |

## 🔧 Per-Request API Parameters

Expand Down
Loading
Loading