SemiAnalysisAI · functionstackx · Jun 1, 2026 · May 31, 2026 · claude · May 31, 2026
@@ -2204,6 +2204,33 @@ dsv4-fp4-mi355x-vllm:
       search-space:
       - { tp: 8, conc-start: 4, conc-end: 512 }
 
+# MTP variant of dsv4-fp4-mi355x-vllm. Mirrors the base recipe's search space
+# and adds spec-decoding: mtp, which routes to dsv4_fp4_mi355x_vllm_mtp.sh
+# (--speculative-config '{"method":"mtp","num_speculative_tokens":2}'), per
+# vllm-project/vllm#43385 (ROCm DeepSeek-V4 MTP, merged 2026-05-24, included in
+# v0.22.0). Full conc 4-512 range maps the complete crossover curve: MTP wins
+# at low batch (PR perf data: +75% @ conc1, +38% @ conc8) and falls behind STP
+# above ~conc32 (-37% @ conc32). Image reuses the base entry's v0.22.0 ROCm
+# build, which already contains the MTP commit.
+dsv4-fp4-mi355x-vllm-mtp:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp }
+
 # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
 # PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks
 # that OOM once warmup/prefill batches multiple requests; keep CONC=1 until

diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_mi355x_vllm_mtp.sh
@@ -0,0 +1,109 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+# DeepSeek-V4-Pro on MI355X via vLLM — MTP variant of dsv4_fp4_mi355x_vllm.sh.
+# Adds MTP speculative decoding per vllm-project/vllm#43385 (ROCm DeepSeek-V4
+# MTP support, merged 2026-05-24, present in v0.22.0 tagged 2026-05-29):
+# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
+#
+# Benchmark prompts are routed through DeepSeek-V4 chat encoding via --dsv4
+# (which auto-enables --use-chat-template). EAGLE/MTP-style spec decoding is
+# trained against chat-formatted inputs; benchmarking against raw random
+# prompts silently regresses the acceptance rate.
+#
+# All other serving flags mirror the non-MTP MI355X recipe (TP=8,
+# VLLM_ROCM_USE_AITER=1, triton_unfused MoE, FP8 KV cache, mp executor, async
+# scheduling, mode=3 FULL_AND_PIECEWISE compilation). See
+# dsv4_fp4_mi355x_vllm.sh for per-flag rationale.
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    DP_ATTENTION \
+    CONC \
+    ISL \
+    OSL \
+    MAX_MODEL_LEN \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+
+if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
+    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
+fi
+
+export VLLM_ROCM_USE_AITER=1
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
+start_gpu_monitor
+
+PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1)
+if [ "${DP_ATTENTION}" = "true" ]; then
+    PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
+fi
+
+EP_ARGS=()
+if [ "${EP_SIZE:-1}" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
+
+# use 2 speculative tokens for all configs for now
+NUM_SPEC_TOKENS=2
+
+set -x
+vllm serve $MODEL --port $PORT \
+    "${PARALLEL_ARGS[@]}" \
+    "${EP_ARGS[@]}" \
+    --async-scheduling \
+    --no-enable-prefix-caching \
+    --distributed-executor-backend mp \
+    --gpu-memory-utilization 0.8 \
+    --kv-cache-dtype fp8 \
+    --trust-remote-code \
+    --moe-backend triton_unfused \
+    --tokenizer-mode deepseek_v4 \
+    --reasoning-parser deepseek_v4 \
+    --speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
+    --compilation-config '{"mode":3,"cudagraph_mode":"FULL_AND_PIECEWISE"}' > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# --dsv4 routes prompts through DeepSeek-V4 chat encoding (auto-enables
+# --use-chat-template); required for meaningful MTP acceptance numbers.
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --trust-remote-code \
+    --dsv4
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+stop_gpu_monitor
+set +x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3342,3 +3342,9 @@
   description:
     - "Update vLLM ROCm image from v0.21.0 to v0.22.0"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1616
+
+- config-keys:
+    - dsv4-fp4-mi355x-vllm-mtp
+  description:
+    - "Add MTP speculative-decoding sibling for dsv4-fp4-mi355x-vllm (model: deepseek-ai/DeepSeek-V4-Pro) on vllm/vllm-openai-rocm:v0.22.0, per vllm-project/vllm#43385"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1630