From 2372f437f1217ea8bec48caffefe55b1bb2cda1d Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sun, 31 May 2026 11:48:12 -0700 Subject: [PATCH] [AMD] Add DeepSeek-V4-Pro FP4 MI355X vLLM MTP recipe MTP speculative-decoding sibling of dsv4-fp4-mi355x-vllm, per vllm-project/vllm#43385 (ROCm DeepSeek-V4 MTP support, included in vLLM v0.22.0). - benchmarks/single_node/dsv4_fp4_mi355x_vllm_mtp.sh: mirrors the base MI355X vLLM recipe and adds --speculative-config '{"method":"mtp","num_speculative_tokens":2}', plus --dsv4 chat encoding on the benchmark for valid MTP acceptance. - .github/configs/amd-master.yaml: dsv4-fp4-mi355x-vllm-mtp entry (conc 4-512, 1k1k + 8k1k), reusing the base v0.22.0 ROCm image which already contains the MTP commit. --- .github/configs/amd-master.yaml | 27 +++++ .../single_node/dsv4_fp4_mi355x_vllm_mtp.sh | 109 ++++++++++++++++++ perf-changelog.yaml | 6 + 3 files changed, 142 insertions(+) create mode 100755 benchmarks/single_node/dsv4_fp4_mi355x_vllm_mtp.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 0b7336fb7..279b7d0bd 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2204,6 +2204,33 @@ dsv4-fp4-mi355x-vllm: search-space: - { tp: 8, conc-start: 4, conc-end: 512 } +# MTP variant of dsv4-fp4-mi355x-vllm. Mirrors the base recipe's search space +# and adds spec-decoding: mtp, which routes to dsv4_fp4_mi355x_vllm_mtp.sh +# (--speculative-config '{"method":"mtp","num_speculative_tokens":2}'), per +# vllm-project/vllm#43385 (ROCm DeepSeek-V4 MTP, merged 2026-05-24, included in +# v0.22.0). Full conc 4-512 range maps the complete crossover curve: MTP wins +# at low batch (PR perf data: +75% @ conc1, +38% @ conc8) and falls behind STP +# above ~conc32 (-37% @ conc32). Image reuses the base entry's v0.22.0 ROCm +# build, which already contains the MTP commit. +dsv4-fp4-mi355x-vllm-mtp: + image: vllm/vllm-openai-rocm:v0.22.0 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp } + # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650). # PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks # that OOM once warmup/prefill batches multiple requests; keep CONC=1 until diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_mi355x_vllm_mtp.sh new file mode 100755 index 000000000..b90d82de9 --- /dev/null +++ b/benchmarks/single_node/dsv4_fp4_mi355x_vllm_mtp.sh @@ -0,0 +1,109 @@ +#!/usr/bin/env bash +set -eo pipefail + +# DeepSeek-V4-Pro on MI355X via vLLM — MTP variant of dsv4_fp4_mi355x_vllm.sh. +# Adds MTP speculative decoding per vllm-project/vllm#43385 (ROCm DeepSeek-V4 +# MTP support, merged 2026-05-24, present in v0.22.0 tagged 2026-05-29): +# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. +# +# Benchmark prompts are routed through DeepSeek-V4 chat encoding via --dsv4 +# (which auto-enables --use-chat-template). EAGLE/MTP-style spec decoding is +# trained against chat-formatted inputs; benchmarking against raw random +# prompts silently regresses the acceptance rate. +# +# All other serving flags mirror the non-MTP MI355X recipe (TP=8, +# VLLM_ROCM_USE_AITER=1, triton_unfused MoE, FP8 KV cache, mp executor, async +# scheduling, mode=3 FULL_AND_PIECEWISE compilation). See +# dsv4_fp4_mi355x_vllm.sh for per-flag rationale. + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + DP_ATTENTION \ + CONC \ + ISL \ + OSL \ + MAX_MODEL_LEN \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi + +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +export VLLM_ROCM_USE_AITER=1 + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +start_gpu_monitor + +PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1) +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP") +fi + +EP_ARGS=() +if [ "${EP_SIZE:-1}" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi + +# use 2 speculative tokens for all configs for now +NUM_SPEC_TOKENS=2 + +set -x +vllm serve $MODEL --port $PORT \ + "${PARALLEL_ARGS[@]}" \ + "${EP_ARGS[@]}" \ + --async-scheduling \ + --no-enable-prefix-caching \ + --distributed-executor-backend mp \ + --gpu-memory-utilization 0.8 \ + --kv-cache-dtype fp8 \ + --trust-remote-code \ + --moe-backend triton_unfused \ + --tokenizer-mode deepseek_v4 \ + --reasoning-parser deepseek_v4 \ + --speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \ + --compilation-config '{"mode":3,"cudagraph_mode":"FULL_AND_PIECEWISE"}' > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# --dsv4 routes prompts through DeepSeek-V4 chat encoding (auto-enables +# --use-chat-template); required for meaningful MTP acceptance numbers. +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code \ + --dsv4 + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5b7d56cd1..61ce924d5 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3342,3 +3342,9 @@ description: - "Update vLLM ROCm image from v0.21.0 to v0.22.0" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1616 + +- config-keys: + - dsv4-fp4-mi355x-vllm-mtp + description: + - "Add MTP speculative-decoding sibling for dsv4-fp4-mi355x-vllm (model: deepseek-ai/DeepSeek-V4-Pro) on vllm/vllm-openai-rocm:v0.22.0, per vllm-project/vllm#43385" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1630