diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a64803497..cccde0bcc 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2448,6 +2448,34 @@ dsv4-fp8-h200-vllm: search-space: - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } +# DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300 +# pareto sweep. The single-node schema has no explicit data-parallel-size +# field, so dp-attn=true is used as the existing vLLM script switch for DP4 +# layouts on 4 allocated GPUs. +dsv4-fp4-b300-vllm: + image: vllm/vllm-openai:deepseekv4-cu130 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: vllm + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 4 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 128, conc-end: 128 } + - { tp: 4, dp-attn: true, conc-start: 256, conc-end: 512 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 4 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 128, conc-end: 128 } + - { tp: 4, dp-attn: true, conc-start: 256, conc-end: 512 } + qwen3.5-fp8-h200-sglang: image: lmsysorg/sglang:v0.5.9-cu129-amd64 model: Qwen/Qwen3.5-397B-A17B-FP8 diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh index 8ec35beac..52f38c4d9 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh @@ -1,14 +1,16 @@ #!/usr/bin/env bash -# Per https://vllm.ai/blog/deepseek-v4 the DeepSeek-V4-Pro recipe lists -# 8xB200 and 8xB300 with identical flags, so this script mirrors -# dsv4_fp4_b200.sh. +# DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300 +# pareto sweep. The matrix uses dp-attn=true as the existing switch to flip a +# 4-GPU run from TP4 to DP4. Expert parallel is always enabled to match the +# provided vllm serve command exactly. source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + DP_ATTENTION \ CONC \ ISL \ OSL \ @@ -22,56 +24,54 @@ fi nvidia-smi +hf download "$MODEL" + SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -# DeepSeek-V4-Pro weights are large and engine startup on B300 can exceed -# the default 600s. Give it an hour to load. +# DeepSeek-V4-Pro weights are large; engine startup can exceed the default +# 600s. Give it an hour to load. export VLLM_ENGINE_READY_TIMEOUT_S=3600 -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1) +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP") fi -# Monkey-patch: bypass persistent_topk unconditionally. It raises "k out of -# range" during CUDA graph capture when the dummy batch has rows with -# seq_lens[i] < k (=2048 for DSV4). An attn_metadata.max_seq_len-based gate is -# not strict enough because dummy batches can have max >= k while individual -# rows have seq_lens[i] = 1. Fall back to top_k_per_row_decode everywhere so -# 1k/1k capture completes; 8k/1k already worked without the patch but we trade -# a small decode-time perf cost there to keep the script single-branch. -INDEXER_PY=/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/sparse_attn_indexer.py -echo "[monkey-patch] patching $INDEXER_PY" -sed -i 's/if current_platform.is_cuda() and topk_tokens in (512, 1024, 2048)[^:]*:/if False: # monkey-patched: bypass persistent_topk (k out of range)/' "$INDEXER_PY" -if ! grep -Fq 'if False: # monkey-patched: bypass persistent_topk' "$INDEXER_PY"; then - echo "[monkey-patch] FAILED: expected marker not found in $INDEXER_PY" >&2 - echo "[monkey-patch] current line around persistent_topk dispatch:" >&2 - grep -n 'topk_tokens in\|persistent_topk' "$INDEXER_PY" >&2 || true - exit 1 +BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN" +if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ]; then + BENCHMARK_MAX_MODEL_LEN=4096 +fi + +if [ "${EVAL_ONLY}" = "true" ]; then + EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$BENCHMARK_MAX_MODEL_LEN") + export EVAL_MAX_MODEL_LEN + SERVE_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +else + SERVE_MAX_MODEL_LEN="$BENCHMARK_MAX_MODEL_LEN" fi -echo "[monkey-patch] applied: $(grep -n 'if False: # monkey-patched' $INDEXER_PY)" # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor -# Per the recipe, run with EP + DP=8 (no --tensor-parallel-size flag). TP -# from the search space is used only for GPU allocation by the runner and -# as the DP size. set -x -vllm serve $MODEL --host 0.0.0.0 --port $PORT \ ---trust-remote-code \ ---kv-cache-dtype fp8 \ ---block-size 256 \ ---no-enable-prefix-caching \ ---enable-expert-parallel \ ---data-parallel-size $TP \ ---max-model-len $MAX_MODEL_LEN \ ---compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \ ---tokenizer-mode deepseek_v4 \ ---tool-call-parser deepseek_v4 \ ---enable-auto-tool-choice \ ---reasoning-parser deepseek_v4 > $SERVER_LOG 2>&1 & +vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ + "${PARALLEL_ARGS[@]}" \ + --pipeline-parallel-size 1 \ + --kv-cache-dtype fp8 \ + --trust-remote-code \ + --block-size 256 \ + --no-enable-prefix-caching \ + --enable-expert-parallel \ + --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \ + --attention_config.use_fp4_indexer_cache True \ + --tokenizer-mode deepseek_v4 \ + --tool-call-parser deepseek_v4 \ + --enable-auto-tool-choice \ + --reasoning-parser deepseek_v4 \ + --max-cudagraph-capture-size 2048 \ + --max-model-len "$SERVE_MAX_MODEL_LEN" \ + --max-num-batched-tokens 2048 > "$SERVER_LOG" 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 528a12d02..0aaf26038 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1755,7 +1755,7 @@ - "VLLM_ENGINE_READY_TIMEOUT_S=3600 to accommodate large weight loading" - "Configs: 1k1k conc 4-64, 8k1k conc 4-64" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1130 - + - config-keys: - dsv4-fp4-b300-sglang description: @@ -1775,3 +1775,14 @@ - "Model: sgl-project/DeepSeek-V4-Pro-FP8" - "https://github.com/sgl-project/sglang/pull/23608#issuecomment-4311952977" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1134 + +- config-keys: + - dsv4-fp4-b300-vllm + description: + - "Add DeepSeek-V4-Pro single-node B300 vLLM aggregate benchmark" + - "Image: vllm/vllm-openai:deepseekv4-cu130" + - "Model: deepseek-ai/DeepSeek-V4-Pro" + - "Uses the submitted B300 pareto schedule for both 1k1k and 8k1k, excluding conc 1: TP8 at conc 4/128, TP4 at conc 4/8/16/32/64/128, DP4 at conc 256/512" + - "Launch args match the provided vllm serve command, including FP4 indexer cache, FULL_AND_PIECEWISE cudagraph config, and max-num-batched-tokens 2048" + - "1k1k uses --max-model-len 4096; 8k1k uses the workflow-provided benchmark context length" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1144