Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
2263ac1
dsv4-b300-sglang: conc=2048 mega_moe deepep recipe
yhyang201 Apr 26, 2026
f1bd23d
dsv4-b300-sglang: add conc=4096 mega_moe deepep recipe
yhyang201 Apr 26, 2026
f3e105f
dsv4-b300-sglang: 1k1k conc=512/1024 mega_moe deepep recipe
yhyang201 Apr 26, 2026
90d3bfd
dsv4-b300-sglang: merge changelog entries into single PR#1179 entry
yhyang201 Apr 26, 2026
86b77f5
dsv4-b300-sglang: add conc=2048/4096 mega_moe CI entries for both ISL…
yhyang201 Apr 26, 2026
ed706e8
fix: correct tp=8 for conc=2048/4096 and swa-full-tokens-ratio for co…
yhyang201 Apr 27, 2026
1a65efb
dsv4-b300-sglang: set NVSHMEM_DISABLE_IB=1 for deepep recipes
yhyang201 Apr 27, 2026
35068f7
dsv4-b300-sglang: update image to sha256:2fec8d79
yhyang201 Apr 27, 2026
2f955a8
dsv4-b300-sglang: enable SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW for conc …
yhyang201 Apr 27, 2026
854efe4
dsv4-b300-sglang: set swa-full-tokens-ratio 0.06 for conc 2048/4096
yhyang201 Apr 27, 2026
8477526
dsv4-b300-sglang: temporarily limit sweep to 8k1k conc 2048/4096
yhyang201 Apr 27, 2026
c0502c1
dsv4-b300-sglang: check CONC=2048/4096 before EP_SIZE=8
yhyang201 Apr 28, 2026
506702b
dsv4-b300-sglang: update conc-4096 recipe parameters
yhyang201 Apr 28, 2026
1f75a9c
dsv4-b300-sglang: set MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 for conc-…
yhyang201 Apr 28, 2026
4ef3386
perf-changelog: rebase on main, append PR#1179 entry
yhyang201 Apr 28, 2026
862d82e
dsv4-b300-sglang: restore full sweep config, add conc 2048/4096
yhyang201 Apr 28, 2026
fd64708
Merge branch 'main' into dsv4-b300-sglang-conc2048-mega-moe
yhyang201 Apr 28, 2026
5862015
dsv4-b300-sglang: restore 8k1k tp:4/ep:4/conc:512 entry
yhyang201 Apr 28, 2026
535faf1
Apply suggestion from @Qiaolin-Yu
Qiaolin-Yu Apr 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1860,7 +1860,7 @@ dsr1-fp8-b300-sglang:
# until a B300-specific recipe ships. Prefix caching is disabled.
# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
dsv4-fp4-b300-sglang:
image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b300
Expand Down Expand Up @@ -1888,6 +1888,8 @@ dsv4-fp4-b300-sglang:
- { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
- { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }

# DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
# selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
Expand Down
83 changes: 69 additions & 14 deletions benchmarks/single_node/dsv4_fp4_b300_sglang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -71,23 +71,78 @@ MEM_FRACTION_STATIC=0.90

if [ "${DP_ATTENTION}" = "true" ]; then
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
export SGLANG_OPT_USE_FAST_MASK_EP=1
export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-runner-backend flashinfer_mxfp4
--disable-flashinfer-autotune
--deepep-config "$DEEPEP_CONFIG"
--chunked-prefill-size 16384
--enable-prefill-delayer
)
MEM_FRACTION_STATIC=0.94
# ep=8 in the yaml signals the mega_moe deepep backend; check high-conc
# recipes first (they also have ep=8) so they aren't shadowed by the
# medium-conc EP_SIZE=8 branch below.
if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
if [ "$CONC" = "2048" ]; then
export SGLANG_LOG_FORWARD_ITERS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
CUDA_GRAPH_MAX_BS=288
MAX_RUNNING_REQUESTS=2560
MEM_FRACTION_STATIC=0.87
SWA_FULL_TOKENS_RATIO=0.06
TOKENIZER_WORKER_NUM=4
else
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
CUDA_GRAPH_MAX_BS=544
MAX_RUNNING_REQUESTS=4352
MEM_FRACTION_STATIC=0.835
SWA_FULL_TOKENS_RATIO=0.075
TOKENIZER_WORKER_NUM=8
fi
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend deepep
--cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS"
--deepep-config "$DEEPEP_CONFIG"
--chunked-prefill-size 65536
--tokenizer-worker-num "$TOKENIZER_WORKER_NUM"
--enable-prefill-delayer
)
if [ "$CONC" = "4096" ]; then
PARALLEL_ARGS+=(--decode-log-interval 5)
fi
elif [ "${EP_SIZE}" = "8" ]; then
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend deepep
--cuda-graph-max-bs 550
--deepep-config "$DEEPEP_CONFIG"
--chunked-prefill-size 16384
--enable-prefill-delayer
)
MAX_RUNNING_REQUESTS=768
MEM_FRACTION_STATIC=0.94
else
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-runner-backend flashinfer_mxfp4
--disable-flashinfer-autotune
--deepep-config "$DEEPEP_CONFIG"
--chunked-prefill-size 16384
--enable-prefill-delayer
)
MEM_FRACTION_STATIC=0.94
fi
else
PARALLEL_ARGS=(
--moe-runner-backend flashinfer_mxfp4
Expand All @@ -111,7 +166,7 @@ PYTHONNOUSERSITE=1 sglang serve \
--port $PORT \
--trust-remote-code \
--tp $TP \
--max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \
--max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \
--mem-fraction-static "$MEM_FRACTION_STATIC" \
--swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \
"${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
Expand Down
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1928,3 +1928,12 @@
- "Search space: TP=8, concurrency 4-64, 1k1k and 8k1k"
- "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188

Comment thread
Qiaolin-Yu marked this conversation as resolved.
- config-keys:
- dsv4-fp4-b300-sglang
description:
- "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.075, tokenizer-workers 8"
- "1k1k conc=512/1024: add mega_moe deepep backend with cuda-graph-max-bs 550, chunked-prefill 16384, max-running-requests 768"
- "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 ep=4 entries"
- "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179
Loading