diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 39e299cb0..9e4177ee8 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1860,7 +1860,7 @@ dsr1-fp8-b300-sglang: # until a B300-specific recipe ships. Prefix caching is disabled. # Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 + image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 @@ -1888,6 +1888,8 @@ dsv4-fp4-b300-sglang: - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index ac552c733..d50b57d72 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -71,23 +71,78 @@ MEM_FRACTION_STATIC=0.90 if [ "${DP_ATTENTION}" = "true" ]; then export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 export SGLANG_OPT_USE_FAST_MASK_EP=1 export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - PARALLEL_ARGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-runner-backend flashinfer_mxfp4 - --disable-flashinfer-autotune - --deepep-config "$DEEPEP_CONFIG" - --chunked-prefill-size 16384 - --enable-prefill-delayer - ) - MEM_FRACTION_STATIC=0.94 + # ep=8 in the yaml signals the mega_moe deepep backend; check high-conc + # recipes first (they also have ep=8) so they aren't shadowed by the + # medium-conc EP_SIZE=8 branch below. + if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then + export NVSHMEM_DISABLE_IB=1 + export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + if [ "$CONC" = "2048" ]; then + export SGLANG_LOG_FORWARD_ITERS=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 + CUDA_GRAPH_MAX_BS=288 + MAX_RUNNING_REQUESTS=2560 + MEM_FRACTION_STATIC=0.87 + SWA_FULL_TOKENS_RATIO=0.06 + TOKENIZER_WORKER_NUM=4 + else + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 + CUDA_GRAPH_MAX_BS=544 + MAX_RUNNING_REQUESTS=4352 + MEM_FRACTION_STATIC=0.835 + SWA_FULL_TOKENS_RATIO=0.075 + TOKENIZER_WORKER_NUM=8 + fi + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" + --deepep-config "$DEEPEP_CONFIG" + --chunked-prefill-size 65536 + --tokenizer-worker-num "$TOKENIZER_WORKER_NUM" + --enable-prefill-delayer + ) + if [ "$CONC" = "4096" ]; then + PARALLEL_ARGS+=(--decode-log-interval 5) + fi + elif [ "${EP_SIZE}" = "8" ]; then + export NVSHMEM_DISABLE_IB=1 + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --cuda-graph-max-bs 550 + --deepep-config "$DEEPEP_CONFIG" + --chunked-prefill-size 16384 + --enable-prefill-delayer + ) + MAX_RUNNING_REQUESTS=768 + MEM_FRACTION_STATIC=0.94 + else + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune + --deepep-config "$DEEPEP_CONFIG" + --chunked-prefill-size 16384 + --enable-prefill-delayer + ) + MEM_FRACTION_STATIC=0.94 + fi else PARALLEL_ARGS=( --moe-runner-backend flashinfer_mxfp4 @@ -111,7 +166,7 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \ + --max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \ --mem-fraction-static "$MEM_FRACTION_STATIC" \ --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \ "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 0421c5596..8941211c1 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1928,3 +1928,12 @@ - "Search space: TP=8, concurrency 4-64, 1k1k and 8k1k" - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.075, tokenizer-workers 8" + - "1k1k conc=512/1024: add mega_moe deepep backend with cuda-graph-max-bs 550, chunked-prefill 16384, max-running-requests 768" + - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 ep=4 entries" + - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179