SemiAnalysisAI · Oseltamivir · Apr 28, 2026 · Apr 26, 2026 · Apr 26, 2026 · Apr 26, 2026
@@ -1860,7 +1860,7 @@ dsr1-fp8-b300-sglang:
 # until a B300-specific recipe ships. Prefix caching is disabled.
 # Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
 dsv4-fp4-b300-sglang:
-  image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
+  image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
@@ -1888,6 +1888,8 @@ dsv4-fp4-b300-sglang:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
     - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
 
 # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
 # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -71,23 +71,78 @@ MEM_FRACTION_STATIC=0.90
 
 if [ "${DP_ATTENTION}" = "true" ]; then
     export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
-    export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
-    export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
     export SGLANG_OPT_USE_FAST_MASK_EP=1
     export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
-    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
     export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
     export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
-    PARALLEL_ARGS=(
-        --dp-size "$TP"
-        --enable-dp-attention
-        --moe-runner-backend flashinfer_mxfp4
-        --disable-flashinfer-autotune
-        --deepep-config "$DEEPEP_CONFIG"
-        --chunked-prefill-size 16384
-        --enable-prefill-delayer
-    )
-    MEM_FRACTION_STATIC=0.94
+    # ep=8 in the yaml signals the mega_moe deepep backend; check high-conc
+    # recipes first (they also have ep=8) so they aren't shadowed by the
+    # medium-conc EP_SIZE=8 branch below.
+    if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then
+        export NVSHMEM_DISABLE_IB=1
+        export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
+        export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
+        export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
+        if [ "$CONC" = "2048" ]; then
+            export SGLANG_LOG_FORWARD_ITERS=1
+            export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
+            CUDA_GRAPH_MAX_BS=288
+            MAX_RUNNING_REQUESTS=2560
+            MEM_FRACTION_STATIC=0.87
+            SWA_FULL_TOKENS_RATIO=0.06
+            TOKENIZER_WORKER_NUM=4
+        else
+            export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
+            CUDA_GRAPH_MAX_BS=544
+            MAX_RUNNING_REQUESTS=4352
+            MEM_FRACTION_STATIC=0.835
+            SWA_FULL_TOKENS_RATIO=0.075
+            TOKENIZER_WORKER_NUM=8
+        fi
+        PARALLEL_ARGS=(
+            --dp-size "$TP"
+            --enable-dp-attention
+            --moe-a2a-backend deepep
+            --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS"
+            --deepep-config "$DEEPEP_CONFIG"
+            --chunked-prefill-size 65536
+            --tokenizer-worker-num "$TOKENIZER_WORKER_NUM"
+            --enable-prefill-delayer
+        )
+        if [ "$CONC" = "4096" ]; then
+            PARALLEL_ARGS+=(--decode-log-interval 5)
+        fi
+    elif [ "${EP_SIZE}" = "8" ]; then
+        export NVSHMEM_DISABLE_IB=1
+        export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
+        export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
+        export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550
+        PARALLEL_ARGS=(
+            --dp-size "$TP"
+            --enable-dp-attention
+            --moe-a2a-backend deepep
+            --cuda-graph-max-bs 550
+            --deepep-config "$DEEPEP_CONFIG"
+            --chunked-prefill-size 16384
+            --enable-prefill-delayer
+        )
+        MAX_RUNNING_REQUESTS=768
+        MEM_FRACTION_STATIC=0.94
+    else
+        export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
+        export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
+        export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
+        PARALLEL_ARGS=(
+            --dp-size "$TP"
+            --enable-dp-attention
+            --moe-runner-backend flashinfer_mxfp4
+            --disable-flashinfer-autotune
+            --deepep-config "$DEEPEP_CONFIG"
+            --chunked-prefill-size 16384
+            --enable-prefill-delayer
+        )
+        MEM_FRACTION_STATIC=0.94
+    fi
 else
     PARALLEL_ARGS=(
         --moe-runner-backend flashinfer_mxfp4
@@ -111,7 +166,7 @@ PYTHONNOUSERSITE=1 sglang serve \
     --port $PORT \
     --trust-remote-code \
     --tp $TP \
-    --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \
+    --max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \
     --mem-fraction-static "$MEM_FRACTION_STATIC" \
     --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \
     "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -1928,3 +1928,12 @@
     - "Search space: TP=8, concurrency 4-64, 1k1k and 8k1k"
     - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.075, tokenizer-workers 8"
+    - "1k1k conc=512/1024: add mega_moe deepep backend with cuda-graph-max-bs 550, chunked-prefill 16384, max-running-requests 768"
+    - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 ep=4 entries"
+    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179