From 623a624a92e7e9b6c27b75cbe5ae98fd61e132f9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 9 Nov 2025 21:12:41 +0000
Subject: [PATCH 1/5] Update h200/b200 sglang image tags to v0.5.5-cu129-amd64

Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com>
---
 .github/configs/nvidia-master.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 917136739..bb585fa4c 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1,5 +1,5 @@
 dsr1-fp4-b200-sglang:
-  image: lmsysorg/sglang:v0.5.3rc1-cu129-b200
+  image: lmsysorg/sglang:v0.5.5-cu129-amd64
   model: nvidia/DeepSeek-R1-0528-FP4-V2
   model-prefix: dsr1
   runner: b200
@@ -73,7 +73,7 @@ dsr1-fp4-b200-trt:
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
 
 dsr1-fp8-b200-sglang:
-  image: lmsysorg/sglang:v0.5.3rc1-cu129-b200
+  image: lmsysorg/sglang:v0.5.5-cu129-amd64
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: b200
@@ -120,7 +120,7 @@ dsr1-fp8-b200-trt:
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
 
 dsr1-fp8-h200-sglang:
-  image: lmsysorg/sglang:v0.5.2rc2-cu126
+  image: lmsysorg/sglang:v0.5.5-cu129-amd64
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: h200

From d6f38f293702e3d03df01717da7275a95533020a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 9 Nov 2025 21:34:27 +0000
Subject: [PATCH 2/5] Fix deprecated SGLang flags: replace --enable-ep-moe with
 --ep-size 8 and --enable-flashinfer-trtllm-moe with --moe-runner-backend
 flashinfer_trtllm

Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com>
---
 benchmarks/dsr1_fp4_b200_docker.sh | 4 ++--
 benchmarks/dsr1_fp8_b200_docker.sh | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh
index ac3e0e889..604b73304 100644
--- a/benchmarks/dsr1_fp4_b200_docker.sh
+++ b/benchmarks/dsr1_fp4_b200_docker.sh
@@ -21,6 +21,6 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.
 --tensor-parallel-size=$TP --data-parallel-size=1 \
 --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \
 --chunked-prefill-size 16384 \
---enable-ep-moe --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --enable-flashinfer-trtllm-moe --stream-interval 10
+--ep-size 8 --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
+--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10
 
diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh
index 8cc26a72f..b4592d987 100644
--- a/benchmarks/dsr1_fp8_b200_docker.sh
+++ b/benchmarks/dsr1_fp8_b200_docker.sh
@@ -34,4 +34,4 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --cuda-graph-max-bs 128 --max-running-requests 128 \
 --mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 --chunked-prefill-size 32768 --max-prefill-tokens 32768 \
 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \
---attention-backend trtllm_mla --stream-interval 30 --enable-flashinfer-trtllm-moe --quantization fp8
+--attention-backend trtllm_mla --stream-interval 30 --moe-runner-backend flashinfer_trtllm --quantization fp8

From d1d2c82b898ed94e0b9d74f9d6abead25e826e19 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 9 Nov 2025 21:42:26 +0000
Subject: [PATCH 3/5] Use $EP_SIZE variable instead of hardcoded 8 and add ep:
 8 to nvidia-master.yaml for B200 SGLang configs

Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com>
---
 .github/configs/nvidia-master.yaml | 12 ++++++------
 benchmarks/dsr1_fp4_b200_docker.sh |  2 +-
 benchmarks/dsr1_fp8_b200_docker.sh |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index bb585fa4c..591e133a8 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -10,17 +10,17 @@ dsr1-fp4-b200-sglang:
     osl: 1024
     search-space:
     - { tp: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, conc-start: 4, conc-end: 128 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
   - isl: 1024
     osl: 8192
     search-space:
     - { tp: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, conc-start: 4, conc-end: 128 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
 
 dsr1-fp4-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
@@ -83,15 +83,15 @@ dsr1-fp8-b200-sglang:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
     search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
 
 dsr1-fp8-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh
index 604b73304..3c8232072 100644
--- a/benchmarks/dsr1_fp4_b200_docker.sh
+++ b/benchmarks/dsr1_fp4_b200_docker.sh
@@ -21,6 +21,6 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.
 --tensor-parallel-size=$TP --data-parallel-size=1 \
 --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \
 --chunked-prefill-size 16384 \
---ep-size 8 --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
+--ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
 --enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10
 
diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh
index b4592d987..361b6f1f6 100644
--- a/benchmarks/dsr1_fp8_b200_docker.sh
+++ b/benchmarks/dsr1_fp8_b200_docker.sh
@@ -34,4 +34,4 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --cuda-graph-max-bs 128 --max-running-requests 128 \
 --mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 --chunked-prefill-size 32768 --max-prefill-tokens 32768 \
 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \
---attention-backend trtllm_mla --stream-interval 30 --moe-runner-backend flashinfer_trtllm --quantization fp8
+--attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8

From a21de785e973aae1ce05dbc953b9160804fb7960 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 9 Nov 2025 21:47:52 +0000
Subject: [PATCH 4/5] Add ep: 4 for tp=4 entries in dsr1-fp4-b200-sglang config

Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com>
---
 .github/configs/nvidia-master.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 591e133a8..85025aba7 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -9,17 +9,17 @@ dsr1-fp4-b200-sglang:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
   - isl: 1024
     osl: 8192
     search-space:
-    - { tp: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
 
 dsr1-fp4-b200-trt:

From 28534c7746c9df0f71a0b5e43fd723ee02ebbf0d Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 9 Nov 2025 22:00:13 +0000
Subject: [PATCH 5/5] Pass EP_SIZE environment variable to Docker containers in
 B200 runner scripts

Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com>
---
 runners/launch_b200-nvd.sh | 2 +-
 runners/launch_b200-tg.sh  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh
index cda48abb3..21a10d48f 100644
--- a/runners/launch_b200-nvd.sh
+++ b/runners/launch_b200-nvd.sh
@@ -30,7 +30,7 @@ docker run --rm -d --init --network host --name $server_name \
 --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT \
+-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \
 -e NCCL_GRAPH_REGISTER=0 \
 -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
 --entrypoint=/bin/bash \
diff --git a/runners/launch_b200-tg.sh b/runners/launch_b200-tg.sh
index b37154925..9f313396c 100644
--- a/runners/launch_b200-tg.sh
+++ b/runners/launch_b200-tg.sh
@@ -12,7 +12,7 @@ docker run --rm -d --network host --name $server_name \
 --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT \
+-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \
 -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \