From 51f19966f64a2eed6aa15d8a6d212f0443f24870 Mon Sep 17 00:00:00 2001
From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com>
Date: Sun, 17 May 2026 03:14:49 +0000
Subject: [PATCH 1/2] Update qwen3.5-bf16-b300-sglang and
 qwen3.5-bf16-b300-sglang-mtp SGLang image to v0.5.12-cu130

Ref #1154

Co-authored-by: Klaud Cold <Klaud-Cold@users.noreply.github.com>
---
 .github/configs/nvidia-master.yaml | 4 ++--
 perf-changelog.yaml                | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index dfe0b806a..bb45ef4e5 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2396,7 +2396,7 @@ qwen3.5-fp4-b300-sglang-mtp:
       - { tp: 2, ep: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp }
 
 qwen3.5-bf16-b300-sglang:
-  image: lmsysorg/sglang:v0.5.11-cu130
+  image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B
   model-prefix: qwen3.5
   runner: b300
@@ -2417,7 +2417,7 @@ qwen3.5-bf16-b300-sglang:
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
 
 qwen3.5-bf16-b300-sglang-mtp:
-  image: lmsysorg/sglang:v0.5.11-cu130
+  image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B
   model-prefix: qwen3.5
   runner: b300
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 3d73205c0..7d3c746f1 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2548,3 +2548,10 @@
   description:
     - "Update vLLM image from v0.20.2 to v0.21.0"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1402
+
+- config-keys:
+    - qwen3.5-bf16-b300-sglang
+    - qwen3.5-bf16-b300-sglang-mtp
+  description:
+    - "Update SGLang image from v0.5.11-cu130 to v0.5.12-cu130"
+  pr-link: XXX

From 9ae53b45527818ef129cb021f53cb4d936454508 Mon Sep 17 00:00:00 2001
From: claude-fix-bot <claude-fix-bot@local>
Date: Mon, 18 May 2026 01:33:43 -0400
Subject: [PATCH 2/2] fix(qwen3.5_bf16_b300): use --mm-attention-backend
 triton_attn

Workaround for the flash_attn v4 cute kernel's sm_103 assertion failure
in the Qwen3.5-VL vision encoder (filed as sgl-project/sglang#25564,
upstream fix in Dao-AILab/flash-attention#2572).

The text decoder still uses --attention-backend trtllm_mha; this only
swaps the multi-modal (vision encoder) attention path to triton_attn,
bypassing the broken flash_attn cute dispatch on B300.

Suggested by upstream sglang reviewer.
---
 benchmarks/single_node/qwen3.5_bf16_b300.sh     | 2 +-
 benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_node/qwen3.5_bf16_b300.sh b/benchmarks/single_node/qwen3.5_bf16_b300.sh
index 4087d7973..f1056c896 100755
--- a/benchmarks/single_node/qwen3.5_bf16_b300.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_b300.sh
@@ -58,7 +58,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \
 --mem-fraction-static $MEM_FRAC_STATIC --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \
 --context-length $CONTEXT_LENGTH --disable-radix-cache \
---attention-backend trtllm_mha --moe-runner-backend flashinfer_trtllm \
+--attention-backend trtllm_mha --mm-attention-backend triton_attn --moe-runner-backend flashinfer_trtllm \
 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
 --tokenizer-worker-num 6 --stream-interval 30 > $SERVER_LOG 2>&1 &
 
diff --git a/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh b/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh
index 319d39f58..705ca9775 100755
--- a/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh
@@ -58,7 +58,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \
 --mem-fraction-static $MEM_FRAC_STATIC --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \
 --context-length $CONTEXT_LENGTH --disable-radix-cache \
---attention-backend trtllm_mha --moe-runner-backend flashinfer_trtllm \
+--attention-backend trtllm_mha --mm-attention-backend triton_attn --moe-runner-backend flashinfer_trtllm \
 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
 --tokenizer-worker-num 6 --stream-interval 30 \
 --speculative-algorithm EAGLE \