From 51f19966f64a2eed6aa15d8a6d212f0443f24870 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sun, 17 May 2026 03:14:49 +0000 Subject: [PATCH 1/2] Update qwen3.5-bf16-b300-sglang and qwen3.5-bf16-b300-sglang-mtp SGLang image to v0.5.12-cu130 Ref #1154 Co-authored-by: Klaud Cold --- .github/configs/nvidia-master.yaml | 4 ++-- perf-changelog.yaml | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index dfe0b806a..bb45ef4e5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2396,7 +2396,7 @@ qwen3.5-fp4-b300-sglang-mtp: - { tp: 2, ep: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } qwen3.5-bf16-b300-sglang: - image: lmsysorg/sglang:v0.5.11-cu130 + image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: b300 @@ -2417,7 +2417,7 @@ qwen3.5-bf16-b300-sglang: - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } qwen3.5-bf16-b300-sglang-mtp: - image: lmsysorg/sglang:v0.5.11-cu130 + image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: b300 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3d73205c0..7d3c746f1 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2548,3 +2548,10 @@ description: - "Update vLLM image from v0.20.2 to v0.21.0" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1402 + +- config-keys: + - qwen3.5-bf16-b300-sglang + - qwen3.5-bf16-b300-sglang-mtp + description: + - "Update SGLang image from v0.5.11-cu130 to v0.5.12-cu130" + pr-link: XXX From 9ae53b45527818ef129cb021f53cb4d936454508 Mon Sep 17 00:00:00 2001 From: claude-fix-bot Date: Mon, 18 May 2026 01:33:43 -0400 Subject: [PATCH 2/2] fix(qwen3.5_bf16_b300): use --mm-attention-backend triton_attn Workaround for the flash_attn v4 cute kernel's sm_103 assertion failure in the Qwen3.5-VL vision encoder (filed as sgl-project/sglang#25564, upstream fix in Dao-AILab/flash-attention#2572). The text decoder still uses --attention-backend trtllm_mha; this only swaps the multi-modal (vision encoder) attention path to triton_attn, bypassing the broken flash_attn cute dispatch on B300. Suggested by upstream sglang reviewer. --- benchmarks/single_node/qwen3.5_bf16_b300.sh | 2 +- benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/qwen3.5_bf16_b300.sh b/benchmarks/single_node/qwen3.5_bf16_b300.sh index 4087d7973..f1056c896 100755 --- a/benchmarks/single_node/qwen3.5_bf16_b300.sh +++ b/benchmarks/single_node/qwen3.5_bf16_b300.sh @@ -58,7 +58,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \ --mem-fraction-static $MEM_FRAC_STATIC --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \ --context-length $CONTEXT_LENGTH --disable-radix-cache \ ---attention-backend trtllm_mha --moe-runner-backend flashinfer_trtllm \ +--attention-backend trtllm_mha --mm-attention-backend triton_attn --moe-runner-backend flashinfer_trtllm \ --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ --tokenizer-worker-num 6 --stream-interval 30 > $SERVER_LOG 2>&1 & diff --git a/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh b/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh index 319d39f58..705ca9775 100755 --- a/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh +++ b/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh @@ -58,7 +58,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \ --mem-fraction-static $MEM_FRAC_STATIC --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \ --context-length $CONTEXT_LENGTH --disable-radix-cache \ ---attention-backend trtllm_mha --moe-runner-backend flashinfer_trtllm \ +--attention-backend trtllm_mha --mm-attention-backend triton_attn --moe-runner-backend flashinfer_trtllm \ --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ --tokenizer-worker-num 6 --stream-interval 30 \ --speculative-algorithm EAGLE \