diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 34a422f1c..b9ce64886 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2477,7 +2477,7 @@ qwen3.5-fp4-b300-sglang-mtp:
       - { tp: 2, ep: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp }
 
 qwen3.5-bf16-b300-sglang:
-  image: lmsysorg/sglang:v0.5.11-cu130
+  image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B
   model-prefix: qwen3.5
   runner: b300
@@ -2498,7 +2498,7 @@ qwen3.5-bf16-b300-sglang:
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
 
 qwen3.5-bf16-b300-sglang-mtp:
-  image: lmsysorg/sglang:v0.5.11-cu130
+  image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B
   model-prefix: qwen3.5
   runner: b300
diff --git a/benchmarks/single_node/qwen3.5_bf16_b300.sh b/benchmarks/single_node/qwen3.5_bf16_b300.sh
index 4087d7973..f1056c896 100755
--- a/benchmarks/single_node/qwen3.5_bf16_b300.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_b300.sh
@@ -58,7 +58,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \
 --mem-fraction-static $MEM_FRAC_STATIC --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \
 --context-length $CONTEXT_LENGTH --disable-radix-cache \
---attention-backend trtllm_mha --moe-runner-backend flashinfer_trtllm \
+--attention-backend trtllm_mha --mm-attention-backend triton_attn --moe-runner-backend flashinfer_trtllm \
 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
 --tokenizer-worker-num 6 --stream-interval 30 > $SERVER_LOG 2>&1 &
 
diff --git a/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh b/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh
index 319d39f58..705ca9775 100755
--- a/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh
@@ -58,7 +58,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \
 --mem-fraction-static $MEM_FRAC_STATIC --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \
 --context-length $CONTEXT_LENGTH --disable-radix-cache \
---attention-backend trtllm_mha --moe-runner-backend flashinfer_trtllm \
+--attention-backend trtllm_mha --mm-attention-backend triton_attn --moe-runner-backend flashinfer_trtllm \
 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
 --tokenizer-worker-num 6 --stream-interval 30 \
 --speculative-algorithm EAGLE \
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index a28c66b85..e600e8800 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2711,3 +2711,11 @@
     - "Update vLLM image from v0.20.2 to v0.21.0"
     - "Add VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 to disable aggressive CUDA-graph memory profiler that OOMs the KV cache"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1395
+
+- config-keys:
+    - qwen3.5-bf16-b300-sglang
+    - qwen3.5-bf16-b300-sglang-mtp
+  description:
+    - "Update SGLang image from v0.5.11-cu130 to v0.5.12-cu130"
+    - "Add --mm-attention-backend triton_attn to bypass flash-attn cute sm_103 assertion (see sgl-project/sglang#25564)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1422