From 66d85512e319645e96308f333618bf9d7ceeb897 Mon Sep 17 00:00:00 2001
From: thomawan <thomawan@amd.com>
Date: Mon, 4 May 2026 16:53:04 +0800
Subject: [PATCH 1/3] Update dsv4 env vars for mi355 sglang

---
 benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh | 4 ++--
 perf-changelog.yaml                              | 8 +++-----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh
index 642c93918..dae0fc043 100755
--- a/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh
@@ -53,7 +53,7 @@ PYEOF
 #                                    the swiglu_limit clamp in the triton
 #                                    MoE fallback path.
 export SGLANG_REASONING_EFFORT=max
-export SGLANG_OPT_USE_FUSED_COMPRESS=false
+export SGLANG_OPT_USE_FUSED_COMPRESS=true
 export SGLANG_OPT_USE_OLD_COMPRESSOR=true
 export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false
 export SGLANG_OPT_USE_JIT_KERNEL_FUSED_TOPK=false
@@ -64,7 +64,7 @@ export SGLANG_OPT_USE_TILELANG_MHC_POST=false
 export SGLANG_ENABLE_THINKING=1
 export SGLANG_USE_AITER=1
 export SGLANG_USE_ROCM700A=1
-export SGLANG_TOPK_TRANSFORM_512_TORCH=1
+export SGLANG_TOPK_TRANSFORM_512_TORCH=0
 export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1
 export SGLANG_DSV4_FP4_EXPERTS=True
 export SGLANG_OPT_DPSK_V4_RADIX=0
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 524c91e67..57305ddc9 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2165,11 +2165,9 @@
 - config-keys:
     - dsv4-fp4-mi355x-sglang
   description:
-    - "Bump image to rocm/sgl-dev:rocm720-mi35x-a8410de-20260502-DSv4 (one commit forward on amd/deepseek_v4: sglang PR #24249, fuse-compress-decode 0501)"
-    - "Drop the runtime sglang clone+pip overlay from benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh — the new image bakes the same a8410de6 SHA the overlay was pinning, so the overlay is redundant. Future sglang bumps now go through an image tag bump"
-    - "Context: ep=8 dp-attn=true entries failed gsm8k eval after #1244 merged. PR sweep (run 25246535693) reported gsm8k strict-match=0.9318 because the launcher silently dropped --ep-size and sglang ran with ep_size=1 regardless of the matrix label; post-merge sweep (run 25262278289) ran with ep_size=8 and gsm8k strict-match dropped to 0.0000. The image bump is the candidate fix to verify on rerun"
-    - "ep=1 entries (dp-attn true and false) are unaffected by the EP=8 regression"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1266
+    - "Flip SGLANG_TOPK_TRANSFORM_512_TORCH from 1 to 0. The indexer's top-k step now runs the tilelang kernel instead of the torch path."
+    - "Flip SGLANG_OPT_USE_FUSED_COMPRESS from false to true. The DeepseekV4 compressor now goes through the fused triton path instead of the torch path."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/xxxx
 
 - config-keys:
     - dsv4-fp4-gb200-dynamo-vllm-mtp2

From c7b9cd8e7e6423043abd7ccdd6fd6fc63b70dcc4 Mon Sep 17 00:00:00 2001
From: Thomas Wang <1am9trash@gmail.com>
Date: Mon, 4 May 2026 17:00:54 +0800
Subject: [PATCH 2/3] Update PR number

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 57305ddc9..5e65f01ee 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2167,7 +2167,7 @@
   description:
     - "Flip SGLANG_TOPK_TRANSFORM_512_TORCH from 1 to 0. The indexer's top-k step now runs the tilelang kernel instead of the torch path."
     - "Flip SGLANG_OPT_USE_FUSED_COMPRESS from false to true. The DeepseekV4 compressor now goes through the fused triton path instead of the torch path."
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/xxxx
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1275
 
 - config-keys:
     - dsv4-fp4-gb200-dynamo-vllm-mtp2

From 5981f999a9c80d904b18976790005e7b72b536ac Mon Sep 17 00:00:00 2001
From: thomawan <thomawan@amd.com>
Date: Mon, 4 May 2026 17:07:11 +0800
Subject: [PATCH 3/3] Add perf log back

---
 perf-changelog.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 5e65f01ee..6f2a60d11 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2162,6 +2162,15 @@
     - "Keep this as eval-only PR validation until the TensorRT-LLM fused MHC kernel is guarded or supports hidden size 7168"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1233
 
+- config-keys:
+    - dsv4-fp4-mi355x-sglang
+  description:
+    - "Bump image to rocm/sgl-dev:rocm720-mi35x-a8410de-20260502-DSv4 (one commit forward on amd/deepseek_v4: sglang PR #24249, fuse-compress-decode 0501)"
+    - "Drop the runtime sglang clone+pip overlay from benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh — the new image bakes the same a8410de6 SHA the overlay was pinning, so the overlay is redundant. Future sglang bumps now go through an image tag bump"
+    - "Context: ep=8 dp-attn=true entries failed gsm8k eval after #1244 merged. PR sweep (run 25246535693) reported gsm8k strict-match=0.9318 because the launcher silently dropped --ep-size and sglang ran with ep_size=1 regardless of the matrix label; post-merge sweep (run 25262278289) ran with ep_size=8 and gsm8k strict-match dropped to 0.0000. The image bump is the candidate fix to verify on rerun"
+    - "ep=1 entries (dp-attn true and false) are unaffected by the EP=8 regression"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1266
+
 - config-keys:
     - dsv4-fp4-mi355x-sglang
   description: