diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh index 642c93918..dae0fc043 100755 --- a/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh @@ -53,7 +53,7 @@ PYEOF # the swiglu_limit clamp in the triton # MoE fallback path. export SGLANG_REASONING_EFFORT=max -export SGLANG_OPT_USE_FUSED_COMPRESS=false +export SGLANG_OPT_USE_FUSED_COMPRESS=true export SGLANG_OPT_USE_OLD_COMPRESSOR=true export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false export SGLANG_OPT_USE_JIT_KERNEL_FUSED_TOPK=false @@ -64,7 +64,7 @@ export SGLANG_OPT_USE_TILELANG_MHC_POST=false export SGLANG_ENABLE_THINKING=1 export SGLANG_USE_AITER=1 export SGLANG_USE_ROCM700A=1 -export SGLANG_TOPK_TRANSFORM_512_TORCH=1 +export SGLANG_TOPK_TRANSFORM_512_TORCH=0 export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1 export SGLANG_DSV4_FP4_EXPERTS=True export SGLANG_OPT_DPSK_V4_RADIX=0 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 524c91e67..6f2a60d11 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2171,6 +2171,13 @@ - "ep=1 entries (dp-attn true and false) are unaffected by the EP=8 regression" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1266 +- config-keys: + - dsv4-fp4-mi355x-sglang + description: + - "Flip SGLANG_TOPK_TRANSFORM_512_TORCH from 1 to 0. The indexer's top-k step now runs the tilelang kernel instead of the torch path." + - "Flip SGLANG_OPT_USE_FUSED_COMPRESS from false to true. The DeepseekV4 compressor now goes through the fused triton path instead of the torch path." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1275 + - config-keys: - dsv4-fp4-gb200-dynamo-vllm-mtp2 description: