diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 082186bce..27ee51eef 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2019,7 +2019,7 @@ qwen3.5-fp8-h200-sglang: - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-h200-sglang-mtp: - image: lmsysorg/sglang:v0.5.9-cu129-amd64 + image: lmsysorg/sglang:v0.5.10.post1 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: h200 diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh index 3e03c64dd..6631fa776 100644 --- a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh +++ b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh @@ -35,7 +35,7 @@ echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_MODEL_LEN: $MAX_MODEL_LEN" start_gpu_monitor set -x -python3 -m sglang.launch_server \ +SGLANG_ENABLE_SPEC_V2=1 python3 -m sglang.launch_server \ --model "$MODEL" \ --host 0.0.0.0 \ --port "$PORT" \ @@ -92,4 +92,4 @@ fi # Stop GPU monitoring stop_gpu_monitor -set +x \ No newline at end of file +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index edfa24efd..746d0645d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1298,7 +1298,7 @@ - "Expand from tp2/tp4 to tp1/tp2/tp4/tp8 with expert parallel and dp-attn variants" - "Add ep2, ep4, and dp-attn configurations for higher concurrency sweeps" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/996 - + - config-keys: - qwen3.5-fp4-b200-sglang description: @@ -1322,7 +1322,7 @@ description: - "Qwen3.5 fp4 support on SGL" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1006 - + - config-keys: - gptoss-fp4-h200-trt description: @@ -1342,3 +1342,9 @@ description: - "TP2/TP4 seach space exploration for Qwen3.5 fp4 on SGL" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1022 + +- config-keys: + - qwen3.5-fp8-h200-sglang-mtp + description: + - "Enable SGLANG_ENABLE_SPEC_V2=1 for Qwen3.5 FP8 H200 SGLang MTP" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1017