Skip to content
Merged
2 changes: 1 addition & 1 deletion .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2019,7 +2019,7 @@ qwen3.5-fp8-h200-sglang:
- { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }

qwen3.5-fp8-h200-sglang-mtp:
image: lmsysorg/sglang:v0.5.9-cu129-amd64
image: lmsysorg/sglang:v0.5.10.post1
model: Qwen/Qwen3.5-397B-A17B-FP8
model-prefix: qwen3.5
runner: h200
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_MODEL_LEN: $MAX_MODEL_LEN"
start_gpu_monitor

set -x
python3 -m sglang.launch_server \
SGLANG_ENABLE_SPEC_V2=1 python3 -m sglang.launch_server \
Comment thread
Oseltamivir marked this conversation as resolved.
--model "$MODEL" \
--host 0.0.0.0 \
--port "$PORT" \
Expand Down Expand Up @@ -92,4 +92,4 @@ fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
set +x
10 changes: 8 additions & 2 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1298,7 +1298,7 @@
- "Expand from tp2/tp4 to tp1/tp2/tp4/tp8 with expert parallel and dp-attn variants"
- "Add ep2, ep4, and dp-attn configurations for higher concurrency sweeps"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/996

- config-keys:
- qwen3.5-fp4-b200-sglang
description:
Expand All @@ -1322,7 +1322,7 @@
description:
- "Qwen3.5 fp4 support on SGL"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1006

- config-keys:
- gptoss-fp4-h200-trt
description:
Expand All @@ -1342,3 +1342,9 @@
description:
- "TP2/TP4 seach space exploration for Qwen3.5 fp4 on SGL"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1022

- config-keys:
- qwen3.5-fp8-h200-sglang-mtp
description:
- "Enable SGLANG_ENABLE_SPEC_V2=1 for Qwen3.5 FP8 H200 SGLang MTP"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1017
Loading