diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 2e1b43a97..b28fe1b5e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3125,7 +3125,7 @@ glm5-fp8-h200-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: zai-org/GLM-5-FP8 model-prefix: glm5 - runner: h200 + runner: h200-dgxc precision: fp8 framework: sglang multinode: false diff --git a/benchmarks/single_node/glm5_fp8_h200.sh b/benchmarks/single_node/glm5_fp8_h200.sh index 7defaa82e..410c66942 100644 --- a/benchmarks/single_node/glm5_fp8_h200.sh +++ b/benchmarks/single_node/glm5_fp8_h200.sh @@ -42,6 +42,7 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.85 \ --served-model-name glm-5-fp8 \ --trust-remote-code \ + --enable-flashinfer-allreduce-fusion \ $EVAL_CONTEXT_ARGS > "$SERVER_LOG" 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 991f460df..237787c0a 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2957,3 +2957,10 @@ - "Following recipe from https://github.com/vllm-project/recipes/pull/433" - "Add DEP8 dp-attn=true validation probes at conc=64 for 1k1k and 8k1k" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1374 + +- config-keys: + - glm5-fp8-h200-sglang + description: + - "Update SGLang image from glm5-hopper to v0.5.10.post1-cu130" + - "Add --enable-flashinfer-allreduce-fusion to server launch" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1033