Skip to content
2 changes: 1 addition & 1 deletion .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3125,7 +3125,7 @@ glm5-fp8-h200-sglang:
image: lmsysorg/sglang:v0.5.12-cu130
model: zai-org/GLM-5-FP8
model-prefix: glm5
runner: h200
runner: h200-dgxc
precision: fp8
framework: sglang
multinode: false
Expand Down
1 change: 1 addition & 0 deletions benchmarks/single_node/glm5_fp8_h200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ python3 -m sglang.launch_server \
--mem-fraction-static 0.85 \
--served-model-name glm-5-fp8 \
--trust-remote-code \
--enable-flashinfer-allreduce-fusion \
$EVAL_CONTEXT_ARGS > "$SERVER_LOG" 2>&1 &

SERVER_PID=$!
Expand Down
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2957,3 +2957,10 @@
- "Following recipe from https://github.com/vllm-project/recipes/pull/433"
- "Add DEP8 dp-attn=true validation probes at conc=64 for 1k1k and 8k1k"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1374

- config-keys:
- glm5-fp8-h200-sglang
description:
- "Update SGLang image from glm5-hopper to v0.5.10.post1-cu130"
- "Add --enable-flashinfer-allreduce-fusion to server launch"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1033
Loading