From 7b227424050e2cdd1635abe6642384dcd914c7b2 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Tue, 14 Apr 2026 23:50:53 -0700 Subject: [PATCH 1/7] update sglang container --- .github/configs/nvidia-master.yaml | 2 +- perf-changelog.yaml | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 27ee51eef..c4c9693a0 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2037,7 +2037,7 @@ qwen3.5-fp8-h200-sglang-mtp: - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } glm5-fp8-h200-sglang: - image: lmsysorg/sglang:glm5-hopper + image: lmsysorg/sglang:v0.5.10.post1-cu130 model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: h200 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 746d0645d..53fa4d4b1 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1348,3 +1348,9 @@ description: - "Enable SGLANG_ENABLE_SPEC_V2=1 for Qwen3.5 FP8 H200 SGLang MTP" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1017 + +- config-keys: + - glm5-fp8-h200-sglang + description: + - "Update SGLang image from glm5-hopper to v0.5.10.post1-cu130" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX From 295e1df88e40e4516fdba1e231656d63d5d6ded0 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Wed, 15 Apr 2026 14:43:53 -0700 Subject: [PATCH 2/7] update PR number --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 53fa4d4b1..214bb3915 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1353,4 +1353,4 @@ - glm5-fp8-h200-sglang description: - "Update SGLang image from glm5-hopper to v0.5.10.post1-cu130" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1033 From e2767e6f2fc6776be5280be7e7b584468cc9e533 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Thu, 23 Apr 2026 16:47:12 -0700 Subject: [PATCH 3/7] update flashinfer --- benchmarks/single_node/glm5_fp8_h200.sh | 1 + perf-changelog.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/benchmarks/single_node/glm5_fp8_h200.sh b/benchmarks/single_node/glm5_fp8_h200.sh index 7a985645f..5618eca27 100644 --- a/benchmarks/single_node/glm5_fp8_h200.sh +++ b/benchmarks/single_node/glm5_fp8_h200.sh @@ -42,6 +42,7 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.85 \ --served-model-name glm-5-fp8 \ --trust-remote-code \ + --enable-flashinfer-allreduce-fusion \ $EVAL_CONTEXT_ARGS > "$SERVER_LOG" 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 214bb3915..de11b46ae 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1353,4 +1353,5 @@ - glm5-fp8-h200-sglang description: - "Update SGLang image from glm5-hopper to v0.5.10.post1-cu130" + - "Add --enable-flashinfer-allreduce-fusion to server launch" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1033 From 3bb3b5a1e1765979d26fe3dcc25463e13c1dc90e Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Thu, 14 May 2026 09:36:24 -0700 Subject: [PATCH 4/7] update contianer --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 2e279508d..a6bea8c23 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2400,7 +2400,7 @@ qwen3.5-fp8-h200-sglang-mtp: - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } glm5-fp8-h200-sglang: - image: lmsysorg/sglang:v0.5.10.post1-cu130 + image: lmsysorg/sglang:v0.5.11-cu129 model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: h200 From 6db622c26f213f710fabe9d60a0eda951017a2bc Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Thu, 14 May 2026 10:47:50 -0700 Subject: [PATCH 5/7] Update SGLang image and add launch option Updated SGLang image version and added server launch option. --- perf-changelog.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index b05b00ccd..5fef480d7 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2481,4 +2481,3 @@ - "Update SGLang image from glm5-hopper to v0.5.10.post1-cu130" - "Add --enable-flashinfer-allreduce-fusion to server launch" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1033 - \ No newline at end of file From 03b8a8b841aa6439a285e14ca2540dee5ae58f11 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Mon, 18 May 2026 16:24:45 -0700 Subject: [PATCH 6/7] fix runner --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 7dac4d174..1f1b73ad4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2888,7 +2888,7 @@ glm5-fp8-h200-sglang: image: lmsysorg/sglang:v0.5.11-cu129 model: zai-org/GLM-5-FP8 model-prefix: glm5 - runner: h200 + runner: h200-dgxc precision: fp8 framework: sglang multinode: false From b24a54581eaf41b1eeab64234e6712db9e5eef15 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Tue, 19 May 2026 12:02:55 -0700 Subject: [PATCH 7/7] image update --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index b5c83d812..b28fe1b5e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3122,7 +3122,7 @@ qwen3.5-fp8-h200-sglang-mtp: - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } glm5-fp8-h200-sglang: - image: lmsysorg/sglang:v0.5.11-cu129 + image: lmsysorg/sglang:v0.5.12-cu130 model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: h200-dgxc