From 0f1645bee0a5f2c61b563f6e5304e5e603766c33 Mon Sep 17 00:00:00 2001 From: Po-Han Huang Date: Wed, 3 Dec 2025 23:42:39 -0800 Subject: [PATCH 01/16] Update vLLM version to v0.12.0 --- .github/configs/nvidia-master.yaml | 6 +++--- benchmarks/gptoss_fp4_b200_docker.sh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 1242386de..dfcd867ef 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -209,7 +209,7 @@ gptoss-fp4-b200-trt: - { tp: 8, conc-start: 4, conc-end: 8 } gptoss-fp4-b200-vllm: - image: vllm/vllm-openai:v0.11.2 + image: vllm/vllm-openai:v0.12.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: b200 @@ -240,7 +240,7 @@ gptoss-fp4-b200-vllm: - { tp: 8, conc-start: 4, conc-end: 4 } gptoss-fp4-h100-vllm: - image: vllm/vllm-openai:v0.11.2 + image: vllm/vllm-openai:v0.12.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: h100 @@ -300,7 +300,7 @@ gptoss-fp4-h200-trt: - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } gptoss-fp4-h200-vllm: - image: vllm/vllm-openai:v0.11.2 + image: vllm/vllm-openai:v0.12.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: h200 diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 8068d2310..482ef2a33 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -30,7 +30,7 @@ fi cat > config.yaml << EOF kv-cache-dtype: fp8 -compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_noop":true}}' +compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}' async-scheduling: true no-enable-prefix-caching: true max-cudagraph-capture-size: 2048 From 8796275cee28c5173d62bafdb1ec0b1ef7603d46 Mon Sep 17 00:00:00 2001 From: Po-Han Huang Date: Wed, 10 Dec 2025 17:22:56 -0800 Subject: [PATCH 02/16] Fix H100/H200 perf regression --- benchmarks/gptoss_fp4_h100_docker.sh | 1 + benchmarks/gptoss_fp4_h100_slurm.sh | 1 + benchmarks/gptoss_fp4_h200_slurm.sh | 1 + 3 files changed, 3 insertions(+) diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index e4892b859..4ef38a2be 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -20,6 +20,7 @@ max-model-len: 10240 EOF export PYTHONNOUSERSITE=1 +export VLLM_MXFP4_USE_MARLIN=1 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index 9bc601fea..e93215e7c 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -22,6 +22,7 @@ EOF SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) export TORCH_CUDA_ARCH_LIST="9.0" +export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 set -x PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index 0f4ed07f3..7428b2332 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -38,6 +38,7 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) export TORCH_CUDA_ARCH_LIST="9.0" +export VLLM_MXFP4_USE_MARLIN=1 PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ From 7a3fdaa791b3eb79f34d206936c11671059ef714 Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Thu, 11 Dec 2025 10:16:55 -0800 Subject: [PATCH 03/16] check and install git before use --- benchmarks/benchmark_lib.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 2b4c20c72..cdf489cd0 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -208,6 +208,17 @@ run_benchmark_serving() { echo "Error: --result-dir is required" return 1 fi + + # Check if git is installed, install if missing + if ! command -v git &> /dev/null; then + echo "git not found, installing..." + if command -v apt-get &> /dev/null; then + apt-get update && apt-get install -y git + else + echo "Error: Could not install git. Package manager not found." + return 1 + fi + fi # Clone benchmark serving repo local BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) From be1e6950f6187492d09718f4c2c6bcaf8dd051ca Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Dec 2025 15:38:30 -0600 Subject: [PATCH 04/16] add container writable to h200 nv runner launch script --- runners/launch_h200-nv.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/runners/launch_h200-nv.sh b/runners/launch_h200-nv.sh index 5319f8959..59fc86511 100644 --- a/runners/launch_h200-nv.sh +++ b/runners/launch_h200-nv.sh @@ -17,6 +17,7 @@ srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +--container-writable \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ From c683d18c88a50888ce717c9b8dbac0e304447baa Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Dec 2025 15:47:29 -0600 Subject: [PATCH 05/16] add sudo to apt-get --- benchmarks/benchmark_lib.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index cdf489cd0..cc7b81553 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -213,7 +213,7 @@ run_benchmark_serving() { if ! command -v git &> /dev/null; then echo "git not found, installing..." if command -v apt-get &> /dev/null; then - apt-get update && apt-get install -y git + sudo apt-get update && sudo apt-get install -y git else echo "Error: Could not install git. Package manager not found." return 1 From 59dae33376810aafcb507a73db2eb7b806c319c2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Dec 2025 15:52:26 -0600 Subject: [PATCH 06/16] add container-remap-root to h200 nv and nb runner launchers --- runners/launch_h200-nb.sh | 1 + runners/launch_h200-nv.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index c76b366d2..ddadd48bc 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -24,6 +24,7 @@ fi srun --jobid=$JOB_ID \ --container-image=$CONTAINER_IMAGE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +--container-remap-root \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ diff --git a/runners/launch_h200-nv.sh b/runners/launch_h200-nv.sh index 59fc86511..3282be1a8 100644 --- a/runners/launch_h200-nv.sh +++ b/runners/launch_h200-nv.sh @@ -18,6 +18,7 @@ srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --container-writable \ +--container-remap-root \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ From ca8f30faa707a119f1b180c40bdb831828a5ae49 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 16 Dec 2025 23:58:26 +0000 Subject: [PATCH 07/16] make changes to perf changelog --- perf-changelog.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 112145f10..298fdd182 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -81,3 +81,12 @@ - Update vLLM image for NVIDIA configs from vLLM 0.11.0 to vLLM 0.11.2 - Adds kv-cache-dtype: fp8 to benchmarks/gptoss_fp4_b200_docker.sh PR: https://github.com/InferenceMAX/InferenceMAX/pull/273 +- config-keys: + - gptoss-fp4-b200-vllm + - gptoss-fp4-h100-vllm + - gptoss-fp4-h200-vllm + description: | + - Update vLLM image for NVIDIA configs from vLLM 0.11.2 to vLLM 0.12.0 + - Adds VLLM_MXFP4_USE_MARLIN=1 to benchmarks/gptoss_fp4_h100_docker.sh and benchmarks/gptoss_fp4_h200_slurm.sh + - Adds VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 to benchmarks/gptoss_fp4_h100_slurm.sh + PR: https://github.com/InferenceMAX/InferenceMAX/pull/327 From 9951db676ccc20eee557f794a6776330b14f0dac Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Tue, 16 Dec 2025 17:09:00 -0800 Subject: [PATCH 08/16] fix typo, use correct env var for h100 --- benchmarks/gptoss_fp4_h100_slurm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index e93215e7c..40acac24d 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -22,7 +22,7 @@ EOF SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) export TORCH_CUDA_ARCH_LIST="9.0" -export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 +export VLLM_MXFP4_USE_MARLIN=1 set -x PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ From 2f2377a392c9e3c276dd493892398bee6175cf00 Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Tue, 30 Dec 2025 11:21:57 -0800 Subject: [PATCH 09/16] update to v0.13.0 --- .github/configs/nvidia-master.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index dfcd867ef..27ab628f2 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -209,7 +209,7 @@ gptoss-fp4-b200-trt: - { tp: 8, conc-start: 4, conc-end: 8 } gptoss-fp4-b200-vllm: - image: vllm/vllm-openai:v0.12.0 + image: vllm/vllm-openai:v0.13.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: b200 @@ -240,7 +240,7 @@ gptoss-fp4-b200-vllm: - { tp: 8, conc-start: 4, conc-end: 4 } gptoss-fp4-h100-vllm: - image: vllm/vllm-openai:v0.12.0 + image: vllm/vllm-openai:v0.13.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: h100 @@ -300,7 +300,7 @@ gptoss-fp4-h200-trt: - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } gptoss-fp4-h200-vllm: - image: vllm/vllm-openai:v0.12.0 + image: vllm/vllm-openai:v0.13.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: h200 From cd5ad1b16390b572b8aeecb11fc447f23e9f6c57 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 16 Dec 2025 23:58:26 +0000 Subject: [PATCH 10/16] make changes to perf changelog --- perf-changelog.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d838d89f4..e3d0c14f6 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -126,3 +126,13 @@ description: - "Updating MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.6.post2" pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/369 + +- config-keys: + - gptoss-fp4-b200-vllm + - gptoss-fp4-h100-vllm + - gptoss-fp4-h200-vllm + description: | + - Update vLLM image for NVIDIA configs from vLLM 0.11.2 to vLLM 0.12.0 + - Adds VLLM_MXFP4_USE_MARLIN=1 to benchmarks/gptoss_fp4_h100_docker.sh and benchmarks/gptoss_fp4_h200_slurm.sh + - Adds VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 to benchmarks/gptoss_fp4_h100_slurm.sh + PR: https://github.com/InferenceMAX/InferenceMAX/pull/327 From dac5bfaae3af65f7d2b23324a0988e9e7f3d47e4 Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Tue, 30 Dec 2025 12:08:04 -0800 Subject: [PATCH 11/16] fix perf-changelog fix perf-changelog fix perf-changelog fix --- perf-changelog.yaml | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index e3d0c14f6..56a0bac3b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -84,16 +84,6 @@ - "Add kv-cache-dtype: fp8 to benchmarks/gptoss_fp4_b200_docker.sh" pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/273 -- config-keys: - - gptoss-fp4-b200-vllm - - gptoss-fp4-h100-vllm - - gptoss-fp4-h200-vllm - description: | - - Update vLLM image for NVIDIA configs from vLLM 0.11.2 to vLLM 0.12.0 - - Adds VLLM_MXFP4_USE_MARLIN=1 to benchmarks/gptoss_fp4_h100_docker.sh and benchmarks/gptoss_fp4_h200_slurm.sh - - Adds VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 to benchmarks/gptoss_fp4_h100_slurm.sh - pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/327 - - config-keys: - gptoss-fp4-b200-trt description: @@ -131,8 +121,7 @@ - gptoss-fp4-b200-vllm - gptoss-fp4-h100-vllm - gptoss-fp4-h200-vllm - description: | - - Update vLLM image for NVIDIA configs from vLLM 0.11.2 to vLLM 0.12.0 - - Adds VLLM_MXFP4_USE_MARLIN=1 to benchmarks/gptoss_fp4_h100_docker.sh and benchmarks/gptoss_fp4_h200_slurm.sh - - Adds VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 to benchmarks/gptoss_fp4_h100_slurm.sh - PR: https://github.com/InferenceMAX/InferenceMAX/pull/327 + description: + - "Update vLLM image from v0.11.2 to v0.13.0" + - "Add VLLM_MXFP4_USE_MARLIN=1 to H100 and H200 benchmark scripts" + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/327 From ce9f4d9c6a114b467a86630c6007a483cb3332e6 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 31 Dec 2025 16:25:36 +0000 Subject: [PATCH 12/16] fix compilation configs --- benchmarks/gptoss_fp4_b200_slurm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_b200_slurm.sh b/benchmarks/gptoss_fp4_b200_slurm.sh index d3d2fef8d..b8a2fd2fa 100644 --- a/benchmarks/gptoss_fp4_b200_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_slurm.sh @@ -27,7 +27,7 @@ fi cat > config.yaml << EOF kv-cache-dtype: fp8 -compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_noop":true}}' +compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}' async-scheduling: true no-enable-prefix-caching: true max-cudagraph-capture-size: 2048 From a71662767dcaa30b29cfa8491fd4f47455b97b0b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 31 Dec 2025 17:23:44 +0000 Subject: [PATCH 13/16] make num prompts conc * 10 --- benchmarks/gptoss_fp4_b200_slurm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_b200_slurm.sh b/benchmarks/gptoss_fp4_b200_slurm.sh index b8a2fd2fa..6890a2191 100644 --- a/benchmarks/gptoss_fp4_b200_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_slurm.sh @@ -64,7 +64,7 @@ run_benchmark_serving \ --input-len "$ISL" \ --output-len "$OSL" \ --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$NUM_PROMPTS" \ + --num-prompts $(( CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ From 7be0229a186d68c7f4f64695c11a086441efce6a Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 2 Jan 2026 15:04:26 +0000 Subject: [PATCH 14/16] add --container-writable to h200 nb --- runners/launch_h200-nb.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index ddadd48bc..15b6fa6c5 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -25,6 +25,7 @@ srun --jobid=$JOB_ID \ --container-image=$CONTAINER_IMAGE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --container-remap-root \ +--container-writable \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ From e13a2ec54962f34768a3becf9a967c6967fc7b07 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 2 Jan 2026 15:13:41 +0000 Subject: [PATCH 15/16] add --container-remap-root to b200 nb --- runners/launch_b200-nb.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index ad607752f..1cb5c3dd1 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -14,7 +14,9 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \ --container-image=$IMAGE \ --container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER: -1} \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ ---no-container-mount-home --container-writable \ +--no-container-mount-home \ +--container-remap-root \ +--container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1},UCX_NET_DEVICES=$UCX_NET_DEVICES \ bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh \ No newline at end of file From f268bac70a8ba08f991e9c8b24c209c141d070eb Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 2 Jan 2026 16:48:47 +0000 Subject: [PATCH 16/16] add --container-remap-root to b200 nv --- runners/launch_b200-nv.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh index 243e624f9..28286e2be 100644 --- a/runners/launch_b200-nv.sh +++ b/runners/launch_b200-nv.sh @@ -17,7 +17,9 @@ srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ ---no-container-mount-home --container-writable \ +--no-container-mount-home \ +--container-remap-root \ +--container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ bash benchmarks/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh