From 7da989d2a0cadc998888bc1955e9272c07ddaeb5 Mon Sep 17 00:00:00 2001 From: Po-Han Huang Date: Wed, 19 Nov 2025 23:57:54 -0800 Subject: [PATCH 1/5] Upgrade vLLM to v0.11.2 Updated configs: - Use FP8 kv-cache for GPT-OSS B200. - Remove "custom_ops" from compilation-config for GPT-OSS. - Remove "cudagraph_mode" from compilation-config for GPT-OSS. - Remove VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB env var for GPT-OSS. - Remove deprecated "--disable-log-requests" flag. - Rename "cuda-graph-sizes" flag. Signed-off-by: Po-Han Huang --- .github/configs/nvidia-master.yaml | 6 +++--- .github/workflows/README.md | 2 +- benchmarks/gptoss_fp4_b200_docker.sh | 10 +++++----- benchmarks/gptoss_fp4_h100_docker.sh | 7 +++---- benchmarks/gptoss_fp4_h100_slurm.sh | 5 ++--- benchmarks/gptoss_fp4_h200_slurm.sh | 5 ++--- 6 files changed, 16 insertions(+), 19 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a3d848475..c19813673 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -202,7 +202,7 @@ gptoss-fp4-b200-trt: - { tp: 8, conc-start: 4, conc-end: 8 } gptoss-fp4-b200-vllm: - image: vllm/vllm-openai:v0.11.0 + image: vllm/vllm-openai:v0.11.2 model: openai/gpt-oss-120b model-prefix: gptoss runner: b200 @@ -232,7 +232,7 @@ gptoss-fp4-b200-vllm: - { tp: 8, conc-start: 4, conc-end: 4 } gptoss-fp4-h100-vllm: - image: vllm/vllm-openai:v0.11.0 + image: vllm/vllm-openai:v0.11.2 model: openai/gpt-oss-120b model-prefix: gptoss runner: h100 @@ -290,7 +290,7 @@ gptoss-fp4-h200-trt: - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } gptoss-fp4-h200-vllm: - image: vllm/vllm-openai:v0.11.0 + image: vllm/vllm-openai:v0.11.2 model: openai/gpt-oss-120b model-prefix: gptoss runner: h200 diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 003b8809f..344c8c07a 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -115,7 +115,7 @@ full-sweep --model-prefix dsr1 --runner-type b200 --precision fp4 --framework sg Use the `custom` command to specify all parameters manually: ``` -custom --runner-label b200-nb_0 --image vllm/vllm-openai:v0.11.0 --model meta-llama/Llama-3.1-70B --framework vllm --precision fp8 --exp-name llama70b_test --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +custom --runner-label b200-nb_0 --image vllm/vllm-openai:v0.11.2 --model meta-llama/Llama-3.1-70B --framework vllm --precision fp8 --exp-name llama70b_test --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml ``` This runs a single 1k1k test job with your custom parameters on the specified runner node. Useful for: diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index 4fbf4f50c..8068d2310 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -29,16 +29,16 @@ else fi cat > config.yaml << EOF -compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+rms_norm"],"cudagraph_mode":"FULL_AND_PIECEWISE"}' +kv-cache-dtype: fp8 +compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_noop":true}}' async-scheduling: true no-enable-prefix-caching: true -cuda-graph-sizes: 2048 +max-cudagraph-capture-size: 2048 max-num-batched-tokens: 8192 max-model-len: $CALCULATED_MAX_MODEL_LEN EOF export TORCH_CUDA_ARCH_LIST="10.0" -export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 @@ -47,7 +47,7 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \ ---disable-log-requests > $SERVER_LOG 2>&1 & +> $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -69,4 +69,4 @@ run_benchmark_serving \ --num-prompts "$NUM_PROMPTS" \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ No newline at end of file + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index 48b548e37..7ac644ddc 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -12,10 +12,9 @@ cat > config.yaml << EOF -compilation-config: '{"cudagraph_mode":"PIECEWISE"}' async-scheduling: true no-enable-prefix-caching: true -cuda-graph-sizes: 2048 +max-cudagraph-capture-size: 2048 max-num-batched-tokens: 8192 max-model-len: 10240 EOF @@ -29,7 +28,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ ---disable-log-requests > $SERVER_LOG 2>&1 & +> $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -51,4 +50,4 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency 512 \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ No newline at end of file + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index a004f8892..9bc601fea 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -13,10 +13,9 @@ echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" cat > config.yaml << EOF -compilation-config: '{"cudagraph_mode":"PIECEWISE"}' async-scheduling: true no-enable-prefix-caching: true -cuda-graph-sizes: 2048 +max-cudagraph-capture-size: 2048 max-num-batched-tokens: 8192 max-model-len: 10240 EOF @@ -30,7 +29,7 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ ---disable-log-requests > $SERVER_LOG 2>&1 & + > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index 970b7ad35..0f4ed07f3 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -27,10 +27,9 @@ fi # Create config.yaml cat > config.yaml << EOF -compilation-config: '{"cudagraph_mode":"PIECEWISE"}' async-scheduling: true no-enable-prefix-caching: true -cuda-graph-sizes: 2048 +max-cudagraph-capture-size: 2048 max-num-batched-tokens: 8192 max-model-len: $CALCULATED_MAX_MODEL_LEN EOF @@ -42,7 +41,7 @@ export TORCH_CUDA_ARCH_LIST="9.0" PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ - --disable-log-requests > $SERVER_LOG 2>&1 & + > $SERVER_LOG 2>&1 & SERVER_PID=$! From f1e9e0d814ca450a3dd1171a05fbaf81e03fd93a Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Dec 2025 13:42:56 -0600 Subject: [PATCH 2/5] make cw runners container writable --- runners/launch_h100-cw.sh | 1 + runners/launch_h200-cw.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index 0179bdd57..d593eac77 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -11,6 +11,7 @@ set -x srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ +--container-writable \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --container-mount-home \ --container-workdir=/workspace/ \ diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index dd4937606..5e2fb7119 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -23,6 +23,7 @@ fi srun --jobid=$JOB_ID \ --container-image=$CONTAINER_IMAGE \ +--container-writable \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --container-mount-home \ --container-workdir=/workspace/ \ From bfacf4586a3b3609cc2d42c304c9c43e19f8e59b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Dec 2025 16:37:54 -0600 Subject: [PATCH 3/5] undo make cw runners container writable --- runners/launch_h100-cw.sh | 1 - runners/launch_h200-cw.sh | 1 - 2 files changed, 2 deletions(-) diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index d593eac77..0179bdd57 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -11,7 +11,6 @@ set -x srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ ---container-writable \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --container-mount-home \ --container-workdir=/workspace/ \ diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 5e2fb7119..dd4937606 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -23,7 +23,6 @@ fi srun --jobid=$JOB_ID \ --container-image=$CONTAINER_IMAGE \ ---container-writable \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --container-mount-home \ --container-workdir=/workspace/ \ From 15343febf69512921c1231a2a6ac36622f24b86f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Dec 2025 17:06:43 -0600 Subject: [PATCH 4/5] coreweave cleanup --- runners/launch_h200-cw.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index dd4937606..56759b49c 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -27,6 +27,6 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh +bash -c "benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh; rm -rf /dev/shm/sagemaker_sessions" scancel $JOB_ID From 2d4316b044b9fd3a7099295d7e3cac87e05373d4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Dec 2025 17:10:33 -0600 Subject: [PATCH 5/5] coreweave cleanup pt 2 --- runners/launch_h200-cw.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 56759b49c..c41d9bf65 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -27,6 +27,6 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash -c "benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh; rm -rf /dev/shm/sagemaker_sessions" +bash -c 'bash benchmarks/'"${EXP_NAME%%_*}_${PRECISION}"'_h200_slurm.sh; rm -rf /dev/shm/sagemaker_sessions' scancel $JOB_ID