From 431704f00a4606b7fe41d76665465c1324867d24 Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Tue, 14 Oct 2025 12:57:43 -0700 Subject: [PATCH 1/8] Update gptoss b200 container and run options --- .github/workflows/gptoss-tmpl.yml | 2 +- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 18 +----------------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/.github/workflows/gptoss-tmpl.yml b/.github/workflows/gptoss-tmpl.yml index 0c505de07..95c501411 100644 --- a/.github/workflows/gptoss-tmpl.yml +++ b/.github/workflows/gptoss-tmpl.yml @@ -101,7 +101,7 @@ jobs: max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} runner: b200-trt - image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' + image: 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1' model: 'openai/gpt-oss-120b' tp-list: '[1, 2, 4, 8]' framework: 'trt' diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 96745306a..9c11d92a3 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -30,28 +30,12 @@ EP_SIZE="1" MOE_BACKEND="TRTLLM" DP_ATTENTION=false -# Lower concurrencies: Concurrency < 256 -# MoE backend=TRTLLM -# Use TP Attention; Switch to MoE Expert parallel for conurrency >=16 (1k1k and 1k8k) -TEP_REQUIRED=false -if [[ "$TP" == "4" || "$TP" == "8" ]]; then - if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - TEP_REQUIRED=true - elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - TEP_REQUIRED=true - fi -fi -if [[ "$TEP_REQUIRED" == "true" && $CONC -ge 16 ]]; then - EP_SIZE="$TP" -fi - # Higher concurrencies: Concurrency >= 256 # MoE Backend = CUTLASS # Use DP attention with expert parallel MoE if [[ $CONC -ge 256 ]]; then EP_SIZE="$TP" DP_ATTENTION=true - MOE_BACKEND="CUTLASS" fi echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" @@ -65,7 +49,7 @@ cuda_graph_config: max_batch_size: $CONC enable_attention_dp: $DP_ATTENTION kv_cache_config: - dtype: auto + dtype: fp8 enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true From 06783ce16d10eaf6a3b325054a8d2d7113b80a7f Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Tue, 14 Oct 2025 13:39:06 -0700 Subject: [PATCH 2/8] update container list --- .github/workflows/runner-sweep-test.yml | 1 + .github/workflows/runner-test.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/runner-sweep-test.yml b/.github/workflows/runner-sweep-test.yml index 6a1b4d4e8..fd100474f 100644 --- a/.github/workflows/runner-sweep-test.yml +++ b/.github/workflows/runner-sweep-test.yml @@ -30,6 +30,7 @@ on: - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2' - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1' - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' + - 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1' - 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' - 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3' - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' diff --git a/.github/workflows/runner-test.yml b/.github/workflows/runner-test.yml index e0fcbaf3b..983394035 100644 --- a/.github/workflows/runner-test.yml +++ b/.github/workflows/runner-test.yml @@ -60,6 +60,7 @@ on: - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2' - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1' - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' + - 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1' - 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' - 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_vllm_0.10.1_instinct_rc1' - 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_sgl-dev-v0.5.2rc2-mi30x_rc1' From cafe0ec63a0d4b08b620a7c0ef0a7f504d5ab978 Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Wed, 15 Oct 2025 10:49:13 -0700 Subject: [PATCH 3/8] Extending num exps temporarily --- .github/workflows/runner-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/runner-test.yml b/.github/workflows/runner-test.yml index 983394035..16ab97fd9 100644 --- a/.github/workflows/runner-test.yml +++ b/.github/workflows/runner-test.yml @@ -125,8 +125,8 @@ jobs: osl: 1024 max-model-len: 2048 random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[4]' + tp-list: '[1,4,8]' + conc-list: '[4,64]' collect-test-results: needs: runner-test From 307c9c10955b8091e1ba24c36eca14a8199205ba Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Wed, 15 Oct 2025 12:39:45 -0700 Subject: [PATCH 4/8] Update error detection logic to avoid false positive --- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 9c11d92a3..e2e148a72 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -89,7 +89,7 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ set +x while IFS= read -r line; do printf '%s\n' "$line" - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then + if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]] && [[ ! "$line" =~ UserWarning ]]; then sleep 5 tail -n100 $SERVER_LOG echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" From 71aa363e5b49f707c8fe7ec0edfe970eabe18ade Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Wed, 15 Oct 2025 15:05:32 -0700 Subject: [PATCH 5/8] leniency on error checking. update NCCL export --- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index e2e148a72..3b6bf1686 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -42,6 +42,7 @@ echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_AT EXTRA_CONFIG_FILE="gptoss-fp4.yml" export TRTLLM_ENABLE_PDL=1 +export NCCL_GRAPH_REGISTER=0 cat > $EXTRA_CONFIG_FILE << EOF cuda_graph_config: @@ -89,12 +90,6 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ set +x while IFS= read -r line; do printf '%s\n' "$line" - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]] && [[ ! "$line" =~ UserWarning ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi if [[ "$line" == *"Application startup complete"* ]]; then break fi From 0f133e61b1e532ef7db3b11382a983ad8795f487 Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Wed, 15 Oct 2025 15:56:10 -0700 Subject: [PATCH 6/8] test --- .github/workflows/runner-test.yml | 4 ++-- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/runner-test.yml b/.github/workflows/runner-test.yml index 16ab97fd9..683153c00 100644 --- a/.github/workflows/runner-test.yml +++ b/.github/workflows/runner-test.yml @@ -125,8 +125,8 @@ jobs: osl: 1024 max-model-len: 2048 random-range-ratio: 0.8 - tp-list: '[1,4,8]' - conc-list: '[4,64]' + tp-list: '[1,2,4]' + conc-list: '[64]' collect-test-results: needs: runner-test diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 3b6bf1686..6342d8963 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -42,7 +42,7 @@ echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_AT EXTRA_CONFIG_FILE="gptoss-fp4.yml" export TRTLLM_ENABLE_PDL=1 -export NCCL_GRAPH_REGISTER=0 +#export NCCL_GRAPH_REGISTER=0 cat > $EXTRA_CONFIG_FILE << EOF cuda_graph_config: From cdec13f108873bdc38e1e9f466c85904bbcf0f91 Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Thu, 16 Oct 2025 11:39:55 -0700 Subject: [PATCH 7/8] Final touches --- .github/workflows/runner-test.yml | 4 ++-- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/runner-test.yml b/.github/workflows/runner-test.yml index 683153c00..983394035 100644 --- a/.github/workflows/runner-test.yml +++ b/.github/workflows/runner-test.yml @@ -125,8 +125,8 @@ jobs: osl: 1024 max-model-len: 2048 random-range-ratio: 0.8 - tp-list: '[1,2,4]' - conc-list: '[64]' + tp-list: '[8]' + conc-list: '[4]' collect-test-results: needs: runner-test diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 6342d8963..f85f5c13f 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -42,7 +42,7 @@ echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_AT EXTRA_CONFIG_FILE="gptoss-fp4.yml" export TRTLLM_ENABLE_PDL=1 -#export NCCL_GRAPH_REGISTER=0 +export NCCL_GRAPH_REGISTER=0 cat > $EXTRA_CONFIG_FILE << EOF cuda_graph_config: @@ -106,4 +106,4 @@ python3 bench_serving/benchmark_serving.py \ --request-rate inf --ignore-eos \ --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ --result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json \ No newline at end of file +--result-filename $RESULT_FILENAME.json From 5bb29eb21ec5d126810f94be60d8087091e15343 Mon Sep 17 00:00:00 2001 From: Jatin Gangani Date: Mon, 20 Oct 2025 14:40:43 -0700 Subject: [PATCH 8/8] update runner --- .github/workflows/gptoss-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gptoss-tmpl.yml b/.github/workflows/gptoss-tmpl.yml index 95c501411..8bb8d13a6 100644 --- a/.github/workflows/gptoss-tmpl.yml +++ b/.github/workflows/gptoss-tmpl.yml @@ -100,7 +100,7 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - runner: b200-trt + runner: b200-nvs image: 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1' model: 'openai/gpt-oss-120b' tp-list: '[1, 2, 4, 8]'