diff --git a/.github/workflows/gptoss-tmpl.yml b/.github/workflows/gptoss-tmpl.yml index 0c505de07..8bb8d13a6 100644 --- a/.github/workflows/gptoss-tmpl.yml +++ b/.github/workflows/gptoss-tmpl.yml @@ -100,8 +100,8 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - runner: b200-trt - image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' + runner: b200-nvs + image: 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1' model: 'openai/gpt-oss-120b' tp-list: '[1, 2, 4, 8]' framework: 'trt' diff --git a/.github/workflows/runner-sweep-test.yml b/.github/workflows/runner-sweep-test.yml index 6a1b4d4e8..fd100474f 100644 --- a/.github/workflows/runner-sweep-test.yml +++ b/.github/workflows/runner-sweep-test.yml @@ -30,6 +30,7 @@ on: - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2' - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1' - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' + - 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1' - 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' - 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3' - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' diff --git a/.github/workflows/runner-test.yml b/.github/workflows/runner-test.yml index e0fcbaf3b..983394035 100644 --- a/.github/workflows/runner-test.yml +++ b/.github/workflows/runner-test.yml @@ -60,6 +60,7 @@ on: - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2' - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1' - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' + - 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1' - 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' - 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_vllm_0.10.1_instinct_rc1' - 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_sgl-dev-v0.5.2rc2-mi30x_rc1' diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 96745306a..f85f5c13f 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -30,34 +30,19 @@ EP_SIZE="1" MOE_BACKEND="TRTLLM" DP_ATTENTION=false -# Lower concurrencies: Concurrency < 256 -# MoE backend=TRTLLM -# Use TP Attention; Switch to MoE Expert parallel for conurrency >=16 (1k1k and 1k8k) -TEP_REQUIRED=false -if [[ "$TP" == "4" || "$TP" == "8" ]]; then - if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - TEP_REQUIRED=true - elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - TEP_REQUIRED=true - fi -fi -if [[ "$TEP_REQUIRED" == "true" && $CONC -ge 16 ]]; then - EP_SIZE="$TP" -fi - # Higher concurrencies: Concurrency >= 256 # MoE Backend = CUTLASS # Use DP attention with expert parallel MoE if [[ $CONC -ge 256 ]]; then EP_SIZE="$TP" DP_ATTENTION=true - MOE_BACKEND="CUTLASS" fi echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" EXTRA_CONFIG_FILE="gptoss-fp4.yml" export TRTLLM_ENABLE_PDL=1 +export NCCL_GRAPH_REGISTER=0 cat > $EXTRA_CONFIG_FILE << EOF cuda_graph_config: @@ -65,7 +50,7 @@ cuda_graph_config: max_batch_size: $CONC enable_attention_dp: $DP_ATTENTION kv_cache_config: - dtype: auto + dtype: fp8 enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true @@ -105,12 +90,6 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ set +x while IFS= read -r line; do printf '%s\n' "$line" - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi if [[ "$line" == *"Application startup complete"* ]]; then break fi @@ -127,4 +106,4 @@ python3 bench_serving/benchmark_serving.py \ --request-rate inf --ignore-eos \ --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ --result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json \ No newline at end of file +--result-filename $RESULT_FILENAME.json