Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/gptoss-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ jobs:
osl: ${{ inputs.osl }}
max-model-len: ${{ inputs.max-model-len }}
random-range-ratio: ${{ inputs.random-range-ratio }}
runner: b200-trt
image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
runner: b200-nvs
image: 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1'
model: 'openai/gpt-oss-120b'
tp-list: '[1, 2, 4, 8]'
framework: 'trt'
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/runner-sweep-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ on:
- 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2'
- 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1'
- 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
- 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1'
- 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev'
- 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3'
- 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915'
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/runner-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ on:
- 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2'
- 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1'
- 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
- 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1'
- 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev'
- 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_vllm_0.10.1_instinct_rc1'
- 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_sgl-dev-v0.5.2rc2-mi30x_rc1'
Expand Down
27 changes: 3 additions & 24 deletions benchmarks/gptoss_fp4_b200_trt_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,42 +30,27 @@ EP_SIZE="1"
MOE_BACKEND="TRTLLM"
DP_ATTENTION=false

# Lower concurrencies: Concurrency < 256
# MoE backend=TRTLLM
# Use TP Attention; Switch to MoE Expert parallel for conurrency >=16 (1k1k and 1k8k)
TEP_REQUIRED=false
if [[ "$TP" == "4" || "$TP" == "8" ]]; then
if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
TEP_REQUIRED=true
elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
TEP_REQUIRED=true
fi
fi
if [[ "$TEP_REQUIRED" == "true" && $CONC -ge 16 ]]; then
EP_SIZE="$TP"
fi

Comment thread
cquil11 marked this conversation as resolved.
# Higher concurrencies: Concurrency >= 256
# MoE Backend = CUTLASS
# Use DP attention with expert parallel MoE
if [[ $CONC -ge 256 ]]; then
EP_SIZE="$TP"
DP_ATTENTION=true
MOE_BACKEND="CUTLASS"
fi

echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"

EXTRA_CONFIG_FILE="gptoss-fp4.yml"
export TRTLLM_ENABLE_PDL=1
export NCCL_GRAPH_REGISTER=0

cat > $EXTRA_CONFIG_FILE << EOF
cuda_graph_config:
enable_padding: true
max_batch_size: $CONC
enable_attention_dp: $DP_ATTENTION
kv_cache_config:
dtype: auto
dtype: fp8
enable_block_reuse: false
free_gpu_memory_fraction: 0.85
print_iter_log: true
Expand Down Expand Up @@ -105,12 +90,6 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \
set +x
while IFS= read -r line; do
printf '%s\n' "$line"
if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
sleep 5
tail -n100 $SERVER_LOG
echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
exit 1
fi
if [[ "$line" == *"Application startup complete"* ]]; then
break
fi
Expand All @@ -127,4 +106,4 @@ python3 bench_serving/benchmark_serving.py \
--request-rate inf --ignore-eos \
--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
--result-dir /workspace/ \
--result-filename $RESULT_FILENAME.json
--result-filename $RESULT_FILENAME.json