diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 34d85fcca..6b029001d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -180,30 +180,30 @@ gptoss-fp4-b200-trt: precision: fp4 framework: trt multinode: false - # For all sequence lengths, if CONC >= 256, then EP=TP and DP_ATTN=true seq-len-configs: + # DP Attn at higher concurrencies, TP attn at middle to lower. TP=1 turns out to be highest as artifact of concurrency limit=128 - isl: 1024 osl: 1024 search-space: - - { tp: 2, dp-attn: true, conc-start: 32, conc-end: 128 } - - { tp: 4, dp-attn: true, conc-start: 32, conc-end: 64 } - - { tp: 1, conc-start: 64, conc-end: 128 } - - { tp: 2, conc-start: 4, conc-end: 32 } - - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 1, conc-start: 128, conc-end: 128 } + - { tp: 2, ep: 2, dp-attn: true, conc-start: 64, conc-end: 128 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 8, conc-end: 32 } + - { tp: 4, conc-start: 4, conc-end: 16 } - { tp: 8, conc-start: 4, conc-end: 8 } - isl: 1024 osl: 8192 search-space: - - { tp: 1, conc-start: 64, conc-end: 128 } - - { tp: 2, dp-attn: true, conc-start: 64, conc-end: 128 } - - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 2, ep: 2, dp-attn: true, conc-start: 64, conc-end: 128 } + - { tp: 2, conc-start: 4, conc-end: 16 } - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 16 } + - { tp: 8, conc-start: 4, conc-end: 8 } + # DP Attn at higher concurrencies, TP attn at middle to lower. TP=1 turns out to be highest as artifact of concurrency limit=128 - isl: 8192 osl: 1024 search-space: - - { tp: 1, conc-start: 64, conc-end: 128 } - - { tp: 2, dp-attn: true, conc-start: 64, conc-end: 128 } + - { tp: 1, conc-start: 128, conc-end: 128 } + - { tp: 2, ep: 2, dp-attn: true, conc-start: 64, conc-end: 128 } - { tp: 2, conc-start: 4, conc-end: 128 } - { tp: 4, conc-start: 4, conc-end: 32 } - { tp: 8, conc-start: 4, conc-end: 8 } @@ -1047,3 +1047,274 @@ dsr1-fp4-gb200-dynamo-sglang: dp-attn: true additional-settings: - "DECODE_NODES=8" + +gptoss-fp4-gb200-dynamo-trt: + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.7.0.post2 + model: openai/gpt-oss-120b + model-prefix: gptoss + runner: gb200 + precision: fp4 + framework: dynamo-trt + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + #Right of pareto + #P: 1xTP1 D:1xTP4 + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 16, 32, 64, 128 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=256" + - "DECODE_GPU_MEM_FRACTION=0.9" + +# P: 1xTP1 D:4xTP2 + - spec-decoding: "none" + conc-list: [ 16 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 4 + tp: 2 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=32" + - "DECODE_GPU_MEM_FRACTION=0.9" + + # P: 1xTP1 D:1xDEP2 + - spec-decoding: "none" + conc-list: [ 256, 512, 1024, 2048, 2560 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=1536" + - "DECODE_GPU_MEM_FRACTION=0.9" + + # P: 1xTP1 D:2xDEP2 + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048, 2560 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=1536" + - "DECODE_GPU_MEM_FRACTION=0.9" + + # P: 1xTP1 D:1xDEP4 + - spec-decoding: "none" + conc-list: [ 256, 1024, 1536 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=512" + - "DECODE_GPU_MEM_FRACTION=0.9" + +# P: 1xTP1 D:3xDEP4 + - spec-decoding: "none" + conc-list: [ 3072 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=1024" + - "DECODE_GPU_MEM_FRACTION=0.9" + + - isl: 8192 + osl: 1024 + search-space: + # Right side of pareto + - spec-decoding: "none" + conc-list: [1] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=4" + - "DECODE_GPU_MEM_FRACTION=0.9" + + - spec-decoding: "none" + conc-list: [2, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=128" + - "DECODE_GPU_MEM_FRACTION=0.9" + +# Middle of pareto +# P: 2xTP1 D:1xTP4 + - spec-decoding: "none" + conc-list: [128, 512] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=1024" + - "DECODE_GPU_MEM_FRACTION=0.9" + +# P: 2xTP1 D:1xTP2 + - spec-decoding: "none" + conc-list: [256, 384] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 1 + tp: 2 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=512" + - "DECODE_GPU_MEM_FRACTION=0.9" + +# P: 2xTP1 D:1xDEP2 + - spec-decoding: "none" + conc-list: [128, 512] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=512" + - "DECODE_GPU_MEM_FRACTION=0.9" + \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 35ed2c58a..45bf152c5 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -49,9 +49,16 @@ moe_config: EOF if [[ "$DP_ATTENTION" == "true" ]]; then - export TRTLLM_MOE_ALLTOALL_BACKEND="mnnvlthroughput" - export TRTLLM_FORCE_ALLTOALL_METHOD="MNNVL" - export TRTLLM_MOE_A2A_WORKSPACE_MB="2048" + # DISABLE All2All for MoE TP + if [[ "$EP_SIZE" -eq 1 ]]; then + # DTP Alltoall Environment variables for EP_SIZE == 1 + export TRTLLM_FORCE_ALLTOALL_METHOD="NotEnabled" + elif [[ "$EP_SIZE" -gt 1 ]]; then + # DEP + export TRTLLM_MOE_ALLTOALL_BACKEND="mnnvlthroughput" + export TRTLLM_FORCE_ALLTOALL_METHOD="MNNVL" + export TRTLLM_MOE_A2A_WORKSPACE_MB="2048" + fi cat << EOF >> $EXTRA_CONFIG_FILE attention_dp_config: enable_balance: true diff --git a/benchmarks/gptoss_fp4_gb200_dynamo-trt_slurm.sh b/benchmarks/gptoss_fp4_gb200_dynamo-trt_slurm.sh new file mode 100644 index 000000000..1bce1d770 --- /dev/null +++ b/benchmarks/gptoss_fp4_gb200_dynamo-trt_slurm.sh @@ -0,0 +1,63 @@ +#!/usr/bin/bash + +set -x + +source "$(dirname "$0")/benchmark_lib.sh" + +check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING \ + PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \ + PREFILL_MAX_NUM_TOKENS PREFILL_MAX_BATCH_SIZE DECODE_MAX_NUM_TOKENS \ + DECODE_MAX_BATCH_SIZE DECODE_GPU_MEM_FRACTION + +if [[ "$SPEC_DECODING" == "mtp" ]]; then + check_env_vars DECODE_MTP_SIZE +else + DECODE_MTP_SIZE="0" +fi + +PERFORMANCE_SWEEPS_PATH="components/backends/trtllm/performance_sweeps" + +echo "Cloning Dynamo repository..." +git clone https://github.com/ai-dynamo/dynamo.git +cd dynamo +git checkout release/0.5.1-rc0.20260105 +git submodule update --init --recursive + +cd "$PERFORMANCE_SWEEPS_PATH" + +# Set up environment variables based on ISL/OSL +if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=1024 +elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then + export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448 +else + echo "Unsupported ISL/OSL combination: $ISL/$OSL" + exit 1 +fi + +kind=dynamo_disagg +additional_slurm_args="--time=04:00:00" +ntasks_per_node=4 + +gen_nodes=$(((DECODE_TP + 3)/4 * DECODE_NUM_WORKERS)) +total_nodes=$((PREFILL_NUM_WORKERS + gen_nodes)) +total_tasks=$((total_nodes * ntasks_per_node)) + +decode_eplb_num_slots=0 + +sbatch --nodes=${total_nodes} \ + --ntasks=${total_tasks} \ + --ntasks-per-node=${ntasks_per_node} \ + --segment=${total_nodes} ${additional_slurm_args} \ + benchmark_disagg.slurm \ + ${PREFILL_NUM_WORKERS} ${PREFILL_TP} \ + ${PREFILL_MAX_BATCH_SIZE} ${PREFILL_MAX_NUM_TOKENS} \ + ${PREFILL_DP_ATTN} ${DECODE_NUM_WORKERS} \ + ${DECODE_TP} ${DECODE_EP} ${DECODE_MAX_BATCH_SIZE} \ + ${DECODE_MAX_NUM_TOKENS} ${DECODE_DP_ATTN} \ + ${DECODE_GPU_MEM_FRACTION} ${decode_eplb_num_slots} \ + ${DECODE_MTP_SIZE} "${CONC_LIST}" \ + ${gen_nodes} ${kind} \ + ${MODEL_PATH} ${SERVED_MODEL_NAME} \ + ${IMAGE} ${ISL} ${OSL} diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c7f68885c..324e88d56 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -141,3 +141,11 @@ description: - Use upstream SGLang images on mi300, mi325 and mi355 for dsr1fp8 pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/332 + +- config-keys: + - gptoss-fp4-gb200-dynamo-trt + - gptoss-fp4-b200-trt + description: + - Explicitly add EP=TP for DP attention configs for B200 AGG nvidia-master file. Multinode Refactor inadvertently changed default EP=1 + - Add GPTOSS DISAGG configurations for GB200 1k1k and 8k1k. + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/387 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index ff611bce8..40f6dc439 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -25,8 +25,16 @@ export MODEL_PATH=$MODEL if [[ $FRAMEWORK == "dynamo-sglang" ]]; then export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs" -else - export SERVED_MODEL_NAME="deepseek-r1-fp4" +elif [[ $FRAMEWORK == "dynamo-trt" ]]; then + if [[ $MODEL_PREFIX == "gptoss" ]]; then + export MODEL_PATH="/mnt/lustre01/models/gpt-oss-120b" + export SERVED_MODEL_NAME="gpt-oss-120b" + elif [[ $MODEL_PREFIX == "dsr1" ]]; then + export SERVED_MODEL_NAME="deepseek-r1-fp4" + else + echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: gptoss" + exit 1 + fi fi export ISL="$ISL" @@ -59,7 +67,7 @@ if [[ $FRAMEWORK == "dynamo-trt" ]]; then echo "Found logs directory: $LOGS_DIR" # Find all result subdirectories in this logs directory - RESULT_SUBDIRS=$(find "$LOGS_DIR" -name "ctx*_gen*_[td]ep*_batch*_eplb*_mtp*" -type d) + RESULT_SUBDIRS=$(find "$LOGS_DIR" -name "ctx*_gen*_*_batch*_eplb*_mtp*" -type d) if [ -z "$RESULT_SUBDIRS" ]; then echo "No result subdirectories found in $LOGS_DIR"