Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 17 additions & 10 deletions benchmarks/70b_fp4_b200_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,29 +21,36 @@ hf download $MODEL

pip install datasets pandas

nvidia-smi

sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py

# Calculate max-model-len based on ISL and OSL
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20))
elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
else
CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}
fi

cat > config.yaml << EOF
kv-cache-dtype: fp8
compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}'
async-scheduling: true
no-enable-prefix-caching: true
max-num-batched-tokens: 8192
max-model-len: 10240
max-model-len: $CALCULATED_MAX_MODEL_LEN
EOF

SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
PORT=$(( 8888 + $PORT_OFFSET ))


export TORCH_CUDA_ARCH_LIST="10.0"
export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}'
export PYTHONNOUSERSITE=1

set -x


PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \
--disable-log-requests > $SERVER_LOG 2>&1 &
vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \
--disable-log-requests > $SERVER_LOG 2>&1 &

set +x
while IFS= read -r line; do
Expand Down
31 changes: 23 additions & 8 deletions benchmarks/70b_fp8_b200_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,29 +21,44 @@ hf download $MODEL

pip install datasets pandas

nvidia-smi

cat > config.yaml << EOF
sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py


FUSION_FLAG='{'\
'"pass_config": {"enable_fi_allreduce_fusion": true, "enable_attn_fusion": true, "enable_noop": true},'\
'"custom_ops": ["+quant_fp8", "+rms_norm"],'\
'"cudagraph_mode": "FULL_DECODE_ONLY",'\
'"splitting_ops": []'\
'}'
cat > config.yaml <<-EOF
kv-cache-dtype: fp8
compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}'
compilation-config: '$FUSION_FLAG'
async-scheduling: true
no-enable-prefix-caching: true
max-num-batched-tokens: 8192
max-model-len: 10240
max-model-len: $MAX_MODEL_LEN
EOF

cat config.yaml # Debugging

SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
PORT=$(( 8888 + $PORT_OFFSET ))


export TORCH_CUDA_ARCH_LIST="10.0"
export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}'

set -x

export PYTHONNOUSERSITE=1

PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \
--disable-log-requests > $SERVER_LOG 2>&1 &
set -x
vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
--gpu-memory-utilization=0.9 \
--tensor-parallel-size=$TP \
--max-num-seqs=512 \
--config config.yaml \
--disable-log-requests > $SERVER_LOG 2>&1 &

set +x
while IFS= read -r line; do
Expand Down
14 changes: 7 additions & 7 deletions benchmarks/dsr1_fp4_b200_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,25 @@ set -x
PORT=$(( 8888 + $PORT_OFFSET ))


# Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls.

SCHEDULER_RECV_INTERVAL=10
sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py

# Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls.
if [[ $CONC -ge 16 ]]; then
SCHEDULER_RECV_INTERVAL=30
SCHEDULER_RECV_INTERVAL=30
else
SCHEDULER_RECV_INTERVAL=10
fi



echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"


PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
--tensor-parallel-size=$TP --data-parallel-size=1 \
--cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \
--chunked-prefill-size 16384 \
--enable-ep-moe --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --enable-flashinfer-trtllm-moe --stream-interval 10 \
--enable-ep-moe --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --enable-flashinfer-trtllm-moe --stream-interval 10 \
> $SERVER_LOG 2>&1 &

set +x
Expand Down
23 changes: 18 additions & 5 deletions benchmarks/dsr1_fp8_b200_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,26 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)

set -x
PORT=$(( 8888 + $PORT_OFFSET ))
export SGL_ENABLE_JIT_DEEPGEMM=false

sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py

export SGL_ENABLE_JIT_DEEPGEMM=false
export SGLANG_ENABLE_FLASHINFER_GEMM=true
python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \

# Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls.
if [[ $CONC -ge 16 ]]; then
SCHEDULER_RECV_INTERVAL=30
else
SCHEDULER_RECV_INTERVAL=10
fi

set -x
PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
--tensor-parallel-size=$TP --data-parallel-size=1 \
--cuda-graph-max-bs 128 --max-running-requests 128 --mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 \
--chunked-prefill-size 32768 --max-prefill-tokens 32768 \
--disable-radix-cache --attention-backend trtllm_mla --enable-flashinfer-trtllm-moe --stream-interval 1 \
--cuda-graph-max-bs 128 --max-running-requests 128 \
--mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 --chunked-prefill-size 32768 --max-prefill-tokens 32768 \
--enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \
--attention-backend trtllm_mla --stream-interval 30 --enable-flashinfer-trtllm-moe --quantization fp8 \
> $SERVER_LOG 2>&1 &

set +x
Expand Down
15 changes: 14 additions & 1 deletion benchmarks/gptoss_fp4_b200_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,27 @@ hf download $MODEL

pip install datasets pandas

nvidia-smi

sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py


# Calculate max-model-len based on ISL and OSL
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20))
elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
else
CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}
fi

cat > config.yaml << EOF
compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+rms_norm"],"cudagraph_mode":"FULL_AND_PIECEWISE"}'
async-scheduling: true
no-enable-prefix-caching: true
cuda-graph-sizes: 2048
max-num-batched-tokens: 8192
max-model-len: 10240
max-model-len: $CALCULATED_MAX_MODEL_LEN
EOF

SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
Expand Down