diff --git a/benchmarks/70b_fp4_b200_slurm.sh b/benchmarks/70b_fp4_b200_slurm.sh index 110facaf5..14896c7bb 100644 --- a/benchmarks/70b_fp4_b200_slurm.sh +++ b/benchmarks/70b_fp4_b200_slurm.sh @@ -21,6 +21,18 @@ hf download $MODEL pip install datasets pandas +nvidia-smi + +sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py + +# Calculate max-model-len based on ISL and OSL +if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20)) +elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then + CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200)) +else + CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} +fi cat > config.yaml << EOF kv-cache-dtype: fp8 @@ -28,22 +40,17 @@ compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_a async-scheduling: true no-enable-prefix-caching: true max-num-batched-tokens: 8192 -max-model-len: 10240 +max-model-len: $CALCULATED_MAX_MODEL_LEN EOF -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - - export TORCH_CUDA_ARCH_LIST="10.0" export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' +export PYTHONNOUSERSITE=1 set -x - - -PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ - --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \ - --disable-log-requests > $SERVER_LOG 2>&1 & +vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ +--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \ +--disable-log-requests > $SERVER_LOG 2>&1 & set +x while IFS= read -r line; do diff --git a/benchmarks/70b_fp8_b200_slurm.sh b/benchmarks/70b_fp8_b200_slurm.sh index 110facaf5..176a3a0a2 100644 --- a/benchmarks/70b_fp8_b200_slurm.sh +++ b/benchmarks/70b_fp8_b200_slurm.sh @@ -21,16 +21,28 @@ hf download $MODEL pip install datasets pandas +nvidia-smi -cat > config.yaml << EOF +sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py + + +FUSION_FLAG='{'\ +'"pass_config": {"enable_fi_allreduce_fusion": true, "enable_attn_fusion": true, "enable_noop": true},'\ +'"custom_ops": ["+quant_fp8", "+rms_norm"],'\ +'"cudagraph_mode": "FULL_DECODE_ONLY",'\ +'"splitting_ops": []'\ +'}' +cat > config.yaml <<-EOF kv-cache-dtype: fp8 -compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}' +compilation-config: '$FUSION_FLAG' async-scheduling: true no-enable-prefix-caching: true max-num-batched-tokens: 8192 -max-model-len: 10240 +max-model-len: $MAX_MODEL_LEN EOF +cat config.yaml # Debugging + SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) @@ -38,12 +50,15 @@ PORT=$(( 8888 + $PORT_OFFSET )) export TORCH_CUDA_ARCH_LIST="10.0" export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' -set -x - +export PYTHONNOUSERSITE=1 -PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ - --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \ - --disable-log-requests > $SERVER_LOG 2>&1 & +set -x +vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ +--gpu-memory-utilization=0.9 \ +--tensor-parallel-size=$TP \ +--max-num-seqs=512 \ +--config config.yaml \ +--disable-log-requests > $SERVER_LOG 2>&1 & set +x while IFS= read -r line; do diff --git a/benchmarks/dsr1_fp4_b200_slurm.sh b/benchmarks/dsr1_fp4_b200_slurm.sh index d3a6b2222..f4a5175b0 100644 --- a/benchmarks/dsr1_fp4_b200_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_slurm.sh @@ -9,16 +9,16 @@ set -x PORT=$(( 8888 + $PORT_OFFSET )) -# Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. - -SCHEDULER_RECV_INTERVAL=10 +sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py +# Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. if [[ $CONC -ge 16 ]]; then - SCHEDULER_RECV_INTERVAL=30 + SCHEDULER_RECV_INTERVAL=30 +else + SCHEDULER_RECV_INTERVAL=10 fi - echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" @@ -26,8 +26,8 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0. --tensor-parallel-size=$TP --data-parallel-size=1 \ --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \ --chunked-prefill-size 16384 \ ---enable-ep-moe --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ ---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --enable-flashinfer-trtllm-moe --stream-interval 10 \ +--enable-ep-moe --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ +--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --enable-flashinfer-trtllm-moe --stream-interval 10 \ > $SERVER_LOG 2>&1 & set +x diff --git a/benchmarks/dsr1_fp8_b200_slurm.sh b/benchmarks/dsr1_fp8_b200_slurm.sh index 28e9e2a32..e4247593d 100644 --- a/benchmarks/dsr1_fp8_b200_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_slurm.sh @@ -7,13 +7,26 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x PORT=$(( 8888 + $PORT_OFFSET )) -export SGL_ENABLE_JIT_DEEPGEMM=false + +sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py + +export SGL_ENABLE_JIT_DEEPGEMM=false export SGLANG_ENABLE_FLASHINFER_GEMM=true -python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ + +# Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. +if [[ $CONC -ge 16 ]]; then + SCHEDULER_RECV_INTERVAL=30 +else + SCHEDULER_RECV_INTERVAL=10 +fi + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ --tensor-parallel-size=$TP --data-parallel-size=1 \ ---cuda-graph-max-bs 128 --max-running-requests 128 --mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 \ ---chunked-prefill-size 32768 --max-prefill-tokens 32768 \ ---disable-radix-cache --attention-backend trtllm_mla --enable-flashinfer-trtllm-moe --stream-interval 1 \ +--cuda-graph-max-bs 128 --max-running-requests 128 \ +--mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 --chunked-prefill-size 32768 --max-prefill-tokens 32768 \ +--enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \ +--attention-backend trtllm_mla --stream-interval 30 --enable-flashinfer-trtllm-moe --quantization fp8 \ > $SERVER_LOG 2>&1 & set +x diff --git a/benchmarks/gptoss_fp4_b200_slurm.sh b/benchmarks/gptoss_fp4_b200_slurm.sh index 8e7161611..096602856 100644 --- a/benchmarks/gptoss_fp4_b200_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_slurm.sh @@ -21,6 +21,19 @@ hf download $MODEL pip install datasets pandas +nvidia-smi + +sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py + + +# Calculate max-model-len based on ISL and OSL +if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20)) +elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then + CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200)) +else + CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} +fi cat > config.yaml << EOF compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+rms_norm"],"cudagraph_mode":"FULL_AND_PIECEWISE"}' @@ -28,7 +41,7 @@ async-scheduling: true no-enable-prefix-caching: true cuda-graph-sizes: 2048 max-num-batched-tokens: 8192 -max-model-len: 10240 +max-model-len: $CALCULATED_MAX_MODEL_LEN EOF SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)