diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 7dccbd805..66a8642bd 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -19,11 +19,9 @@ if [[ "$version" == "" || $version -lt 177 ]]; then export HSA_NO_SCRATCH_RECLAIM=1 fi -export NCCL_MIN_NCHANNELS=112 export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 -export ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 set -x diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index d37b2654b..8b657a085 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -30,11 +30,9 @@ if [[ "$version" == "" || $version -lt 177 ]]; then export HSA_NO_SCRATCH_RECLAIM=1 fi -export NCCL_MIN_NCHANNELS=112 export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 -export ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 set -x diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index 147c0a84b..05250267f 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -9,12 +9,19 @@ # CONC # MAX_MODEL_LEN -export HSA_NO_SCRATCH_RECLAIM=1 -export NCCL_MIN_NCHANNELS=112 +# If the machine runs a MEC FW older than 177, RCCL +# cannot reclaim some memory. +# Disable that features to avoid crashes. +# This is related to the changes in the driver at: +# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates +version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` +if [[ "$version" == "" || $version -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 -export ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0 set -x vllm serve $MODEL --port $PORT \ diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index 7a26cde02..d89ed501c 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -21,12 +21,19 @@ huggingface-cli download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=8888 -export HSA_NO_SCRATCH_RECLAIM=1 -export NCCL_MIN_NCHANNELS=112 +# If the machine runs a MEC FW older than 177, RCCL +# cannot reclaim some memory. +# Disable that features to avoid crashes. +# This is related to the changes in the driver at: +# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates +version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` +if [[ "$version" == "" || $version -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 -export ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0 set -x vllm serve $MODEL --port $PORT \ diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index a2fc54bad..103e77fe3 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -16,8 +16,6 @@ EOF sleep 5 cat config.yaml -export HSA_NO_SCRATCH_RECLAIM=1 -export NCCL_MIN_NCHANNELS=112 export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index 867a26233..657bc1fdf 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -22,10 +22,9 @@ EOF sleep 5 cat config.yaml -export HSA_NO_SCRATCH_RECLAIM=1 -export NCCL_MIN_NCHANNELS=112 export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 +export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 set -x vllm serve $MODEL --port $PORT \