From 593bcf45ddc8205d9ffca66e75e16a57f8c53d72 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Wed, 11 Mar 2026 11:19:28 +0000 Subject: [PATCH 01/85] [AMD] Add vLLM disaggregated prefill-decode benchmark for MI355X Add multi-node vLLM PD disaggregation recipe using Nixl/RIXL KV transfer and vllm-router, mirroring the existing SGLang disagg recipe structure. - New benchmark config: dsr1-fp8-mi355x-vllm-disagg (1P2D, TP8) - New utils: vllm_disagg_utils/ (job.slurm, server.sh, submit.sh, etc.) - Runner: extend launch_mi355x-amds.sh for vllm-disagg framework --- .github/configs/amd-master.yaml | 71 +++ .../multi_node/dsr1_fp8_mi355x_vllm-disagg.sh | 47 ++ .../multi_node/vllm_disagg_utils/bench.sh | 70 +++ .../multi_node/vllm_disagg_utils/env.sh | 52 ++ .../multi_node/vllm_disagg_utils/job.slurm | 326 +++++++++++++ .../multi_node/vllm_disagg_utils/server.sh | 444 ++++++++++++++++++ .../vllm_disagg_utils/start_etcd.sh | 47 ++ .../multi_node/vllm_disagg_utils/submit.sh | 131 ++++++ .../multi_node/vllm_disagg_utils/sync.py | 198 ++++++++ runners/launch_mi355x-amds.sh | 15 +- 10 files changed, 1398 insertions(+), 3 deletions(-) create mode 100755 benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh create mode 100755 benchmarks/multi_node/vllm_disagg_utils/bench.sh create mode 100755 benchmarks/multi_node/vllm_disagg_utils/env.sh create mode 100644 benchmarks/multi_node/vllm_disagg_utils/job.slurm create mode 100755 benchmarks/multi_node/vllm_disagg_utils/server.sh create mode 100755 benchmarks/multi_node/vllm_disagg_utils/start_etcd.sh create mode 100755 benchmarks/multi_node/vllm_disagg_utils/submit.sh create mode 100755 benchmarks/multi_node/vllm_disagg_utils/sync.py diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a3afb2f6b..62686b75f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1350,6 +1350,77 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=2" +dsr1-fp8-mi355x-vllm-disagg: + image: vllm_disagg_pd:latest + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp8 + framework: vllm-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node + 2 decode nodes + 1 proxy = 4 nodes total + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 1024 + osl: 8192 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + dsr1-fp4-mi355x-sglang-disagg: image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh new file mode 100755 index 000000000..a457a2714 --- /dev/null +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/vllm_disagg_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +# vLLM disagg uses TP-only parallelism (no EP/DP). +# PREFILL_NODES and DECODE_NODES come from additional-settings in the YAML config. + +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf) + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/benchmarks/multi_node/vllm_disagg_utils/bench.sh b/benchmarks/multi_node/vllm_disagg_utils/bench.sh new file mode 100755 index 000000000..cfe66d460 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/bench.sh @@ -0,0 +1,70 @@ +#!/bin/bash +# vLLM Disaggregated Benchmark Runner +# +# Usage: bash bench.sh \ +# \ +# + +n_prefill=$1 +n_decode=$2 +prefill_gpus=$3 +decode_gpus=$4 +model_path=$5 +model_name=$6 +# Prefer MODEL_PATH from environment (handles HF cache snapshot resolution) +MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" +log_path=$7 + +chosen_isl=${8:-1024} +chosen_osl=${9:-1024} +concurrency_list=${10:-"512x1"} +chosen_req_rate=${11:-inf} +random_range_ratio=${12:-0.8} +num_prompts_multiplier=${13:-10} + +IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list" + +ROUTER_PORT="${ROUTER_PORT:-2584}" + +echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" + +profile_folder="${log_path}/vllm_isl_${chosen_isl}_osl_${chosen_osl}" +mkdir -p "$profile_folder" + +for max_concurrency in "${chosen_concurrencies[@]}"; do + + export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}" + + num_prompts=$(( max_concurrency * num_prompts_multiplier )) + if [[ "$num_prompts" -lt 16 ]]; then + num_prompts=16 + fi + + echo "profile_folder: $profile_folder" + echo "max_concurrency: $max_concurrency" + echo "chosen_req_rate: $chosen_req_rate" + echo "MODEL_PATH: $MODEL_PATH" + echo "ROUTER_PORT: $ROUTER_PORT" + echo "chosen_isl: $chosen_isl" + echo "chosen_osl: $chosen_osl" + echo "num_prompts: $num_prompts" + echo "export_file: $export_file" + + vllm bench serve \ + --model "$MODEL_PATH" \ + --backend vllm \ + --host 127.0.0.1 \ + --port "$ROUTER_PORT" \ + --dataset-name "random" \ + --random-input-len "$chosen_isl" \ + --random-output-len "$chosen_osl" \ + --random-prefix-len 0 \ + --num-prompts "$num_prompts" \ + --request-rate "$chosen_req_rate" \ + --ignore-eos \ + --max-concurrency "$max_concurrency" \ + 2>&1 | tee "${export_file}.log" + + sleep 5 + echo "-----------------------------------------" +done diff --git a/benchmarks/multi_node/vllm_disagg_utils/env.sh b/benchmarks/multi_node/vllm_disagg_utils/env.sh new file mode 100755 index 000000000..ebe77f09b --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/env.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# vLLM/Nixl environment setup for multi-node disaggregated serving. +# +# REQUIRED ENVIRONMENT VARIABLES: +# IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) +# Set by runner or auto-detected from hostname. +# +# The Docker image (built from vllm_disagg_inference.ubuntu.amd.Dockerfile) already +# sets LD_LIBRARY_PATH for UCX (/usr/local/ucx/lib) and RIXL (/usr/local/RIXL/install/lib). + +set -x + +# IBDEVICES configuration +# Prefer IBDEVICES set by runner (runners/launch_mi355x-amds.sh) +# Fall back to hostname detection if not set (for direct script execution) +if [[ -z "$IBDEVICES" ]]; then + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7 + elif [[ $NODENAME == mia1* ]]; then + export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 + else + DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',') + if [[ -n "$DETECTED" ]]; then + export IBDEVICES="$DETECTED" + else + echo "WARNING: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2 + fi + fi + echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $(hostname -s)" +else + echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)" +fi + +if [[ -z "$UCX_NET_DEVICES" ]]; then + FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1) + if [[ -n "$FIRST_IB" ]]; then + export UCX_NET_DEVICES="${FIRST_IB}:1" + fi + echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES" +else + echo "[INFO] Using UCX_NET_DEVICES=$UCX_NET_DEVICES (set by environment)" +fi + +export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) +export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} + +# RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing +export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1} + +set +x +echo "[INFO] IBDEVICES=$IBDEVICES UCX_NET_DEVICES=$UCX_NET_DEVICES NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX" diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm new file mode 100644 index 000000000..710b7168a --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -0,0 +1,326 @@ +#!/bin/bash +#SBATCH --job-name=vllm-pd-bench +#SBATCH -N 4 # CHECK this to be right in batch jobs +#SBATCH -n 4 # CHECK this to be right in batch jobs +#SBATCH --ntasks-per-node=1 +#SBATCH --spread-job +#SBATCH --gres=gpu:8 +#SBATCH --time=24:00:00 +# --output and --error are set by submit.sh via BENCHMARK_LOGS_DIR + +echo "=== Job Start Time ===" +echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')" +echo "PST Time: $(TZ=America/Los_Angeles date '+%Y-%m-%d %H:%M:%S %Z')" +echo "=======================" +echo "" + +# ============================================================================= +# Model Validation +# ============================================================================= + +VALID_MODELS=( + "Llama-3.1-405B-Instruct-FP8-KV" + "amd-Llama-3.3-70B-Instruct-FP8-KV" + "DeepSeek-V3" + "DeepSeek-R1-0528" + "gpt-oss-120b" +) + +if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then + echo "Error: DOCKER_IMAGE_NAME is not set." + exit 1 +fi + +MODEL_NAME="${MODEL_NAME:-None}" +model_found=false +for m in "${VALID_MODELS[@]}"; do + [[ "$MODEL_NAME" == "$m" ]] && model_found=true && break +done +if [[ "$model_found" != "true" ]]; then + echo "Error: Model '$MODEL_NAME' not found. Available:" + printf ' - %s\n' "${VALID_MODELS[@]}" + exit 1 +fi +echo "Model found: $MODEL_NAME" + +RUN_FILE="server.sh" +echo "Runfile set: $RUN_FILE" + +# DI_REPO_DIR points to the repo root. +# $(pwd) is vllm_disagg_utils/ (the sbatch submit dir); go up 3 levels to reach the repo root. +export DI_REPO_DIR=$(cd "$(pwd)/../../.." && pwd) + +xP="${xP:-1}" +yD="${yD:-1}" + +# Benchmark configuration +BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" +BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" +BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" +BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" + +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + +# ============================================================================= +# Model Path Resolution +# ============================================================================= + +# HF cache directory names may differ from MODEL_NAME +declare -A MODEL_DIR_NAMES=( + ["DeepSeek-R1-0528"]="models--deepseek-ai--DeepSeek-R1-0528" +) + +# MODEL_DIR detection: prefer env var, fall back to hostname detection +if [[ -z "$MODEL_DIR" ]]; then + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + MODEL_DIR="/nfsdata" + elif [[ $NODENAME == mia1* ]]; then + MODEL_DIR="/it-share/data" + else + MODEL_DIR="/nfsdata" + fi + echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $(hostname -s)" +fi +export MODEL_DIR + +DISK_DIR_NAME="${MODEL_DIR_NAMES[$MODEL_NAME]:-$MODEL_NAME}" +echo "Looking for model: $MODEL_NAME (disk dir: $DISK_DIR_NAME)" + +resolve_hf_cache_path() { + local base_path=$1 + if [[ -d "${base_path}/snapshots" ]]; then + local snapshot=$(ls -1 "${base_path}/snapshots" 2>/dev/null | head -1) + if [[ -n "$snapshot" ]]; then + echo "${base_path}/snapshots/${snapshot}" + return 0 + fi + fi + echo "$base_path" + return 1 +} + +MODEL_PATH="" +SEARCH_PATHS=( + "${MODEL_DIR}/${DISK_DIR_NAME}" + "${MODEL_DIR}/${MODEL_NAME}" + "/nfsdata/hf_hub_cache-0/${DISK_DIR_NAME}" + "/nfsdata/hf_hub_cache-0/${MODEL_NAME}" +) + +for search_path in "${SEARCH_PATHS[@]}"; do + if [[ -d "$search_path" ]]; then + RESOLVED=$(resolve_hf_cache_path "$search_path") + MODEL_PATH="$RESOLVED" + echo "Found MODEL_PATH: $MODEL_PATH" + break + fi +done + +if [[ -z "$MODEL_PATH" ]]; then + echo "FATAL: Model '$MODEL_NAME' not found. Searched:" + for p in "${SEARCH_PATHS[@]}"; do echo " - $p"; done + exit 1 +fi +echo "Final MODEL_PATH: $MODEL_PATH" + +# ============================================================================= +# Node Selection and vLLM-Specific NUM_NODES +# ============================================================================= + +# vLLM needs xP + yD + 1 (dedicated proxy node) +NUM_NODES=$((xP + yD + 1)) +echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD + 1 proxy)" + +FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) +SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') + +# Update SLURM environment variables +export SLURM_NNODES=$NUM_NODES +export SLURM_NTASKS=$NUM_NODES +export SLURM_JOB_NUM_NODES=$NUM_NODES +export SLURM_NPROCS=$NUM_NODES +export SLURM_JOB_NODELIST="$SELECTED_NODELIST_STR" +export SLURM_NODELIST="$SELECTED_NODELIST_STR" +export SLURM_TASKS_PER_NODE="1(x$NUM_NODES)" +export SLURM_NTASKS_PER_NODE=1 + +echo "" +echo "Selected nodes: $SELECTED_NODELIST_STR" + +# ============================================================================= +# IP Resolution +# ============================================================================= + +USER_NAME=$(whoami) +MASTER_NODE=$(echo "$SELECTED_NODES" | head -n 1) +NODE0_ADDR=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$MASTER_NODE" bash -c 'ip route get 1.1.1.1') +NODE0_ADDR=$(echo "$NODE0_ADDR" | awk '/src/ {print $7}') + +IPS=() +for NODE in $SELECTED_NODES; do + IP=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$NODE" bash -c 'ip route get 1.1.1.1') + IP=$(echo "$IP" | awk '/src/ {print $7}') + IPS+=("$IP") +done + +echo "Node IPs: ${IPS[*]}" + +DOCKER_MOUNT_PATH="/workspace" +VLLM_WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/vllm_disagg_utils" + +NNODES=$NUM_NODES + +echo "MASTER_NODE: ${MASTER_NODE}" +echo "NODE0_ADDR: ${NODE0_ADDR}" +echo "NNODES: ${NNODES}" +echo "REPO DIR: ${DI_REPO_DIR}" +echo "USER: ${USER_NAME}" + +# Reduce log spam +export TQDM_MININTERVAL=20 + +# Translate the host-resolved MODEL_PATH to the Docker mount namespace +DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}" + +export DI_REPO_DIR=$DI_REPO_DIR +export VLLM_WS_PATH=$VLLM_WS_PATH +export NNODES=$NNODES +export NODE0_ADDR=$NODE0_ADDR +export MODEL_PATH=$MODEL_PATH +export MODEL_DIR=$MODEL_DIR +export xP=$xP +export yD=$yD +export MODEL_NAME=$MODEL_NAME +export USER_NAME=$USER_NAME +export IPADDRS="$(echo "${IPS[*]}" | sed 's/ /,/g')" +export GPUS_PER_NODE=$GPUS_PER_NODE +export BENCH_INPUT_LEN=$BENCH_INPUT_LEN +export BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN +export BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO +export BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER +export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY +export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE +export DRY_RUN="${DRY_RUN:-0}" +export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" + +SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') +export DOCKER_CONT_NAME="container_vllm_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" +export RUN_FILE_FULL="$VLLM_WS_PATH/${RUN_FILE}" + +SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) + +cleanup() { + echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..." + sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true + echo "[${SLURM_JOB_ID}] cleanup done." +} + +trap cleanup INT TERM HUP + +# Force NFS cache refresh on all nodes +echo "Refreshing NFS caches on all nodes..." +srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c ' + sync + ls -la '"$DI_REPO_DIR"'/benchmarks/multi_node/vllm_disagg_utils > /dev/null 2>&1 + stat '"$DI_REPO_DIR"'/benchmarks/multi_node/vllm_disagg_utils/server.sh > /dev/null 2>&1 + cat '"$DI_REPO_DIR"'/benchmarks/multi_node/vllm_disagg_utils/server.sh > /dev/null 2>&1 + echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true + echo "NFS cache refreshed on $(hostname)" +' + +srun \ + --nodelist="$SELECTED_NODELIST_SRUN" \ + --kill-on-bad-exit=1 \ + --signal=TERM@30 \ + --unbuffered \ + bash -lc " +set -euo pipefail + +echo \"Rank \$SLURM_PROCID on \$(hostname)\" + +# Pre-clean (idempotent) +sudo docker ps -aq --filter \"name=^container_vllm_\" | xargs -r sudo docker rm -f || true +sudo docker ps -aq | xargs -r sudo docker stop || true + +exec sudo docker run --rm \ + --init \ + --stop-timeout 10 \ + --device /dev/dri \ + --device /dev/kfd \ + --device /dev/infiniband \ + --device=/dev/infiniband/rdma_cm \ + --device=/dev/infiniband/uverbs0 \ + --device=/dev/infiniband/uverbs1 \ + --device=/dev/infiniband/uverbs2 \ + --device=/dev/infiniband/uverbs3 \ + --device=/dev/infiniband/uverbs4 \ + --device=/dev/infiniband/uverbs5 \ + --device=/dev/infiniband/uverbs6 \ + --device=/dev/infiniband/uverbs7 \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + --network host \ + --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v /sys:/sys \ + -v /etc/libibverbs.d/ionic.driver:/etc/libibverbs.d/ionic.driver:ro \ + -v /lib/x86_64-linux-gnu/libionic.so.1:/lib/x86_64-linux-gnu/libionic.so.1:ro \ + -v /lib/x86_64-linux-gnu/libionic.so:/lib/x86_64-linux-gnu/libionic.so:ro \ + -v /usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav34.so:/usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav34.so:ro \ + -v ${MODEL_DIR}:/models \ + -v \$HOME/.ssh:/root/.ssh \ + --shm-size 128G \ + -v /tmp:/run_logs \ + -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ + -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ + -e SLURM_JOB_ID=\$SLURM_JOB_ID \ + -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST \ + -e NNODES=\$NNODES \ + -e NODE_RANK=\$SLURM_PROCID \ + -e NODE0_ADDR=\$NODE0_ADDR \ + -e MODEL_DIR=/models \ + -e MODEL_NAME=\$MODEL_NAME \ + -e MODEL_PATH=$DOCKER_MODEL_PATH \ + -e VLLM_WS_PATH=${VLLM_WS_PATH} \ + -e GPUS_PER_NODE=\$GPUS_PER_NODE \ + -e xP=\$xP \ + -e yD=\$yD \ + -e IPADDRS=\$IPADDRS \ + -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN \ + -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN \ + -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO \ + -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER \ + -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY \ + -e BENCH_REQUEST_RATE=\$BENCH_REQUEST_RATE \ + -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \ + -e DRY_RUN=\$DRY_RUN \ + -e BENCHMARK_LOGS_DIR=/benchmark_logs \ + -e UCX_TLS=all \ + -e UCX_SOCKADDR_TLS_PRIORITY=tcp \ + -e UCX_MEMTYPE_CACHE=y \ + -e UCX_RNDV_SCHEME=get_zcopy \ + -e UCX_RNDV_THRESH=4k \ + -e UCX_ROCM_IPC_MIN_ZCOPY=0 \ + -e UCX_LOG_LEVEL=info \ + -e HSA_ENABLE_SDMA=1 \ + --name \"$DOCKER_CONT_NAME\" \ + \"$DOCKER_IMAGE_NAME\" bash -lc ' + mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' + '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log + ' + +DOCKER_EXIT_CODE=\$? +if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then + echo \"ERROR: docker exited rc=\$DOCKER_EXIT_CODE on \$(hostname)\" + exit \$DOCKER_EXIT_CODE +fi +" + +srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'sudo docker rm -f $DOCKER_CONT_NAME 2>/dev/null || true' diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh new file mode 100755 index 000000000..b4ab7bce8 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -0,0 +1,444 @@ +#!/bin/bash +# vLLM Disaggregated Server Launcher with Model-Specific Configurations +# ============================================================================= +# +# Node role assignment (by NODE_RANK): +# 0 -> Proxy/Router node +# 1..xP -> Prefill nodes (kv_producer) +# xP+1..xP+yD -> Decode nodes (kv_consumer) + +# ============================================================================= +# Environment Configuration +# ============================================================================= + +NODE0_ADDR="${NODE0_ADDR:-localhost}" +NODE_RANK="${NODE_RANK:-0}" +MODEL_DIR="${MODEL_DIR:-}" +MODEL_NAME="${MODEL_NAME:-}" + +xP="${xP:-1}" +yD="${yD:-1}" + +IPADDRS="${IPADDRS:-localhost}" + +# Benchmark Configuration +BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" +BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" +BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" +BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" + +DRY_RUN="${DRY_RUN:-0}" +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + +ROUTER_PORT="${ROUTER_PORT:-2584}" +SERVER_PORT="${SERVER_PORT:-2584}" +ENGINE_ID="${ENGINE_ID:-${MODEL_NAME}-pd-run}" + +# Prefer MODEL_PATH from job.slurm (handles HF cache snapshot resolution) +MODEL_PATH="${MODEL_PATH:-${MODEL_DIR}/${MODEL_NAME}}" + +# ============================================================================= +# Dependencies and Environment Setup +# ============================================================================= +source $VLLM_WS_PATH/env.sh + +host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}') +# RDMA IP for Nixl KV transfer (prefer 192.168.x.x subnet if available) +rdma_ip=$(hostname -I | tr ' ' '\n' | grep '^192\.168\.' | head -1) +rdma_ip="${rdma_ip:-$host_ip}" +host_name=$(hostname) + +echo "[INFO] Management IP (barriers/proxy): $host_ip" +echo "[INFO] RDMA IP (Nixl KV transfer): $rdma_ip" + +# --------------------------------------------------------------------------- +# RDMA route setup for Pensando ionic (RoCEv2) point-to-point /31 links. +# Each benic interface has a /31 to the TOR switch. Without explicit routes, +# traffic to other nodes' RDMA IPs falls through to the management network +# (no RDMA capability). Fix: add a /24 route via the TOR gateway so RoCEv2 +# stays on the ionic fabric. +# --------------------------------------------------------------------------- +if [[ "$rdma_ip" =~ ^192\.168\.([0-9]+)\.([0-9]+)$ ]]; then + rdma_subnet="${BASH_REMATCH[1]}" + rdma_host="${BASH_REMATCH[2]}" + rdma_gw="192.168.${rdma_subnet}.$(( rdma_host | 1 ))" # /31 peer = TOR switch + rdma_iface=$(ip -o addr show | awk -v ip="$rdma_ip" '$4 ~ ip {print $2}' | head -1) + if [[ -n "$rdma_iface" ]]; then + ip route replace "192.168.${rdma_subnet}.0/24" via "$rdma_gw" dev "$rdma_iface" 2>/dev/null && \ + echo "[RDMA-ROUTE] Added 192.168.${rdma_subnet}.0/24 via $rdma_gw dev $rdma_iface" || \ + echo "[RDMA-ROUTE] Route add failed for 192.168.${rdma_subnet}.0/24" + fi +fi + +# Patch Nixl UCX backend: set ucx_error_handling_mode=none for shared-memory +# transport compatibility (Pensando ionic NICs don't support rdmacm, so the +# default UCP_ERR_HANDLING_MODE_PEER causes "no active messages transport" errors) +NIXL_API_FILE=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) +if [[ -n "$NIXL_API_FILE" ]]; then + if ! grep -q 'ucx_error_handling_mode' "$NIXL_API_FILE"; then + sed -i '/init\["num_threads"\] = str(nixl_conf.num_threads)/a\ init["ucx_error_handling_mode"] = "none"' "$NIXL_API_FILE" + echo "[PATCH] Added ucx_error_handling_mode=none to $NIXL_API_FILE" + else + echo "[PATCH] ucx_error_handling_mode already set in $NIXL_API_FILE" + fi +fi + +if [[ -z "$UCX_NET_DEVICES" ]]; then + echo "Error: UCX_NET_DEVICES is empty after env.sh detection" >&2 + exit 1 +fi + +# ============================================================================= +# Model-Specific Configuration Maps +# ============================================================================= + +declare -A MODEL_PREFILL_CONFIGS=( + ["Llama-3.1-405B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --kv-cache-dtype fp8" + ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + ["DeepSeek-V3"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + ["DeepSeek-R1-0528"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + ["gpt-oss-120b"]="--tensor-parallel-size 8" +) + +declare -A MODEL_DECODE_CONFIGS=( + ["Llama-3.1-405B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --kv-cache-dtype fp8" + ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + ["DeepSeek-V3"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + ["DeepSeek-R1-0528"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + ["gpt-oss-120b"]="--tensor-parallel-size 8" +) + +declare -A MODEL_ENVS=( + ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + ["Llama-3.1-405B-Instruct-FP8-KV"]="VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + ["DeepSeek-V3"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" + ["DeepSeek-R1-0528"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" + ["gpt-oss-120b"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0" +) + +get_model_config() { + local mode="$1" + local model_name="$2" + if [[ "$mode" == "prefill" ]]; then + echo "${MODEL_PREFILL_CONFIGS[$model_name]:-"--tensor-parallel-size 8"}" + elif [[ "$mode" == "decode" ]]; then + echo "${MODEL_DECODE_CONFIGS[$model_name]:-"--tensor-parallel-size 8"}" + fi +} + +get_model_envs() { + echo "${MODEL_ENVS[$1]:-""}" +} + +if [[ -z "$MODEL_NAME" ]]; then + echo "ERROR: MODEL_NAME is not set"; exit 1 +fi + +PREFILL_SERVER_CONFIG=$(get_model_config "prefill" "$MODEL_NAME") +DECODE_SERVER_CONFIG=$(get_model_config "decode" "$MODEL_NAME") +PREFILL_MODEL_ENVS=$(get_model_envs "$MODEL_NAME") +DECODE_MODEL_ENVS=$(get_model_envs "$MODEL_NAME") +echo "Using model-specific configuration for: $MODEL_NAME" + +# ============================================================================= +# Container Synchronization +# ============================================================================= + +echo "Waiting at the container creation barrier on $host_name" +python3 $VLLM_WS_PATH/sync.py barrier \ + --local-ip ${host_ip} \ + --local-port 5000 \ + --enable-port \ + --node-ips ${IPADDRS} \ + --node-ports 5000 \ + --wait-for-all-ports \ + --timeout 300 + +# ============================================================================= +# ETCD Server Setup +# ============================================================================= + +echo "Proceeding to start etcd server on $host_name" +bash ${VLLM_WS_PATH}/start_etcd.sh > /dev/null & +etcd_pid=$! + +echo "Waiting at etcd server barrier on $host_name" +python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports 2379 \ + --wait-for-all-ports \ + --timeout 300 + +echo "All etcd servers are up : $host_name" +sleep 3 + +echo "etcd endpoint health==================" +etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true +echo "======================================" + +python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports 2379 \ + --wait-for-all-ports \ + --timeout 300 + +# ============================================================================= +# Cluster Topology Configuration +# ============================================================================= +IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" + +PREFILL_ARGS="" +DECODE_ARGS="" + +for ((i=1; i<=xP && i<${#IP_ARRAY[@]}; i++)); do + PREFILL_ARGS+="${IP_ARRAY[$i]} " +done + +for ((i=xP+1; i<${#IP_ARRAY[@]}; i++)); do + DECODE_ARGS+="${IP_ARRAY[$i]} " +done + +echo "Prefill node IPs: ${PREFILL_ARGS}" +echo "Decode node IPs: ${DECODE_ARGS}" + +# Common UCX/Nixl environment for prefill and decode workers +setup_ucx_env() { + export UCX_TLS=all + export UCX_SOCKADDR_TLS_PRIORITY=tcp + export UCX_MEMTYPE_CACHE=y + export UCX_RNDV_SCHEME=get_zcopy + export UCX_RNDV_THRESH=4k + export UCX_ROCM_IPC_MIN_ZCOPY=0 + export HSA_ENABLE_SDMA=1 + export UCX_LOG_LEVEL=info + export VLLM_USE_V1=1 + export VLLM_SERVER_DEV_MODE=0 + export VLLM_NIXL_SIDE_CHANNEL_HOST=${host_ip} + export VLLM_NIXL_SIDE_CHANNEL_PORT=5557 +} + +# ============================================================================= +# Node Role Assignment and Server Launch +# ============================================================================= + +if [ "$NODE_RANK" -eq 0 ]; then + echo "NODE INFO =======================================" + echo "================================================" + echo "Node List : ${SLURM_JOB_NODELIST}" + echo "Node IPs : ${IPADDRS}" + echo "Model : ${MODEL_NAME:-'Not specified'}" + echo "================================================" + + echo "CLUSTER INFO ====================================" + echo "================================================" + echo "${host_name}:${host_ip} is Proxy Node" + echo "Prefill servers: ${PREFILL_ARGS}" + echo "Decode servers: ${DECODE_ARGS}" + echo "================================================" + + PD_IPADDRS="${IPADDRS#*,}" + echo "Waiting for all prefill and decode servers to be up . . ." + python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${PD_IPADDRS} \ + --node-ports $SERVER_PORT \ + --wait-for-all-ports \ + --timeout 1800 + + echo "Congratulations!!! All prefill and decode servers are up . . ." + + echo "Starting vLLM Router..." + [ -f /root/.cargo/env ] && source /root/.cargo/env + + PREFILL_URLS="" + DECODE_URLS="" + for ip in ${PREFILL_ARGS}; do + PREFILL_URLS+="--prefill http://${ip}:${SERVER_PORT} " + done + for ip in ${DECODE_ARGS}; do + DECODE_URLS+="--decode http://${ip}:${SERVER_PORT} " + done + + ROUTER_CMD="UCX_TLS=tcp,self,shm VLLM_USE_V1=1 \ + vllm-router \ + --host 0.0.0.0 \ + --port $ROUTER_PORT \ + --vllm-pd-disaggregation \ + $PREFILL_URLS \ + $DECODE_URLS \ + --policy round_robin \ + --prefill-policy round_robin \ + --decode-policy round_robin \ + --intra-node-data-parallel-size 1 \ + --retry-max-retries 3 \ + --health-check-endpoint /health \ + --prometheus-port 29000" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $ROUTER_CMD" + else + ROUTER_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_${host_name}.log" + set -x + eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & + set +x + proxy_pid=$! + + HEALTH_BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-health \ + --health-endpoint /health \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $HEALTH_BARRIER_CMD" + else + eval "$HEALTH_BARRIER_CMD" + fi + + echo "Router is ready for benchmarking" + fi + + echo "Ready for benchmarking on ${host_name}:${host_ip}" + echo "Benchmarking on ${host_name}:${host_ip}" + cd $VLLM_WS_PATH + + export ROUTER_PORT=$ROUTER_PORT + BENCH_CMD="bash $VLLM_WS_PATH/bench.sh ${xP} ${yD} $((GPUS_PER_NODE*xP)) $((GPUS_PER_NODE*yD)) \ + $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ + ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \ + ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BENCH_CMD" + else + set -x + eval "$BENCH_CMD" + set +x + fi + + # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) + LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" + mkdir -p "$LOGS_OUTPUT" + + if [[ "$DRY_RUN" -eq 0 ]]; then + cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" + echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" + fi + + echo "Killing the proxy server" + [[ "$DRY_RUN" -eq 0 ]] && kill $proxy_pid + +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -le "$xP" ]; then + echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME})" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + + setup_ucx_env + for env_pair in ${PREFILL_MODEL_ENVS}; do + export "$env_pair" + done + + PREFILL_CMD="vllm serve ${MODEL_PATH} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --disable-log-requests \ + --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"engine_id\": \"${ENGINE_ID}\", \"kv_role\": \"kv_producer\", \"kv_parallel_size\": 8, \"kv_rank\": 0, \"kv_buffer_size\": 5000000000, \"kv_buffer_device\": \"cuda\", \"kv_ip\": \"'\"${rdma_ip}\"'\", \"kv_port\": 14600}' \ + ${PREFILL_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + set -x + eval "$PREFILL_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $VLLM_WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port ${ROUTER_PORT}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the prefill server" + [[ "$DRY_RUN" -eq 0 ]] && kill $prefill_pid + +else + echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME})" + echo "Using decode config: $DECODE_SERVER_CONFIG" + + setup_ucx_env + for env_pair in ${DECODE_MODEL_ENVS}; do + export "$env_pair" + done + + DECODE_CMD="vllm serve ${MODEL_PATH} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --disable-log-requests \ + --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"engine_id\": \"${ENGINE_ID}\", \"kv_role\": \"kv_consumer\", \"kv_parallel_size\": 8, \"kv_rank\": 0, \"kv_buffer_size\": 5000000000, \"kv_buffer_device\": \"cuda\", \"kv_ip\": \"'\"${rdma_ip}\"'\", \"kv_port\": 14600}' \ + ${DECODE_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $DECODE_CMD" + else + set -x + eval "$DECODE_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & + set +x + decode_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $VLLM_WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port ${ROUTER_PORT}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the decode server" + [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid +fi + +echo "Killing the etcd server" +kill $etcd_pid + +echo "Script completed successfully" +exit 0 diff --git a/benchmarks/multi_node/vllm_disagg_utils/start_etcd.sh b/benchmarks/multi_node/vllm_disagg_utils/start_etcd.sh new file mode 100755 index 000000000..46bbd2964 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/start_etcd.sh @@ -0,0 +1,47 @@ +#!/bin/bash +set -x + +IPADDRS="${IPADDRS:-localhost}" + +# Use management network IP (matching what the Slurm script resolved) +host_ip=$(ip route get 1.1.1.1 2>/dev/null | sed -n 's/.*src \([^ ]*\).*/\1/p') +if [[ -z "$host_ip" ]]; then + host_ip=$(hostname -I | awk '{print $1}') +fi + +IFS=',' read -ra ADDR <<< "$IPADDRS" + +# Determine node name based on position in the IPADDRS list +index=0 +for ip in "${ADDR[@]}"; do + if [[ "$ip" == "$host_ip" ]]; then + break + fi + index=$((index + 1)) +done +node_name="etcd-$((index+1))" + +# Build initial cluster string +initial_cluster="" +for i in "${!ADDR[@]}"; do + peer_name="etcd-$((i+1))" + initial_cluster+="$peer_name=http://${ADDR[i]}:2380" + if [[ $i -lt $((${#ADDR[@]} - 1)) ]]; then + initial_cluster+="," + fi +done + +mkdir -p /var/lib/etcd +rm -rf /var/lib/etcd/* + +/usr/local/bin/etcd/etcd \ + --name "$node_name" \ + --data-dir /var/lib/etcd \ + --initial-advertise-peer-urls http://$host_ip:2380 \ + --listen-peer-urls http://0.0.0.0:2380 \ + --listen-client-urls http://0.0.0.0:2379 \ + --advertise-client-urls http://$host_ip:2379 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-cluster "$initial_cluster" \ + --initial-cluster-state new \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/etcd_NODE${NODE_RANK}.log diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh new file mode 100755 index 000000000..a41a31d79 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# +# Cluster Configuration Template for Multi-Node vLLM Disaggregated Serving +# +# This script submits a multi-node vLLM disaggregated benchmark job to SLURM. +# It must be configured for your specific cluster before use. +# +# Key difference from SGLang: vLLM uses a dedicated proxy node, so +# NUM_NODES = PREFILL_NODES + DECODE_NODES + 1. + +usage() { + cat << 'USAGE' +Usage: + bash submit.sh \ + [NODE_LIST] + +Arguments: + PREFILL_NODES Number of prefill nodes + PREFILL_WORKERS Number of prefill workers (usually 1) + DECODE_NODES Number of decode nodes + DECODE_WORKERS Number of decode workers (usually 1) + ISL Input sequence length + OSL Output sequence length + CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") + REQUEST_RATE Request rate ("inf" for max throughput) + NODE_LIST Optional: comma-separated hostnames + +Required environment variables: + SLURM_ACCOUNT SLURM account name + SLURM_PARTITION SLURM partition + TIME_LIMIT Job time limit (e.g., "08:00:00") + MODEL_PATH Path to model directory (e.g., /nfsdata) + MODEL_NAME Model name directory + CONTAINER_IMAGE Docker image name (e.g., vllm_disagg_pd:latest) + RUNNER_NAME Runner identifier (for job name) +USAGE +} + +check_env() { + local name="$1" + if [[ -z "${!name:-}" ]]; then + echo "Error: ${name} not specified" >&2 + usage >&2 + exit 1 + fi +} + +check_env SLURM_ACCOUNT +check_env SLURM_PARTITION +check_env TIME_LIMIT + +check_env MODEL_PATH +check_env MODEL_NAME +check_env CONTAINER_IMAGE +check_env RUNNER_NAME + +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + +# COMMAND_LINE ARGS +PREFILL_NODES=$1 +PREFILL_WORKERS=${2:-1} +DECODE_NODES=$3 +DECODE_WORKERS=${4:-1} +ISL=$5 +OSL=$6 +CONCURRENCIES=$7 +REQUEST_RATE=$8 +NODE_LIST=${9} + +# vLLM needs xP + yD + 1 nodes (dedicated proxy node) +NUM_NODES=$((PREFILL_NODES + DECODE_NODES + 1)) +profiler_args="${ISL} ${OSL} ${CONCURRENCIES} ${REQUEST_RATE}" + +# Export variables for the SLURM job +export MODEL_DIR=$MODEL_PATH +export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE +export PROFILER_ARGS=$profiler_args + +# For vLLM, each worker = 1 node (TP=8 per node). +# xP/yD must match the node counts so job.slurm's NUM_NODES = xP+yD+1 is correct. +export xP=$PREFILL_NODES +export yD=$DECODE_NODES +export NUM_NODES=$NUM_NODES +export GPUS_PER_NODE=$GPUS_PER_NODE +export MODEL_NAME=$MODEL_NAME +export BENCH_INPUT_LEN=${ISL} +export BENCH_OUTPUT_LEN=${OSL} +export BENCH_RANDOM_RANGE_RATIO=${BENCH_RANDOM_RANGE_RATIO:-1} +export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} +export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} +export BENCH_REQUEST_RATE=${REQUEST_RATE} + +# Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. +export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +mkdir -p "$BENCHMARK_LOGS_DIR" + +# Optional: pass an explicit node list to sbatch. +NODELIST_OPT=() +if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then + IFS=',' read -r -a NODE_ARR <<< "$NODE_LIST" + if [[ "${#NODE_ARR[@]}" -ne "$NUM_NODES" ]]; then + echo "Error: NODE_LIST has ${#NODE_ARR[@]} nodes but NUM_NODES=${NUM_NODES}" >&2 + echo "Error: NODE_LIST='${NODE_LIST}'" >&2 + exit 1 + fi + NODELIST_CSV="$(IFS=,; echo "${NODE_ARR[*]}")" + NODELIST_OPT=(--nodelist "$NODELIST_CSV") +fi + +# Construct the sbatch command +sbatch_cmd=( + sbatch + --parsable + -N "$NUM_NODES" + -n "$NUM_NODES" + "${NODELIST_OPT[@]}" + --time "$TIME_LIMIT" + --partition "$SLURM_PARTITION" + --account "$SLURM_ACCOUNT" + --job-name "$RUNNER_NAME" + --output "${BENCHMARK_LOGS_DIR}/slurm_job-%j.out" + --error "${BENCHMARK_LOGS_DIR}/slurm_job-%j.err" + "$(dirname "$0")/job.slurm" +) + +JOB_ID=$("${sbatch_cmd[@]}") +if [[ $? -ne 0 ]]; then + echo "Error: Failed to submit job with sbatch" >&2 + exit 1 +fi +echo "$JOB_ID" diff --git a/benchmarks/multi_node/vllm_disagg_utils/sync.py b/benchmarks/multi_node/vllm_disagg_utils/sync.py new file mode 100755 index 000000000..140951519 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/sync.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +""" +Multi-node synchronization utilities for disaggregated inference. + +Subcommands: + barrier - Wait until all specified nodes have opened their ports (TCP barrier) + Optionally wait for HTTP health endpoints to return 200 + wait - Block until a remote port closes (shutdown coordination) +""" + +import socket +import time +import threading +import argparse +import sys +import urllib.request +import urllib.error + + +def is_port_open(ip, port, timeout=2): + """Check if a given IP and port are accessible.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.settimeout(timeout) + return s.connect_ex((ip, port)) == 0 + + +def check_health(ip, port, path="/health", timeout=2): + """Return True if http://ip:port/path returns HTTP 200.""" + try: + url = f"http://{ip}:{port}{path}" + req = urllib.request.Request(url) + with urllib.request.urlopen(req, timeout=timeout) as resp: + return getattr(resp, "status", 200) == 200 + except (urllib.error.URLError, urllib.error.HTTPError, OSError): + return False + + +# ============================================================================= +# barrier subcommand +# ============================================================================= + +def cmd_barrier(args): + """Wait until all nodes have opened the specified ports.""" + NODE_IPS = [ip.strip() for ip in args.node_ips.split(",") if ip.strip()] + NODE_PORTS = [int(p.strip()) for p in args.node_ports.split(",") if p.strip()] + + if not NODE_IPS: + print("Error: NODE_IPS argument is empty or not set.") + sys.exit(1) + + if len(NODE_PORTS) == 1: + NODE_PORTS *= len(NODE_IPS) + elif len(NODE_PORTS) != len(NODE_IPS): + print("Error: Number of ports must match number of node IPs or only one port should be given for all.") + sys.exit(1) + + server_socket = None + + def open_port(): + nonlocal server_socket + server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + server_socket.bind((args.local_ip, args.local_port)) + server_socket.listen(5) + print(f"Port {args.local_port} is now open on {args.local_ip}.") + while True: + conn, addr = server_socket.accept() + conn.close() + + def close_port(): + nonlocal server_socket + if server_socket: + server_socket.close() + print(f"Port {args.local_port} has been closed on {args.local_ip}.") + + if args.enable_port: + threading.Thread(target=open_port, daemon=True).start() + + # Wait for all ports (TCP check) + if args.wait_for_all_ports: + start_time = time.time() + timeout = args.timeout + + while True: + if timeout > 0: + elapsed = time.time() - start_time + if elapsed >= timeout: + not_open = [(ip, port) for ip, port in zip(NODE_IPS, NODE_PORTS) + if not is_port_open(ip, port)] + print(f"ERROR: Timeout after {timeout} seconds waiting for ports to open.", flush=True) + print("The following nodes/ports are still not responding:", flush=True) + for ip, port in not_open: + print(f" - {ip}:{port}", flush=True) + sys.exit(1) + + all_open = all(is_port_open(ip, port) for ip, port in zip(NODE_IPS, NODE_PORTS)) + if all_open: + break + + if timeout > 0: + remaining = timeout - (time.time() - start_time) + print(f"Waiting for nodes.{NODE_PORTS},{NODE_IPS} . . ({remaining:.0f}s remaining)", flush=True) + else: + print(f"Waiting for nodes.{NODE_PORTS},{NODE_IPS} . .", flush=True) + time.sleep(5) + + # Wait for all health endpoints (HTTP check) + if args.wait_for_all_health: + health_path = args.health_endpoint + start_time = time.time() + timeout = args.timeout + + while True: + if timeout > 0: + elapsed = time.time() - start_time + if elapsed >= timeout: + not_ready = [ + (ip, port) + for ip, port in zip(NODE_IPS, NODE_PORTS) + if not check_health(ip, port, health_path) + ] + print(f"ERROR: Timeout after {timeout} seconds waiting for health endpoints.", flush=True) + print(f"The following (http://ip:port{health_path}) are still not responding:", flush=True) + for ip, port in not_ready: + print(f" - http://{ip}:{port}{health_path}", flush=True) + sys.exit(1) + + all_ready = all( + check_health(ip, port, health_path) + for ip, port in zip(NODE_IPS, NODE_PORTS) + ) + if all_ready: + break + + if timeout > 0: + remaining = timeout - (time.time() - start_time) + print( + f"Waiting for health on {list(zip(NODE_IPS, NODE_PORTS))} ({health_path}) .. ({remaining:.0f}s remaining)", + flush=True, + ) + else: + print(f"Waiting for health on {list(zip(NODE_IPS, NODE_PORTS))} ({health_path}) ..", flush=True) + time.sleep(30) + + if args.enable_port: + time.sleep(30) + close_port() + + +# ============================================================================= +# wait subcommand +# ============================================================================= + +def cmd_wait(args): + """Wait while a remote port remains open, exit when it closes.""" + print(f"Waiting while port {args.remote_port} on {args.remote_ip} is open...") + while is_port_open(args.remote_ip, args.remote_port): + time.sleep(5) + print(f"Port {args.remote_port} on {args.remote_ip} is now closed.") + + +# ============================================================================= +# CLI +# ============================================================================= + +def main(): + parser = argparse.ArgumentParser(description="Multi-node synchronization utilities.") + subparsers = parser.add_subparsers(dest="command", required=True) + + # barrier subcommand + bp = subparsers.add_parser("barrier", help="Wait for all nodes to open specified ports.") + bp.add_argument("--local-ip", required=False, help="Local IP address to bind the server.") + bp.add_argument("--local-port", type=int, required=False, help="Port number to bind the server.") + bp.add_argument("--enable-port", action="store_true", help="Enable opening and closing of local port.") + bp.add_argument("--node-ips", required=True, help="Comma-separated list of node IPs.") + bp.add_argument("--node-ports", required=True, help="Comma-separated list of ports to check.") + bp.add_argument("--timeout", type=int, default=600, + help="Timeout in seconds (default: 600). Set to 0 for no timeout.") + bp.add_argument("--wait-for-all-ports", action="store_true", + help="Wait until all node ports are open (TCP).") + bp.add_argument("--wait-for-all-health", action="store_true", + help="Wait until http://ip:port/health returns 200 for all nodes.") + bp.add_argument("--health-endpoint", default="/health", + help="Path for health check (default: /health).") + bp.set_defaults(func=cmd_barrier) + + # wait subcommand + wp = subparsers.add_parser("wait", help="Wait while a remote port remains open.") + wp.add_argument("--remote-ip", required=True, help="Remote server IP address.") + wp.add_argument("--remote-port", type=int, required=True, help="Remote port number.") + wp.set_defaults(func=cmd_wait) + + args = parser.parse_args() + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index a8033847e..6b47b34b7 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -56,7 +56,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then trap 'sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true' EXIT SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh" - if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then + if [[ "$FRAMEWORK" == "sglang-disagg" || "$FRAMEWORK" == "vllm-disagg" ]]; then BENCHMARK_SUBDIR="multi_node" else BENCHMARK_SUBDIR="single_node" @@ -108,8 +108,17 @@ if [[ "$IS_MULTINODE" == "true" ]]; then if [[ "${EVAL_ONLY:-false}" != "true" ]]; then cat > collect_latest_results.py <<'PY' import os, sys -sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) -for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: +job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) +prefixes = ["sglang", "vllm"] +logs_root = f"{job_dir}/logs/" +candidates = [] +if os.path.isdir(logs_root): + for name in os.listdir(logs_root): + for pfx in prefixes: + subdir = f"{logs_root}{name}/{pfx}_isl_{isl}_osl_{osl}" + if os.path.isdir(subdir): + candidates.append(subdir) +for path in sorted(candidates, key=os.path.getmtime, reverse=True)[:nexp]: print(path) PY From f805b622c4ae6709c79adfefd284b0d3fb93f84c Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Wed, 11 Mar 2026 17:50:16 +0000 Subject: [PATCH 02/85] [AMD] Refactor vLLM disagg recipe: models.yaml, UCX cleanup, QoS support Extract hardcoded model configurations from server.sh bash maps and job.slurm VALID_MODELS into a declarative models.yaml, mirroring the SGLang disagg recipe pattern. Adding a new model now requires no script changes. Also: - Consolidate UCX transport vars in job.slurm Docker env; remove duplicated setup_ucx_env() from server.sh - Extract RDMA workarounds (ionic /31 route fix, Nixl UCX patch) into setup_rdma_env() helper - Lower UCX_LOG_LEVEL from info to warn - Add nicctl mount and QoS/DSCP auto-detection to env.sh - Remove stale host libionic bind-mounts (driver now built into image) --- .../multi_node/vllm_disagg_utils/env.sh | 54 +++++- .../multi_node/vllm_disagg_utils/job.slurm | 46 +++-- .../multi_node/vllm_disagg_utils/models.yaml | 41 +++++ .../multi_node/vllm_disagg_utils/server.sh | 162 ++++++++---------- 4 files changed, 184 insertions(+), 119 deletions(-) create mode 100644 benchmarks/multi_node/vllm_disagg_utils/models.yaml diff --git a/benchmarks/multi_node/vllm_disagg_utils/env.sh b/benchmarks/multi_node/vllm_disagg_utils/env.sh index ebe77f09b..f4340e812 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/env.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/env.sh @@ -33,9 +33,17 @@ else fi if [[ -z "$UCX_NET_DEVICES" ]]; then - FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1) - if [[ -n "$FIRST_IB" ]]; then - export UCX_NET_DEVICES="${FIRST_IB}:1" + # Use the first benic interface for UCX TCP transport (maps to ionic RDMA NIC). + # We use TCP device names (benicXp1) instead of IB device names (ionic_X:1) + # because ud_verbs/ionic crashes in ucp_request_memory_dereg (UCX bug with ionic provider). + UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/benic1p1/{print $2}' | head -1) + if [[ -n "$UCX_NET_DEV" ]]; then + export UCX_NET_DEVICES="$UCX_NET_DEV" + else + FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1) + if [[ -n "$FIRST_IB" ]]; then + export UCX_NET_DEVICES="${FIRST_IB}:1" + fi fi echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES" else @@ -48,5 +56,43 @@ export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} # RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1} +# QoS/DSCP configuration for lossless RoCEv2 fabric. +# Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname +if [[ -n "$UCX_IB_TRAFFIC_CLASS" ]]; then + echo "[INFO] Using UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS (set by environment)" +elif command -v nicctl &> /dev/null; then + ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') + ND_DSCP=$(nicctl show qos 2>/dev/null | awk -v p="$ND_PRIO" ' +$1 == "DSCP" && $2 == ":" && $NF == p { + print $3; exit +}') + if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then + export UCX_IB_TRAFFIC_CLASS=$(( 4 * ND_DSCP )) + export UCX_IB_SL=$ND_PRIO + echo "[INFO] Detected QoS from nicctl: UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS, UCX_IB_SL=$UCX_IB_SL" + else + echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export UCX_IB_TRAFFIC_CLASS=96 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export UCX_IB_TRAFFIC_CLASS=104 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + fi + fi +else + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export UCX_IB_TRAFFIC_CLASS=96 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export UCX_IB_TRAFFIC_CLASS=104 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + else + echo "[INFO] No nicctl and unable to detect from hostname. Skipping QoS configuration." + fi +fi + set +x -echo "[INFO] IBDEVICES=$IBDEVICES UCX_NET_DEVICES=$UCX_NET_DEVICES NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX" +echo "[INFO] IBDEVICES=$IBDEVICES UCX_NET_DEVICES=$UCX_NET_DEVICES NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX UCX_IB_TRAFFIC_CLASS=${UCX_IB_TRAFFIC_CLASS:-unset}" diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index 710b7168a..494ef6901 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -18,13 +18,14 @@ echo "" # Model Validation # ============================================================================= -VALID_MODELS=( - "Llama-3.1-405B-Instruct-FP8-KV" - "amd-Llama-3.3-70B-Instruct-FP8-KV" - "DeepSeek-V3" - "DeepSeek-R1-0528" - "gpt-oss-120b" -) +# Use $(pwd) not BASH_SOURCE — sbatch copies the script to /var/spool/slurmd/ +# at runtime, but the CWD remains the submit-time directory (vllm_disagg_utils/). +MODELS_YAML="$(pwd)/models.yaml" + +if [[ ! -f "$MODELS_YAML" ]]; then + echo "Error: models.yaml not found at $MODELS_YAML" + exit 1 +fi if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then echo "Error: DOCKER_IMAGE_NAME is not set." @@ -32,13 +33,10 @@ if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then fi MODEL_NAME="${MODEL_NAME:-None}" -model_found=false -for m in "${VALID_MODELS[@]}"; do - [[ "$MODEL_NAME" == "$m" ]] && model_found=true && break -done -if [[ "$model_found" != "true" ]]; then - echo "Error: Model '$MODEL_NAME' not found. Available:" - printf ' - %s\n' "${VALID_MODELS[@]}" +if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then + echo "Error: Model '$MODEL_NAME' not found in models.yaml" + echo "Available models:" + grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/ - /' exit 1 fi echo "Model found: $MODEL_NAME" @@ -67,11 +65,6 @@ GPUS_PER_NODE="${GPUS_PER_NODE:-8}" # Model Path Resolution # ============================================================================= -# HF cache directory names may differ from MODEL_NAME -declare -A MODEL_DIR_NAMES=( - ["DeepSeek-R1-0528"]="models--deepseek-ai--DeepSeek-R1-0528" -) - # MODEL_DIR detection: prefer env var, fall back to hostname detection if [[ -z "$MODEL_DIR" ]]; then NODENAME=$(hostname -s) @@ -86,7 +79,11 @@ if [[ -z "$MODEL_DIR" ]]; then fi export MODEL_DIR -DISK_DIR_NAME="${MODEL_DIR_NAMES[$MODEL_NAME]:-$MODEL_NAME}" +# Extract hf_dir from models.yaml (the line after the model's top-level key) +DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next} + found && /^[^ ]/{exit} + found && /hf_dir:/{gsub(/[" ]/, "", $2); print $2; exit}' "$MODELS_YAML") +DISK_DIR_NAME="${DISK_DIR_NAME:-$MODEL_NAME}" echo "Looking for model: $MODEL_NAME (disk dir: $DISK_DIR_NAME)" resolve_hf_cache_path() { @@ -270,10 +267,7 @@ exec sudo docker run --rm \ --security-opt seccomp=unconfined \ --privileged \ -v /sys:/sys \ - -v /etc/libibverbs.d/ionic.driver:/etc/libibverbs.d/ionic.driver:ro \ - -v /lib/x86_64-linux-gnu/libionic.so.1:/lib/x86_64-linux-gnu/libionic.so.1:ro \ - -v /lib/x86_64-linux-gnu/libionic.so:/lib/x86_64-linux-gnu/libionic.so:ro \ - -v /usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav34.so:/usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav34.so:ro \ + $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \ -v ${MODEL_DIR}:/models \ -v \$HOME/.ssh:/root/.ssh \ --shm-size 128G \ @@ -302,13 +296,13 @@ exec sudo docker run --rm \ -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \ -e DRY_RUN=\$DRY_RUN \ -e BENCHMARK_LOGS_DIR=/benchmark_logs \ - -e UCX_TLS=all \ + -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma \ -e UCX_SOCKADDR_TLS_PRIORITY=tcp \ -e UCX_MEMTYPE_CACHE=y \ -e UCX_RNDV_SCHEME=get_zcopy \ -e UCX_RNDV_THRESH=4k \ -e UCX_ROCM_IPC_MIN_ZCOPY=0 \ - -e UCX_LOG_LEVEL=info \ + -e UCX_LOG_LEVEL=warn \ -e HSA_ENABLE_SDMA=1 \ --name \"$DOCKER_CONT_NAME\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml new file mode 100644 index 000000000..31197ec52 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -0,0 +1,41 @@ +# Model-specific vLLM server configurations for disaggregated inference. +# +# Each top-level key is a MODEL_NAME value (must match the model identifier +# used in amd-master.yaml and the directory/HF-cache name under MODEL_DIR). +# +# To add a new model: add a new top-level entry following the same schema. +# No script changes are required. +# +# Schema: +# : +# prefill_flags: str # vLLM CLI flags for prefill workers +# decode_flags: str # vLLM CLI flags for decode workers +# env: str # Space-separated KEY=VALUE pairs exported before vllm serve +# hf_dir: str # (optional) On-disk directory name if it differs from the key +# # e.g. HF cache layout: models--deepseek-ai--DeepSeek-R1-0528 + +Llama-3.1-405B-Instruct-FP8-KV: + prefill_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" + decode_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" + env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + +amd-Llama-3.3-70B-Instruct-FP8-KV: + prefill_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + decode_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + +DeepSeek-V3: + prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + decode_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" + +DeepSeek-R1-0528: + prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + decode_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" + hf_dir: "models--deepseek-ai--DeepSeek-R1-0528" + +gpt-oss-120b: + prefill_flags: "--tensor-parallel-size 8" + decode_flags: "--tensor-parallel-size 8" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0" diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index b4ab7bce8..21fe506cb 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -53,37 +53,43 @@ host_name=$(hostname) echo "[INFO] Management IP (barriers/proxy): $host_ip" echo "[INFO] RDMA IP (Nixl KV transfer): $rdma_ip" -# --------------------------------------------------------------------------- -# RDMA route setup for Pensando ionic (RoCEv2) point-to-point /31 links. -# Each benic interface has a /31 to the TOR switch. Without explicit routes, -# traffic to other nodes' RDMA IPs falls through to the management network -# (no RDMA capability). Fix: add a /24 route via the TOR gateway so RoCEv2 -# stays on the ionic fabric. -# --------------------------------------------------------------------------- -if [[ "$rdma_ip" =~ ^192\.168\.([0-9]+)\.([0-9]+)$ ]]; then - rdma_subnet="${BASH_REMATCH[1]}" - rdma_host="${BASH_REMATCH[2]}" - rdma_gw="192.168.${rdma_subnet}.$(( rdma_host | 1 ))" # /31 peer = TOR switch - rdma_iface=$(ip -o addr show | awk -v ip="$rdma_ip" '$4 ~ ip {print $2}' | head -1) - if [[ -n "$rdma_iface" ]]; then - ip route replace "192.168.${rdma_subnet}.0/24" via "$rdma_gw" dev "$rdma_iface" 2>/dev/null && \ - echo "[RDMA-ROUTE] Added 192.168.${rdma_subnet}.0/24 via $rdma_gw dev $rdma_iface" || \ - echo "[RDMA-ROUTE] Route add failed for 192.168.${rdma_subnet}.0/24" +# ============================================================================= +# RDMA / Nixl Workarounds +# ============================================================================= + +setup_rdma_env() { + # Pensando ionic (RoCEv2) point-to-point /31 route fix. + # Each benic interface has a /31 to the TOR switch. Without explicit routes, + # traffic to other nodes' RDMA IPs falls through to the management network. + if [[ "$rdma_ip" =~ ^192\.168\.([0-9]+)\.([0-9]+)$ ]]; then + local rdma_subnet="${BASH_REMATCH[1]}" + local rdma_host="${BASH_REMATCH[2]}" + local rdma_gw="192.168.${rdma_subnet}.$(( rdma_host | 1 ))" + local rdma_iface + rdma_iface=$(ip -o addr show | awk -v ip="$rdma_ip" '$4 ~ ip {print $2}' | head -1) + if [[ -n "$rdma_iface" ]]; then + ip route replace "192.168.${rdma_subnet}.0/24" via "$rdma_gw" dev "$rdma_iface" 2>/dev/null && \ + echo "[RDMA-ROUTE] Added 192.168.${rdma_subnet}.0/24 via $rdma_gw dev $rdma_iface" || \ + echo "[RDMA-ROUTE] Route add failed for 192.168.${rdma_subnet}.0/24" + fi fi -fi -# Patch Nixl UCX backend: set ucx_error_handling_mode=none for shared-memory -# transport compatibility (Pensando ionic NICs don't support rdmacm, so the -# default UCP_ERR_HANDLING_MODE_PEER causes "no active messages transport" errors) -NIXL_API_FILE=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) -if [[ -n "$NIXL_API_FILE" ]]; then - if ! grep -q 'ucx_error_handling_mode' "$NIXL_API_FILE"; then - sed -i '/init\["num_threads"\] = str(nixl_conf.num_threads)/a\ init["ucx_error_handling_mode"] = "none"' "$NIXL_API_FILE" - echo "[PATCH] Added ucx_error_handling_mode=none to $NIXL_API_FILE" - else - echo "[PATCH] ucx_error_handling_mode already set in $NIXL_API_FILE" + # Patch Nixl UCX backend: set ucx_error_handling_mode=none. + # Pensando ionic NICs don't support rdmacm, so the default + # UCP_ERR_HANDLING_MODE_PEER causes "no active messages transport" errors. + local nixl_api + nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) + if [[ -n "$nixl_api" ]]; then + if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then + sed -i '/init\["num_threads"\] = str(nixl_conf.num_threads)/a\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" + echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api" + else + echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" + fi fi -fi +} + +setup_rdma_env if [[ -z "$UCX_NET_DEVICES" ]]; then echo "Error: UCX_NET_DEVICES is empty after env.sh detection" >&2 @@ -91,56 +97,45 @@ if [[ -z "$UCX_NET_DEVICES" ]]; then fi # ============================================================================= -# Model-Specific Configuration Maps +# Model-Specific Configuration from YAML # ============================================================================= +MODELS_YAML="${VLLM_WS_PATH}/models.yaml" -declare -A MODEL_PREFILL_CONFIGS=( - ["Llama-3.1-405B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --kv-cache-dtype fp8" - ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" - ["DeepSeek-V3"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - ["DeepSeek-R1-0528"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - ["gpt-oss-120b"]="--tensor-parallel-size 8" -) - -declare -A MODEL_DECODE_CONFIGS=( - ["Llama-3.1-405B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --kv-cache-dtype fp8" - ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" - ["DeepSeek-V3"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - ["DeepSeek-R1-0528"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - ["gpt-oss-120b"]="--tensor-parallel-size 8" -) - -declare -A MODEL_ENVS=( - ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" - ["Llama-3.1-405B-Instruct-FP8-KV"]="VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" - ["DeepSeek-V3"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" - ["DeepSeek-R1-0528"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" - ["gpt-oss-120b"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0" -) - -get_model_config() { - local mode="$1" - local model_name="$2" - if [[ "$mode" == "prefill" ]]; then - echo "${MODEL_PREFILL_CONFIGS[$model_name]:-"--tensor-parallel-size 8"}" - elif [[ "$mode" == "decode" ]]; then - echo "${MODEL_DECODE_CONFIGS[$model_name]:-"--tensor-parallel-size 8"}" - fi -} - -get_model_envs() { - echo "${MODEL_ENVS[$1]:-""}" -} +if [[ ! -f "$MODELS_YAML" ]]; then + echo "ERROR: models.yaml not found at $MODELS_YAML" + exit 1 +fi if [[ -z "$MODEL_NAME" ]]; then echo "ERROR: MODEL_NAME is not set"; exit 1 fi -PREFILL_SERVER_CONFIG=$(get_model_config "prefill" "$MODEL_NAME") -DECODE_SERVER_CONFIG=$(get_model_config "decode" "$MODEL_NAME") -PREFILL_MODEL_ENVS=$(get_model_envs "$MODEL_NAME") -DECODE_MODEL_ENVS=$(get_model_envs "$MODEL_NAME") -echo "Using model-specific configuration for: $MODEL_NAME" +eval "$(python3 -c " +import yaml, sys + +with open('${MODELS_YAML}') as f: + models = yaml.safe_load(f) + +model_name = '${MODEL_NAME}' +if model_name not in models: + print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') + sys.exit(0) + +m = models[model_name] + +def bash_escape(s): + \"\"\"Escape a value for safe embedding in a bash double-quoted assignment.\"\"\" + return s.replace('\\\\', '\\\\\\\\').replace('\"', '\\\\\"').replace('\$', '\\\\\$').replace('\`', '\\\\\`') + +pf = bash_escape(m.get('prefill_flags', '--tensor-parallel-size 8')) +df = bash_escape(m.get('decode_flags', '--tensor-parallel-size 8')) +ev = bash_escape(m.get('env', '')) +print(f'PREFILL_SERVER_CONFIG=\"{pf}\"') +print(f'DECODE_SERVER_CONFIG=\"{df}\"') +print(f'MODEL_ENVS=\"{ev}\"') +")" + +echo "Loaded model configuration for: $MODEL_NAME" # ============================================================================= # Container Synchronization @@ -203,20 +198,15 @@ done echo "Prefill node IPs: ${PREFILL_ARGS}" echo "Decode node IPs: ${DECODE_ARGS}" -# Common UCX/Nixl environment for prefill and decode workers -setup_ucx_env() { - export UCX_TLS=all - export UCX_SOCKADDR_TLS_PRIORITY=tcp - export UCX_MEMTYPE_CACHE=y - export UCX_RNDV_SCHEME=get_zcopy - export UCX_RNDV_THRESH=4k - export UCX_ROCM_IPC_MIN_ZCOPY=0 - export HSA_ENABLE_SDMA=1 - export UCX_LOG_LEVEL=info +# vLLM/Nixl-specific environment (UCX transport vars are set at the Docker level in job.slurm) +setup_vllm_env() { export VLLM_USE_V1=1 export VLLM_SERVER_DEV_MODE=0 export VLLM_NIXL_SIDE_CHANNEL_HOST=${host_ip} export VLLM_NIXL_SIDE_CHANNEL_PORT=5557 + for env_pair in ${MODEL_ENVS}; do + export "$env_pair" + done } # ============================================================================= @@ -334,10 +324,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -le "$xP" ]; then echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME})" echo "Using prefill config: $PREFILL_SERVER_CONFIG" - setup_ucx_env - for env_pair in ${PREFILL_MODEL_ENVS}; do - export "$env_pair" - done + setup_vllm_env PREFILL_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ @@ -387,10 +374,7 @@ else echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME})" echo "Using decode config: $DECODE_SERVER_CONFIG" - setup_ucx_env - for env_pair in ${DECODE_MODEL_ENVS}; do - export "$env_pair" - done + setup_vllm_env DECODE_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ From a65d6bebd0fef41021dd3cbdd442b89af6006146 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Wed, 11 Mar 2026 20:20:51 +0000 Subject: [PATCH 03/85] [AMD] Update vLLM disagg recipe for v0.17.1 NixlConnector API Adapt server.sh to vLLM v0.17.1 breaking changes: - Use simplified kv-transfer-config (side channel via env vars instead of kv_ip/kv_port, add kv_load_failure_policy) - Remove deprecated --disable-log-requests (disabled by default in v0.17) - Route NIXL side channel through RDMA IP for correct fabric path - Fix RIXL ucx_error_handling_mode patch for updated _api.py layout --- benchmarks/multi_node/vllm_disagg_utils/env.sh | 2 +- benchmarks/multi_node/vllm_disagg_utils/server.sh | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/env.sh b/benchmarks/multi_node/vllm_disagg_utils/env.sh index f4340e812..cc9b9320b 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/env.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/env.sh @@ -6,7 +6,7 @@ # Set by runner or auto-detected from hostname. # # The Docker image (built from vllm_disagg_inference.ubuntu.amd.Dockerfile) already -# sets LD_LIBRARY_PATH for UCX (/usr/local/ucx/lib) and RIXL (/usr/local/RIXL/install/lib). +# sets LD_LIBRARY_PATH for UCX (/usr/local/ucx/lib) and RIXL (/usr/local/rixl/lib). set -x diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 21fe506cb..d90e4b240 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -81,7 +81,7 @@ setup_rdma_env() { nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) if [[ -n "$nixl_api" ]]; then if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then - sed -i '/init\["num_threads"\] = str(nixl_conf.num_threads)/a\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" + sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api" else echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" @@ -202,8 +202,8 @@ echo "Decode node IPs: ${DECODE_ARGS}" setup_vllm_env() { export VLLM_USE_V1=1 export VLLM_SERVER_DEV_MODE=0 - export VLLM_NIXL_SIDE_CHANNEL_HOST=${host_ip} - export VLLM_NIXL_SIDE_CHANNEL_PORT=5557 + export VLLM_NIXL_SIDE_CHANNEL_HOST=${rdma_ip} + export VLLM_NIXL_SIDE_CHANNEL_PORT=5600 for env_pair in ${MODEL_ENVS}; do export "$env_pair" done @@ -329,8 +329,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -le "$xP" ]; then PREFILL_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ - --disable-log-requests \ - --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"engine_id\": \"${ENGINE_ID}\", \"kv_role\": \"kv_producer\", \"kv_parallel_size\": 8, \"kv_rank\": 0, \"kv_buffer_size\": 5000000000, \"kv_buffer_device\": \"cuda\", \"kv_ip\": \"'\"${rdma_ip}\"'\", \"kv_port\": 14600}' \ + --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_producer\", \"kv_load_failure_policy\": \"fail\"}' \ ${PREFILL_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then @@ -379,8 +378,7 @@ else DECODE_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ - --disable-log-requests \ - --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"engine_id\": \"${ENGINE_ID}\", \"kv_role\": \"kv_consumer\", \"kv_parallel_size\": 8, \"kv_rank\": 0, \"kv_buffer_size\": 5000000000, \"kv_buffer_device\": \"cuda\", \"kv_ip\": \"'\"${rdma_ip}\"'\", \"kv_port\": 14600}' \ + --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_consumer\", \"kv_load_failure_policy\": \"fail\"}' \ ${DECODE_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then From d62d53cd39ffbacae2541383887619bbf0910d80 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Thu, 12 Mar 2026 12:13:36 +0000 Subject: [PATCH 04/85] [AMD] Make vLLM disagg recipe CI-compatible (mia1 cluster) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bench.sh: replace `vllm bench serve` (log-only output) with the shared run_benchmark_serving helper from benchmark_lib.sh, matching the SGLang disagg pattern. This produces the .json result files that the multinode CI workflow expects (benchmark-multinode-tmpl.yml → process_result.py). server.sh: make the Nixl ucx_error_handling_mode=none runtime patch conditional on Pensando ionic RDMA devices (IBDEVICES=*ionic*). On the mia1 cluster (ConnectX/mlx5, IBDEVICES=rdma*), UCX handles error mode natively and the patch is skipped. Model-path resolution and IBDEVICES/UCX/QoS auto-detection were verified to already work on mia1 — no changes needed. Tested locally (Job 2802, 1P+2D, ISL/OSL=1024): conc 8 → 507 tok/s conc 32 → 1778 tok/s conc 16 → 1004 tok/s conc 64 → 2480 tok/s All four .json result files produced; 100% external prefix cache hit rate. --- .../multi_node/vllm_disagg_utils/bench.sh | 27 ++++++++++--------- .../multi_node/vllm_disagg_utils/server.sh | 23 +++++++++------- 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/bench.sh b/benchmarks/multi_node/vllm_disagg_utils/bench.sh index cfe66d460..69a178ca4 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/bench.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/bench.sh @@ -1,6 +1,9 @@ #!/bin/bash # vLLM Disaggregated Benchmark Runner # +# Produces JSON result files via benchmark_serving.py (same as SGLang bench.sh) +# so that the CI pipeline can collect and process results. +# # Usage: bash bench.sh \ # \ # @@ -11,7 +14,6 @@ prefill_gpus=$3 decode_gpus=$4 model_path=$5 model_name=$6 -# Prefer MODEL_PATH from environment (handles HF cache snapshot resolution) MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" log_path=$7 @@ -31,6 +33,10 @@ echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_ profile_folder="${log_path}/vllm_isl_${chosen_isl}_osl_${chosen_osl}" mkdir -p "$profile_folder" +source "$(dirname "$0")/../../benchmark_lib.sh" + +REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" + for max_concurrency in "${chosen_concurrencies[@]}"; do export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}" @@ -50,21 +56,18 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do echo "num_prompts: $num_prompts" echo "export_file: $export_file" - vllm bench serve \ + run_benchmark_serving \ + --bench-serving-dir "$REPO_ROOT" \ --model "$MODEL_PATH" \ - --backend vllm \ - --host 127.0.0.1 \ --port "$ROUTER_PORT" \ - --dataset-name "random" \ - --random-input-len "$chosen_isl" \ - --random-output-len "$chosen_osl" \ - --random-prefix-len 0 \ + --backend openai \ + --input-len "$chosen_isl" \ + --output-len "$chosen_osl" \ + --random-range-ratio "$random_range_ratio" \ --num-prompts "$num_prompts" \ - --request-rate "$chosen_req_rate" \ - --ignore-eos \ --max-concurrency "$max_concurrency" \ - 2>&1 | tee "${export_file}.log" + --result-filename "$export_file" \ + --result-dir /workspace/ - sleep 5 echo "-----------------------------------------" done diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index d90e4b240..933019abe 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -75,17 +75,22 @@ setup_rdma_env() { fi # Patch Nixl UCX backend: set ucx_error_handling_mode=none. - # Pensando ionic NICs don't support rdmacm, so the default + # Only needed for Pensando ionic NICs which don't support rdmacm — the default # UCP_ERR_HANDLING_MODE_PEER causes "no active messages transport" errors. - local nixl_api - nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) - if [[ -n "$nixl_api" ]]; then - if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then - sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" - echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api" - else - echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" + # ConnectX/mlx5 NICs (mia1 cluster) handle error mode properly; skip the patch. + if [[ "${IBDEVICES:-}" == *ionic* ]]; then + local nixl_api + nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) + if [[ -n "$nixl_api" ]]; then + if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then + sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" + echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api" + else + echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" + fi fi + else + echo "[INFO] Non-ionic RDMA devices (${IBDEVICES:-unset}); skipping ucx_error_handling_mode patch" fi } From 788aa2b5b01939c06a796e74c73bac61a3d28457 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Thu, 12 Mar 2026 13:46:47 +0000 Subject: [PATCH 05/85] [AMD] Co-locate vLLM disagg router with prefill on NODE_RANK=0 Move the vllm-router from a dedicated proxy node onto the first prefill node, mirroring SGLang's co-location pattern. This reduces the node count from xP + yD + 1 to xP + yD (e.g., 3 nodes instead of 4 for 1P+2D). - server.sh: NODE_RANK=0 now runs both vllm serve (prefill, port 2584) and vllm-router (port 30000); barrier waits on all nodes - submit.sh / job.slurm: NUM_NODES = PREFILL_NODES + DECODE_NODES - bench.sh: ROUTER_PORT default updated to 30000 Local 1P+2D benchmark (ISL/OSL=1024, DeepSeek-R1 FP8, MI355X): - Throughput: +1.6% to +8.4% across concurrency 8-64 - Mean TTFT: -22% to -63% (prefill is local to router) - TPOT/ITL: unchanged (within noise) - 25% fewer nodes, no performance regression --- .github/configs/amd-master.yaml | 2 +- .../multi_node/vllm_disagg_utils/bench.sh | 2 +- .../multi_node/vllm_disagg_utils/job.slurm | 10 ++-- .../multi_node/vllm_disagg_utils/server.sh | 49 ++++++++++++++----- .../multi_node/vllm_disagg_utils/submit.sh | 10 ++-- 5 files changed, 48 insertions(+), 25 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 62686b75f..a22e413e0 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1363,7 +1363,7 @@ dsr1-fp8-mi355x-vllm-disagg: - isl: 1024 osl: 1024 search-space: - # 1P2D: 1 prefill node + 2 decode nodes + 1 proxy = 4 nodes total + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - spec-decoding: "none" conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] prefill: diff --git a/benchmarks/multi_node/vllm_disagg_utils/bench.sh b/benchmarks/multi_node/vllm_disagg_utils/bench.sh index 69a178ca4..37b9d0b56 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/bench.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/bench.sh @@ -26,7 +26,7 @@ num_prompts_multiplier=${13:-10} IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list" -ROUTER_PORT="${ROUTER_PORT:-2584}" +ROUTER_PORT="${ROUTER_PORT:-30000}" echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index 494ef6901..7b25fd4b5 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --job-name=vllm-pd-bench -#SBATCH -N 4 # CHECK this to be right in batch jobs -#SBATCH -n 4 # CHECK this to be right in batch jobs +#SBATCH -N 3 # Overridden by submit.sh -N flag +#SBATCH -n 3 # Overridden by submit.sh -n flag #SBATCH --ntasks-per-node=1 #SBATCH --spread-job #SBATCH --gres=gpu:8 @@ -127,9 +127,9 @@ echo "Final MODEL_PATH: $MODEL_PATH" # Node Selection and vLLM-Specific NUM_NODES # ============================================================================= -# vLLM needs xP + yD + 1 (dedicated proxy node) -NUM_NODES=$((xP + yD + 1)) -echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD + 1 proxy)" +# Router co-located with first prefill: xP + yD nodes total (same as SGLang) +NUM_NODES=$((xP + yD)) +echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD, proxy co-located with first prefill)" FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 933019abe..8447046c1 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -3,9 +3,11 @@ # ============================================================================= # # Node role assignment (by NODE_RANK): -# 0 -> Proxy/Router node -# 1..xP -> Prefill nodes (kv_producer) -# xP+1..xP+yD -> Decode nodes (kv_consumer) +# 0 -> Proxy/Router + first Prefill node (kv_producer) +# 1..xP-1 -> Additional Prefill nodes (kv_producer) +# xP..xP+yD-1 -> Decode nodes (kv_consumer) +# +# Total nodes = xP + yD (router co-located with first prefill, like SGLang). # ============================================================================= # Environment Configuration @@ -32,7 +34,7 @@ BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" DRY_RUN="${DRY_RUN:-0}" GPUS_PER_NODE="${GPUS_PER_NODE:-8}" -ROUTER_PORT="${ROUTER_PORT:-2584}" +ROUTER_PORT="${ROUTER_PORT:-30000}" SERVER_PORT="${SERVER_PORT:-2584}" ENGINE_ID="${ENGINE_ID:-${MODEL_NAME}-pd-run}" @@ -192,11 +194,11 @@ IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" PREFILL_ARGS="" DECODE_ARGS="" -for ((i=1; i<=xP && i<${#IP_ARRAY[@]}; i++)); do +for ((i=0; i&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill_pid=$! + fi + echo "Waiting for all prefill and decode servers to be up . . ." python3 $VLLM_WS_PATH/sync.py barrier \ - --node-ips ${PD_IPADDRS} \ + --node-ips ${IPADDRS} \ --node-ports $SERVER_PORT \ --wait-for-all-ports \ --timeout 1800 @@ -322,11 +342,14 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" fi - echo "Killing the proxy server" - [[ "$DRY_RUN" -eq 0 ]] && kill $proxy_pid + echo "Killing the proxy server and prefill server" + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $proxy_pid + kill $prefill_pid + fi -elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -le "$xP" ]; then - echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME})" +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then + echo "${host_name}:${host_ip} is Additional Prefill Node (Model: ${MODEL_NAME})" echo "Using prefill config: $PREFILL_SERVER_CONFIG" setup_vllm_env diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh index a41a31d79..d60ed87e6 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -5,8 +5,8 @@ # This script submits a multi-node vLLM disaggregated benchmark job to SLURM. # It must be configured for your specific cluster before use. # -# Key difference from SGLang: vLLM uses a dedicated proxy node, so -# NUM_NODES = PREFILL_NODES + DECODE_NODES + 1. +# Router is co-located with the first prefill node (same as SGLang), so +# NUM_NODES = PREFILL_NODES + DECODE_NODES. usage() { cat << 'USAGE' @@ -67,8 +67,8 @@ CONCURRENCIES=$7 REQUEST_RATE=$8 NODE_LIST=${9} -# vLLM needs xP + yD + 1 nodes (dedicated proxy node) -NUM_NODES=$((PREFILL_NODES + DECODE_NODES + 1)) +# Router co-located with first prefill: xP + yD nodes total +NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) profiler_args="${ISL} ${OSL} ${CONCURRENCIES} ${REQUEST_RATE}" # Export variables for the SLURM job @@ -77,7 +77,7 @@ export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE export PROFILER_ARGS=$profiler_args # For vLLM, each worker = 1 node (TP=8 per node). -# xP/yD must match the node counts so job.slurm's NUM_NODES = xP+yD+1 is correct. +# xP/yD must match the node counts so NUM_NODES = xP+yD is correct. export xP=$PREFILL_NODES export yD=$DECODE_NODES export NUM_NODES=$NUM_NODES From efce933d5e591bc7c8ec8df955a7861209a04704 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Thu, 12 Mar 2026 17:31:07 +0000 Subject: [PATCH 06/85] [AMD] Use public vLLM base image with runtime dependency install MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the custom Docker image (vllm_disagg_pd:latest) with the public vllm/vllm-openai-rocm:v0.17.1 base image. Missing components (UCX, RIXL, etcd, libionic1, vllm-router) are now installed at container start via setup_deps.sh, which is sourced by server.sh. This eliminates the need to build, host, and maintain a custom image — CI nodes can pull directly from Docker Hub. Changes: - Add setup_deps.sh: idempotent installer for UCX (ROCm fork), RIXL, etcd, libionic1 (Pensando ionic), and vllm-router (NODE_RANK=0 only). Build steps run in subshells to avoid CWD pollution. - server.sh: source setup_deps.sh before any other logic - job.slurm: add --entrypoint "" to override the base image's vllm CLI entrypoint, allowing bash -lc to work correctly - env.sh: update comment (paths now set by setup_deps.sh, not image ENV) - amd-master.yaml: image changed to vllm/vllm-openai-rocm:v0.17.1 Tested locally (Job 2807, 3 nodes, ISL/OSL=1024): Setup overhead: ~2.5 min per node (all components built from source) Benchmark completed successfully across concurrency 8/16/32/64 --- .github/configs/amd-master.yaml | 2 +- .../multi_node/vllm_disagg_utils/env.sh | 4 +- .../multi_node/vllm_disagg_utils/job.slurm | 1 + .../multi_node/vllm_disagg_utils/server.sh | 5 + .../vllm_disagg_utils/setup_deps.sh | 186 ++++++++++++++++++ 5 files changed, 195 insertions(+), 3 deletions(-) create mode 100644 benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a22e413e0..5c6e6c013 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1351,7 +1351,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" dsr1-fp8-mi355x-vllm-disagg: - image: vllm_disagg_pd:latest + image: vllm/vllm-openai-rocm:v0.17.1 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg diff --git a/benchmarks/multi_node/vllm_disagg_utils/env.sh b/benchmarks/multi_node/vllm_disagg_utils/env.sh index cc9b9320b..e1cc2f6af 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/env.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/env.sh @@ -5,8 +5,8 @@ # IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) # Set by runner or auto-detected from hostname. # -# The Docker image (built from vllm_disagg_inference.ubuntu.amd.Dockerfile) already -# sets LD_LIBRARY_PATH for UCX (/usr/local/ucx/lib) and RIXL (/usr/local/rixl/lib). +# UCX and RIXL paths (LD_LIBRARY_PATH, PATH) are set by setup_deps.sh, which is +# sourced at the top of server.sh before this file. set -x diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index 7b25fd4b5..3a71436fe 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -305,6 +305,7 @@ exec sudo docker run --rm \ -e UCX_LOG_LEVEL=warn \ -e HSA_ENABLE_SDMA=1 \ --name \"$DOCKER_CONT_NAME\" \ + --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 8447046c1..efabf5e32 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -9,6 +9,11 @@ # # Total nodes = xP + yD (router co-located with first prefill, like SGLang). +# ============================================================================= +# Dependency Setup (idempotent; required when using base vLLM image) +# ============================================================================= +source "$(dirname "${BASH_SOURCE[0]}")/setup_deps.sh" + # ============================================================================= # Environment Configuration # ============================================================================= diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh new file mode 100644 index 000000000..ee2524979 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -0,0 +1,186 @@ +#!/bin/bash +# ============================================================================= +# setup_deps.sh — Install missing vLLM disagg dependencies at container start. +# +# Base image: vllm/vllm-openai-rocm:v0.17.1 +# Sourced by server.sh so PATH / LD_LIBRARY_PATH exports persist. +# Idempotent: each component is skipped if already present. +# +# Build steps run in subshells to avoid CWD pollution between installers. +# ============================================================================= + +ROCM_PATH="${ROCM_PATH:-/opt/rocm}" +UCX_HOME="${UCX_HOME:-/usr/local/ucx}" +RIXL_HOME="${RIXL_HOME:-/usr/local/rixl}" + +_SETUP_START=$(date +%s) +_SETUP_INSTALLED=() + +# --------------------------------------------------------------------------- +# 1. UCX (ROCm fork — required for GPU-direct RDMA via Nixl) +# --------------------------------------------------------------------------- +install_ucx() { + if [[ -x "${UCX_HOME}/bin/ucx_info" ]]; then + echo "[SETUP] UCX already present at ${UCX_HOME}" + return 0 + fi + + echo "[SETUP] Installing UCX build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + autoconf automake libtool pkg-config \ + librdmacm-dev rdmacm-utils libibverbs-dev ibverbs-utils ibverbs-providers \ + infiniband-diags perftest ethtool rdma-core strace \ + && rm -rf /var/lib/apt/lists/* + + echo "[SETUP] Building UCX from source (ROCm/ucx @ da3fac2a)..." + ( + set -e + mkdir -p /usr/local/src && cd /usr/local/src + git clone --quiet https://github.com/ROCm/ucx.git && cd ucx + git checkout da3fac2a + ./autogen.sh && mkdir -p build && cd build + ../configure \ + --prefix="${UCX_HOME}" \ + --enable-shared --disable-static \ + --disable-doxygen-doc --enable-optimizations \ + --enable-devel-headers --enable-mt \ + --with-rocm="${ROCM_PATH}" --with-verbs --with-dm + make -j"$(nproc)" && make install + ) + rm -rf /usr/local/src/ucx + + if [[ ! -x "${UCX_HOME}/bin/ucx_info" ]]; then + echo "[SETUP] ERROR: UCX build failed"; exit 1 + fi + _SETUP_INSTALLED+=("UCX") +} + +# --------------------------------------------------------------------------- +# 2. RIXL (ROCm fork of NIXL — KV cache transfer for disaggregated vLLM) +# --------------------------------------------------------------------------- +install_rixl() { + if python3 -c "import rixl" 2>/dev/null; then + echo "[SETUP] RIXL Python bindings already present" + return 0 + fi + + echo "[SETUP] Installing RIXL build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc \ + libcpprest-dev libaio-dev \ + && rm -rf /var/lib/apt/lists/* + pip3 install --quiet meson "pybind11[global]" + + echo "[SETUP] Building RIXL from source (ROCm/RIXL @ f33a5599)..." + ( + set -e + git clone --quiet https://github.com/ROCm/RIXL.git /opt/rixl && cd /opt/rixl + git checkout f33a5599 + meson setup build --prefix="${RIXL_HOME}" \ + -Ducx_path="${UCX_HOME}" \ + -Drocm_path="${ROCM_PATH}" + cd build && ninja && ninja install + cd /opt/rixl + pip install --quiet \ + --config-settings=setup-args="-Drocm_path=${ROCM_PATH}" \ + --config-settings=setup-args="-Ducx_path=${UCX_HOME}" . + ) + rm -rf /opt/rixl + + if ! python3 -c "import rixl" 2>/dev/null; then + echo "[SETUP] ERROR: RIXL build failed"; exit 1 + fi + _SETUP_INSTALLED+=("RIXL") +} + +# --------------------------------------------------------------------------- +# 3. etcd (distributed KV store for vLLM disagg service discovery) +# --------------------------------------------------------------------------- +install_etcd() { + if [[ -x /usr/local/bin/etcd/etcd ]]; then + echo "[SETUP] etcd already present" + return 0 + fi + + local version="v3.6.0-rc.5" + echo "[SETUP] Downloading etcd ${version}..." + wget -q "https://github.com/etcd-io/etcd/releases/download/${version}/etcd-${version}-linux-amd64.tar.gz" \ + -O /tmp/etcd.tar.gz + mkdir -p /usr/local/bin/etcd + tar -xf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 + rm /tmp/etcd.tar.gz + _SETUP_INSTALLED+=("etcd") +} + +# --------------------------------------------------------------------------- +# 4. libionic1 (Pensando ionic RDMA verbs provider for RoCEv2 KV transfer) +# Harmless on non-Pensando nodes (shared lib is simply unused). +# --------------------------------------------------------------------------- +install_libionic() { + if dpkg -l libionic1 2>/dev/null | grep -q '^ii'; then + echo "[SETUP] libionic1 already installed" + return 0 + fi + + echo "[SETUP] Downloading and installing libionic1..." + wget -q "https://repo.radeon.com/amdainic/pensando/ubuntu/1.117.5/pool/main/r/rdma-core/libionic1_54.0-149.g3304be71_amd64.deb" \ + -O /tmp/libionic1.deb + dpkg -i /tmp/libionic1.deb || true + rm -f /tmp/libionic1.deb + _SETUP_INSTALLED+=("libionic1") +} + +# --------------------------------------------------------------------------- +# 5. vllm-router (Rust-based proxy for PD disaggregation) +# Only needed on NODE_RANK=0 (proxy node). +# --------------------------------------------------------------------------- +install_vllm_router() { + if pip show vllm-router &>/dev/null; then + echo "[SETUP] vllm-router already installed" + return 0 + fi + + echo "[SETUP] Installing Rust toolchain..." + if ! command -v cargo &>/dev/null; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + export PATH="/root/.cargo/bin:${PATH}" + fi + + echo "[SETUP] Installing vllm-router via pip..." + pip install --quiet vllm-router + + if ! pip show vllm-router &>/dev/null; then + echo "[SETUP] ERROR: vllm-router install failed"; exit 1 + fi + _SETUP_INSTALLED+=("vllm-router") +} + +# ============================================================================= +# Run installers +# ============================================================================= + +install_ucx +install_rixl +install_etcd +install_libionic + +if [[ "${NODE_RANK:-0}" -eq 0 ]]; then + install_vllm_router +fi + +# ============================================================================= +# Export paths (persists for server.sh since this file is sourced) +# ============================================================================= + +export ROCM_PATH="${ROCM_PATH}" +export UCX_HOME="${UCX_HOME}" +export RIXL_HOME="${RIXL_HOME}" +export PATH="${UCX_HOME}/bin:/usr/local/bin/etcd:/root/.cargo/bin:${PATH}" +export LD_LIBRARY_PATH="${UCX_HOME}/lib:${RIXL_HOME}/lib:${RIXL_HOME}/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" + +_SETUP_END=$(date +%s) +if [[ ${#_SETUP_INSTALLED[@]} -eq 0 ]]; then + echo "[SETUP] All dependencies already present (${_SETUP_END}s wallclock)" +else + echo "[SETUP] Installed: ${_SETUP_INSTALLED[*]} in $(( _SETUP_END - _SETUP_START ))s" +fi From 2ffd37f06461f763e2337279fefa03f934195cca Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 13 Mar 2026 14:19:12 +0000 Subject: [PATCH 07/85] [AMD] Enable Expert Parallelism with MoRI all-to-all on vLLM disagg decode Enable MoRI-based Expert Parallelism (--enable-expert-parallel --all2all-backend mori) on decode workers for DeepSeek-R1-0528, while keeping TP=8 to preserve KV cache transfer compatibility with the prefill node via NixlConnector. This matches SGLang's approach of TP=8 + EP within the TP group. KV Transfer: RIXL/NixlConnector (unchanged) MoE All-to-All: NCCL (default) -> MoRI-EP (--all2all-backend mori) Changes: - models.yaml: Add --enable-expert-parallel --all2all-backend mori to decode_flags; increase engine ready timeout to 1200s - setup_deps.sh: Add MoRI install and vLLM v0.17.1 patches for MoRI-EP + FP8 compatibility (AITER assertion, defer_input_quant) - server.sh: Support decode_env from models.yaml for decode-specific environment overrides - dsr1_fp8_mi355x_vllm-disagg.sh: Pass NODELIST to submit.sh for Slurm node constraints --- .../multi_node/dsr1_fp8_mi355x_vllm-disagg.sh | 4 +- .../multi_node/vllm_disagg_utils/models.yaml | 4 +- .../multi_node/vllm_disagg_utils/server.sh | 7 ++ .../vllm_disagg_utils/setup_deps.sh | 85 +++++++++++++++++++ 4 files changed, 96 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh index a457a2714..167aff5f3 100755 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh @@ -30,14 +30,14 @@ export MODEL_PATH=$MODEL_PATH export MODEL_NAME=$MODEL_NAME export CONTAINER_IMAGE=$IMAGE -# vLLM disagg uses TP-only parallelism (no EP/DP). # PREFILL_NODES and DECODE_NODES come from additional-settings in the YAML config. +# NODELIST (optional) constrains which Slurm nodes are used. JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ $PREFILL_NUM_WORKERS \ $DECODE_NODES \ $DECODE_NUM_WORKERS \ - $ISL $OSL "${CONC_LIST// /x}" inf) + $ISL $OSL "${CONC_LIST// /x}" inf "${NODELIST:-}") if [[ $? -ne 0 ]]; then echo "Failed to submit job" >&2 diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml index 31197ec52..4a720785a 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/models.yaml +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -31,8 +31,8 @@ DeepSeek-V3: DeepSeek-R1-0528: prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - decode_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=1200" hf_dir: "models--deepseek-ai--DeepSeek-R1-0528" gpt-oss-120b: diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index efabf5e32..7778dfd34 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -142,9 +142,11 @@ def bash_escape(s): pf = bash_escape(m.get('prefill_flags', '--tensor-parallel-size 8')) df = bash_escape(m.get('decode_flags', '--tensor-parallel-size 8')) ev = bash_escape(m.get('env', '')) +dev = bash_escape(m.get('decode_env', '')) print(f'PREFILL_SERVER_CONFIG=\"{pf}\"') print(f'DECODE_SERVER_CONFIG=\"{df}\"') print(f'MODEL_ENVS=\"{ev}\"') +print(f'DECODE_MODEL_ENVS=\"{dev}\"') ")" echo "Loaded model configuration for: $MODEL_NAME" @@ -408,6 +410,11 @@ else setup_vllm_env + for env_pair in ${DECODE_MODEL_ENVS}; do + export "$env_pair" + echo "[DECODE_ENV] $env_pair" + done + DECODE_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index ee2524979..8e2276d1c 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -155,6 +155,89 @@ install_vllm_router() { _SETUP_INSTALLED+=("vllm-router") } +# --------------------------------------------------------------------------- +# 6. MoRI (Modular RDMA Interface — EP dispatch/combine kernels for MoE) +# Required for --all2all-backend mori (Expert Parallelism via RDMA). +# GPU kernels are JIT-compiled on first use; no hipcc needed at install. +# --------------------------------------------------------------------------- +install_mori() { + if python3 -c "import mori" 2>/dev/null; then + echo "[SETUP] MoRI Python bindings already present" + return 0 + fi + + echo "[SETUP] Installing MoRI build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + libopenmpi-dev openmpi-bin libpci-dev \ + && rm -rf /var/lib/apt/lists/* + + echo "[SETUP] Building MoRI from source (ROCm/mori @ b645fc8)..." + ( + set -e + git clone --quiet https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori + git checkout b645fc8 + pip install --quiet . + ) + rm -rf /opt/mori + + if ! python3 -c "import mori" 2>/dev/null; then + echo "[SETUP] ERROR: MoRI build failed"; exit 1 + fi + _SETUP_INSTALLED+=("MoRI") +} + +# --------------------------------------------------------------------------- +# 7. Patch vLLM v0.17.1 MoRI-EP + FP8 incompatibility +# v0.17.1 asserts MoRI requires AITER fused_moe, but AITER's FP8 kernel +# uses defer_input_quant=True which MoRI's prepare/finalize rejects. +# Patch: remove both the AITER requirement assertion and the +# defer_input_quant NotImplementedError so non-AITER kernels work. +# --------------------------------------------------------------------------- +patch_mori_fp8_compat() { + python3 -c ' +import re, os, sys +patched = [] + +# 1. Patch layer.py: remove multi-line AITER assertion for MoRI +try: + import vllm.model_executor.layers.fused_moe.layer as lm + f = lm.__file__ + src = open(f).read() + if "Mori needs to be used with aiter" in src: + new = re.sub( + r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)", + "pass # [PATCHED] AITER requirement removed for MoRI-EP + FP8", + src, flags=re.DOTALL) + if new != src: + open(f, "w").write(new) + patched.append("layer.py") +except Exception as e: + print(f"[SETUP] WARN patch layer.py: {e}", file=sys.stderr) + +# 2. Patch mori_prepare_finalize.py: remove defer_input_quant restriction +try: + import vllm.model_executor.layers.fused_moe.mori_prepare_finalize as mm + f = mm.__file__ + src = open(f).read() + if "defer_input_quant" in src: + new = re.sub( + r"raise NotImplementedError\([^)]*defer_input_quant[^)]*\)", + "pass # [PATCHED] defer_input_quant check removed for MoRI-EP + FP8", + src) + if new != src: + open(f, "w").write(new) + patched.append("mori_prepare_finalize.py") +except Exception as e: + print(f"[SETUP] WARN patch mori_pf: {e}", file=sys.stderr) + +if patched: + print(f"[SETUP] Patched: {chr(44).join(patched)}") +else: + print("[SETUP] No MoRI-FP8 patches needed") +' + _SETUP_INSTALLED+=("MoRI-FP8-patch") +} + # ============================================================================= # Run installers # ============================================================================= @@ -163,6 +246,8 @@ install_ucx install_rixl install_etcd install_libionic +install_mori +patch_mori_fp8_compat if [[ "${NODE_RANK:-0}" -eq 0 ]]; then install_vllm_router From 25345ce537eceb1b19983ca93c56cf161f2c9bf7 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 13 Mar 2026 23:25:36 +0000 Subject: [PATCH 08/85] [AMD] Switch vLLM disagg KV transfer to MoRI-IO with protocol-aware proxy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace NixlConnector with MoRIIOConnector for KV cache transfer and replace the Rust-based vllm-router with a MoRI-IO-aware Python proxy that handles both HTTP routing and ZMQ-based RDMA endpoint discovery. The key architectural change is that the proxy enriches each request's kv_transfer_params with remote RDMA endpoint info (handshake_port, notify_port, host, port) before dispatching, enabling concurrent prefill+decode in WRITE mode — something vllm-router could not do because it only understands HTTP, not the MoRI-IO registration protocol. Changes: - Add moriio_proxy.py: MoRI-IO-aware proxy with ZMQ service discovery, request enrichment, and /health endpoint (adapted from vLLM upstream moriio_toy_proxy_server.py) - server.sh: switch --kv-transfer-config from NixlConnector to MoRIIOConnector with kv_connector_extra_config (proxy_ip, proxy_ping_port, http_port); launch proxy before prefill on NODE_RANK=0; set VLLM_DISABLE_REQUEST_ID_RANDOMIZATION=1 as workaround for v0.17.1 completion-ID mismatch (upstream fix: vllm-project/vllm#34907) - setup_deps.sh: replace vllm-router/Rust install with lightweight Python deps (quart, aiohttp, msgpack, pyzmq) for the proxy Benchmark (Job 2853 vs 2818 NixlConnector baseline, ISL/OSL=1024): TTFT median: -37% to -55% across C8–C64 (e.g. 384→241ms @C64) TTFT p99: -63% at C64 (6622→2469ms) Throughput: +8% at C64 (2634→2844 tok/s) TPOT: unchanged (~22ms @C64) --- .../vllm_disagg_utils/moriio_proxy.py | 309 ++++++++++++++++++ .../multi_node/vllm_disagg_utils/server.sh | 87 ++--- .../vllm_disagg_utils/setup_deps.sh | 29 +- 3 files changed, 358 insertions(+), 67 deletions(-) create mode 100644 benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py diff --git a/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py b/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py new file mode 100644 index 000000000..82272dd52 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +# MoRI-IO proxy server for vLLM PD disaggregation. +# +# Based on vLLM's examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py +# with the following adaptations for production multi-node use: +# - Ports configurable via PROXY_HTTP_PORT / PROXY_PING_PORT env vars +# - /health endpoint for sync.py barrier readiness checks +# - Uses stdlib `re` instead of `regex` to avoid extra dep +# +# The proxy performs two roles that vllm-router cannot: +# 1. ZMQ service discovery — prefill/decode workers register their RDMA ports +# 2. Request enrichment — injects remote endpoint info into kv_transfer_params + +import asyncio +import copy +import logging +import os +import re +import socket +import threading +import uuid + +import aiohttp +import msgpack +import zmq +from quart import Quart, make_response, request + +logger = logging.getLogger("moriio_proxy") +logger.setLevel(logging.DEBUG) +handler = logging.StreamHandler() +handler.setFormatter(logging.Formatter( + "%(asctime)s %(levelname)s [%(name)s] %(message)s")) +logger.addHandler(handler) + +prefill_instances: list[dict] = [] +decode_instances: list[dict] = [] +request_nums = 0 +app = Quart(__name__) + +IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)") + +TRANSFER_TYPE = None + + +def _append_whole_dict_unique(target_list, data_dict): + new_filtered = {k: v for k, v in data_dict.items() if k != "index"} + for existed in target_list: + existed_filtered = {k: v for k, v in existed.items() if k != "index"} + if existed_filtered == new_filtered: + return False + logger.info("Registered instance: role=%s addr=%s hs_port=%s notify=%s dp=%s tp=%s", + data_dict.get("role"), data_dict.get("request_address"), + data_dict.get("handshake_port"), data_dict.get("notify_port"), + data_dict.get("dp_size"), data_dict.get("tp_size")) + target_list.append(data_dict) + transfer_mode = data_dict.get("transfer_mode", "unknown") + global TRANSFER_TYPE + + if TRANSFER_TYPE is None: + TRANSFER_TYPE = transfer_mode + logger.info("Transfer mode set to: %s", TRANSFER_TYPE) + elif transfer_mode != TRANSFER_TYPE: + raise ValueError(f"mismatched transfer mode {TRANSFER_TYPE} vs {transfer_mode}") + + return True + + +_list_lock = threading.RLock() + + +def _listen_for_register(hostname, port): + context = zmq.Context() + router_socket = context.socket(zmq.ROUTER) + router_socket.bind(f"tcp://{hostname}:{port}") + poller = zmq.Poller() + poller.register(router_socket, zmq.POLLIN) + global prefill_instances + global decode_instances + + while True: + socks = dict(poller.poll()) + if router_socket in socks: + remote_addr, msg = router_socket.recv_multipart() + data = msgpack.loads(msg) + if data["type"] == "HELLO": + pass + elif ( + data["type"] == "register" + and data["role"] == "P" + and data["request_address"] not in prefill_instances + ): + with _list_lock: + _append_whole_dict_unique(prefill_instances, data) + + elif ( + data["type"] == "register" + and data["role"] == "D" + and data["request_address"] not in decode_instances + ): + with _list_lock: + _append_whole_dict_unique(decode_instances, data) + + +def start_service_discovery(hostname, port): + if not hostname: + hostname = socket.gethostname() + if port == 0: + raise ValueError("Port cannot be 0") + + _listener_thread = threading.Thread( + target=_listen_for_register, args=(hostname, port), daemon=True + ) + _listener_thread.start() + logger.info("Service discovery listening on %s:%s", hostname, port) + return _listener_thread + + +async def send_request_to_prefill( + endpoint, req_data, request_id, d_endpoint, dip, dport, selected_prefill_dp_rank +): + req_data_copy = req_data + + req_data_copy["kv_transfer_params"].update( + { + "do_remote_decode": True, + "do_remote_prefill": False, + "remote_handshake_port": d_endpoint["handshake_port"], + "remote_notify_port": d_endpoint["notify_port"], + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": dip, + "remote_port": dport, + } + ) + req_data_copy["stream"] = False + req_data_copy["max_tokens"] = 1 + if "max_completion_tokens" in req_data_copy: + req_data_copy["max_completion_tokens"] = 1 + if "stream_options" in req_data_copy: + del req_data_copy["stream_options"] + async with aiohttp.ClientSession( + timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) + ) as session: + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id, + } + if selected_prefill_dp_rank is not None: + headers["X-data-parallel-rank"] = str(selected_prefill_dp_rank) + async with session.post( + url=endpoint, json=req_data_copy, headers=headers + ) as response: + if response.status == 200: + return await response.json() + else: + raise RuntimeError( + f"Prefill response status={response.status}" + ) + + +async def start_decode_request(endpoint, req_data, request_id): + session = aiohttp.ClientSession( + timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) + ) + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id, + } + response = await session.post(url=endpoint, json=req_data, headers=headers) + return session, response + + +async def stream_decode_response(session, response, request_id): + try: + if response.status == 200: + async for chunk_bytes in response.content.iter_chunked(1024): + yield chunk_bytes + else: + raise RuntimeError( + f"Decode response status={response.status}" + ) + finally: + await session.close() + + +@app.route("/health", methods=["GET"]) +async def health_check(): + with _list_lock: + p_count = len(prefill_instances) + d_count = len(decode_instances) + return await make_response( + ({"status": "ok", "prefill_instances": p_count, "decode_instances": d_count}, 200) + ) + + +@app.route("/v1/completions", methods=["POST"]) +@app.route("/v1/chat/completions", methods=["POST"]) +async def handle_request(): + try: + with _list_lock: + global request_nums + request_nums += 1 + + def extract_ip_port_fast(url): + match = IP_PORT_PATTERN.search(url) + if not match: + raise ValueError(f"Invalid URL format: {url}") + return match.groups() + + req_data = await request.get_json() + request_id = str(uuid.uuid4()) + + if not prefill_instances or not decode_instances: + return await make_response( + ("Service Unavailable: No prefill or decode instances registered.", 503) + ) + + pid = request_nums % len(prefill_instances) + did = request_nums % len(decode_instances) + prefill_instance_endpoint = prefill_instances[pid] + decode_instance_endpoint = decode_instances[did] + + selected_prefill_dp_rank = None + if prefill_instance_endpoint["dp_size"] > 1: + selected_prefill_dp_rank = request_nums % prefill_instance_endpoint["dp_size"] + + dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"]) + + req_data_to_prefill = copy.deepcopy(req_data) + req_data_to_prefill["kv_transfer_params"] = {} + req_data["kv_transfer_params"] = {} + req_data_to_prefill["kv_transfer_params"]["remote_dp_size"] = ( + decode_instance_endpoint["dp_size"] + ) + req_data_to_prefill["kv_transfer_params"]["remote_tp_size"] = ( + decode_instance_endpoint["tp_size"] + ) + + send_prefill_task = asyncio.create_task( + send_request_to_prefill( + prefill_instance_endpoint["request_address"], + req_data_to_prefill, + request_id, + decode_instance_endpoint, + dip, + dport, + selected_prefill_dp_rank, + ) + ) + ip, port = extract_ip_port_fast(prefill_instance_endpoint["request_address"]) + + req_data["max_tokens"] -= 1 + + req_data["kv_transfer_params"] = { + "do_remote_decode": False, + "do_remote_prefill": True, + "remote_handshake_port": prefill_instance_endpoint["handshake_port"], + "remote_notify_port": prefill_instance_endpoint["notify_port"], + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": ip, + "remote_port": port, + } + if TRANSFER_TYPE == "READ": + prefill_response = await send_prefill_task + req_data["kv_transfer_params"]["remote_engine_id"] = prefill_response[ + "kv_transfer_params" + ]["remote_engine_id"] + req_data["kv_transfer_params"]["remote_block_ids"] = prefill_response[ + "kv_transfer_params" + ]["remote_block_ids"] + + req_data["kv_transfer_params"]["remote_dp_size"] = prefill_instance_endpoint[ + "dp_size" + ] + req_data["kv_transfer_params"]["remote_tp_size"] = prefill_instance_endpoint[ + "tp_size" + ] + + if selected_prefill_dp_rank is not None: + req_data["kv_transfer_params"]["remote_dp_rank"] = selected_prefill_dp_rank + + decode_request_task = asyncio.create_task( + start_decode_request( + decode_instance_endpoint["request_address"], req_data, request_id + ) + ) + + session, decode_response = await decode_request_task + stream_generator = stream_decode_response(session, decode_response, request_id) + response = await make_response(stream_generator) + return response + except Exception as e: + logger.exception("Error handling request: %s", e) + return await make_response((f"Internal Server Error: {e!s}", 500)) + + +if __name__ == "__main__": + http_port = int(os.environ.get("PROXY_HTTP_PORT", "30000")) + ping_port = int(os.environ.get("PROXY_PING_PORT", "36367")) + + t = start_service_discovery("0.0.0.0", ping_port) + app.debug = False + app.config["BODY_TIMEOUT"] = 360000 + app.config["RESPONSE_TIMEOUT"] = 360000 + + logger.info("MoRI-IO proxy starting: HTTP=%d, ZMQ=%d", http_port, ping_port) + app.run(host="0.0.0.0", port=http_port) + t.join() diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 7778dfd34..f81ff68e1 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -212,12 +212,18 @@ done echo "Prefill node IPs: ${PREFILL_ARGS}" echo "Decode node IPs: ${DECODE_ARGS}" -# vLLM/Nixl-specific environment (UCX transport vars are set at the Docker level in job.slurm) +# MoRI-IO proxy ZMQ registration port (must match moriio_proxy.py PROXY_PING_PORT) +PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" + +# vLLM environment (UCX transport vars are set at the Docker level in job.slurm) setup_vllm_env() { export VLLM_USE_V1=1 export VLLM_SERVER_DEV_MODE=0 export VLLM_NIXL_SIDE_CHANNEL_HOST=${rdma_ip} export VLLM_NIXL_SIDE_CHANNEL_PORT=5600 + # Workaround: disable request-ID randomization so MoRI-IO connector can + # match completion IDs between prefill and decode without PR #34907 patch. + export VLLM_DISABLE_REQUEST_ID_RANDOMIZATION=1 for env_pair in ${MODEL_ENVS}; do export "$env_pair" done @@ -245,10 +251,26 @@ if [ "$NODE_RANK" -eq 0 ]; then setup_vllm_env + # Start MoRI-IO proxy FIRST — workers register via ZMQ on startup + echo "Starting MoRI-IO proxy (HTTP=$ROUTER_PORT, ZMQ=$PROXY_PING_PORT)..." + PROXY_CMD="PROXY_HTTP_PORT=$ROUTER_PORT PROXY_PING_PORT=$PROXY_PING_PORT \ + python3 $VLLM_WS_PATH/moriio_proxy.py" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PROXY_CMD" + else + PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log" + set -x + eval "$PROXY_CMD" 2>&1 | tee "$PROXY_LOG_FILE" & + set +x + proxy_pid=$! + sleep 3 + fi + PREFILL_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ - --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_producer\", \"kv_load_failure_policy\": \"fail\"}' \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ ${PREFILL_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then @@ -270,56 +292,19 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Congratulations!!! All prefill and decode servers are up . . ." - echo "Starting vLLM Router..." - [ -f /root/.cargo/env ] && source /root/.cargo/env - - PREFILL_URLS="" - DECODE_URLS="" - for ip in ${PREFILL_ARGS}; do - PREFILL_URLS+="--prefill http://${ip}:${SERVER_PORT} " - done - for ip in ${DECODE_ARGS}; do - DECODE_URLS+="--decode http://${ip}:${SERVER_PORT} " - done - - ROUTER_CMD="UCX_TLS=tcp,self,shm VLLM_USE_V1=1 \ - vllm-router \ - --host 0.0.0.0 \ - --port $ROUTER_PORT \ - --vllm-pd-disaggregation \ - $PREFILL_URLS \ - $DECODE_URLS \ - --policy round_robin \ - --prefill-policy round_robin \ - --decode-policy round_robin \ - --intra-node-data-parallel-size 1 \ - --retry-max-retries 3 \ - --health-check-endpoint /health \ - --prometheus-port 29000" + # Wait for proxy /health to confirm it is accepting requests + HEALTH_BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-health \ + --health-endpoint /health \ + --timeout 1800" if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $ROUTER_CMD" + echo "DRY RUN: $HEALTH_BARRIER_CMD" else - ROUTER_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_${host_name}.log" - set -x - eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & - set +x - proxy_pid=$! - - HEALTH_BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports ${ROUTER_PORT} \ - --wait-for-all-health \ - --health-endpoint /health \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $HEALTH_BARRIER_CMD" - else - eval "$HEALTH_BARRIER_CMD" - fi - - echo "Router is ready for benchmarking" + eval "$HEALTH_BARRIER_CMD" + echo "MoRI-IO proxy is ready for benchmarking" fi echo "Ready for benchmarking on ${host_name}:${host_ip}" @@ -364,7 +349,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then PREFILL_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ - --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_producer\", \"kv_load_failure_policy\": \"fail\"}' \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ ${PREFILL_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then @@ -418,7 +403,7 @@ else DECODE_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ - --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_consumer\", \"kv_load_failure_policy\": \"fail\"}' \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ ${DECODE_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index 8e2276d1c..3af1b5b0e 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -131,28 +131,25 @@ install_libionic() { } # --------------------------------------------------------------------------- -# 5. vllm-router (Rust-based proxy for PD disaggregation) +# 5. MoRI-IO proxy deps (Python packages for the MoRI-IO-aware proxy server) +# The proxy replaces vllm-router: it handles both HTTP routing AND the +# MoRI-IO ZMQ registration/request-enrichment protocol. # Only needed on NODE_RANK=0 (proxy node). # --------------------------------------------------------------------------- -install_vllm_router() { - if pip show vllm-router &>/dev/null; then - echo "[SETUP] vllm-router already installed" +install_mori_proxy_deps() { + if python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then + echo "[SETUP] MoRI-IO proxy Python deps already present" return 0 fi - echo "[SETUP] Installing Rust toolchain..." - if ! command -v cargo &>/dev/null; then - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y - export PATH="/root/.cargo/bin:${PATH}" - fi - - echo "[SETUP] Installing vllm-router via pip..." - pip install --quiet vllm-router + echo "[SETUP] Installing MoRI-IO proxy Python deps..." + pip install --quiet --ignore-installed blinker + pip install --quiet quart aiohttp msgpack pyzmq - if ! pip show vllm-router &>/dev/null; then - echo "[SETUP] ERROR: vllm-router install failed"; exit 1 + if ! python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then + echo "[SETUP] ERROR: MoRI-IO proxy deps install failed"; exit 1 fi - _SETUP_INSTALLED+=("vllm-router") + _SETUP_INSTALLED+=("mori-proxy-deps") } # --------------------------------------------------------------------------- @@ -250,7 +247,7 @@ install_mori patch_mori_fp8_compat if [[ "${NODE_RANK:-0}" -eq 0 ]]; then - install_vllm_router + install_mori_proxy_deps fi # ============================================================================= From c50b3c8e6cc39c586c4e507a8ca81850b6dbc460 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 17 Mar 2026 08:47:54 +0000 Subject: [PATCH 09/85] [AMD] BUG fix: RANDOM_RANGE_RATIO never reaches bench.sh Signed-off-by: Theresa Shan --- .../multi_node/dsr1_fp8_mi355x_vllm-disagg.sh | 3 ++- .../multi_node/vllm_disagg_utils/submit.sh | 24 ++++++++++--------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh index 167aff5f3..172ecdf51 100755 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh @@ -37,7 +37,8 @@ JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ $PREFILL_NUM_WORKERS \ $DECODE_NODES \ $DECODE_NUM_WORKERS \ - $ISL $OSL "${CONC_LIST// /x}" inf "${NODELIST:-}") + $ISL $OSL "${CONC_LIST// /x}" inf "${NODELIST:-}" \ + ${RANDOM_RANGE_RATIO}) if [[ $? -ne 0 ]]; then echo "Failed to submit job" >&2 diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh index d60ed87e6..f210d7ac7 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -12,18 +12,19 @@ usage() { cat << 'USAGE' Usage: bash submit.sh \ - [NODE_LIST] + [NODE_LIST] [RANDOM_RANGE_RATIO] Arguments: - PREFILL_NODES Number of prefill nodes - PREFILL_WORKERS Number of prefill workers (usually 1) - DECODE_NODES Number of decode nodes - DECODE_WORKERS Number of decode workers (usually 1) - ISL Input sequence length - OSL Output sequence length - CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") - REQUEST_RATE Request rate ("inf" for max throughput) - NODE_LIST Optional: comma-separated hostnames + PREFILL_NODES Number of prefill nodes + PREFILL_WORKERS Number of prefill workers (usually 1) + DECODE_NODES Number of decode nodes + DECODE_WORKERS Number of decode workers (usually 1) + ISL Input sequence length + OSL Output sequence length + CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") + REQUEST_RATE Request rate ("inf" for max throughput) + NODE_LIST Optional: comma-separated hostnames + RANDOM_RANGE_RATIO Optional: random range ratio for benchmark (default 0.8) Required environment variables: SLURM_ACCOUNT SLURM account name @@ -66,6 +67,7 @@ OSL=$6 CONCURRENCIES=$7 REQUEST_RATE=$8 NODE_LIST=${9} +RANDOM_RANGE_RATIO=${10} # Router co-located with first prefill: xP + yD nodes total NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) @@ -85,10 +87,10 @@ export GPUS_PER_NODE=$GPUS_PER_NODE export MODEL_NAME=$MODEL_NAME export BENCH_INPUT_LEN=${ISL} export BENCH_OUTPUT_LEN=${OSL} -export BENCH_RANDOM_RANGE_RATIO=${BENCH_RANDOM_RANGE_RATIO:-1} export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} export BENCH_REQUEST_RATE=${REQUEST_RATE} +export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" From fa7794ddbd22e6821f0fc636fe91cf6422e2c68e Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 17 Mar 2026 10:22:58 +0000 Subject: [PATCH 10/85] Bug fix: 1. With DRY_RUN=1, node 0 skipped starting proxy/prefill but still ran the first barrier; 2. kill and kill run only when DRY_RUN=0 Signed-off-by: Theresa Shan --- .../multi_node/vllm_disagg_utils/server.sh | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index f81ff68e1..55538d4fa 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -284,11 +284,15 @@ if [ "$NODE_RANK" -eq 0 ]; then fi echo "Waiting for all prefill and decode servers to be up . . ." - python3 $VLLM_WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports $SERVER_PORT \ - --wait-for-all-ports \ - --timeout 1800 + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: skipping barrier (wait-for-all-ports)" + else + python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports $SERVER_PORT \ + --wait-for-all-ports \ + --timeout 1800 + fi echo "Congratulations!!! All prefill and decode servers are up . . ." @@ -336,8 +340,8 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Killing the proxy server and prefill server" if [[ "$DRY_RUN" -eq 0 ]]; then - kill $proxy_pid - kill $prefill_pid + [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true + [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true fi elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then From 8fb6f4890c6450e7b4a4114c194e98aa561d4c47 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Thu, 19 Mar 2026 18:33:36 +0000 Subject: [PATCH 11/85] [AMD] Fix vLLM disagg hang: READ mode support + safety timeouts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable READ-mode KV transfer (decode-initiated RDMA reads) with a critical scheduler assertion fix, and add safety timeouts to prevent indefinite hangs during RDMA transfers. Changes: - setup_deps.sh: Add patches — save_kv_layer/start_load_kv handshake timeouts (30s), RDMA transfer timeout (120s), deferred write task expiry (60s), write worker error handling, and scheduler assertion fix for READ-mode intermediate request states - moriio_proxy.py: Add stream idle timeout (PROXY_STREAM_IDLE_TIMEOUT) to abort stalled decode streams, and proper response.release() - submit.sh, job.slurm: Plumb PROXY_STREAM_IDLE_TIMEOUT and VLLM_MORIIO_CONNECTOR_READ_MODE env vars into Docker containers Validated: 1k/1k full sweep (C8–C512), 100% success rate at all concurrency levels, peak 8500 output tok/s at C512. --- .../multi_node/vllm_disagg_utils/job.slurm | 2 + .../vllm_disagg_utils/moriio_proxy.py | 21 +- .../vllm_disagg_utils/setup_deps.sh | 468 +++++++++++++++++- .../multi_node/vllm_disagg_utils/submit.sh | 3 + 4 files changed, 489 insertions(+), 5 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index 3a71436fe..b216f53f4 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -304,6 +304,8 @@ exec sudo docker run --rm \ -e UCX_ROCM_IPC_MIN_ZCOPY=0 \ -e UCX_LOG_LEVEL=warn \ -e HSA_ENABLE_SDMA=1 \ + -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} \ + -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-0} \ --name \"$DOCKER_CONT_NAME\" \ --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' diff --git a/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py b/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py index 82272dd52..b2162c98a 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py +++ b/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py @@ -18,6 +18,7 @@ import re import socket import threading +import time import uuid import aiohttp @@ -37,6 +38,8 @@ request_nums = 0 app = Quart(__name__) +STREAM_IDLE_TIMEOUT = int(os.environ.get("PROXY_STREAM_IDLE_TIMEOUT", "300")) + IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)") TRANSFER_TYPE = None @@ -173,13 +176,27 @@ async def start_decode_request(endpoint, req_data, request_id): async def stream_decode_response(session, response, request_id): try: if response.status == 200: - async for chunk_bytes in response.content.iter_chunked(1024): - yield chunk_bytes + chunk_iter = response.content.iter_chunked(1024).__aiter__() + while True: + try: + chunk_bytes = await asyncio.wait_for( + chunk_iter.__anext__(), timeout=STREAM_IDLE_TIMEOUT, + ) + yield chunk_bytes + except StopAsyncIteration: + break + except asyncio.TimeoutError: + logger.error( + "Decode stream %s idle for %ds, aborting", + request_id, STREAM_IDLE_TIMEOUT, + ) + break else: raise RuntimeError( f"Decode response status={response.status}" ) finally: + await response.release() await session.close() diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index 3af1b5b0e..467e1bd5a 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -16,6 +16,19 @@ RIXL_HOME="${RIXL_HOME:-/usr/local/rixl}" _SETUP_START=$(date +%s) _SETUP_INSTALLED=() +git_clone_retry() { + local url="$1" dest="$2" max_tries=3 try=1 + while (( try <= max_tries )); do + if git clone --quiet "$url" "$dest" 2>/dev/null; then return 0; fi + echo "[SETUP] git clone attempt $try/$max_tries failed for $url, retrying in 10s..." + rm -rf "$dest" + sleep 10 + (( try++ )) + done + echo "[SETUP] git clone failed after $max_tries attempts: $url" + return 1 +} + # --------------------------------------------------------------------------- # 1. UCX (ROCm fork — required for GPU-direct RDMA via Nixl) # --------------------------------------------------------------------------- @@ -36,7 +49,7 @@ install_ucx() { ( set -e mkdir -p /usr/local/src && cd /usr/local/src - git clone --quiet https://github.com/ROCm/ucx.git && cd ucx + git_clone_retry https://github.com/ROCm/ucx.git ucx && cd ucx git checkout da3fac2a ./autogen.sh && mkdir -p build && cd build ../configure \ @@ -74,7 +87,7 @@ install_rixl() { echo "[SETUP] Building RIXL from source (ROCm/RIXL @ f33a5599)..." ( set -e - git clone --quiet https://github.com/ROCm/RIXL.git /opt/rixl && cd /opt/rixl + git_clone_retry https://github.com/ROCm/RIXL.git /opt/rixl && cd /opt/rixl git checkout f33a5599 meson setup build --prefix="${RIXL_HOME}" \ -Ducx_path="${UCX_HOME}" \ @@ -171,7 +184,7 @@ install_mori() { echo "[SETUP] Building MoRI from source (ROCm/mori @ b645fc8)..." ( set -e - git clone --quiet https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori + git_clone_retry https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori git checkout b645fc8 pip install --quiet . ) @@ -235,6 +248,451 @@ else: _SETUP_INSTALLED+=("MoRI-FP8-patch") } +# --------------------------------------------------------------------------- +# 8. Patch vLLM MoRI-IO save_kv_layer busy-spin (C128 tail-batch deadlock) +# In WRITE mode, save_kv_layer spins forever waiting for the handshake +# callback to set write_ready_flags. This blocks the model worker thread, +# preventing it from responding to EngineCore shm_broadcast, causing a +# TimeoutError cascade and crash. +# Patch: add time.sleep(0.001) and a 30s timeout to yield CPU and prevent +# the model worker from deadlocking. +# --------------------------------------------------------------------------- +patch_moriio_save_kv_timeout() { + python3 -c ' +import os, sys + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc + f = mc.__file__ + src = open(f).read() + + # Already patched? + if "[PATCHED] save_kv_layer timeout" in src: + print("[SETUP] save_kv_layer timeout patch already applied") + sys.exit(0) + + old = """ while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.write_ready_flags + ): + continue""" + + if old not in src: + print("[SETUP] WARN: save_kv_layer busy-spin pattern not found, skipping patch") + sys.exit(0) + + new = """ # [PATCHED] save_kv_layer — null guard + timeout + sleep + if remote_engine_id is None: + return + import time as _time, os as _os + _wait_start = _time.monotonic() + _SAVE_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) + while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.write_ready_flags + ): + _elapsed = _time.monotonic() - _wait_start + if _elapsed > _SAVE_KV_TIMEOUT: + import logging as _logging + _logging.getLogger("vllm.moriio").warning( + "[HANGFIX] save_kv_layer: timeout (%.1fs) waiting for " + "write_ready_flags[%s], breaking to unblock model " + "worker", _elapsed, remote_engine_id) + break + _time.sleep(0.001) + continue""" + + new_src = src.replace(old, new) + if new_src == src: + print("[SETUP] WARN: replacement had no effect") + sys.exit(0) + + open(f, "w").write(new_src) + print("[SETUP] Patched save_kv_layer: null guard + timeout + sleep") +except Exception as e: + print(f"[SETUP] WARN patch save_kv_layer: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-save-kv-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 9. Patch MoRIIO waiting_for_transfer_complete with bounded timeout +# The original status.Wait() blocks forever if an RDMA completion never +# arrives (e.g., NIC queue saturation at C256). This replaces the unbounded +# wait with a polling loop using status.Succeeded() + configurable timeout. +# Also adds error handling to the write worker loop so a single failed +# transfer doesn't kill the background thread. +# --------------------------------------------------------------------------- +patch_moriio_transfer_timeout() { + python3 -c ' +import os, sys, textwrap + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine as me + f = me.__file__ + src = open(f).read() + + if "[PATCHED] transfer completion timeout" in src: + print("[SETUP] transfer completion timeout patch already applied") + sys.exit(0) + + # --- Patch 1: Replace waiting_for_transfer_complete with polling + timeout --- + old_wait = """ def waiting_for_transfer_complete(self): + if not self.transfer_status: + return + + transfers_to_wait = [] + with self.lock: + transfers_to_wait = self.transfer_status[:] + self.transfer_status.clear() + + for status in transfers_to_wait: + try: + status.Wait() + if not status.Succeeded(): + logger.error( + "Transfer failed: %s, Code: %s", status.Message(), status.Code() + ) + raise TransferError("MoRIIO transfer failed!") + except Exception as e: + logger.error("Transfer %s failed: %s", status, e) + raise""" + + new_wait = """ def waiting_for_transfer_complete(self): + # [PATCHED] transfer completion timeout — bounded polling loop + import time as _time, os as _os + if not self.transfer_status: + return + + _timeout = float(_os.environ.get("VLLM_MORIIO_TRANSFER_TIMEOUT", "120")) + + transfers_to_wait = [] + with self.lock: + transfers_to_wait = self.transfer_status[:] + self.transfer_status.clear() + + _start = _time.monotonic() + remaining = list(transfers_to_wait) + _polls = 0 + _completed = 0 + + while remaining: + _elapsed = _time.monotonic() - _start + if _elapsed > _timeout: + logger.error( + "[HANGFIX] transfer_timeout elapsed=%.1fs " + "pending=%d/%d completed=%d polls=%d " + "action=raise_transfer_error", + _elapsed, len(remaining), len(transfers_to_wait), + _completed, _polls, + ) + raise TransferError( + f"RDMA transfer timeout after {_elapsed:.1f}s, " + f"{len(remaining)}/{len(transfers_to_wait)} pending" + ) + + still_waiting = [] + for status in remaining: + try: + if status.Succeeded(): + _completed += 1 + continue + still_waiting.append(status) + except Exception as e: + logger.error( + "[HANGFIX] transfer_poll_error error=%s", e) + raise TransferError( + f"Transfer failed during poll: {e}" + ) from e + + remaining = still_waiting + if remaining: + _time.sleep(0.005) + _polls += 1 + if _polls % 2000 == 0: + logger.warning( + "[HANGFIX] transfer_wait pending=%d " + "completed=%d elapsed=%.1fs timeout=%.0fs", + len(remaining), _completed, + _time.monotonic() - _start, _timeout, + )""" + + if old_wait not in src: + print("[SETUP] WARN: waiting_for_transfer_complete pattern not found") + sys.exit(0) + + new_src = src.replace(old_wait, new_wait) + + # --- Patch 2: Add error handling + cleanup to _write_worker_loop --- + old_loop = """ self._execute_write_task(task)""" + + new_loop = """ try: + self._execute_write_task(task) + except Exception as _e: + logger.error( + "[HANGFIX] req=%s write_task_failed error=%s " + "action=cleanup_and_mark_done", + task.request_id, _e, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None + ) + except Exception: + pass""" + + if old_loop in new_src: + new_src = new_src.replace(old_loop, new_loop, 1) + else: + print("[SETUP] WARN: _write_worker_loop pattern not found for error handling") + + # --- Patch 3: Add deferred task timeout to _process_deferred_tasks --- + old_deferred = """ def _process_deferred_tasks(self) -> None: + \"\"\"Process tasks that were previously deferred.\"\"\" + if not self._deferred_tasks: + return + + still_deferred: list[WriteTask] = [] + for task in self._deferred_tasks: + if self._is_remote_ready(task): + self._execute_write_task(task) + else: + still_deferred.append(task) + + self._deferred_tasks = still_deferred""" + + new_deferred = """ def _process_deferred_tasks(self) -> None: + \"\"\"Process tasks that were previously deferred.\"\"\" + # [PATCHED] deferred task timeout — prune stale tasks + import time as _time, os as _os + if not self._deferred_tasks: + return + + _DEFER_TIMEOUT = float( + _os.environ.get("VLLM_MORIIO_DEFER_TIMEOUT", "60")) + + still_deferred: list[WriteTask] = [] + for task in self._deferred_tasks: + _age = _time.monotonic() - getattr(task, "_defer_ts", _time.monotonic()) + if _age > _DEFER_TIMEOUT: + logger.error( + "[HANGFIX] req=%s deferred_task_expired age=%.1fs " + "action=drop_and_mark_done", + task.request_id, _age, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None) + except Exception: + pass + continue + if self._is_remote_ready(task): + try: + self._execute_write_task(task) + except Exception as _e: + logger.error( + "[HANGFIX] req=%s deferred_write_failed error=%s", + task.request_id, _e, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None) + except Exception: + pass + else: + still_deferred.append(task) + + self._deferred_tasks = still_deferred""" + + if old_deferred in new_src: + new_src = new_src.replace(old_deferred, new_deferred, 1) + else: + print("[SETUP] WARN: _process_deferred_tasks pattern not found") + + # --- Patch 4: Stamp defer time when task is deferred --- + old_defer_add = """ self._deferred_tasks.append(task)""" + new_defer_add = """ import time as _time2 + if not hasattr(task, "_defer_ts"): + task._defer_ts = _time2.monotonic() + self._deferred_tasks.append(task)""" + if old_defer_add in new_src: + new_src = new_src.replace(old_defer_add, new_defer_add, 1) + else: + print("[SETUP] WARN: deferred task timestamp patch target not found") + + open(f, "w").write(new_src) + print("[SETUP] Patched: transfer timeout + writer error handling") + +except Exception as e: + print(f"[SETUP] WARN patch transfer_timeout: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-transfer-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 10. Patch MoRIIO start_load_kv busy-spin (same pattern as save_kv_layer) +# The READ-mode spin loop in start_load_kv has the same unbounded-spin +# issue as save_kv_layer. Add timeout + sleep + null guard. +# --------------------------------------------------------------------------- +patch_moriio_load_kv_timeout() { + python3 -c ' +import os, sys + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc + f = mc.__file__ + src = open(f).read() + + if "[PATCHED] start_load_kv timeout" in src: + print("[SETUP] start_load_kv timeout patch already applied") + sys.exit(0) + + old = """ while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.load_ready_flag + and wait_handshake_readd_req + ): + continue""" + + if old not in src: + print("[SETUP] WARN: start_load_kv busy-spin pattern not found, skipping") + sys.exit(0) + + new = """ # [PATCHED] start_load_kv timeout — prevent model worker deadlock + if remote_engine_id is None and not wait_handshake_readd_req: + self._reqs_to_send.update(metadata.reqs_to_send) + return + import time as _time, os as _os + _wait_start = _time.monotonic() + _LOAD_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) + while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.load_ready_flag + and wait_handshake_readd_req + ): + if _time.monotonic() - _wait_start > _LOAD_KV_TIMEOUT: + import logging as _logging + _logging.getLogger("vllm.moriio").warning( + "[HANGFIX] start_load_kv: timeout (%.1fs) waiting for " + "load_ready_flag[%s]", _time.monotonic() - _wait_start, + remote_engine_id) + break + _time.sleep(0.001) + continue""" + + new_src = src.replace(old, new) + if new_src == src: + print("[SETUP] WARN: start_load_kv replacement had no effect") + sys.exit(0) + + open(f, "w").write(new_src) + print("[SETUP] Patched start_load_kv busy-spin with timeout + sleep") +except Exception as e: + print(f"[SETUP] WARN patch start_load_kv: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-load-kv-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 11. Fix READ-mode scheduler assertion in _update_from_kv_xfer_finished +# vLLM v0.17.1 asserts that a request in finished_recving must be either +# WAITING_FOR_REMOTE_KVS or finished. In READ mode the request can +# transition to RUNNING before the aggregated recv notification arrives, +# crashing the engine with AssertionError. +# --------------------------------------------------------------------------- +patch_scheduler_read_mode_fix() { + python3 -c ' +import os, sys + +try: + import vllm.v1.core.sched.scheduler as smod + f = smod.__file__ + src = open(f).read() + + if "[PATCHED] read-mode recv assertion" in src: + print("[SETUP] scheduler read-mode assertion fix already applied") + sys.exit(0) + + old_recv = """ for req_id in kv_connector_output.finished_recving or (): + logger.debug("Finished recving KV transfer for request %s", req_id) + assert req_id in self.requests + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + else: + assert RequestStatus.is_finished(req.status) + self._free_blocks(self.requests[req_id])""" + + new_recv = """ # [PATCHED] read-mode recv assertion — handle intermediate states + for req_id in kv_connector_output.finished_recving or (): + logger.debug("Finished recving KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping recv", req_id) + continue + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + elif RequestStatus.is_finished(req.status): + self._free_blocks(self.requests[req_id]) + else: + logger.debug( + "Request %s recv finished but status=%s (not " + "WAITING_FOR_REMOTE_KVS or finished), skipping " + "block free — will be freed on request completion", + req_id, req.status.name)""" + + if old_recv not in src: + print("[SETUP] WARN: scheduler finished_recving pattern not found, skipping") + sys.exit(0) + + new_src = src.replace(old_recv, new_recv, 1) + + old_send = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + assert req_id in self.requests + self._free_blocks(self.requests[req_id])""" + + new_send = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping send", req_id) + continue + req = self.requests[req_id] + if RequestStatus.is_finished(req.status): + self._free_blocks(req) + else: + logger.debug( + "Request %s send finished but status=%s, " + "deferring block free to request completion", + req_id, req.status.name)""" + + if old_send in new_src: + new_src = new_src.replace(old_send, new_send, 1) + else: + print("[SETUP] WARN: scheduler finished_sending pattern not found") + + open(f, "w").write(new_src) + print("[SETUP] Patched: scheduler _update_from_kv_xfer_finished read-mode fix") + +except Exception as e: + print(f"[SETUP] WARN patch scheduler read-mode: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("scheduler-read-mode-fix") +} + # ============================================================================= # Run installers # ============================================================================= @@ -245,6 +703,10 @@ install_etcd install_libionic install_mori patch_mori_fp8_compat +patch_moriio_save_kv_timeout +patch_moriio_transfer_timeout +patch_moriio_load_kv_timeout +patch_scheduler_read_mode_fix if [[ "${NODE_RANK:-0}" -eq 0 ]]; then install_mori_proxy_deps diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh index f210d7ac7..5d733b010 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -92,6 +92,9 @@ export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} export BENCH_REQUEST_RATE=${REQUEST_RATE} export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} +export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} +export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-0} + # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" mkdir -p "$BENCHMARK_LOGS_DIR" From 5c5d072af6566f8145cf853720300b108ce06df5 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sat, 21 Mar 2026 19:15:33 +0000 Subject: [PATCH 12/85] Adapt vLLM disagg recipe for 9N mia1 cluster (mlx5 NICs) Port the vLLM disaggregated serving pipeline from the 4N cluster (Pensando ionic NICs) to the 9N mia1 cluster (mlx5/rdma NICs). Key changes: - Fix C512 deadlock: apply ucx_error_handling_mode=none universally instead of only for ionic NICs. Under high concurrency, UCX's default UCP_ERR_HANDLING_MODE_PEER prevents RIXL RDMA READ retries from recovering after ibv_post_send queue exhaustion, causing prefill KV cache saturation and pipeline deadlock. - Force-reinstall MoRI from b645fc8 to fix PCI topology assertion failure on nodes with Broadcom PEX890xx PCIe switches. - Auto-detect Docker privilege (sudo vs non-sudo) for cross-cluster portability. - Add SLURM_EXCLUDE_NODES support to skip nodes with broken Docker sockets. - Increase VLLM_ENGINE_READY_TIMEOUT_S to 3600 to accommodate longer setup times (RIXL/MoRI source builds over NFS). --- .../multi_node/vllm_disagg_utils/job.slurm | 20 +++++++++---- .../multi_node/vllm_disagg_utils/models.yaml | 2 +- .../multi_node/vllm_disagg_utils/server.sh | 29 +++++++++---------- .../vllm_disagg_utils/setup_deps.sh | 25 ++++++++++++---- .../multi_node/vllm_disagg_utils/submit.sh | 8 +++++ 5 files changed, 57 insertions(+), 27 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index b216f53f4..904aaaff4 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -61,6 +61,16 @@ BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" GPUS_PER_NODE="${GPUS_PER_NODE:-8}" +# ============================================================================= +# Docker privilege detection +# ============================================================================= +if docker ps &>/dev/null; then + DOCKER_CMD="docker" +else + DOCKER_CMD="sudo docker" +fi +export DOCKER_CMD + # ============================================================================= # Model Path Resolution # ============================================================================= @@ -212,7 +222,7 @@ SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) cleanup() { echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..." - sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true + rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true echo "[${SLURM_JOB_ID}] cleanup done." } @@ -240,10 +250,10 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" # Pre-clean (idempotent) -sudo docker ps -aq --filter \"name=^container_vllm_\" | xargs -r sudo docker rm -f || true -sudo docker ps -aq | xargs -r sudo docker stop || true +$DOCKER_CMD ps -aq --filter \"name=^container_vllm_\" | xargs -r $DOCKER_CMD rm -f || true +$DOCKER_CMD ps -aq | xargs -r $DOCKER_CMD stop || true -exec sudo docker run --rm \ +exec $DOCKER_CMD run --rm \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -320,4 +330,4 @@ if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then fi " -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'sudo docker rm -f $DOCKER_CONT_NAME 2>/dev/null || true' +srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c "$DOCKER_CMD rm -f \$DOCKER_CONT_NAME 2>/dev/null || true" diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml index 4a720785a..ef062e5f4 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/models.yaml +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -32,7 +32,7 @@ DeepSeek-V3: DeepSeek-R1-0528: prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=1200" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" hf_dir: "models--deepseek-ai--DeepSeek-R1-0528" gpt-oss-120b: diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 55538d4fa..d21bdbebb 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -82,22 +82,21 @@ setup_rdma_env() { fi # Patch Nixl UCX backend: set ucx_error_handling_mode=none. - # Only needed for Pensando ionic NICs which don't support rdmacm — the default - # UCP_ERR_HANDLING_MODE_PEER causes "no active messages transport" errors. - # ConnectX/mlx5 NICs (mia1 cluster) handle error mode properly; skip the patch. - if [[ "${IBDEVICES:-}" == *ionic* ]]; then - local nixl_api - nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) - if [[ -n "$nixl_api" ]]; then - if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then - sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" - echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api" - else - echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" - fi + # Required for ALL NIC types under high concurrency (C512+). Without this, + # UCX's default UCP_ERR_HANDLING_MODE_PEER triggers transport-level error + # recovery on ibv_post_send failures, preventing RIXL RDMA READ retries from + # recovering gracefully. This causes the prefill KV cache to fill to 100% + # and deadlock the pipeline. On ionic NICs this was already applied (rdmacm + # incompatibility); on mlx5 NICs it was incorrectly skipped. + local nixl_api + nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) + if [[ -n "$nixl_api" ]]; then + if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then + sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" + echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api (IBDEVICES=${IBDEVICES:-unset})" + else + echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" fi - else - echo "[INFO] Non-ionic RDMA devices (${IBDEVICES:-unset}); skipping ucx_error_handling_mode patch" fi } diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index 467e1bd5a..a6b1f79cb 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -171,8 +171,18 @@ install_mori_proxy_deps() { # GPU kernels are JIT-compiled on first use; no hipcc needed at install. # --------------------------------------------------------------------------- install_mori() { - if python3 -c "import mori" 2>/dev/null; then - echo "[SETUP] MoRI Python bindings already present" + local MORI_TARGET_COMMIT="b645fc8" + local MORI_MARKER="/usr/local/lib/python3.*/dist-packages/.mori_commit_${MORI_TARGET_COMMIT}" + + # The pre-installed MoRI in vllm base images has a PCI topology bug: it + # only maps the secondary bus of each bridge instead of the full + # secondary-to-subordinate range (dsp2dev). This causes an assertion + # failure in TopoSystemPci::Load() on nodes with deeply-nested PCIe + # switch topologies (e.g. Broadcom PEX890xx on MI355X mia1 nodes). + # Always rebuild from the target commit unless the marker file proves + # the correct version was already installed in this container. + if ls $MORI_MARKER &>/dev/null; then + echo "[SETUP] MoRI @ $MORI_TARGET_COMMIT already installed (marker found)" return 0 fi @@ -181,19 +191,22 @@ install_mori() { libopenmpi-dev openmpi-bin libpci-dev \ && rm -rf /var/lib/apt/lists/* - echo "[SETUP] Building MoRI from source (ROCm/mori @ b645fc8)..." + echo "[SETUP] Building MoRI from source (ROCm/mori @ $MORI_TARGET_COMMIT)..." + echo "[SETUP] (overriding pre-installed version to fix PCI topology bug)" ( set -e git_clone_retry https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori - git checkout b645fc8 - pip install --quiet . + git checkout "$MORI_TARGET_COMMIT" + pip install --quiet --force-reinstall . ) rm -rf /opt/mori if ! python3 -c "import mori" 2>/dev/null; then echo "[SETUP] ERROR: MoRI build failed"; exit 1 fi - _SETUP_INSTALLED+=("MoRI") + # Drop a marker so re-entry doesn't rebuild + touch $(python3 -c "import sysconfig; print(sysconfig.get_paths()['purelib'])")/.mori_commit_${MORI_TARGET_COMMIT} + _SETUP_INSTALLED+=("MoRI@$MORI_TARGET_COMMIT") } # --------------------------------------------------------------------------- diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh index 5d733b010..c5404ec18 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -112,6 +112,13 @@ if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then NODELIST_OPT=(--nodelist "$NODELIST_CSV") fi +# Optional: exclude specific nodes (e.g. nodes with broken Docker sockets). +# Set SLURM_EXCLUDE_NODES env var to a comma-separated list of hostnames. +EXCLUDE_OPT=() +if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then + EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES") +fi + # Construct the sbatch command sbatch_cmd=( sbatch @@ -119,6 +126,7 @@ sbatch_cmd=( -N "$NUM_NODES" -n "$NUM_NODES" "${NODELIST_OPT[@]}" + "${EXCLUDE_OPT[@]}" --time "$TIME_LIMIT" --partition "$SLURM_PARTITION" --account "$SLURM_ACCOUNT" From 776bde983fb7fae2b57b4c294ab2e887c0ba2f9d Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sun, 22 Mar 2026 12:38:46 +0000 Subject: [PATCH 13/85] [AMD] Fix vLLM disagg sweep hang: KV cache leak + benchmark client hardening MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Server-side: RIXL can lose `finished_sending` notifications under high concurrency with ibv_post_send failures, permanently leaking prefill KV blocks. Over multiple benchmark rounds (sweep), leaked blocks accumulate and saturate the prefill KV cache, deadlocking C512. - Fix finished_sending handler to unconditionally free KV blocks (the conditional status check had no recovery path, causing leaks) - Add idle KV block reaper: detects engine idle >5s with finished requests still holding blocks, then force-frees them - Add 10s cooldown between benchmark rounds for reaper activation Client-side: SSE streaming loop did not break on the [DONE] sentinel, causing the benchmark client to hang when the proxy held connections open after request completion. - Break SSE loop on [DONE] in completions and chat completions - Share a single aiohttp.ClientSession across all requests (connection pooling via TCPConnector instead of per-request session creation) - Add asyncio.wait_for timeout around asyncio.gather with proper task cancellation and partial result collection - Reduce AIOHTTP_TIMEOUT from 6h to 30min Verified: sweep 1K/1K C128→C256→C512 all pass (Job 6222, 9N cluster). --- .../multi_node/vllm_disagg_utils/bench.sh | 2 + .../vllm_disagg_utils/setup_deps.sh | 123 ++++++++++++- utils/bench_serving/backend_request_func.py | 172 +++++++++++------- utils/bench_serving/benchmark_serving.py | 53 ++++-- 4 files changed, 264 insertions(+), 86 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/bench.sh b/benchmarks/multi_node/vllm_disagg_utils/bench.sh index 37b9d0b56..5b9f5c772 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/bench.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/bench.sh @@ -70,4 +70,6 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do --result-dir /workspace/ echo "-----------------------------------------" + echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." + sleep 10 done diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index a6b1f79cb..a95591cb5 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -683,14 +683,7 @@ try: if req_id not in self.requests: logger.debug("Request %s already removed, skipping send", req_id) continue - req = self.requests[req_id] - if RequestStatus.is_finished(req.status): - self._free_blocks(req) - else: - logger.debug( - "Request %s send finished but status=%s, " - "deferring block free to request completion", - req_id, req.status.name)""" + self._free_blocks(self.requests[req_id])""" if old_send in new_src: new_src = new_src.replace(old_send, new_send, 1) @@ -706,6 +699,119 @@ except Exception as e: _SETUP_INSTALLED+=("scheduler-read-mode-fix") } +# --------------------------------------------------------------------------- +# 12. Idle KV block reaper for disaggregated prefill (READ mode) +# The RIXL notification path can lose `finished_sending` signals under +# high concurrency with ibv_post_send failures. This leaves KV blocks +# permanently allocated on the prefill engine even after the decode has +# finished reading. Over multiple benchmark rounds, leaked blocks +# accumulate and eventually saturate the prefill KV cache. +# +# Fix: instrument the scheduler's `schedule()` method to detect idle +# periods (0 running, 0 waiting for >5s) and force-free blocks for +# any remaining requests whose status is finished. +# --------------------------------------------------------------------------- +patch_prefill_idle_kv_reaper() { + python3 -c ' +import os, sys + +try: + import vllm.v1.core.sched.scheduler as smod + f = smod.__file__ + src = open(f).read() + + if "[PATCHED] idle-kv-reaper" in src: + print("[SETUP] idle KV block reaper already applied") + sys.exit(0) + + # Find the _update_from_kv_xfer_finished method end and add reaper logic + # We inject into the method that processes KV transfer completions. + marker = "[PATCHED] read-mode recv assertion" + if marker not in src: + print("[SETUP] WARN: scheduler read-mode patch not found, skipping reaper") + sys.exit(0) + + # Add reaper state initialization to __init__ + old_init_marker = "self.finished_recving_kv_req_ids" + if old_init_marker not in src: + print("[SETUP] WARN: finished_recving_kv_req_ids not found in scheduler") + sys.exit(0) + + # Find the first occurrence to insert reaper state + init_pos = src.find(old_init_marker) + # Find the line containing it + line_end = src.find("\n", init_pos) + init_line = src[init_pos:line_end] + + # Add reaper state after this line + reaper_init = init_line + """ + # [PATCHED] idle-kv-reaper state + self._idle_kv_reaper_ts = 0.0 + self._idle_kv_reaper_active = False""" + + src = src.replace(init_line, reaper_init, 1) + + # Now add the reaper logic at the end of _update_from_kv_xfer_finished + # Find the finished_sending handler we patched + send_handler = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping send", req_id) + continue + self._free_blocks(self.requests[req_id])""" + + reaper_logic = send_handler + """ + + # [PATCHED] idle-kv-reaper — force-free leaked prefill KV blocks + import time as _time + _REAPER_IDLE_SECS = 5.0 + _num_running = sum(1 for r in self.requests.values() + if r.status == RequestStatus.RUNNING) + _num_waiting = sum(1 for r in self.requests.values() + if r.status == RequestStatus.WAITING) + _is_idle = (_num_running == 0 and _num_waiting == 0) + + if _is_idle: + if not self._idle_kv_reaper_active: + self._idle_kv_reaper_active = True + self._idle_kv_reaper_ts = _time.monotonic() + elif _time.monotonic() - self._idle_kv_reaper_ts > _REAPER_IDLE_SECS: + _reaped = 0 + _reap_ids = [] + for _rid, _req in list(self.requests.items()): + if RequestStatus.is_finished(_req.status): + _reap_ids.append(_rid) + for _rid in _reap_ids: + try: + _req = self.requests[_rid] + self._free_blocks(_req) + _reaped += 1 + except Exception as _e: + logger.debug("[KV-REAPER] free_blocks failed for %s: %s", _rid, _e) + if _reaped > 0: + logger.warning( + "[KV-REAPER] Force-freed blocks for %d finished " + "requests after %.1fs idle", + _reaped, _time.monotonic() - self._idle_kv_reaper_ts) + self._idle_kv_reaper_ts = _time.monotonic() + else: + self._idle_kv_reaper_active = False""" + + if send_handler in src: + src = src.replace(send_handler, reaper_logic, 1) + else: + print("[SETUP] WARN: send handler not found for reaper injection") + sys.exit(0) + + open(f, "w").write(src) + print("[SETUP] Patched: idle KV block reaper for prefill") + +except Exception as e: + print(f"[SETUP] WARN patch idle-kv-reaper: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("idle-kv-reaper") +} + # ============================================================================= # Run installers # ============================================================================= @@ -720,6 +826,7 @@ patch_moriio_save_kv_timeout patch_moriio_transfer_timeout patch_moriio_load_kv_timeout patch_scheduler_read_mode_fix +patch_prefill_idle_kv_reaper if [[ "${NODE_RANK:-0}" -eq 0 ]]; then install_mori_proxy_deps diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py index 4c8820f8d..bd8e40bfd 100644 --- a/utils/bench_serving/backend_request_func.py +++ b/utils/bench_serving/backend_request_func.py @@ -14,7 +14,7 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) -AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=30 * 60) @dataclass @@ -49,12 +49,16 @@ class RequestFuncOutput: async def async_request_tgi( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: params = { "best_of": request_func_input.best_of, "max_new_tokens": request_func_input.output_len, @@ -62,7 +66,6 @@ async def async_request_tgi( "temperature": 0.01, # TGI does not accept 0.0 temperature. "top_p": 0.99, # TGI does not accept 1.0 top_p. "truncate": request_func_input.prompt_len, - # TGI does not accept ignore_eos flag. } payload = { "inputs": request_func_input.prompt, @@ -113,21 +116,28 @@ async def async_request_tgi( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output async def async_request_trt_llm( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: assert request_func_input.best_of == 1 payload = { "accumulate_tokens": True, @@ -181,18 +191,25 @@ async def async_request_trt_llm( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output async def async_request_deepspeed_mii( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: assert request_func_input.best_of == 1 payload = { @@ -225,23 +242,30 @@ async def async_request_deepspeed_mii( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output async def async_request_openai_completions( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith( ("completions", "profile") ), "OpenAI Completions API URL must end with 'completions' or 'profile'." - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: payload = { "model": request_func_input.model_name \ if request_func_input.model_name else request_func_input.model, @@ -281,33 +305,35 @@ async def async_request_openai_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - if chunk != "[DONE]": - data = json.loads(chunk) - - # NOTE: Some completion API might have a last - # usage summary response without a token so we - # want to check a token was generated - if choices := data.get("choices"): - # Note that text could be empty here - # e.g. for special tokens - text = choices[0].get("text") - timestamp = time.perf_counter() - # First token - if not first_chunk_received: - first_chunk_received = True - ttft = time.perf_counter() - st - output.ttft = ttft - - # Decoding phase - else: - output.itl.append(timestamp - - most_recent_timestamp) - - most_recent_timestamp = timestamp - generated_text += text or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") + if chunk == "[DONE]": + break + + data = json.loads(chunk) + + # NOTE: Some completion API might have a last + # usage summary response without a token so we + # want to check a token was generated + if choices := data.get("choices"): + # Note that text could be empty here + # e.g. for special tokens + text = choices[0].get("text") + timestamp = time.perf_counter() + # First token + if not first_chunk_received: + first_chunk_received = True + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + most_recent_timestamp = timestamp + generated_text += text or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") if first_chunk_received: output.success = True else: @@ -324,6 +350,9 @@ async def async_request_openai_completions( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() if pbar: pbar.update(1) @@ -333,15 +362,19 @@ async def async_request_openai_completions( async def async_request_openai_chat_completions( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith( "chat/completions" ), "OpenAI Chat Completions API URL must end with 'chat/completions'." - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: - content = request_func_input.prompt + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: + content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: content = [{"type": "text", "text": request_func_input.prompt}] content.append(request_func_input.multi_modal_content) @@ -388,28 +421,30 @@ async def async_request_openai_chat_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - if chunk != "[DONE]": - timestamp = time.perf_counter() - data = json.loads(chunk) + if chunk == "[DONE]": + break - if choices := data.get("choices"): - content = choices[0]["delta"].get("content") - # First token - if ttft == 0.0: - ttft = timestamp - st - output.ttft = ttft + timestamp = time.perf_counter() + data = json.loads(chunk) - # Decoding phase - else: - output.itl.append(timestamp - - most_recent_timestamp) + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft - generated_text += content or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) - most_recent_timestamp = timestamp + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") + + most_recent_timestamp = timestamp output.generated_text = generated_text output.success = True @@ -421,6 +456,9 @@ async def async_request_openai_chat_completions( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() if pbar: pbar.update(1) diff --git a/utils/bench_serving/benchmark_serving.py b/utils/bench_serving/benchmark_serving.py index 741e44236..0e491384c 100644 --- a/utils/bench_serving/benchmark_serving.py +++ b/utils/bench_serving/benchmark_serving.py @@ -39,9 +39,10 @@ from multiprocessing import Pool, cpu_count from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple +import aiohttp import numpy as np -from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, - RequestFuncOutput) +from backend_request_func import (AIOHTTP_TIMEOUT, ASYNC_REQUEST_FUNCS, + RequestFuncInput, RequestFuncOutput) from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase @@ -518,11 +519,14 @@ async def benchmark( else: raise ValueError(f"Unknown backend: {backend}") + connector = aiohttp.TCPConnector(limit=0, enable_cleanup_closed=True) + shared_session = aiohttp.ClientSession( + trust_env=True, timeout=AIOHTTP_TIMEOUT, connector=connector) + print("Starting initial single prompt test run...") test_prompt, test_prompt_len, test_output_len, test_mm_content = ( input_requests[0]) if backend != "openai-chat" and test_mm_content is not None: - # multi-modal benchmark is only available on OpenAI Chat backend. raise ValueError( "Multi-modal content is only supported on 'openai-chat' backend.") test_input = RequestFuncInput( @@ -541,13 +545,15 @@ async def benchmark( if num_warmups > 0: print(f"Warming up with {num_warmups} requests...") warmup_pbar = None if disable_tqdm else tqdm(total=num_warmups) - warmup_semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None + warmup_semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else asyncio.Semaphore(num_warmups) async def warmup_limited_req_fn(): if warmup_semaphore is None: return await request_func(request_func_input=test_input, pbar=warmup_pbar) async with warmup_semaphore: - return await request_func(request_func_input=test_input, pbar=warmup_pbar) + return await request_func( + request_func_input=test_input, pbar=warmup_pbar, + session=shared_session) warmup_tasks = [] for _ in range(num_warmups): @@ -560,7 +566,6 @@ async def warmup_limited_req_fn(): print("Warmup completed.") if lora_modules: - # For each input request, choose a LoRA module at random. lora_modules = iter( [random.choice(lora_modules) for _ in range(len(input_requests))]) @@ -577,7 +582,8 @@ async def warmup_limited_req_fn(): best_of=best_of, multi_modal_content=test_mm_content, ignore_eos=ignore_eos) - profile_output = await request_func(request_func_input=profile_input) + profile_output = await request_func( + request_func_input=profile_input, session=shared_session) if profile_output.success: print("Profiler started") @@ -598,10 +604,10 @@ async def warmup_limited_req_fn(): async def limited_request_func(request_func_input, pbar): if semaphore is None: return await request_func(request_func_input=request_func_input, - pbar=pbar) + pbar=pbar, session=shared_session) async with semaphore: return await request_func(request_func_input=request_func_input, - pbar=pbar) + pbar=pbar, session=shared_session) print("Starting main benchmark run...") @@ -629,7 +635,28 @@ async def limited_request_func(request_func_input, pbar): asyncio.create_task( limited_request_func(request_func_input=request_func_input, pbar=pbar))) - outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) + gather_timeout = max(7200, len(input_requests) * 30) + try: + outputs: List[RequestFuncOutput] = await asyncio.wait_for( + asyncio.gather(*tasks), timeout=gather_timeout) + except asyncio.TimeoutError: + completed = pbar.n if pbar else "?" + print(f"\n[WARNING] Benchmark timed out after {gather_timeout}s " + f"({completed}/{len(tasks)} requests completed). " + "Collecting partial results...") + for task in tasks: + if not task.done(): + task.cancel() + await asyncio.gather(*tasks, return_exceptions=True) + outputs = [] + for task in tasks: + if task.done() and not task.cancelled(): + try: + outputs.append(task.result()) + except Exception: + outputs.append(RequestFuncOutput()) + else: + outputs.append(RequestFuncOutput()) if profile: print("Stopping profiler...") @@ -642,10 +669,14 @@ async def limited_request_func(request_func_input, pbar): logprobs=logprobs, best_of=best_of, ) - profile_output = await request_func(request_func_input=profile_input) + profile_output = await request_func( + request_func_input=profile_input, session=shared_session) if profile_output.success: print("Profiler stopped") + await shared_session.close() + await connector.close() + if pbar is not None: pbar.close() From a4b3658fbba27ee26cd62317cc5c8732bb9905bc Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sun, 22 Mar 2026 18:21:22 +0000 Subject: [PATCH 14/85] [AMD] Fix vLLM disagg Slurm job never terminating after benchmark completion Background processes (proxy, prefill, decode, etcd) were started via `cmd 2>&1 | tee logfile &`, causing bash $! to capture the PID of tee rather than the actual process. `kill $pid` only killed tee, leaving the real process running. The proxy kept port 30000 open, so decode nodes' `sync.py wait` never detected shutdown and the Slurm job hung forever. Additionally, etcd's stderr was not redirected, holding the Docker container's main pipe open and preventing container exit even after server.sh completed. Changes: - Redirect all background processes to log files instead of piping through tee, so $! captures the correct PID (matches SGLang pattern) - Redirect etcd launcher's stderr to prevent pipe leak - Add pkill fallback cleanup for proxy, vllm, and etcd processes - Increase barrier grace period to handle node setup time variance - Increase container creation barrier timeout from 300s to 600s --- .../multi_node/vllm_disagg_utils/server.sh | 29 +++++++++++-------- .../multi_node/vllm_disagg_utils/sync.py | 5 +++- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index d21bdbebb..8a149e776 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -162,14 +162,14 @@ python3 $VLLM_WS_PATH/sync.py barrier \ --node-ips ${IPADDRS} \ --node-ports 5000 \ --wait-for-all-ports \ - --timeout 300 + --timeout 600 # ============================================================================= # ETCD Server Setup # ============================================================================= echo "Proceeding to start etcd server on $host_name" -bash ${VLLM_WS_PATH}/start_etcd.sh > /dev/null & +bash ${VLLM_WS_PATH}/start_etcd.sh > /dev/null 2>&1 & etcd_pid=$! echo "Waiting at etcd server barrier on $host_name" @@ -260,7 +260,7 @@ if [ "$NODE_RANK" -eq 0 ]; then else PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log" set -x - eval "$PROXY_CMD" 2>&1 | tee "$PROXY_LOG_FILE" & + eval "$PROXY_CMD" > "$PROXY_LOG_FILE" 2>&1 & set +x proxy_pid=$! sleep 3 @@ -275,9 +275,9 @@ if [ "$NODE_RANK" -eq 0 ]; then if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $PREFILL_CMD" else + PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log" set -x - eval "$PREFILL_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 & set +x prefill_pid=$! fi @@ -341,6 +341,10 @@ if [ "$NODE_RANK" -eq 0 ]; then if [[ "$DRY_RUN" -eq 0 ]]; then [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true + sleep 2 + # Fallback: ensure no orphaned processes keep ports open + pkill -f moriio_proxy 2>/dev/null || true + pkill -f "vllm serve" 2>/dev/null || true fi elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then @@ -358,9 +362,9 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $PREFILL_CMD" else + PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log" set -x - eval "$PREFILL_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 & set +x prefill_pid=$! fi @@ -390,7 +394,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then fi echo "Killing the prefill server" - [[ "$DRY_RUN" -eq 0 ]] && kill $prefill_pid + [[ "$DRY_RUN" -eq 0 ]] && kill $prefill_pid 2>/dev/null || true else echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME})" @@ -412,9 +416,9 @@ else if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $DECODE_CMD" else + DECODE_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log" set -x - eval "$DECODE_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & + eval "$DECODE_CMD" > "$DECODE_LOG_FILE" 2>&1 & set +x decode_pid=$! fi @@ -444,11 +448,12 @@ else fi echo "Killing the decode server" - [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid + [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid 2>/dev/null || true fi echo "Killing the etcd server" -kill $etcd_pid +kill $etcd_pid 2>/dev/null || true +pkill -f etcd 2>/dev/null || true echo "Script completed successfully" exit 0 diff --git a/benchmarks/multi_node/vllm_disagg_utils/sync.py b/benchmarks/multi_node/vllm_disagg_utils/sync.py index 140951519..3678e7614 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/sync.py +++ b/benchmarks/multi_node/vllm_disagg_utils/sync.py @@ -143,7 +143,10 @@ def close_port(): time.sleep(30) if args.enable_port: - time.sleep(30) + # Keep the port open long enough for slow nodes to pass their barrier. + # The previous 30s was too short when setup times vary by minutes. + grace = max(60, args.timeout // 2) if args.timeout > 0 else 300 + time.sleep(grace) close_port() From a28dce56bd70619a21e7be069cc2d6daa2b1dc75 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sun, 22 Mar 2026 20:44:27 +0000 Subject: [PATCH 15/85] [AMD] Enable MoRI-IO READ mode by default for vLLM disagg --- .github/configs/amd-master.yaml | 3 +++ benchmarks/multi_node/vllm_disagg_utils/job.slurm | 2 +- benchmarks/multi_node/vllm_disagg_utils/submit.sh | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 5c6e6c013..11f294bd1 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1373,6 +1373,7 @@ dsr1-fp8-mi355x-vllm-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" decode: num-worker: 2 tp: 8 @@ -1393,6 +1394,7 @@ dsr1-fp8-mi355x-vllm-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" decode: num-worker: 2 tp: 8 @@ -1413,6 +1415,7 @@ dsr1-fp8-mi355x-vllm-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" decode: num-worker: 2 tp: 8 diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index 904aaaff4..c555f6948 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -315,7 +315,7 @@ exec $DOCKER_CMD run --rm \ -e UCX_LOG_LEVEL=warn \ -e HSA_ENABLE_SDMA=1 \ -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} \ - -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-0} \ + -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} \ --name \"$DOCKER_CONT_NAME\" \ --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh index c5404ec18..7063aa7a8 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -93,7 +93,7 @@ export BENCH_REQUEST_RATE=${REQUEST_RATE} export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} -export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-0} +export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" From af1bbb4fc7ae15a8860312840b11ac22aacacf2b Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sun, 22 Mar 2026 20:57:24 +0000 Subject: [PATCH 16/85] [AMD] Fix CI checkout failure caused by root-owned __pycache__ files Fix per-node Docker privilege detection in vLLM disagg job.slurm --- .../multi_node/vllm_disagg_utils/job.slurm | 18 ++++++++++++++---- .../multi_node/vllm_disagg_utils/server.sh | 3 +++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index c555f6948..d33525081 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -64,6 +64,9 @@ GPUS_PER_NODE="${GPUS_PER_NODE:-8}" # ============================================================================= # Docker privilege detection # ============================================================================= +# Detect on the batch host (used for post-srun cleanup). +# Per-node detection happens inside the srun inline script below because +# some nodes may require sudo while others do not. if docker ps &>/dev/null; then DOCKER_CMD="docker" else @@ -249,11 +252,18 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" +# Per-node Docker privilege detection (some nodes need sudo, others don't) +if docker ps &>/dev/null; then + _DCMD=docker +else + _DCMD='sudo docker' +fi + # Pre-clean (idempotent) -$DOCKER_CMD ps -aq --filter \"name=^container_vllm_\" | xargs -r $DOCKER_CMD rm -f || true -$DOCKER_CMD ps -aq | xargs -r $DOCKER_CMD stop || true +\$_DCMD ps -aq --filter \"name=^container_vllm_\" | xargs -r \$_DCMD rm -f || true +\$_DCMD ps -aq | xargs -r \$_DCMD stop || true -exec $DOCKER_CMD run --rm \ +exec \$_DCMD run --rm \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -330,4 +340,4 @@ if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then fi " -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c "$DOCKER_CMD rm -f \$DOCKER_CONT_NAME 2>/dev/null || true" +srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'if docker ps &>/dev/null; then D=docker; else D="sudo docker"; fi; $D rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 8a149e776..85a50b38d 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -455,5 +455,8 @@ echo "Killing the etcd server" kill $etcd_pid 2>/dev/null || true pkill -f etcd 2>/dev/null || true +# Clean root-owned __pycache__ so the CI runner can delete the workspace on next checkout +find /workspace -name '__pycache__' -exec rm -rf {} + 2>/dev/null || true + echo "Script completed successfully" exit 0 From 7eddefa9254e8bf87316a7f6ea38d407d11e54e4 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Mon, 23 Mar 2026 09:07:02 +0000 Subject: [PATCH 17/85] [AMD] Fix CI checkout EACCES by redirecting Python bytecache off NFS Docker containers run as root, so __pycache__/*.pyc files created during benchmark_serving.py import end up root-owned on the NFS workspace. The CI runner cannot delete them, breaking checkout. Set PYTHONPYCACHEPREFIX=/tmp/pycache in the Docker env so bytecache stays inside the container. Remove the previous server.sh find-and- delete workaround since the root cause is now addressed. --- benchmarks/multi_node/vllm_disagg_utils/job.slurm | 1 + benchmarks/multi_node/vllm_disagg_utils/server.sh | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index d33525081..bc04f3b61 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -326,6 +326,7 @@ exec \$_DCMD run --rm \ -e HSA_ENABLE_SDMA=1 \ -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} \ -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} \ + -e PYTHONPYCACHEPREFIX=/tmp/pycache \ --name \"$DOCKER_CONT_NAME\" \ --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 85a50b38d..8a149e776 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -455,8 +455,5 @@ echo "Killing the etcd server" kill $etcd_pid 2>/dev/null || true pkill -f etcd 2>/dev/null || true -# Clean root-owned __pycache__ so the CI runner can delete the workspace on next checkout -find /workspace -name '__pycache__' -exec rm -rf {} + 2>/dev/null || true - echo "Script completed successfully" exit 0 From 1b791b6b3e8a1d8085e14bde624ddbd53e80b5b7 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Mon, 23 Mar 2026 16:28:18 +0000 Subject: [PATCH 18/85] [AMD] Fix KV reaper deadlock on high-ISL disagg workloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The idle KV block reaper only fired when both running=0 AND waiting=0. Under 8K ISL at C64+, leaked blocks filled the prefill KV cache while new requests queued in WAITING state — the non-empty wait queue prevented the reaper from ever triggering, causing a permanent hang. Remove the waiting-queue check so the reaper fires whenever no requests are actively running, which is precisely when leaked blocks can be safely reclaimed. Verified with 8K/1K sweep (C32–C512) completing without hangs. --- benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index a95591cb5..e8437a5c9 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -767,11 +767,9 @@ try: _REAPER_IDLE_SECS = 5.0 _num_running = sum(1 for r in self.requests.values() if r.status == RequestStatus.RUNNING) - _num_waiting = sum(1 for r in self.requests.values() - if r.status == RequestStatus.WAITING) - _is_idle = (_num_running == 0 and _num_waiting == 0) + _should_reap = (_num_running == 0) - if _is_idle: + if _should_reap: if not self._idle_kv_reaper_active: self._idle_kv_reaper_active = True self._idle_kv_reaper_ts = _time.monotonic() From 5c5f0b2fed3cd5d857d6ec33738e786640e02952 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 24 Mar 2026 08:35:21 +0000 Subject: [PATCH 19/85] [AMD] Enable reading PREFILL_TP,PREFILL_EP,PREFILL_DP_ATTN,DECODE_TP,DECODE_EP,DECODE_DP_ATTN from amd-master.yaml config. Signed-off-by: Theresa Shan --- .github/configs/amd-master.yaml | 6 +-- .../multi_node/dsr1_fp8_mi355x_vllm-disagg.sh | 39 +++++++++++++-- .../multi_node/vllm_disagg_utils/job.slurm | 14 ++++++ .../multi_node/vllm_disagg_utils/server.sh | 31 ++++++++++++ .../multi_node/vllm_disagg_utils/submit.sh | 50 +++++++++++++------ 5 files changed, 119 insertions(+), 21 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 11f294bd1..183afd339 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1377,7 +1377,7 @@ dsr1-fp8-mi355x-vllm-disagg: decode: num-worker: 2 tp: 8 - ep: 1 + ep: 8 dp-attn: false additional-settings: - "DECODE_NODES=2" @@ -1398,7 +1398,7 @@ dsr1-fp8-mi355x-vllm-disagg: decode: num-worker: 2 tp: 8 - ep: 1 + ep: 8 dp-attn: false additional-settings: - "DECODE_NODES=2" @@ -1419,7 +1419,7 @@ dsr1-fp8-mi355x-vllm-disagg: decode: num-worker: 2 tp: 8 - ep: 1 + ep: 8 dp-attn: false additional-settings: - "DECODE_NODES=2" diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh index 172ecdf51..b21e9204a 100755 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh @@ -11,8 +11,12 @@ check_env_vars \ MODEL_PATH \ PREFILL_NUM_WORKERS \ PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ DECODE_NUM_WORKERS \ DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ RANDOM_RANGE_RATIO @@ -30,15 +34,42 @@ export MODEL_PATH=$MODEL_PATH export MODEL_NAME=$MODEL_NAME export CONTAINER_IMAGE=$IMAGE -# PREFILL_NODES and DECODE_NODES come from additional-settings in the YAML config. -# NODELIST (optional) constrains which Slurm nodes are used. +# Same EP/DP booleans as dsr1_fp8_mi355x_sglang-disagg.sh → amd_utils/submit.sh +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi +# Parameter order matches SGLang disagg submit.sh; arg 16 is optional NODELIST. JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ $PREFILL_NUM_WORKERS \ $DECODE_NODES \ $DECODE_NUM_WORKERS \ - $ISL $OSL "${CONC_LIST// /x}" inf "${NODELIST:-}" \ - ${RANDOM_RANGE_RATIO}) + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") if [[ $? -ne 0 ]]; then echo "Failed to submit job" >&2 diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index bc04f3b61..e1cad0817 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -217,6 +217,14 @@ export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE export DRY_RUN="${DRY_RUN:-0}" export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +# TP / EP / DP (from vllm_disagg_utils/submit.sh; mirrors amd_utils disagg) +export PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-false}" +export PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-false}" +export DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-false}" +export DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-false}" +export PREFILL_TP="${PREFILL_TP:-8}" +export DECODE_TP="${DECODE_TP:-8}" + SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_vllm_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$VLLM_WS_PATH/${RUN_FILE}" @@ -327,6 +335,12 @@ exec \$_DCMD run --rm \ -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} \ -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} \ -e PYTHONPYCACHEPREFIX=/tmp/pycache \ + -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \ + -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP \ + -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP \ + -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP \ + -e PREFILL_TP=\$PREFILL_TP \ + -e DECODE_TP=\$DECODE_TP \ --name \"$DOCKER_CONT_NAME\" \ --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 8a149e776..9b0ff2ebb 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -150,6 +150,37 @@ print(f'DECODE_MODEL_ENVS=\"{dev}\"') echo "Loaded model configuration for: $MODEL_NAME" +# Apply tensor-parallel size and EP/DP flags from submit pipeline (YAML PREFILL_TP / dp-attn / ep). +if [[ -n "${PREFILL_TP:-}" ]]; then + if echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then + PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${PREFILL_TP}/g") + else + PREFILL_SERVER_CONFIG+=" --tensor-parallel-size ${PREFILL_TP}" + fi +fi +if [[ -n "${DECODE_TP:-}" ]]; then + if echo "$DECODE_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then + DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${DECODE_TP}/g") + else + DECODE_SERVER_CONFIG+=" --tensor-parallel-size ${DECODE_TP}" + fi +fi +if [[ "${PREFILL_ENABLE_EP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then + PREFILL_SERVER_CONFIG+=" --enable-expert-parallel" +fi +if [[ "${PREFILL_ENABLE_DP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then + PREFILL_SERVER_CONFIG+=" --enable-dp-attention" +fi +if [[ "${DECODE_ENABLE_EP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then + DECODE_SERVER_CONFIG+=" --enable-expert-parallel" +fi +if [[ "${DECODE_ENABLE_DP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then + DECODE_SERVER_CONFIG+=" --enable-dp-attention" +fi + +echo "PREFILL_SERVER_CONFIG (after TP/EP/DP): $PREFILL_SERVER_CONFIG" +echo "DECODE_SERVER_CONFIG (after TP/EP/DP): $DECODE_SERVER_CONFIG" + # ============================================================================= # Container Synchronization # ============================================================================= diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh index 7063aa7a8..ecb5a9876 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -12,19 +12,29 @@ usage() { cat << 'USAGE' Usage: bash submit.sh \ - [NODE_LIST] [RANDOM_RANGE_RATIO] + \ + \ + \ + \ + [NODE_LIST] Arguments: - PREFILL_NODES Number of prefill nodes - PREFILL_WORKERS Number of prefill workers (usually 1) - DECODE_NODES Number of decode nodes - DECODE_WORKERS Number of decode workers (usually 1) - ISL Input sequence length - OSL Output sequence length - CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") - REQUEST_RATE Request rate ("inf" for max throughput) - NODE_LIST Optional: comma-separated hostnames - RANDOM_RANGE_RATIO Optional: random range ratio for benchmark (default 0.8) + PREFILL_NODES Number of prefill nodes + PREFILL_WORKERS Number of prefill workers (usually 1) + DECODE_NODES Number of decode nodes + DECODE_WORKERS Number of decode workers (usually 1) + ISL Input sequence length + OSL Output sequence length + CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") + REQUEST_RATE Request rate ("inf" for max throughput) + PREFILL_ENABLE_EP true/false (from PREFILL_EP in YAML; false when EP==1) + PREFILL_ENABLE_DP true/false (data-parallel attention on prefill) + DECODE_ENABLE_EP true/false (from DECODE_EP in YAML) + DECODE_ENABLE_DP true/false (data-parallel attention on decode) + PREFILL_TP Tensor parallel size per prefill node + DECODE_TP Tensor parallel size per decode node + RANDOM_RANGE_RATIO Random range ratio for benchmark client + NODE_LIST Optional: comma-separated hostnames (must match NUM_NODES) Required environment variables: SLURM_ACCOUNT SLURM account name @@ -57,7 +67,7 @@ check_env RUNNER_NAME GPUS_PER_NODE="${GPUS_PER_NODE:-8}" -# COMMAND_LINE ARGS +# COMMAND_LINE ARGS (aligned with benchmarks/multi_node/amd_utils/submit.sh) PREFILL_NODES=$1 PREFILL_WORKERS=${2:-1} DECODE_NODES=$3 @@ -66,8 +76,14 @@ ISL=$5 OSL=$6 CONCURRENCIES=$7 REQUEST_RATE=$8 -NODE_LIST=${9} -RANDOM_RANGE_RATIO=${10} +PREFILL_ENABLE_EP=${9:-false} +PREFILL_ENABLE_DP=${10:-false} +DECODE_ENABLE_EP=${11:-false} +DECODE_ENABLE_DP=${12:-false} +PREFILL_TP=${13:-8} +DECODE_TP=${14:-8} +RANDOM_RANGE_RATIO=${15:-0.8} +NODE_LIST=${16} # Router co-located with first prefill: xP + yD nodes total NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) @@ -85,6 +101,12 @@ export yD=$DECODE_NODES export NUM_NODES=$NUM_NODES export GPUS_PER_NODE=$GPUS_PER_NODE export MODEL_NAME=$MODEL_NAME +export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP} +export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP} +export DECODE_ENABLE_EP=${DECODE_ENABLE_EP} +export DECODE_ENABLE_DP=${DECODE_ENABLE_DP} +export PREFILL_TP=${PREFILL_TP} +export DECODE_TP=${DECODE_TP} export BENCH_INPUT_LEN=${ISL} export BENCH_OUTPUT_LEN=${OSL} export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} From a337fae38bceb649d9d65972a75a6f76547d4f93 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sun, 29 Mar 2026 17:13:53 +0000 Subject: [PATCH 20/85] [AMD] Upgrade vLLM disagg image from v0.17.1 to v0.18.0 Bump vllm/vllm-openai-rocm to v0.18.0 for the dsr1-fp8-mi355x-vllm-disagg config. Changes required by the new image: - setup_deps.sh: drop aiohttp/pyzmq installs (now pre-installed in v0.18.0); move install_mori_proxy_deps before patches and run on all nodes so msgpack is available when patch scripts import MoRI-IO connector modules - moriio_proxy.py: populate transfer_id in kv_transfer_params dicts (new required field in v0.18.0's moriio_connector.update_state_after_alloc) - MoRI PCI topology bug persists in v0.18.0; rebuild from b645fc8 retained Tested: 1K1K C8,16,32,64,128,256 on mia1 3-node (1P+2D) CONC512 is ongoing but it seems good so far --- .github/configs/amd-master.yaml | 2 +- .../vllm_disagg_utils/moriio_proxy.py | 5 +-- .../vllm_disagg_utils/setup_deps.sh | 34 +++++++++---------- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 183afd339..0de838729 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1351,7 +1351,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" dsr1-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:v0.17.1 + image: vllm/vllm-openai-rocm:v0.18.0 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg diff --git a/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py b/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py index b2162c98a..7d1e8454b 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py +++ b/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py @@ -244,8 +244,8 @@ def extract_ip_port_fast(url): dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"]) req_data_to_prefill = copy.deepcopy(req_data) - req_data_to_prefill["kv_transfer_params"] = {} - req_data["kv_transfer_params"] = {} + req_data_to_prefill["kv_transfer_params"] = {"transfer_id": request_id} + req_data["kv_transfer_params"] = {"transfer_id": request_id} req_data_to_prefill["kv_transfer_params"]["remote_dp_size"] = ( decode_instance_endpoint["dp_size"] ) @@ -269,6 +269,7 @@ def extract_ip_port_fast(url): req_data["max_tokens"] -= 1 req_data["kv_transfer_params"] = { + "transfer_id": request_id, "do_remote_decode": False, "do_remote_prefill": True, "remote_handshake_port": prefill_instance_endpoint["handshake_port"], diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index e8437a5c9..42aa648b0 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -2,7 +2,7 @@ # ============================================================================= # setup_deps.sh — Install missing vLLM disagg dependencies at container start. # -# Base image: vllm/vllm-openai-rocm:v0.17.1 +# Base image: vllm/vllm-openai-rocm:v0.18.0 # Sourced by server.sh so PATH / LD_LIBRARY_PATH exports persist. # Idempotent: each component is skipped if already present. # @@ -156,8 +156,11 @@ install_mori_proxy_deps() { fi echo "[SETUP] Installing MoRI-IO proxy Python deps..." + # v0.18.0 ships aiohttp, pyzmq, blinker(distutils); only quart and msgpack + # are missing. --ignore-installed blinker avoids pip's distutils uninstall + # error when quart pulls a newer blinker version. pip install --quiet --ignore-installed blinker - pip install --quiet quart aiohttp msgpack pyzmq + pip install --quiet quart msgpack if ! python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then echo "[SETUP] ERROR: MoRI-IO proxy deps install failed"; exit 1 @@ -169,18 +172,16 @@ install_mori_proxy_deps() { # 6. MoRI (Modular RDMA Interface — EP dispatch/combine kernels for MoE) # Required for --all2all-backend mori (Expert Parallelism via RDMA). # GPU kernels are JIT-compiled on first use; no hipcc needed at install. +# +# v0.18.0 ships MoRI 0.1.dev185+g2d02c6a98, but it STILL has the PCI +# topology bug (TopoSystemPci::Load assertion failure on Broadcom +# PEX890xx switches). Always rebuild from our target commit b645fc8 +# which includes the dsp2dev subordinate-range fix. # --------------------------------------------------------------------------- install_mori() { local MORI_TARGET_COMMIT="b645fc8" local MORI_MARKER="/usr/local/lib/python3.*/dist-packages/.mori_commit_${MORI_TARGET_COMMIT}" - # The pre-installed MoRI in vllm base images has a PCI topology bug: it - # only maps the secondary bus of each bridge instead of the full - # secondary-to-subordinate range (dsp2dev). This causes an assertion - # failure in TopoSystemPci::Load() on nodes with deeply-nested PCIe - # switch topologies (e.g. Broadcom PEX890xx on MI355X mia1 nodes). - # Always rebuild from the target commit unless the marker file proves - # the correct version was already installed in this container. if ls $MORI_MARKER &>/dev/null; then echo "[SETUP] MoRI @ $MORI_TARGET_COMMIT already installed (marker found)" return 0 @@ -192,7 +193,7 @@ install_mori() { && rm -rf /var/lib/apt/lists/* echo "[SETUP] Building MoRI from source (ROCm/mori @ $MORI_TARGET_COMMIT)..." - echo "[SETUP] (overriding pre-installed version to fix PCI topology bug)" + echo "[SETUP] (overriding image-provided version to fix PCI topology bug)" ( set -e git_clone_retry https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori @@ -204,14 +205,13 @@ install_mori() { if ! python3 -c "import mori" 2>/dev/null; then echo "[SETUP] ERROR: MoRI build failed"; exit 1 fi - # Drop a marker so re-entry doesn't rebuild touch $(python3 -c "import sysconfig; print(sysconfig.get_paths()['purelib'])")/.mori_commit_${MORI_TARGET_COMMIT} _SETUP_INSTALLED+=("MoRI@$MORI_TARGET_COMMIT") } # --------------------------------------------------------------------------- -# 7. Patch vLLM v0.17.1 MoRI-EP + FP8 incompatibility -# v0.17.1 asserts MoRI requires AITER fused_moe, but AITER's FP8 kernel +# 7. Patch vLLM MoRI-EP + FP8 incompatibility (present in v0.17.1 & v0.18.0) +# vLLM asserts MoRI requires AITER fused_moe, but AITER's FP8 kernel # uses defer_input_quant=True which MoRI's prepare/finalize rejects. # Patch: remove both the AITER requirement assertion and the # defer_input_quant NotImplementedError so non-AITER kernels work. @@ -621,10 +621,11 @@ except Exception as e: # --------------------------------------------------------------------------- # 11. Fix READ-mode scheduler assertion in _update_from_kv_xfer_finished -# vLLM v0.17.1 asserts that a request in finished_recving must be either +# vLLM asserts that a request in finished_recving must be either # WAITING_FOR_REMOTE_KVS or finished. In READ mode the request can # transition to RUNNING before the aggregated recv notification arrives, # crashing the engine with AssertionError. +# (present in v0.17.1 & v0.18.0) # --------------------------------------------------------------------------- patch_scheduler_read_mode_fix() { python3 -c ' @@ -819,6 +820,7 @@ install_rixl install_etcd install_libionic install_mori +install_mori_proxy_deps patch_mori_fp8_compat patch_moriio_save_kv_timeout patch_moriio_transfer_timeout @@ -826,10 +828,6 @@ patch_moriio_load_kv_timeout patch_scheduler_read_mode_fix patch_prefill_idle_kv_reaper -if [[ "${NODE_RANK:-0}" -eq 0 ]]; then - install_mori_proxy_deps -fi - # ============================================================================= # Export paths (persists for server.sh since this file is sourced) # ============================================================================= From fb211a4cad36f5850de200aef95f4314295e6a7d Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Mon, 30 Mar 2026 08:27:13 +0000 Subject: [PATCH 21/85] [AMD] Add Kimi-K2.5-MXFP4 disagg inference config (1P2D) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable vLLM disagg serving for amd/Kimi-K2.5-MXFP4 on MI355X with a 1P2D node topology (TP=8, decode EP=8). Changes: - amd-master.yaml: add kimik2.5-fp4-mi355x-vllm-disagg config with three seq-len scenarios (1K1K, 8K1K), READ mode enabled - models.yaml: add Kimi-K2.5-MXFP4 server flags (PIECEWISE cudagraph, --gpu-memory-utilization 0.90, --mm-encoder-tp-mode data) - bench.sh: add --trust-remote-code for models with custom code - setup_deps.sh: install amd-quark for MXFP4 quantization support - Add kimik2.5_fp4_mi355x_vllm-disagg.sh entry script Verified with full 1K/1K sweep (CONC 8–512) on SA4N and mia1 9N cluster; all concurrency levels completed without hang. --- .github/configs/amd-master.yaml | 33 +++++++- .../kimik2.5_fp4_mi355x_vllm-disagg.sh | 79 +++++++++++++++++++ .../multi_node/vllm_disagg_utils/bench.sh | 3 +- .../multi_node/vllm_disagg_utils/models.yaml | 6 ++ .../vllm_disagg_utils/setup_deps.sh | 22 ++++++ 5 files changed, 141 insertions(+), 2 deletions(-) create mode 100755 benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 0de838729..6f33178f3 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1403,9 +1403,20 @@ dsr1-fp8-mi355x-vllm-disagg: additional-settings: - "DECODE_NODES=2" +kimik2.5-fp4-mi355x-vllm-disagg: + image: vllm/vllm-openai-rocm:v0.18.0 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + runner: mi355x-disagg + precision: fp4 + framework: vllm-disagg + multinode: true + disagg: true + seq-len-configs: - isl: 1024 - osl: 8192 + osl: 1024 search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - spec-decoding: "none" conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] prefill: @@ -1424,6 +1435,26 @@ dsr1-fp8-mi355x-vllm-disagg: additional-settings: - "DECODE_NODES=2" + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" dsr1-fp4-mi355x-sglang-disagg: image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 diff --git a/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh new file mode 100755 index 000000000..b21e9204a --- /dev/null +++ b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/vllm_disagg_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +# Same EP/DP booleans as dsr1_fp8_mi355x_sglang-disagg.sh → amd_utils/submit.sh +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi + +# Parameter order matches SGLang disagg submit.sh; arg 16 is optional NODELIST. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/benchmarks/multi_node/vllm_disagg_utils/bench.sh b/benchmarks/multi_node/vllm_disagg_utils/bench.sh index 5b9f5c772..274c5954e 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/bench.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/bench.sh @@ -67,7 +67,8 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do --num-prompts "$num_prompts" \ --max-concurrency "$max_concurrency" \ --result-filename "$export_file" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + --trust-remote-code echo "-----------------------------------------" echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml index ef062e5f4..0ef2bc77f 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/models.yaml +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -35,6 +35,12 @@ DeepSeek-R1-0528: env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" hf_dir: "models--deepseek-ai--DeepSeek-R1-0528" +Kimi-K2.5-MXFP4: + prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" + hf_dir: "models--amd--Kimi-K2.5-MXFP4" + gpt-oss-120b: prefill_flags: "--tensor-parallel-size 8" decode_flags: "--tensor-parallel-size 8" diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index 42aa648b0..848bd6918 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -209,6 +209,27 @@ install_mori() { _SETUP_INSTALLED+=("MoRI@$MORI_TARGET_COMMIT") } +# --------------------------------------------------------------------------- +# 6b. amd-quark (MXFP4 quantization support for Kimi-K2.5-MXFP4 and similar) +# Required due to ROCm vLLM missing the quark dependency: +# https://github.com/vllm-project/vllm/issues/35633 +# --------------------------------------------------------------------------- +install_amd_quark() { + if python3 -c "import quark" 2>/dev/null; then + echo "[SETUP] amd-quark already present" + return 0 + fi + + echo "[SETUP] Installing amd-quark for MXFP4 quantization support..." + pip install --quiet amd-quark + + if ! python3 -c "import quark" 2>/dev/null; then + echo "[SETUP] WARN: amd-quark install failed (non-fatal for non-MXFP4 models)" + return 0 + fi + _SETUP_INSTALLED+=("amd-quark") +} + # --------------------------------------------------------------------------- # 7. Patch vLLM MoRI-EP + FP8 incompatibility (present in v0.17.1 & v0.18.0) # vLLM asserts MoRI requires AITER fused_moe, but AITER's FP8 kernel @@ -820,6 +841,7 @@ install_rixl install_etcd install_libionic install_mori +install_amd_quark install_mori_proxy_deps patch_mori_fp8_compat patch_moriio_save_kv_timeout From 9b8159e969647371651d128fcb8efdf154240a0c Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 3 Apr 2026 15:08:57 +0000 Subject: [PATCH 22/85] feat: add MiniMax M2.5 PD disaggregation recipe (1P2D, MoRI-EP + MoRI-IO) Cherry-picked from ChuanLi1101/InferenceMAX:chuali/minimax-m25-vllm-disagg (commit 72a0002e). Resolved conflict in models.yaml to keep both Kimi-K2.5-MXFP4 and MiniMax-M2.5 entries. Add multi-node vLLM PD disaggregation support for MiniMax-M2.5 (FP8), following the DeepSeek R1 disagg recipe pattern. Includes: - models.yaml: MiniMax-M2.5 config with TP8 prefill / TP8+EP8+MoRI decode - Entry script: minimaxm25_fp8_mi355x_vllm-disagg.sh - amd-master.yaml: e2e test entry for 1P2D on MI355X (1k1k, 8k1k, 1k8k) MiniMax M2.5 (230B, 256 experts, top-8 sigmoid routing, GQA) uses the same disagg infrastructure as DSR1. Unlike DeepSeek MLA models, M2.5 uses standard GQA attention so AITER paged attention is fully supported and no block-size/cudagraph workarounds are needed. Co-authored-by: ChuanLi1101 Co-authored-by: Claude Made-with: Cursor --- .github/configs/amd-master.yaml | 75 ++++++++++++++++++ .../minimaxm25_fp8_mi355x_vllm-disagg.sh | 77 +++++++++++++++++++ .../multi_node/vllm_disagg_utils/models.yaml | 6 ++ 3 files changed, 158 insertions(+) create mode 100644 benchmarks/multi_node/minimaxm25_fp8_mi355x_vllm-disagg.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 6f33178f3..df3f90cfd 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1456,6 +1456,81 @@ kimik2.5-fp4-mi355x-vllm-disagg: additional-settings: - "DECODE_NODES=2" +minimaxm25-fp8-mi355x-vllm-disagg: + image: vllm/vllm-openai-rocm:v0.18.0 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm25 + runner: mi355x-disagg + precision: fp8 + framework: vllm-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 1024 + osl: 8192 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + dsr1-fp4-mi355x-sglang-disagg: image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 model: amd/DeepSeek-R1-0528-MXFP4-v2 diff --git a/benchmarks/multi_node/minimaxm25_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm25_fp8_mi355x_vllm-disagg.sh new file mode 100644 index 000000000..137ee0381 --- /dev/null +++ b/benchmarks/multi_node/minimaxm25_fp8_mi355x_vllm-disagg.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/vllm_disagg_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi + +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml index 0ef2bc77f..3e62972b8 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/models.yaml +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -41,6 +41,12 @@ Kimi-K2.5-MXFP4: env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" hf_dir: "models--amd--Kimi-K2.5-MXFP4" +MiniMax-M2.5: + prefill_flags: "--tensor-parallel-size 8 --no-enable-prefix-caching" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_ENGINE_READY_TIMEOUT_S=3600" + hf_dir: "models--MiniMaxAI--MiniMax-M2.5" + gpt-oss-120b: prefill_flags: "--tensor-parallel-size 8" decode_flags: "--tensor-parallel-size 8" From e3319a73ff68a1b4657554ddfacdcf4ced744565 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 3 Apr 2026 15:09:47 +0000 Subject: [PATCH 23/85] feat: add Dockerfile and runtime patch for MiniMax M2.5 WideEP + MoRI Cherry-picked from ChuanLi1101/InferenceMAX:chuali/minimax-m25-vllm-disagg (commit bb6bd0ed). Adapted for v0.18.0 base: kept vllm/vllm-openai-rocm:v0.18.0 image (runtime patch via setup_deps.sh is sufficient; custom Docker image available in docker/minimax-m25-disagg/ if needed). Two deployment options for getting vLLM minimax_m2.py changes into the container: Option A -- Custom Docker image (docker/minimax-m25-disagg/): Builds from the public vLLM ROCm image and pre-installs UCX, etcd, RIXL, and patched minimax_m2.py with WideEP + MoRI + EPLB support baked in. Option B -- Runtime patch (setup_deps.sh): patch_minimax_m2_wideep_mori() copies patched minimax_m2.py from the mounted InferenceX repo into the container's vLLM installation at startup. Co-authored-by: ChuanLi1101 Co-authored-by: Claude Made-with: Cursor --- .../vllm_disagg_utils/patches/minimax_m2.py | 672 ++++++++++++++++++ .../vllm_disagg_utils/setup_deps.sh | 40 ++ docker/minimax-m25-disagg/Dockerfile | 91 +++ docker/minimax-m25-disagg/build.sh | 31 + .../minimax-m25-disagg/patches/minimax_m2.py | 672 ++++++++++++++++++ 5 files changed, 1506 insertions(+) create mode 100644 benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py create mode 100644 docker/minimax-m25-disagg/Dockerfile create mode 100644 docker/minimax-m25-disagg/build.sh create mode 100644 docker/minimax-m25-disagg/patches/minimax_m2.py diff --git a/benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py b/benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py new file mode 100644 index 000000000..c27b77ccf --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py @@ -0,0 +1,672 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The MiniMax AI team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only MiniMaxM2/M2.5 model.""" + +from collections.abc import Iterable +from typing import Any + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm._aiter_ops import rocm_aiter_ops +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config +from vllm.distributed import ( + get_ep_group, + get_pp_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention +from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.model_executor.models.utils import sequence_parallel_chunk +from vllm.sequence import IntermediateTensors + +from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP +from .utils import ( + AutoWeightsLoader, + PPMissingLayer, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, + make_layers, + maybe_prefix, +) + +logger = init_logger(__name__) + + +class MiniMaxM2MoE(nn.Module): + """MoE layer for MiniMax M2/M2.5 with EP/WideEP/Mori support. + + Follows the DeepSeek V2 MoE pattern: GateLinear + FusedMoE with + expert parallelism, EPLB, and sequence parallel awareness. + """ + + def __init__( + self, + config: PretrainedConfig, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ): + super().__init__() + vllm_config = get_current_vllm_config() + parallel_config = vllm_config.parallel_config + + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + + self.ep_group = get_ep_group().device_group + self.ep_rank = get_ep_group().rank_in_group + self.ep_size = self.ep_group.size() + + self.n_routed_experts: int = config.num_local_experts + self.n_shared_experts: int = 0 + + self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe + self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0) + self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() + + eplb_config = parallel_config.eplb_config + self.enable_eplb = parallel_config.enable_eplb + self.n_redundant_experts = eplb_config.num_redundant_experts + self.n_logical_experts = self.n_routed_experts + self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + + self.use_routing_bias = getattr(config, "use_routing_bias", False) + if self.use_routing_bias: + self.e_score_correction_bias = nn.Parameter( + torch.empty(config.num_local_experts, dtype=torch.float32) + ) + self.e_score_correction_bias.weight_loader = ( + MiniMaxM2MoE.ebias_weight_loader + ) + else: + self.e_score_correction_bias = None + + self.gate = GateLinear( + config.hidden_size, + config.num_local_experts, + params_dtype=torch.float32, + prefix=f"{prefix}.gate", + ) + + self.experts = FusedMoE( + num_experts=config.num_local_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + reduce_results=False, + renormalize=True, + scoring_func=getattr(config, "scoring_func", "softmax"), + e_score_correction_bias=self.e_score_correction_bias, + quant_config=quant_config, + prefix=f"{prefix}.experts", + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts, + is_sequence_parallel=self.is_sequence_parallel, + router_logits_dtype=torch.float32, + gate=self.gate, + routed_scaling_factor=1.0 + if not self.is_rocm_aiter_moe_enabled + else self.routed_scaling_factor, + ) + + @staticmethod + def ebias_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor) -> None: + assert param.size() == loaded_weight.size() + param.data.copy_(loaded_weight.to(torch.float32)) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + num_tokens, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + + if self.is_sequence_parallel: + hidden_states = sequence_parallel_chunk(hidden_states) + + if self.experts.is_internal_router: + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=hidden_states + ) + else: + router_logits, _ = self.gate(hidden_states) + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=router_logits + ) + + if hidden_states.dtype != torch.float16: + if not self.is_rocm_aiter_moe_enabled: + final_hidden_states = final_hidden_states * self.routed_scaling_factor + + if self.is_sequence_parallel: + final_hidden_states = tensor_model_parallel_all_gather( + final_hidden_states, 0 + ) + final_hidden_states = final_hidden_states[:num_tokens] + elif self.tp_size > 1: + final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( + final_hidden_states + ) + + return final_hidden_states.view(num_tokens, hidden_dim) + + +class MiniMaxM2Attention(nn.Module): + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rotary_dim: int, + rope_parameters: dict[str, Any] | None = None, + attn_window_size: int | None = None, + max_position_embeddings: int = 8192, + head_dim: int | None = None, + rms_norm_eps: float = 1e-06, + qkv_bias: bool = False, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = head_dim or (hidden_size // self.total_num_heads) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + if ( + rope_parameters is not None + and "partial_rotary_factor" not in rope_parameters + ): + rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim + self.rotary_emb = get_rope( + self.head_dim, + max_position=max_position_embeddings, + rope_parameters=rope_parameters, + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + per_layer_sliding_window=attn_window_size, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + + self.q_norm = MiniMaxText01RMSNormTP( + self.head_dim * self.total_num_heads, eps=rms_norm_eps + ) + self.k_norm = MiniMaxText01RMSNormTP( + self.head_dim * self.total_num_kv_heads, eps=rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = MiniMaxText01RMSNormTP.forward_qk( + self.q_norm, self.k_norm, q.contiguous(), k.contiguous() + ) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class MiniMaxM2DecoderLayer(nn.Module): + def __init__( + self, + config: PretrainedConfig, + prefix: str, + model_config: ModelConfig, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): + max_position_embeddings = max( + config.max_position_embeddings, config.max_model_len + ) + # DecoderLayers are created with `make_layers` which passes the prefix + # with the layer's index. + layer_idx = int(prefix.split(sep=".")[-1]) + + self.layer_idx = layer_idx + self.self_attn = MiniMaxM2Attention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rotary_dim=config.rotary_dim, + rope_parameters=config.rope_parameters, + max_position_embeddings=max_position_embeddings, + rms_norm_eps=config.rms_norm_eps, + qkv_bias=getattr(config, "attention_bias", False), + head_dim=getattr(config, "head_dim", None), + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + + self.block_sparse_moe = MiniMaxM2MoE( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + ) -> torch.Tensor: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + + hidden_states = self.block_sparse_moe(hidden_states) + + return hidden_states, residual + + +@support_torch_compile +class MiniMaxM2Model(nn.Module): + fall_back_to_pt_during_load = False + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config + + self.vocab_size = config.vocab_size + + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=None, + prefix=f"{prefix}.embed_tokens", + ) + else: + self.embed_tokens = PPMissingLayer() + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: MiniMaxM2DecoderLayer( + config, + prefix, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + ), + prefix=f"{prefix}.layers", + ) + + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_input_ids(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for layer in self.layers[self.start_layer : self.end_layer]: + hidden_states, residual = layer(positions, hidden_states, residual) + + if not get_pp_group().is_last_rank: + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return FusedMoE.make_expert_params_mapping( + self, + ckpt_gate_proj_name="w1", + ckpt_down_proj_name="w2", + ckpt_up_proj_name="w3", + num_experts=self.config.num_local_experts, + num_redundant_experts=0, + ) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = self.get_expert_mapping() + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is not None: + continue # skip spec decode layers for main model + + for param_name, weight_name, shard_id in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if ("mlp.experts." in name) and name not in params_dict: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader( + param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id, + ) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class MiniMaxM2MixtureOfExperts(MixtureOfExperts): + """EPLB protocol implementation for MiniMax M2/M2.5.""" + + moe_mlp_layers: list[MiniMaxM2MoE] + + def extract_moe_parameters(self, example_moe: MiniMaxM2MoE | None): + if example_moe is None: + self.num_moe_layers = 0 + self.num_expert_groups = 0 + self.num_logical_experts = 0 + self.num_physical_experts = 0 + self.num_local_physical_experts = 0 + self.num_routed_experts = 0 + self.num_shared_experts = 0 + self.num_redundant_experts = 0 + logger.warning("MiniMax M2: No MoE layer found in model.layers.") + else: + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = num_physical_experts - self.num_logical_experts + for moe in self.moe_mlp_layers: + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + + +class MiniMaxM2ForCausalLM( + nn.Module, SupportsLoRA, SupportsPP, MiniMaxM2MixtureOfExperts +): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + if hasattr(vllm_config.model_config, "max_model_len"): + self.config.max_model_len = vllm_config.model_config.max_model_len + self.model = MiniMaxM2Model( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead( + config.vocab_size, config.hidden_size, quant_config=None + ) + else: + self.lm_head = PPMissingLayer() + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + + self.num_moe_layers = config.num_hidden_layers + self._set_moe_parameters() + + def _set_moe_parameters(self): + self.expert_weights: list = [] + self.num_expert_groups = 1 + self.moe_layers: list = [] + self.moe_mlp_layers: list[MiniMaxM2MoE] = [] + example_moe = None + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + assert isinstance(layer, MiniMaxM2DecoderLayer) + if isinstance(layer.block_sparse_moe, MiniMaxM2MoE): + example_moe = layer.block_sparse_moe + self.moe_mlp_layers.append(layer.block_sparse_moe) + self.moe_layers.append(layer.block_sparse_moe.experts) + self.extract_moe_parameters(example_moe) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor | IntermediateTensors: + hidden_states = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + logits = self.logits_processor(self.lm_head, hidden_states) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() + + +def get_spec_layer_idx_from_weight_name( + config: PretrainedConfig, weight_name: str +) -> int | None: + if hasattr(config, "num_mtp_modules") and (config.num_mtp_modules > 0): + layer_idx = config.num_hidden_layers + for i in range(config.num_mtp_modules): + if weight_name.startswith(f"model.layers.{layer_idx + i}."): + return layer_idx + i + return None diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index 848bd6918..7f691d141 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -832,6 +832,45 @@ except Exception as e: _SETUP_INSTALLED+=("idle-kv-reaper") } +# --------------------------------------------------------------------------- +# 13. Patch MiniMax M2.5 WideEP + MoRI + EPLB support +# Replaces the upstream minimax_m2.py with our patched version that adds +# GateLinear, EP group integration, sequence parallelism, and the +# MixtureOfExperts EPLB protocol. Idempotent: skips if already patched. +# --------------------------------------------------------------------------- +patch_minimax_m2_wideep_mori() { + local patch_file="${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}/patches/minimax_m2.py" + if [[ ! -f "$patch_file" ]]; then + # Also check the Docker-baked location + patch_file="/opt/vllm_disagg/patches/minimax_m2.py" + fi + if [[ ! -f "$patch_file" ]]; then + echo "[SETUP] minimax_m2.py patch not found, skipping (WideEP/MoRI not patched)" + return 0 + fi + + python3 -c ' +import os, sys, shutil + +try: + import vllm.model_executor.models.minimax_m2 as mmod + target = mmod.__file__ + src = sys.argv[1] + + with open(target) as f: + if "get_ep_group" in f.read(): + print("[SETUP] minimax_m2.py already has WideEP+MoRI support") + sys.exit(0) + + shutil.copy2(src, target) + print(f"[SETUP] Patched minimax_m2.py: {src} -> {target}") + +except Exception as e: + print(f"[SETUP] WARN patch minimax_m2: {e}", file=sys.stderr) +' "$patch_file" + _SETUP_INSTALLED+=("minimax-m2-wideep-mori") +} + # ============================================================================= # Run installers # ============================================================================= @@ -849,6 +888,7 @@ patch_moriio_transfer_timeout patch_moriio_load_kv_timeout patch_scheduler_read_mode_fix patch_prefill_idle_kv_reaper +patch_minimax_m2_wideep_mori # ============================================================================= # Export paths (persists for server.sh since this file is sourced) diff --git a/docker/minimax-m25-disagg/Dockerfile b/docker/minimax-m25-disagg/Dockerfile new file mode 100644 index 000000000..3bced3f91 --- /dev/null +++ b/docker/minimax-m25-disagg/Dockerfile @@ -0,0 +1,91 @@ +# MiniMax M2.5 PD Disaggregation Docker Image +# +# Extends the public vLLM ROCm image with: +# 1. WideEP + MoRI support for MiniMax M2.5 (minimax_m2.py patch) +# 2. Pre-installed runtime deps (UCX, RIXL, etcd, MoRI) +# 3. Disagg orchestration scripts baked in +# +# Build: +# docker build -t minimax-m25-disagg:latest -f docker/minimax-m25-disagg/Dockerfile . +# +# The image still sources setup_deps.sh at startup for idempotent patching +# (scheduler KV reaper, MoRI-IO read mode, etc.) but the heavy build steps +# (UCX, RIXL) are cached in the image layer. + +ARG BASE_IMAGE=vllm/vllm-openai-rocm:v0.18.0 +FROM ${BASE_IMAGE} + +ARG ROCM_PATH=/opt/rocm +ARG UCX_HOME=/usr/local/ucx +ARG RIXL_HOME=/usr/local/rixl + +# ---------------------------------------------------------------- +# 1. Patch vLLM: MiniMax M2.5 WideEP + MoRI + EPLB support +# ---------------------------------------------------------------- +COPY docker/minimax-m25-disagg/patches/minimax_m2.py /tmp/patches/minimax_m2.py +RUN VLLM_MODELS=$(python3 -c "import vllm.model_executor.models; import os; print(os.path.dirname(vllm.model_executor.models.__file__))") && \ + cp /tmp/patches/minimax_m2.py "${VLLM_MODELS}/minimax_m2.py" && \ + echo "[DOCKER] Patched minimax_m2.py -> ${VLLM_MODELS}/minimax_m2.py" && \ + rm -rf /tmp/patches + +# ---------------------------------------------------------------- +# 2. Pre-install UCX build deps (speeds up setup_deps.sh at runtime) +# ---------------------------------------------------------------- +RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \ + autoconf automake libtool pkg-config \ + librdmacm-dev rdmacm-utils libibverbs-dev ibverbs-utils ibverbs-providers \ + infiniband-diags perftest ethtool rdma-core strace \ + && rm -rf /var/lib/apt/lists/* + +# ---------------------------------------------------------------- +# 3. Pre-build UCX (ROCm fork) — the longest step in setup_deps.sh +# ---------------------------------------------------------------- +RUN git clone --quiet https://github.com/ROCm/ucx.git /usr/local/src/ucx && \ + cd /usr/local/src/ucx && \ + git checkout da3fac2a && \ + ./autogen.sh && mkdir -p build && cd build && \ + ../configure \ + --prefix="${UCX_HOME}" \ + --enable-shared --disable-static \ + --disable-doxygen-doc --enable-optimizations \ + --enable-devel-headers --enable-mt \ + --with-rocm="${ROCM_PATH}" --with-verbs --with-dm && \ + make -j"$(nproc)" && make install && \ + rm -rf /usr/local/src/ucx + +# ---------------------------------------------------------------- +# 4. Pre-install etcd +# ---------------------------------------------------------------- +RUN ARCH=$(uname -m) && \ + if [ "$ARCH" = "x86_64" ]; then ETCD_ARCH=amd64; else ETCD_ARCH=arm64; fi && \ + ETCD_VER=v3.5.21 && \ + curl -fsSL "https://github.com/etcd-io/etcd/releases/download/${ETCD_VER}/etcd-${ETCD_VER}-linux-${ETCD_ARCH}.tar.gz" | \ + tar xz -C /usr/local/bin --strip-components=1 \ + "etcd-${ETCD_VER}-linux-${ETCD_ARCH}/etcd" \ + "etcd-${ETCD_VER}-linux-${ETCD_ARCH}/etcdctl" && \ + etcd --version + +# ---------------------------------------------------------------- +# 5. Pre-install RIXL (Nixl KV transfer) +# ---------------------------------------------------------------- +RUN pip install --no-cache-dir nixl && \ + python3 -c "import nixl; print('RIXL installed:', nixl.__file__)" || \ + echo "[DOCKER] WARN: nixl pip install failed, will fallback to setup_deps.sh" + +# ---------------------------------------------------------------- +# 6. Copy disagg orchestration scripts into the image +# ---------------------------------------------------------------- +COPY benchmarks/multi_node/vllm_disagg_utils/ /opt/vllm_disagg/ +COPY benchmarks/multi_node/minimaxm25_fp8_mi355x_vllm-disagg.sh /opt/vllm_disagg/ + +# ---------------------------------------------------------------- +# 7. Environment +# ---------------------------------------------------------------- +ENV UCX_HOME=${UCX_HOME} \ + RIXL_HOME=${RIXL_HOME} \ + ROCM_PATH=${ROCM_PATH} \ + PATH="${UCX_HOME}/bin:/usr/local/bin:${PATH}" \ + LD_LIBRARY_PATH="${UCX_HOME}/lib:${LD_LIBRARY_PATH:-}" \ + PYTHONPYCACHEPREFIX=/tmp/pycache + +WORKDIR /workspace diff --git a/docker/minimax-m25-disagg/build.sh b/docker/minimax-m25-disagg/build.sh new file mode 100644 index 000000000..b36227caf --- /dev/null +++ b/docker/minimax-m25-disagg/build.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Build the MiniMax M2.5 PD Disagg Docker image. +# +# Usage: +# cd +# bash docker/minimax-m25-disagg/build.sh [tag] [base_image] +# +# Examples: +# bash docker/minimax-m25-disagg/build.sh # default tag + base +# bash docker/minimax-m25-disagg/build.sh my-tag:v1 # custom tag +# bash docker/minimax-m25-disagg/build.sh latest vllm/vllm-openai-rocm:v0.19.0 # custom base +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +TAG="${1:-minimax-m25-disagg:latest}" +BASE_IMAGE="${2:-vllm/vllm-openai-rocm:v0.18.0}" + +echo "Building MiniMax M2.5 Disagg image..." +echo " Tag: $TAG" +echo " Base image: $BASE_IMAGE" +echo " Context: $REPO_ROOT" + +docker build \ + -t "$TAG" \ + --build-arg BASE_IMAGE="$BASE_IMAGE" \ + -f "$REPO_ROOT/docker/minimax-m25-disagg/Dockerfile" \ + "$REPO_ROOT" + +echo "" +echo "Done. Image: $TAG" +echo "To push: docker tag $TAG /$TAG && docker push /$TAG" diff --git a/docker/minimax-m25-disagg/patches/minimax_m2.py b/docker/minimax-m25-disagg/patches/minimax_m2.py new file mode 100644 index 000000000..c27b77ccf --- /dev/null +++ b/docker/minimax-m25-disagg/patches/minimax_m2.py @@ -0,0 +1,672 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The MiniMax AI team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only MiniMaxM2/M2.5 model.""" + +from collections.abc import Iterable +from typing import Any + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm._aiter_ops import rocm_aiter_ops +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config +from vllm.distributed import ( + get_ep_group, + get_pp_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention +from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.model_executor.models.utils import sequence_parallel_chunk +from vllm.sequence import IntermediateTensors + +from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP +from .utils import ( + AutoWeightsLoader, + PPMissingLayer, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, + make_layers, + maybe_prefix, +) + +logger = init_logger(__name__) + + +class MiniMaxM2MoE(nn.Module): + """MoE layer for MiniMax M2/M2.5 with EP/WideEP/Mori support. + + Follows the DeepSeek V2 MoE pattern: GateLinear + FusedMoE with + expert parallelism, EPLB, and sequence parallel awareness. + """ + + def __init__( + self, + config: PretrainedConfig, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ): + super().__init__() + vllm_config = get_current_vllm_config() + parallel_config = vllm_config.parallel_config + + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + + self.ep_group = get_ep_group().device_group + self.ep_rank = get_ep_group().rank_in_group + self.ep_size = self.ep_group.size() + + self.n_routed_experts: int = config.num_local_experts + self.n_shared_experts: int = 0 + + self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe + self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0) + self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() + + eplb_config = parallel_config.eplb_config + self.enable_eplb = parallel_config.enable_eplb + self.n_redundant_experts = eplb_config.num_redundant_experts + self.n_logical_experts = self.n_routed_experts + self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + + self.use_routing_bias = getattr(config, "use_routing_bias", False) + if self.use_routing_bias: + self.e_score_correction_bias = nn.Parameter( + torch.empty(config.num_local_experts, dtype=torch.float32) + ) + self.e_score_correction_bias.weight_loader = ( + MiniMaxM2MoE.ebias_weight_loader + ) + else: + self.e_score_correction_bias = None + + self.gate = GateLinear( + config.hidden_size, + config.num_local_experts, + params_dtype=torch.float32, + prefix=f"{prefix}.gate", + ) + + self.experts = FusedMoE( + num_experts=config.num_local_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + reduce_results=False, + renormalize=True, + scoring_func=getattr(config, "scoring_func", "softmax"), + e_score_correction_bias=self.e_score_correction_bias, + quant_config=quant_config, + prefix=f"{prefix}.experts", + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts, + is_sequence_parallel=self.is_sequence_parallel, + router_logits_dtype=torch.float32, + gate=self.gate, + routed_scaling_factor=1.0 + if not self.is_rocm_aiter_moe_enabled + else self.routed_scaling_factor, + ) + + @staticmethod + def ebias_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor) -> None: + assert param.size() == loaded_weight.size() + param.data.copy_(loaded_weight.to(torch.float32)) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + num_tokens, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + + if self.is_sequence_parallel: + hidden_states = sequence_parallel_chunk(hidden_states) + + if self.experts.is_internal_router: + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=hidden_states + ) + else: + router_logits, _ = self.gate(hidden_states) + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=router_logits + ) + + if hidden_states.dtype != torch.float16: + if not self.is_rocm_aiter_moe_enabled: + final_hidden_states = final_hidden_states * self.routed_scaling_factor + + if self.is_sequence_parallel: + final_hidden_states = tensor_model_parallel_all_gather( + final_hidden_states, 0 + ) + final_hidden_states = final_hidden_states[:num_tokens] + elif self.tp_size > 1: + final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( + final_hidden_states + ) + + return final_hidden_states.view(num_tokens, hidden_dim) + + +class MiniMaxM2Attention(nn.Module): + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rotary_dim: int, + rope_parameters: dict[str, Any] | None = None, + attn_window_size: int | None = None, + max_position_embeddings: int = 8192, + head_dim: int | None = None, + rms_norm_eps: float = 1e-06, + qkv_bias: bool = False, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = head_dim or (hidden_size // self.total_num_heads) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + if ( + rope_parameters is not None + and "partial_rotary_factor" not in rope_parameters + ): + rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim + self.rotary_emb = get_rope( + self.head_dim, + max_position=max_position_embeddings, + rope_parameters=rope_parameters, + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + per_layer_sliding_window=attn_window_size, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + + self.q_norm = MiniMaxText01RMSNormTP( + self.head_dim * self.total_num_heads, eps=rms_norm_eps + ) + self.k_norm = MiniMaxText01RMSNormTP( + self.head_dim * self.total_num_kv_heads, eps=rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = MiniMaxText01RMSNormTP.forward_qk( + self.q_norm, self.k_norm, q.contiguous(), k.contiguous() + ) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class MiniMaxM2DecoderLayer(nn.Module): + def __init__( + self, + config: PretrainedConfig, + prefix: str, + model_config: ModelConfig, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): + max_position_embeddings = max( + config.max_position_embeddings, config.max_model_len + ) + # DecoderLayers are created with `make_layers` which passes the prefix + # with the layer's index. + layer_idx = int(prefix.split(sep=".")[-1]) + + self.layer_idx = layer_idx + self.self_attn = MiniMaxM2Attention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rotary_dim=config.rotary_dim, + rope_parameters=config.rope_parameters, + max_position_embeddings=max_position_embeddings, + rms_norm_eps=config.rms_norm_eps, + qkv_bias=getattr(config, "attention_bias", False), + head_dim=getattr(config, "head_dim", None), + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + + self.block_sparse_moe = MiniMaxM2MoE( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + ) -> torch.Tensor: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + + hidden_states = self.block_sparse_moe(hidden_states) + + return hidden_states, residual + + +@support_torch_compile +class MiniMaxM2Model(nn.Module): + fall_back_to_pt_during_load = False + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config + + self.vocab_size = config.vocab_size + + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=None, + prefix=f"{prefix}.embed_tokens", + ) + else: + self.embed_tokens = PPMissingLayer() + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: MiniMaxM2DecoderLayer( + config, + prefix, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + ), + prefix=f"{prefix}.layers", + ) + + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_input_ids(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for layer in self.layers[self.start_layer : self.end_layer]: + hidden_states, residual = layer(positions, hidden_states, residual) + + if not get_pp_group().is_last_rank: + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return FusedMoE.make_expert_params_mapping( + self, + ckpt_gate_proj_name="w1", + ckpt_down_proj_name="w2", + ckpt_up_proj_name="w3", + num_experts=self.config.num_local_experts, + num_redundant_experts=0, + ) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = self.get_expert_mapping() + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is not None: + continue # skip spec decode layers for main model + + for param_name, weight_name, shard_id in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if ("mlp.experts." in name) and name not in params_dict: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader( + param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id, + ) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class MiniMaxM2MixtureOfExperts(MixtureOfExperts): + """EPLB protocol implementation for MiniMax M2/M2.5.""" + + moe_mlp_layers: list[MiniMaxM2MoE] + + def extract_moe_parameters(self, example_moe: MiniMaxM2MoE | None): + if example_moe is None: + self.num_moe_layers = 0 + self.num_expert_groups = 0 + self.num_logical_experts = 0 + self.num_physical_experts = 0 + self.num_local_physical_experts = 0 + self.num_routed_experts = 0 + self.num_shared_experts = 0 + self.num_redundant_experts = 0 + logger.warning("MiniMax M2: No MoE layer found in model.layers.") + else: + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = num_physical_experts - self.num_logical_experts + for moe in self.moe_mlp_layers: + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + + +class MiniMaxM2ForCausalLM( + nn.Module, SupportsLoRA, SupportsPP, MiniMaxM2MixtureOfExperts +): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + if hasattr(vllm_config.model_config, "max_model_len"): + self.config.max_model_len = vllm_config.model_config.max_model_len + self.model = MiniMaxM2Model( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead( + config.vocab_size, config.hidden_size, quant_config=None + ) + else: + self.lm_head = PPMissingLayer() + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + + self.num_moe_layers = config.num_hidden_layers + self._set_moe_parameters() + + def _set_moe_parameters(self): + self.expert_weights: list = [] + self.num_expert_groups = 1 + self.moe_layers: list = [] + self.moe_mlp_layers: list[MiniMaxM2MoE] = [] + example_moe = None + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + assert isinstance(layer, MiniMaxM2DecoderLayer) + if isinstance(layer.block_sparse_moe, MiniMaxM2MoE): + example_moe = layer.block_sparse_moe + self.moe_mlp_layers.append(layer.block_sparse_moe) + self.moe_layers.append(layer.block_sparse_moe.experts) + self.extract_moe_parameters(example_moe) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor | IntermediateTensors: + hidden_states = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + logits = self.logits_processor(self.lm_head, hidden_states) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() + + +def get_spec_layer_idx_from_weight_name( + config: PretrainedConfig, weight_name: str +) -> int | None: + if hasattr(config, "num_mtp_modules") and (config.num_mtp_modules > 0): + layer_idx = config.num_hidden_layers + for i in range(config.num_mtp_modules): + if weight_name.startswith(f"model.layers.{layer_idx + i}."): + return layer_idx + i + return None From 17a4abfd54e34793eef38fa0b4f263354a9d009d Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 3 Apr 2026 15:21:45 +0000 Subject: [PATCH 24/85] Fix: rename minimaxm25 to minimaxm2.5 for CI naming consistency Align MiniMax M2.5 disagg naming with existing single-node configs (minimaxm2.5_fp8_mi355x.sh, minimaxm2.5_fp8_mi300x.sh, etc.). - amd-master.yaml: minimaxm25 -> minimaxm2.5 in config key + model-prefix - Rename entry script: minimaxm25_fp8_mi355x_vllm-disagg.sh -> minimaxm2.5_fp8_mi355x_vllm-disagg.sh - Dockerfile: update COPY path to match renamed script --- .github/configs/amd-master.yaml | 4 ++-- ...x_vllm-disagg.sh => minimaxm2.5_fp8_mi355x_vllm-disagg.sh} | 0 docker/minimax-m25-disagg/Dockerfile | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename benchmarks/multi_node/{minimaxm25_fp8_mi355x_vllm-disagg.sh => minimaxm2.5_fp8_mi355x_vllm-disagg.sh} (100%) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index df3f90cfd..b82850cdd 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1456,10 +1456,10 @@ kimik2.5-fp4-mi355x-vllm-disagg: additional-settings: - "DECODE_NODES=2" -minimaxm25-fp8-mi355x-vllm-disagg: +minimaxm2.5-fp8-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:v0.18.0 model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm25 + model-prefix: minimaxm2.5 runner: mi355x-disagg precision: fp8 framework: vllm-disagg diff --git a/benchmarks/multi_node/minimaxm25_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh similarity index 100% rename from benchmarks/multi_node/minimaxm25_fp8_mi355x_vllm-disagg.sh rename to benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh diff --git a/docker/minimax-m25-disagg/Dockerfile b/docker/minimax-m25-disagg/Dockerfile index 3bced3f91..88e9ce764 100644 --- a/docker/minimax-m25-disagg/Dockerfile +++ b/docker/minimax-m25-disagg/Dockerfile @@ -76,7 +76,7 @@ RUN pip install --no-cache-dir nixl && \ # 6. Copy disagg orchestration scripts into the image # ---------------------------------------------------------------- COPY benchmarks/multi_node/vllm_disagg_utils/ /opt/vllm_disagg/ -COPY benchmarks/multi_node/minimaxm25_fp8_mi355x_vllm-disagg.sh /opt/vllm_disagg/ +COPY benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh /opt/vllm_disagg/ # ---------------------------------------------------------------- # 7. Environment From fec9fe253ea248bd6ab9e7dd3ba376f5b637293f Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 3 Apr 2026 16:07:42 +0000 Subject: [PATCH 25/85] Optimize: add --gpu-memory-utilization 0.95 and --block-size 32 to MiniMax M2.5 disagg Align MiniMax M2.5 disagg serve parameters with the proven single-node config (minimaxm2.5_fp8_mi355x.sh). MiniMax M2.5 uses GQA (not MLA), so block-size 32 is optimal (vs block-size 1 for DeepSeek/Kimi MLA). The extra 5% GPU memory (0.95 vs default 0.9) increases KV cache capacity for high-concurrency sweeps (C256/C512). --- benchmarks/multi_node/vllm_disagg_utils/models.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml index 3e62972b8..0b4629b13 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/models.yaml +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -42,8 +42,8 @@ Kimi-K2.5-MXFP4: hf_dir: "models--amd--Kimi-K2.5-MXFP4" MiniMax-M2.5: - prefill_flags: "--tensor-parallel-size 8 --no-enable-prefix-caching" - decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching" + prefill_flags: "--tensor-parallel-size 8 --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_ENGINE_READY_TIMEOUT_S=3600" hf_dir: "models--MiniMaxAI--MiniMax-M2.5" From 4a0a81a9a22ef1b9e6c77820064fa2fd6886a286 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 3 Apr 2026 18:17:08 +0000 Subject: [PATCH 26/85] =?UTF-8?q?Fix:=20MiniMax=20M2.5=20disagg=20?= =?UTF-8?q?=E2=80=94=20require=20EP=3D8=20for=20prefill,=20fix=20ROCm=20ga?= =?UTF-8?q?te=20dtype?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MiniMax M2.5 has expert intermediate_size=1536; with TP=8 and no EP the sharded dimension (192) is not divisible by FP8 block_n=128, crashing the prefill node. Set prefill EP=8 (matching decode and single-node) and add --enable-expert-parallel --all2all-backend mori to prefill_flags. Fix GateLinear to use out_dtype=torch.float32 instead of params_dtype=torch.float32 so the GEMM runs in bf16 (ROCm compatible) and only the output is cast to fp32 for routing precision. Remove the 1K/8K benchmark scenario (not needed). --- .github/configs/amd-master.yaml | 26 +++---------------- .../multi_node/vllm_disagg_utils/models.yaml | 2 +- .../vllm_disagg_utils/patches/minimax_m2.py | 2 +- 3 files changed, 5 insertions(+), 25 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index b82850cdd..132a41f4f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1470,12 +1470,14 @@ minimaxm2.5-fp8-mi355x-vllm-disagg: osl: 1024 search-space: # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + # Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536, + # TP8 shards to 192 which is not divisible by FP8 block_n=128. - spec-decoding: "none" conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] prefill: num-worker: 1 tp: 8 - ep: 1 + ep: 8 dp-attn: false additional-settings: - "PREFILL_NODES=1" @@ -1496,30 +1498,9 @@ minimaxm2.5-fp8-mi355x-vllm-disagg: prefill: num-worker: 1 tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 ep: 8 dp-attn: false additional-settings: - - "DECODE_NODES=2" - - - isl: 1024 - osl: 8192 - search-space: - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - "PREFILL_NODES=1" - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" decode: @@ -1530,7 +1511,6 @@ minimaxm2.5-fp8-mi355x-vllm-disagg: additional-settings: - "DECODE_NODES=2" - dsr1-fp4-mi355x-sglang-disagg: image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 model: amd/DeepSeek-R1-0528-MXFP4-v2 diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml index 0b4629b13..c6d27b5ae 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/models.yaml +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -42,7 +42,7 @@ Kimi-K2.5-MXFP4: hf_dir: "models--amd--Kimi-K2.5-MXFP4" MiniMax-M2.5: - prefill_flags: "--tensor-parallel-size 8 --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + prefill_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_ENGINE_READY_TIMEOUT_S=3600" hf_dir: "models--MiniMaxAI--MiniMax-M2.5" diff --git a/benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py b/benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py index c27b77ccf..8290276fb 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py +++ b/benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py @@ -128,7 +128,7 @@ def __init__( self.gate = GateLinear( config.hidden_size, config.num_local_experts, - params_dtype=torch.float32, + out_dtype=torch.float32, prefix=f"{prefix}.gate", ) From 9445f6a04279ca3a531d08ac26886eb7fb8b9891 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 3 Apr 2026 19:28:13 +0000 Subject: [PATCH 27/85] Remove unused docker/minimax-m25-disagg/ directory The Dockerfile, build.sh, and duplicate minimax_m2.py patch were never used by the CI pipeline or local tests. --- docker/minimax-m25-disagg/Dockerfile | 91 --- docker/minimax-m25-disagg/build.sh | 31 - .../minimax-m25-disagg/patches/minimax_m2.py | 672 ------------------ 3 files changed, 794 deletions(-) delete mode 100644 docker/minimax-m25-disagg/Dockerfile delete mode 100644 docker/minimax-m25-disagg/build.sh delete mode 100644 docker/minimax-m25-disagg/patches/minimax_m2.py diff --git a/docker/minimax-m25-disagg/Dockerfile b/docker/minimax-m25-disagg/Dockerfile deleted file mode 100644 index 88e9ce764..000000000 --- a/docker/minimax-m25-disagg/Dockerfile +++ /dev/null @@ -1,91 +0,0 @@ -# MiniMax M2.5 PD Disaggregation Docker Image -# -# Extends the public vLLM ROCm image with: -# 1. WideEP + MoRI support for MiniMax M2.5 (minimax_m2.py patch) -# 2. Pre-installed runtime deps (UCX, RIXL, etcd, MoRI) -# 3. Disagg orchestration scripts baked in -# -# Build: -# docker build -t minimax-m25-disagg:latest -f docker/minimax-m25-disagg/Dockerfile . -# -# The image still sources setup_deps.sh at startup for idempotent patching -# (scheduler KV reaper, MoRI-IO read mode, etc.) but the heavy build steps -# (UCX, RIXL) are cached in the image layer. - -ARG BASE_IMAGE=vllm/vllm-openai-rocm:v0.18.0 -FROM ${BASE_IMAGE} - -ARG ROCM_PATH=/opt/rocm -ARG UCX_HOME=/usr/local/ucx -ARG RIXL_HOME=/usr/local/rixl - -# ---------------------------------------------------------------- -# 1. Patch vLLM: MiniMax M2.5 WideEP + MoRI + EPLB support -# ---------------------------------------------------------------- -COPY docker/minimax-m25-disagg/patches/minimax_m2.py /tmp/patches/minimax_m2.py -RUN VLLM_MODELS=$(python3 -c "import vllm.model_executor.models; import os; print(os.path.dirname(vllm.model_executor.models.__file__))") && \ - cp /tmp/patches/minimax_m2.py "${VLLM_MODELS}/minimax_m2.py" && \ - echo "[DOCKER] Patched minimax_m2.py -> ${VLLM_MODELS}/minimax_m2.py" && \ - rm -rf /tmp/patches - -# ---------------------------------------------------------------- -# 2. Pre-install UCX build deps (speeds up setup_deps.sh at runtime) -# ---------------------------------------------------------------- -RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \ - autoconf automake libtool pkg-config \ - librdmacm-dev rdmacm-utils libibverbs-dev ibverbs-utils ibverbs-providers \ - infiniband-diags perftest ethtool rdma-core strace \ - && rm -rf /var/lib/apt/lists/* - -# ---------------------------------------------------------------- -# 3. Pre-build UCX (ROCm fork) — the longest step in setup_deps.sh -# ---------------------------------------------------------------- -RUN git clone --quiet https://github.com/ROCm/ucx.git /usr/local/src/ucx && \ - cd /usr/local/src/ucx && \ - git checkout da3fac2a && \ - ./autogen.sh && mkdir -p build && cd build && \ - ../configure \ - --prefix="${UCX_HOME}" \ - --enable-shared --disable-static \ - --disable-doxygen-doc --enable-optimizations \ - --enable-devel-headers --enable-mt \ - --with-rocm="${ROCM_PATH}" --with-verbs --with-dm && \ - make -j"$(nproc)" && make install && \ - rm -rf /usr/local/src/ucx - -# ---------------------------------------------------------------- -# 4. Pre-install etcd -# ---------------------------------------------------------------- -RUN ARCH=$(uname -m) && \ - if [ "$ARCH" = "x86_64" ]; then ETCD_ARCH=amd64; else ETCD_ARCH=arm64; fi && \ - ETCD_VER=v3.5.21 && \ - curl -fsSL "https://github.com/etcd-io/etcd/releases/download/${ETCD_VER}/etcd-${ETCD_VER}-linux-${ETCD_ARCH}.tar.gz" | \ - tar xz -C /usr/local/bin --strip-components=1 \ - "etcd-${ETCD_VER}-linux-${ETCD_ARCH}/etcd" \ - "etcd-${ETCD_VER}-linux-${ETCD_ARCH}/etcdctl" && \ - etcd --version - -# ---------------------------------------------------------------- -# 5. Pre-install RIXL (Nixl KV transfer) -# ---------------------------------------------------------------- -RUN pip install --no-cache-dir nixl && \ - python3 -c "import nixl; print('RIXL installed:', nixl.__file__)" || \ - echo "[DOCKER] WARN: nixl pip install failed, will fallback to setup_deps.sh" - -# ---------------------------------------------------------------- -# 6. Copy disagg orchestration scripts into the image -# ---------------------------------------------------------------- -COPY benchmarks/multi_node/vllm_disagg_utils/ /opt/vllm_disagg/ -COPY benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh /opt/vllm_disagg/ - -# ---------------------------------------------------------------- -# 7. Environment -# ---------------------------------------------------------------- -ENV UCX_HOME=${UCX_HOME} \ - RIXL_HOME=${RIXL_HOME} \ - ROCM_PATH=${ROCM_PATH} \ - PATH="${UCX_HOME}/bin:/usr/local/bin:${PATH}" \ - LD_LIBRARY_PATH="${UCX_HOME}/lib:${LD_LIBRARY_PATH:-}" \ - PYTHONPYCACHEPREFIX=/tmp/pycache - -WORKDIR /workspace diff --git a/docker/minimax-m25-disagg/build.sh b/docker/minimax-m25-disagg/build.sh deleted file mode 100644 index b36227caf..000000000 --- a/docker/minimax-m25-disagg/build.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash -# Build the MiniMax M2.5 PD Disagg Docker image. -# -# Usage: -# cd -# bash docker/minimax-m25-disagg/build.sh [tag] [base_image] -# -# Examples: -# bash docker/minimax-m25-disagg/build.sh # default tag + base -# bash docker/minimax-m25-disagg/build.sh my-tag:v1 # custom tag -# bash docker/minimax-m25-disagg/build.sh latest vllm/vllm-openai-rocm:v0.19.0 # custom base -set -euo pipefail - -REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" -TAG="${1:-minimax-m25-disagg:latest}" -BASE_IMAGE="${2:-vllm/vllm-openai-rocm:v0.18.0}" - -echo "Building MiniMax M2.5 Disagg image..." -echo " Tag: $TAG" -echo " Base image: $BASE_IMAGE" -echo " Context: $REPO_ROOT" - -docker build \ - -t "$TAG" \ - --build-arg BASE_IMAGE="$BASE_IMAGE" \ - -f "$REPO_ROOT/docker/minimax-m25-disagg/Dockerfile" \ - "$REPO_ROOT" - -echo "" -echo "Done. Image: $TAG" -echo "To push: docker tag $TAG /$TAG && docker push /$TAG" diff --git a/docker/minimax-m25-disagg/patches/minimax_m2.py b/docker/minimax-m25-disagg/patches/minimax_m2.py deleted file mode 100644 index c27b77ccf..000000000 --- a/docker/minimax-m25-disagg/patches/minimax_m2.py +++ /dev/null @@ -1,672 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Copyright 2025 The MiniMax AI team. -# Copyright 2023 The vLLM team. -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only MiniMaxM2/M2.5 model.""" - -from collections.abc import Iterable -from typing import Any - -import torch -from torch import nn -from transformers import PretrainedConfig - -from vllm._aiter_ops import rocm_aiter_ops -from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config -from vllm.distributed import ( - get_ep_group, - get_pp_group, - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_gather, -) -from vllm.logger import init_logger -from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ( - QKVParallelLinear, - RowParallelLinear, -) -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, - VocabParallelEmbedding, -) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) -from vllm.model_executor.models.utils import sequence_parallel_chunk -from vllm.sequence import IntermediateTensors - -from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP -from .utils import ( - AutoWeightsLoader, - PPMissingLayer, - is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, - make_layers, - maybe_prefix, -) - -logger = init_logger(__name__) - - -class MiniMaxM2MoE(nn.Module): - """MoE layer for MiniMax M2/M2.5 with EP/WideEP/Mori support. - - Follows the DeepSeek V2 MoE pattern: GateLinear + FusedMoE with - expert parallelism, EPLB, and sequence parallel awareness. - """ - - def __init__( - self, - config: PretrainedConfig, - quant_config: QuantizationConfig | None = None, - prefix: str = "", - ): - super().__init__() - vllm_config = get_current_vllm_config() - parallel_config = vllm_config.parallel_config - - self.tp_size = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() - - self.ep_group = get_ep_group().device_group - self.ep_rank = get_ep_group().rank_in_group - self.ep_size = self.ep_group.size() - - self.n_routed_experts: int = config.num_local_experts - self.n_shared_experts: int = 0 - - self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe - self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0) - self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() - - eplb_config = parallel_config.eplb_config - self.enable_eplb = parallel_config.enable_eplb - self.n_redundant_experts = eplb_config.num_redundant_experts - self.n_logical_experts = self.n_routed_experts - self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts - self.n_local_physical_experts = self.n_physical_experts // self.ep_size - - self.use_routing_bias = getattr(config, "use_routing_bias", False) - if self.use_routing_bias: - self.e_score_correction_bias = nn.Parameter( - torch.empty(config.num_local_experts, dtype=torch.float32) - ) - self.e_score_correction_bias.weight_loader = ( - MiniMaxM2MoE.ebias_weight_loader - ) - else: - self.e_score_correction_bias = None - - self.gate = GateLinear( - config.hidden_size, - config.num_local_experts, - params_dtype=torch.float32, - prefix=f"{prefix}.gate", - ) - - self.experts = FusedMoE( - num_experts=config.num_local_experts, - top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - reduce_results=False, - renormalize=True, - scoring_func=getattr(config, "scoring_func", "softmax"), - e_score_correction_bias=self.e_score_correction_bias, - quant_config=quant_config, - prefix=f"{prefix}.experts", - enable_eplb=self.enable_eplb, - num_redundant_experts=self.n_redundant_experts, - is_sequence_parallel=self.is_sequence_parallel, - router_logits_dtype=torch.float32, - gate=self.gate, - routed_scaling_factor=1.0 - if not self.is_rocm_aiter_moe_enabled - else self.routed_scaling_factor, - ) - - @staticmethod - def ebias_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor) -> None: - assert param.size() == loaded_weight.size() - param.data.copy_(loaded_weight.to(torch.float32)) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - num_tokens, hidden_dim = hidden_states.shape - hidden_states = hidden_states.view(-1, hidden_dim) - - if self.is_sequence_parallel: - hidden_states = sequence_parallel_chunk(hidden_states) - - if self.experts.is_internal_router: - final_hidden_states = self.experts( - hidden_states=hidden_states, router_logits=hidden_states - ) - else: - router_logits, _ = self.gate(hidden_states) - final_hidden_states = self.experts( - hidden_states=hidden_states, router_logits=router_logits - ) - - if hidden_states.dtype != torch.float16: - if not self.is_rocm_aiter_moe_enabled: - final_hidden_states = final_hidden_states * self.routed_scaling_factor - - if self.is_sequence_parallel: - final_hidden_states = tensor_model_parallel_all_gather( - final_hidden_states, 0 - ) - final_hidden_states = final_hidden_states[:num_tokens] - elif self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( - final_hidden_states - ) - - return final_hidden_states.view(num_tokens, hidden_dim) - - -class MiniMaxM2Attention(nn.Module): - def __init__( - self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - rotary_dim: int, - rope_parameters: dict[str, Any] | None = None, - attn_window_size: int | None = None, - max_position_embeddings: int = 8192, - head_dim: int | None = None, - rms_norm_eps: float = 1e-06, - qkv_bias: bool = False, - cache_config: CacheConfig | None = None, - quant_config: QuantizationConfig | None = None, - prefix: str = "", - ) -> None: - super().__init__() - self.hidden_size = hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = num_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.total_num_kv_heads = num_kv_heads - if self.total_num_kv_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_kv_heads == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - self.head_dim = head_dim or (hidden_size // self.total_num_heads) - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - self.max_position_embeddings = max_position_embeddings - - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=qkv_bias, - quant_config=quant_config, - prefix=f"{prefix}.qkv_proj", - ) - - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.o_proj", - ) - - if ( - rope_parameters is not None - and "partial_rotary_factor" not in rope_parameters - ): - rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim - self.rotary_emb = get_rope( - self.head_dim, - max_position=max_position_embeddings, - rope_parameters=rope_parameters, - ) - self.attn = Attention( - self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - per_layer_sliding_window=attn_window_size, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - ) - - self.q_norm = MiniMaxText01RMSNormTP( - self.head_dim * self.total_num_heads, eps=rms_norm_eps - ) - self.k_norm = MiniMaxText01RMSNormTP( - self.head_dim * self.total_num_kv_heads, eps=rms_norm_eps - ) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = MiniMaxText01RMSNormTP.forward_qk( - self.q_norm, self.k_norm, q.contiguous(), k.contiguous() - ) - q, k = self.rotary_emb(positions, q, k) - attn_output = self.attn(q, k, v) - output, _ = self.o_proj(attn_output) - return output - - -class MiniMaxM2DecoderLayer(nn.Module): - def __init__( - self, - config: PretrainedConfig, - prefix: str, - model_config: ModelConfig, - cache_config: CacheConfig | None = None, - quant_config: QuantizationConfig | None = None, - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - max_position_embeddings = getattr(config, "max_position_embeddings", 8192) - if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): - max_position_embeddings = max( - config.max_position_embeddings, config.max_model_len - ) - # DecoderLayers are created with `make_layers` which passes the prefix - # with the layer's index. - layer_idx = int(prefix.split(sep=".")[-1]) - - self.layer_idx = layer_idx - self.self_attn = MiniMaxM2Attention( - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - num_kv_heads=config.num_key_value_heads, - rotary_dim=config.rotary_dim, - rope_parameters=config.rope_parameters, - max_position_embeddings=max_position_embeddings, - rms_norm_eps=config.rms_norm_eps, - qkv_bias=getattr(config, "attention_bias", False), - head_dim=getattr(config, "head_dim", None), - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn", - ) - - self.block_sparse_moe = MiniMaxM2MoE( - config=config, - quant_config=quant_config, - prefix=f"{prefix}.mlp", - ) - self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm( - config.hidden_size, eps=config.rms_norm_eps - ) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - residual: torch.Tensor | None, - ) -> torch.Tensor: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm(hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - ) - - # Fully Connected - hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) - - hidden_states = self.block_sparse_moe(hidden_states) - - return hidden_states, residual - - -@support_torch_compile -class MiniMaxM2Model(nn.Module): - fall_back_to_pt_during_load = False - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - model_config = vllm_config.model_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - self.config = config - - self.vocab_size = config.vocab_size - - if get_pp_group().is_first_rank: - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - quant_config=None, - prefix=f"{prefix}.embed_tokens", - ) - else: - self.embed_tokens = PPMissingLayer() - - self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, - lambda prefix: MiniMaxM2DecoderLayer( - config, - prefix, - model_config=model_config, - cache_config=cache_config, - quant_config=quant_config, - ), - prefix=f"{prefix}.layers", - ) - - if get_pp_group().is_last_rank: - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - else: - self.norm = PPMissingLayer() - self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( - ["hidden_states", "residual"], config.hidden_size - ) - - def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.embed_tokens(input_ids) - - def forward( - self, - input_ids: torch.Tensor | None, - positions: torch.Tensor, - intermediate_tensors: IntermediateTensors | None, - inputs_embeds: torch.Tensor | None = None, - ) -> torch.Tensor | IntermediateTensors: - if get_pp_group().is_first_rank: - if inputs_embeds is not None: - hidden_states = inputs_embeds - else: - hidden_states = self.embed_input_ids(input_ids) - residual = None - else: - assert intermediate_tensors is not None - hidden_states = intermediate_tensors["hidden_states"] - residual = intermediate_tensors["residual"] - - for layer in self.layers[self.start_layer : self.end_layer]: - hidden_states, residual = layer(positions, hidden_states, residual) - - if not get_pp_group().is_last_rank: - return IntermediateTensors( - {"hidden_states": hidden_states, "residual": residual} - ) - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return FusedMoE.make_expert_params_mapping( - self, - ckpt_gate_proj_name="w1", - ckpt_down_proj_name="w2", - ckpt_up_proj_name="w3", - num_experts=self.config.num_local_experts, - num_redundant_experts=0, - ) - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - - # Params for weights, fp8 weight scales, fp8 activation scales - # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = self.get_expert_mapping() - - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - - spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) - if spec_layer is not None: - continue # skip spec decode layers for main model - - for param_name, weight_name, shard_id in stacked_params_mapping: - # Skip non-stacked layers and experts (experts handled below). - if weight_name not in name: - continue - # We have mlp.experts[0].gate_proj in the checkpoint. - # Since we handle the experts below in expert_params_mapping, - # we need to skip here BEFORE we update the name, otherwise - # name will be updated to mlp.experts[0].gate_up_proj, which - # will then be updated below in expert_params_mapping - # for mlp.experts[0].gate_gate_up_proj, which breaks load. - if ("mlp.experts." in name) and name not in params_dict: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - for mapping in expert_params_mapping: - param_name, weight_name, expert_id, shard_id = mapping - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader( - param, - loaded_weight, - name, - shard_id=shard_id, - expert_id=expert_id, - ) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr( - param, "weight_loader", default_weight_loader - ) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - - -class MiniMaxM2MixtureOfExperts(MixtureOfExperts): - """EPLB protocol implementation for MiniMax M2/M2.5.""" - - moe_mlp_layers: list[MiniMaxM2MoE] - - def extract_moe_parameters(self, example_moe: MiniMaxM2MoE | None): - if example_moe is None: - self.num_moe_layers = 0 - self.num_expert_groups = 0 - self.num_logical_experts = 0 - self.num_physical_experts = 0 - self.num_local_physical_experts = 0 - self.num_routed_experts = 0 - self.num_shared_experts = 0 - self.num_redundant_experts = 0 - logger.warning("MiniMax M2: No MoE layer found in model.layers.") - else: - self.num_logical_experts = example_moe.n_logical_experts - self.num_physical_experts = example_moe.n_physical_experts - self.num_local_physical_experts = example_moe.n_local_physical_experts - self.num_routed_experts = example_moe.n_routed_experts - self.num_shared_experts = example_moe.n_shared_experts - self.num_redundant_experts = example_moe.n_redundant_experts - - def update_physical_experts_metadata( - self, - num_physical_experts: int, - num_local_physical_experts: int, - ) -> None: - assert self.num_local_physical_experts == num_local_physical_experts - self.num_physical_experts = num_physical_experts - self.num_local_physical_experts = num_local_physical_experts - self.num_redundant_experts = num_physical_experts - self.num_logical_experts - for moe in self.moe_mlp_layers: - moe.n_local_physical_experts = num_local_physical_experts - moe.n_physical_experts = num_physical_experts - moe.n_redundant_experts = self.num_redundant_experts - moe.experts.update_expert_map() - - -class MiniMaxM2ForCausalLM( - nn.Module, SupportsLoRA, SupportsPP, MiniMaxM2MixtureOfExperts -): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - } - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.config = config - self.quant_config = quant_config - if hasattr(vllm_config.model_config, "max_model_len"): - self.config.max_model_len = vllm_config.model_config.max_model_len - self.model = MiniMaxM2Model( - vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") - ) - if get_pp_group().is_last_rank: - self.lm_head = ParallelLMHead( - config.vocab_size, config.hidden_size, quant_config=None - ) - else: - self.lm_head = PPMissingLayer() - self.logits_processor = LogitsProcessor(config.vocab_size) - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors - ) - - self.num_moe_layers = config.num_hidden_layers - self._set_moe_parameters() - - def _set_moe_parameters(self): - self.expert_weights: list = [] - self.num_expert_groups = 1 - self.moe_layers: list = [] - self.moe_mlp_layers: list[MiniMaxM2MoE] = [] - example_moe = None - for layer in self.model.layers: - if isinstance(layer, PPMissingLayer): - continue - assert isinstance(layer, MiniMaxM2DecoderLayer) - if isinstance(layer.block_sparse_moe, MiniMaxM2MoE): - example_moe = layer.block_sparse_moe - self.moe_mlp_layers.append(layer.block_sparse_moe) - self.moe_layers.append(layer.block_sparse_moe.experts) - self.extract_moe_parameters(example_moe) - - def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.embed_input_ids(input_ids) - - def forward( - self, - input_ids: torch.Tensor | None, - positions: torch.Tensor, - intermediate_tensors: IntermediateTensors | None = None, - inputs_embeds: torch.Tensor | None = None, - **kwargs, - ) -> torch.Tensor | IntermediateTensors: - hidden_states = self.model( - input_ids, positions, intermediate_tensors, inputs_embeds - ) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - ) -> torch.Tensor | None: - logits = self.logits_processor(self.lm_head, hidden_states) - return logits - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) - return loader.load_weights(weights) - - def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return self.model.get_expert_mapping() - - -def get_spec_layer_idx_from_weight_name( - config: PretrainedConfig, weight_name: str -) -> int | None: - if hasattr(config, "num_mtp_modules") and (config.num_mtp_modules > 0): - layer_idx = config.num_hidden_layers - for i in range(config.num_mtp_modules): - if weight_name.startswith(f"model.layers.{layer_idx + i}."): - return layer_idx + i - return None From 4b94881e4a0a28a2c8ec32e2b686d3b97646ee80 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Mon, 13 Apr 2026 03:00:45 +0000 Subject: [PATCH 28/85] remove vllm disagg for dpsr1 and dpv3 Signed-off-by: Theresa Shan --- .github/configs/amd-master.yaml | 53 ------------- .../multi_node/dsr1_fp8_mi355x_vllm-disagg.sh | 79 ------------------- .../multi_node/vllm_disagg_utils/models.yaml | 13 +-- 3 files changed, 1 insertion(+), 144 deletions(-) delete mode 100755 benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 132a41f4f..26a34ebcb 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1350,59 +1350,6 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=2" -dsr1-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:v0.18.0 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp8 - framework: vllm-disagg - multinode: true - disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - kimik2.5-fp4-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:v0.18.0 model: amd/Kimi-K2.5-MXFP4 diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh deleted file mode 100755 index b21e9204a..000000000 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - CONC_LIST \ - ISL \ - OSL \ - IMAGE \ - SPEC_DECODING \ - MODEL_PATH \ - PREFILL_NUM_WORKERS \ - PREFILL_TP \ - PREFILL_EP \ - PREFILL_DP_ATTN \ - DECODE_NUM_WORKERS \ - DECODE_TP \ - DECODE_EP \ - DECODE_DP_ATTN \ - PREFILL_NODES \ - DECODE_NODES \ - RANDOM_RANGE_RATIO - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -set -x - -cd "$GITHUB_WORKSPACE/benchmarks/multi_node/vllm_disagg_utils" || exit 1 - -export TIME_LIMIT="08:00:00" -export MODEL_PATH=$MODEL_PATH -export MODEL_NAME=$MODEL_NAME -export CONTAINER_IMAGE=$IMAGE - -# Same EP/DP booleans as dsr1_fp8_mi355x_sglang-disagg.sh → amd_utils/submit.sh -if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then - export PREFILL_ENABLE_EP=false -else - export PREFILL_ENABLE_EP=true -fi - -if [[ "$PREFILL_DP_ATTN" == "true" ]]; then - export PREFILL_ENABLE_DP=true -else - export PREFILL_ENABLE_DP=false -fi - -if [[ "${DECODE_EP:-1}" -eq 1 ]]; then - export DECODE_ENABLE_EP=false -else - export DECODE_ENABLE_EP=true -fi - -if [[ "$DECODE_DP_ATTN" == "true" ]]; then - export DECODE_ENABLE_DP=true -else - export DECODE_ENABLE_DP=false -fi - -# Parameter order matches SGLang disagg submit.sh; arg 16 is optional NODELIST. -JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ - $PREFILL_NUM_WORKERS \ - $DECODE_NODES \ - $DECODE_NUM_WORKERS \ - $ISL $OSL "${CONC_LIST// /x}" inf \ - ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ - ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ - ${PREFILL_TP} ${DECODE_TP} \ - ${RANDOM_RANGE_RATIO} \ - "${NODELIST:-}") - -if [[ $? -ne 0 ]]; then - echo "Failed to submit job" >&2 - exit 1 -fi - -echo "$JOB_ID" diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml index c6d27b5ae..c68bb46e3 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/models.yaml +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -12,7 +12,7 @@ # decode_flags: str # vLLM CLI flags for decode workers # env: str # Space-separated KEY=VALUE pairs exported before vllm serve # hf_dir: str # (optional) On-disk directory name if it differs from the key -# # e.g. HF cache layout: models--deepseek-ai--DeepSeek-R1-0528 +# # e.g. HF cache layout: models--amd--Kimi-K2.5-MXFP4 Llama-3.1-405B-Instruct-FP8-KV: prefill_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" @@ -24,17 +24,6 @@ amd-Llama-3.3-70B-Instruct-FP8-KV: decode_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" -DeepSeek-V3: - prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - decode_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" - -DeepSeek-R1-0528: - prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" - hf_dir: "models--deepseek-ai--DeepSeek-R1-0528" - Kimi-K2.5-MXFP4: prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" From c5ba7eaff990061006d64f913a37437db7721e24 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 21 Apr 2026 06:40:27 +0000 Subject: [PATCH 29/85] consolidate amd_utils for sglang and vllm Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/bench.sh | 72 +- benchmarks/multi_node/amd_utils/env.sh | 232 +++-- benchmarks/multi_node/amd_utils/job.slurm | 468 ++++----- .../multi_node/amd_utils/models_vllm.yaml | 42 + .../multi_node/amd_utils/moriio_proxy.py | 327 +++++++ .../amd_utils/patches/minimax_m2.py | 672 +++++++++++++ benchmarks/multi_node/amd_utils/server.sh | 783 +-------------- .../multi_node/amd_utils/server_sglang.sh | 624 ++++++++++++ .../multi_node/amd_utils/server_vllm.sh | 490 ++++++++++ benchmarks/multi_node/amd_utils/setup_deps.sh | 908 ++++++++++++++++++ benchmarks/multi_node/amd_utils/start_etcd.sh | 47 + benchmarks/multi_node/amd_utils/submit.sh | 112 ++- benchmarks/multi_node/amd_utils/sync.py | 5 +- .../dsr1_fp4_mi355x_sglang-disagg.sh | 3 +- .../dsr1_fp8_mi355x_sglang-disagg.sh | 3 +- .../kimik2.5_fp4_mi355x_vllm-disagg.sh | 5 +- .../minimaxm2.5_fp8_mi355x_vllm-disagg.sh | 5 +- 17 files changed, 3645 insertions(+), 1153 deletions(-) create mode 100644 benchmarks/multi_node/amd_utils/models_vllm.yaml create mode 100644 benchmarks/multi_node/amd_utils/moriio_proxy.py create mode 100644 benchmarks/multi_node/amd_utils/patches/minimax_m2.py create mode 100755 benchmarks/multi_node/amd_utils/server_sglang.sh create mode 100755 benchmarks/multi_node/amd_utils/server_vllm.sh create mode 100644 benchmarks/multi_node/amd_utils/setup_deps.sh create mode 100755 benchmarks/multi_node/amd_utils/start_etcd.sh diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index ac996c5a9..87f3b1e8a 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -1,4 +1,17 @@ #!/bin/bash +# Dual-Engine Disaggregated Benchmark Runner +# +# ENGINE=sglang (default): SGLang benchmark +# ENGINE=vllm: vLLM benchmark +# +# Produces JSON result files via benchmark_serving.py so that the CI pipeline +# can collect and process results. +# +# Usage: bash bench.sh \ +# \ +# + +ENGINE="${ENGINE:-sglang}" n_prefill=$1 n_decode=$2 @@ -6,58 +19,81 @@ prefill_gpus=$3 decode_gpus=$4 model_path=$5 model_name=$6 -MODEL_PATH="${model_path}/${model_name}" +MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" log_path=$7 chosen_isl=${8:-1024} chosen_osl=${9:-1024} concurrency_list=${10:-"512x1"} -chosen_req_rate=${11:-1} +if [[ "$ENGINE" == "vllm" ]]; then + chosen_req_rate=${11:-inf} +else + chosen_req_rate=${11:-1} +fi random_range_ratio=${12:-0.8} num_prompts_multiplier=${13:-10} IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list" -echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" - -head_node="localhost" -head_port="30000" +ROUTER_PORT="${ROUTER_PORT:-30000}" +echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" -profile_folder="${log_path}/sglang_isl_${chosen_isl}_osl_${chosen_osl}" -mkdir -p $profile_folder +profile_folder="${log_path}/${ENGINE}_isl_${chosen_isl}_osl_${chosen_osl}" +mkdir -p "$profile_folder" source "$(dirname "$0")/../../benchmark_lib.sh" -# Repo root inside the container (3 levels up from this script's directory) REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" -for max_concurrency in ${chosen_concurrencies[@]}; do +for max_concurrency in "${chosen_concurrencies[@]}"; do export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}" + num_prompts=$(( max_concurrency * num_prompts_multiplier )) + if [[ "$num_prompts" -lt 16 ]]; then + num_prompts=16 + fi + echo "profile_folder: $profile_folder" echo "max_concurrency: $max_concurrency" echo "chosen_req_rate: $chosen_req_rate" echo "MODEL_PATH: $MODEL_PATH" - echo "head_port: $head_port" + echo "ROUTER_PORT: $ROUTER_PORT" echo "chosen_isl: $chosen_isl" echo "chosen_osl: $chosen_osl" + echo "num_prompts: $num_prompts" echo "export_file: $export_file" + # Engine-specific extra flags + extra_flags="" + if [[ "$ENGINE" == "vllm" ]]; then + extra_flags="--trust-remote-code" + else + if [ "$IS_MTP" = "true" ]; then + extra_flags="--use-chat-template" + fi + fi + run_benchmark_serving \ --bench-serving-dir "$REPO_ROOT" \ - --model ${MODEL_PATH} \ - --port ${head_port} \ + --model "$MODEL_PATH" \ + --port "$ROUTER_PORT" \ --backend openai \ - --input-len ${chosen_isl} \ - --output-len ${chosen_osl} \ - --random-range-ratio ${random_range_ratio} \ - --num-prompts $(( $max_concurrency * $num_prompts_multiplier )) \ + --input-len "$chosen_isl" \ + --output-len "$chosen_osl" \ + --random-range-ratio "$random_range_ratio" \ + --num-prompts "$num_prompts" \ --max-concurrency "$max_concurrency" \ --result-filename "$export_file" \ --result-dir /workspace/ \ - $( [ "$IS_MTP" = "true" ] && echo "--use-chat-template" ) + $extra_flags echo "-----------------------------------------" + + # vLLM: cooldown between rounds for idle KV block reaper + if [[ "$ENGINE" == "vllm" ]]; then + echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." + sleep 10 + fi done diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 904576003..c5a438541 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -1,142 +1,198 @@ #!/bin/bash -# SGLang/MoRI environment setup for multi-node disaggregated serving. +# Dual-engine environment setup for multi-node disaggregated serving. +# +# ENGINE=sglang (default): SGLang/MoRI environment +# ENGINE=vllm: vLLM/Nixl environment # # REQUIRED ENVIRONMENT VARIABLES: # IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) -# This must be set by the runner script (runners/launch_mi355x-amds.sh) -# -# OPTIONAL ENVIRONMENT VARIABLES: -# MORI_RDMA_TC - RDMA traffic class (e.g., 96, 104). Set by runner if cluster uses QoS. - +# Set by runner or auto-detected from hostname. set -x + +ENGINE="${ENGINE:-sglang}" export PYTHONDONTWRITEBYTECODE=1 -# IBDEVICES configuration +# ============================================================================= +# Shared: IBDEVICES detection +# ============================================================================= + # Prefer IBDEVICES set by runner (runners/launch_mi355x-amds.sh) # Fall back to hostname detection if not set (for direct script execution) if [[ -z "$IBDEVICES" ]]; then - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7 - elif [[ $NODENAME == mia1* ]]; then - export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 + DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',') + if [[ -n "$DETECTED" ]]; then + export IBDEVICES="$DETECTED" else - echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2 - exit 1 + echo "WARNING: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2 fi - echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $NODENAME" + echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $(hostname -s)" else echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)" fi export IBDEVICES -# Auto-detect default network interface (portable across clusters) +# Shared: Auto-detect default network interface (portable across clusters) export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) +set +x -export NCCL_IB_HCA=$IBDEVICES +export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} -export SGLANG_USE_AITER=1 +# ============================================================================= +# Engine-specific environment +# ============================================================================= -export SGLANG_MORI_DISPATCH_DTYPE=auto -export MORI_COMBINE_DTYPE_PREFILL=fp8_direct_cast -export MORI_COMBINE_DTYPE_DECODE=fp8 -export SGLANG_MORI_QP_PER_TRANSFER=4 -export SGLANG_MORI_NUM_WORKERS=4 -export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000 +if [[ "$ENGINE" == "vllm" ]]; then + # ========================================================================= + # vLLM/Nixl-specific environment + # ========================================================================= + set -x -export MORI_IO_QP_MAX_SEND_WR=16384 -export MORI_IO_QP_MAX_CQE=32768 -export MORI_IO_QP_MAX_SGE=4 + # UCX_NET_DEVICES: Use the first benic interface for UCX TCP transport + if [[ -z "$UCX_NET_DEVICES" ]]; then + UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/benic1p1/{print $2}' | head -1) + if [[ -n "$UCX_NET_DEV" ]]; then + export UCX_NET_DEVICES="$UCX_NET_DEV" + else + FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1) + if [[ -n "$FIRST_IB" ]]; then + export UCX_NET_DEVICES="${FIRST_IB}:1" + fi + fi + echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES" + else + echo "[INFO] Using UCX_NET_DEVICES=$UCX_NET_DEVICES (set by environment)" + fi -export MORI_IO_TC_DISABLE=0 + # RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing + export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1} -export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600 -export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600 + # QoS/DSCP configuration for lossless RoCEv2 fabric. + if [[ -n "$UCX_IB_TRAFFIC_CLASS" ]]; then + echo "[INFO] Using UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS (set by environment)" + elif command -v nicctl &> /dev/null; then + ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') + ND_DSCP=$(nicctl show qos 2>/dev/null | awk -v p="$ND_PRIO" ' +$1 == "DSCP" && $2 == ":" && $NF == p { + print $3; exit +}') + if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then + export UCX_IB_TRAFFIC_CLASS=$(( 4 * ND_DSCP )) + export UCX_IB_SL=$ND_PRIO + echo "[INFO] Detected QoS from nicctl: UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS, UCX_IB_SL=$UCX_IB_SL" + else + echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export UCX_IB_TRAFFIC_CLASS=96 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export UCX_IB_TRAFFIC_CLASS=104 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + fi + fi + else + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export UCX_IB_TRAFFIC_CLASS=96 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export UCX_IB_TRAFFIC_CLASS=104 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + else + echo "[INFO] No nicctl and unable to detect from hostname. Skipping QoS configuration." + fi + fi -# Disable allocating memory in one pass -export MORI_SHMEM_MODE=ISOLATION + set +x + echo "[INFO] IBDEVICES=$IBDEVICES UCX_NET_DEVICES=$UCX_NET_DEVICES NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX UCX_IB_TRAFFIC_CLASS=${UCX_IB_TRAFFIC_CLASS:-unset}" -# Enable spec v2 -export SGLANG_ENABLE_SPEC_V2=1 -export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=0 +else + # ========================================================================= + # SGLang/MoRI-specific environment + # ========================================================================= -export SGLANG_LOG_MS=true -export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 + export SGLANG_USE_AITER=1 + export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200 + export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200 -export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192 -export MORI_MAX_DISPATCH_TOKENS_DECODE=512 + # Disable allocating memory in one pass + export MORI_SHMEM_MODE=ISOLATION + export SGLANG_MORI_FP8_DISP=True -export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=32768 -export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2703 + if [[ "$MODEL_NAME" == *mxfp4* ]]; then + export SGLANG_MORI_FP8_DISP=False + fi + + export SGLANG_MORI_FP4_DISP=False + export SGLANG_MORI_FP8_COMB=False -# set MTP size=1 when EP16 -export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) + # Per-role dispatch token limits (prefill uses higher throughput, decode uses lower) + export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384 + if [[ "$MODEL_NAME" == *mxfp4* ]]; then + export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288 + fi + export MORI_MAX_DISPATCH_TOKENS_DECODE=160 -export MORI_EP_LAUNCH_CONFIG_MODE=AUTO + # set MTP size=1 when EP16 + export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) + export MORI_EP_LAUNCH_CONFIG_MODE=AUTO + export MORI_IO_QP_MAX_SEND_WR=16384 + export MORI_IO_QP_MAX_CQE=32768 + export MORI_IO_QP_MAX_SGE=4 -export MORI_APP_LOG_LEVEL=INFO + export MORI_APP_LOG_LEVEL=INFO -# Router logging control: -# 0 (default) keeps noisy per-request access logs out of stdout while still logging to file. -# 1 mirrors router logs to stdout via tee (useful for live debugging). -export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}" + # Router logging control + export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}" -# QoS/DSCP configuration -# Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname -if [[ -n "$MORI_RDMA_TC" ]]; then - echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)" -elif command -v nicctl &> /dev/null; then - ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') - ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" ' + # QoS/DSCP configuration + if [[ -n "$MORI_RDMA_TC" ]]; then + echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)" + elif command -v nicctl &> /dev/null; then + ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') + ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" ' $1 == "DSCP" && $2 == ":" && $NF == p { print $3; exit }') - if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then - TC=$(( 4 * ND_DSCP )) - export MORI_RDMA_SL=$ND_PRIO - export MORI_IO_SL=$ND_PRIO - export MORI_RDMA_TC=$TC - export MORI_IO_TC=$TC - echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL" + if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then + TC=$(( 4 * ND_DSCP )) + export MORI_RDMA_SL=$ND_PRIO + export MORI_RDMA_TC=$TC + echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL" + else + echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." + # Fall back to hostname-based detection + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export MORI_RDMA_TC=96 + echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export MORI_RDMA_TC=104 + echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" + else + echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." + fi + fi else - echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." - # Fall back to hostname-based detection + # nicctl not available, try hostname-based detection NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 - export MORI_IO_TC=96 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 - export MORI_IO_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else - echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." + echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." + echo " This is normal for clusters without QoS or outside Docker containers." fi fi -else - # nicctl not available, try hostname-based detection - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - export MORI_RDMA_TC=96 - export MORI_IO_TC=96 - echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" - elif [[ $NODENAME == mia1* ]]; then - export MORI_RDMA_TC=104 - export MORI_IO_TC=104 - echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" - else - echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." - echo " This is normal for clusters without QoS or outside Docker containers." - fi -fi - -# FIXME: WA for latest upstream 0305 image -export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} + # FIXME: WA for latest upstream 0305 image + export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} -set +x +fi diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 824605c46..56fefb0ed 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -1,265 +1,260 @@ #!/bin/bash -#SBATCH --job-name=1p2d_bench-serving # Specify a custom string for your slurm batch job -#SBATCH -N 3 # CHECK this to be right in batch jobs -#SBATCH -n 3 # CHECK this to be right in batch jobs +#SBATCH --job-name=disagg-bench +#SBATCH -N 3 # Overridden by submit.sh -N flag +#SBATCH -n 3 # Overridden by submit.sh -n flag #SBATCH --ntasks-per-node=1 #SBATCH --spread-job -#SBATCH --gres=gpu:8 # Request 8 GPUs and 8 NICs (use --gres if specific GPU resources are needed) -#SBATCH --time=24:00:00 # Set a time limit for the job (HH:MM:SS) +#SBATCH --gres=gpu:8 +#SBATCH --time=24:00:00 # --output and --error are set by submit.sh via BENCHMARK_LOGS_DIR +ENGINE="${ENGINE:-sglang}" -# ------------------------ -# Print current time in UTC and PST formats -# ------------------------ echo "=== Job Start Time ===" echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')" echo "PST Time: $(TZ=America/Los_Angeles date '+%Y-%m-%d %H:%M:%S %Z')" +echo "ENGINE: $ENGINE" echo "=======================" echo "" # ============================================================================= -# Model validation from models.yaml (replaces hardcoded VALID_MODELS array) +# Model Validation # ============================================================================= -# DI_REPO_DIR is set below from $(pwd); use the submit-time working directory -# because sbatch copies this script to /var/spool/slurmd/ at runtime. -MODELS_YAML="$(pwd)/models.yaml" + +# Use $(pwd) not BASH_SOURCE — sbatch copies the script to /var/spool/slurmd/ +# at runtime, but the CWD remains the submit-time directory (amd_utils/). +if [[ "$ENGINE" == "vllm" ]]; then + MODELS_YAML="$(pwd)/models_vllm.yaml" +else + MODELS_YAML="$(pwd)/models.yaml" +fi if [[ ! -f "$MODELS_YAML" ]]; then - echo "Error: models.yaml not found at $MODELS_YAML" + echo "Error: models YAML not found at $MODELS_YAML" exit 1 fi -# Validate MODEL_NAME exists as a top-level key in models.yaml +if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then + echo "Error: DOCKER_IMAGE_NAME is not set." + exit 1 +fi + +MODEL_NAME="${MODEL_NAME:-None}" if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then - echo "Error: Model '$MODEL_NAME' not found in models.yaml" + echo "Error: Model '$MODEL_NAME' not found in $MODELS_YAML" echo "Available models:" grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/ - /' exit 1 fi echo "Model found: $MODEL_NAME" -# All models use server.sh as the entrypoint RUN_FILE="server.sh" echo "Runfile set: $RUN_FILE" -if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then - echo "Error: DOCKER_IMAGE_NAME is not set." - exit 1 -fi - -# DI_REPO_DIR points to the repo root so Docker can access both benchmarks/ and utils/. +# DI_REPO_DIR points to the repo root. # $(pwd) is amd_utils/ (the sbatch submit dir); go up 3 levels to reach the repo root. export DI_REPO_DIR=$(cd "$(pwd)/../../.." && pwd) -xP="${xP:-1}" #-> Number of Prefill Workers -yD="${yD:-1}" #-> Number of Decode Workers +xP="${xP:-1}" +yD="${yD:-1}" -# Parallelism Configuration with defaults -PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" -PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}" -PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}" -DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" -DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}" -DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}" -DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} # 0 for disabling MTP - -# Benchmark Configuration with defaults +# Benchmark configuration BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" GPUS_PER_NODE="${GPUS_PER_NODE:-8}" -MODEL_NAME="${MODEL_NAME:-None}" +# Engine-specific defaults +PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-false}" +PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-false}" +DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-false}" +DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-false}" +PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" +DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" +DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} + +# ============================================================================= +# Docker privilege detection +# ============================================================================= +# Detect on the batch host. Per-node detection happens inside srun below. +if docker ps &>/dev/null; then + DOCKER_CMD="docker" +else + DOCKER_CMD="sudo docker" +fi +export DOCKER_CMD + +# ============================================================================= +# Model Path Resolution +# ============================================================================= # MODEL_DIR detection: prefer env var, fall back to hostname detection if [[ -z "$MODEL_DIR" ]]; then NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then MODEL_DIR="/nfsdata" - echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then MODEL_DIR="/it-share/data" - echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $NODENAME" else - MODEL_DIR="/nfsdata" # Default fallback - echo "[INFO] Using default MODEL_DIR=$MODEL_DIR (hostname $NODENAME not recognized)" + MODEL_DIR="/nfsdata" fi + echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $(hostname -s)" fi export MODEL_DIR -# ------------------------ -# Model path validation and selection across all nodes -# ------------------------ -echo "Looking for model: $MODEL_NAME" -echo "Checking model availability across all allocated nodes..." - -# Get all allocated nodes -ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -TOTAL_NODES=$(echo "$ALL_NODES" | wc -l) - -echo "Total allocated nodes: $TOTAL_NODES" -echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')" - -# Function to check model path on all nodes -check_model_path() { - local path=$1 - local check_name=$2 - - echo "Checking $check_name: $path" +if [[ "$ENGINE" == "vllm" ]]; then + # vLLM: Extract hf_dir from models.yaml, search multiple paths, resolve HF cache snapshots + DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next} + found && /^[^ ]/{exit} + found && /hf_dir:/{gsub(/[" ]/, "", $2); print $2; exit}' "$MODELS_YAML") + DISK_DIR_NAME="${DISK_DIR_NAME:-$MODEL_NAME}" + echo "Looking for model: $MODEL_NAME (disk dir: $DISK_DIR_NAME)" + + resolve_hf_cache_path() { + local base_path=$1 + if [[ -d "${base_path}/snapshots" ]]; then + local snapshot=$(ls -1 "${base_path}/snapshots" 2>/dev/null | head -1) + if [[ -n "$snapshot" ]]; then + echo "${base_path}/snapshots/${snapshot}" + return 0 + fi + fi + echo "$base_path" + return 1 + } + + MODEL_PATH="" + SEARCH_PATHS=( + "${MODEL_DIR}/${DISK_DIR_NAME}" + "${MODEL_DIR}/${MODEL_NAME}" + "/nfsdata/hf_hub_cache-0/${DISK_DIR_NAME}" + "/nfsdata/hf_hub_cache-0/${MODEL_NAME}" + ) + + for search_path in "${SEARCH_PATHS[@]}"; do + if [[ -d "$search_path" ]]; then + RESOLVED=$(resolve_hf_cache_path "$search_path") + MODEL_PATH="$RESOLVED" + echo "Found MODEL_PATH: $MODEL_PATH" + break + fi + done - # Run check on all nodes in parallel - srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c " - if [ -d '$path' ]; then - echo \"\$(hostname): ✓ Found $path\" - exit 0 + if [[ -z "$MODEL_PATH" ]]; then + echo "FATAL: Model '$MODEL_NAME' not found. Searched:" + for p in "${SEARCH_PATHS[@]}"; do echo " - $p"; done + exit 1 + fi + echo "Final MODEL_PATH: $MODEL_PATH" +else + # SGLang: Validate model path across all allocated nodes + echo "Looking for model: $MODEL_NAME" + echo "Checking model availability across all allocated nodes..." + + ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") + TOTAL_NODES=$(echo "$ALL_NODES" | wc -l) + echo "Total allocated nodes: $TOTAL_NODES" + echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')" + + check_model_path() { + local path=$1 + local check_name=$2 + echo "Checking $check_name: $path" + srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c " + if [ -d '$path' ]; then + echo \"\$(hostname): Found $path\" + exit 0 + else + echo \"\$(hostname): Missing $path\" + exit 1 + fi + " + local exit_code=$? + if [ $exit_code -eq 0 ]; then + echo "$check_name available on ALL nodes" + return 0 else - echo \"\$(hostname): ✗ Missing $path\" - exit 1 + echo "$check_name NOT available on all nodes" + return 1 fi - " + } - # Check if all nodes succeeded (exit code 0) - local exit_code=$? - if [ $exit_code -eq 0 ]; then - echo "✓ $check_name available on ALL nodes" - return 0 + if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then + MODEL_PATH="$MODEL_DIR/$MODEL_NAME" + echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)" else - echo "✗ $check_name NOT available on all nodes" - return 1 + echo "FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in:" + echo " - $MODEL_DIR/$MODEL_NAME" + exit 1 fi -} - -# Check model weights exist on "$MODEL_DIR/$MODEL_NAME" -if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then - MODEL_PATH="$MODEL_DIR/$MODEL_NAME" - echo "" - echo "✓ Selected MODEL_PATH: $MODEL_PATH (available on all nodes)" -else - echo "" - echo "✗ FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in the following:" - echo " - $MODEL_DIR/$MODEL_NAME" - echo "" - echo "Model must be accessible from all nodes for distributed execution." - echo "Please ensure the model is available on all allocated nodes." - exit 1 + echo "Final MODEL_PATH: $MODEL_PATH" fi -echo "Final MODEL_PATH: $MODEL_PATH" -echo "" - -NUM_NODES="${NUM_NODES}" +# ============================================================================= +# Node Selection +# ============================================================================= -# ------------------------ -# Extract first NUM_NODES from SLURM allocation and update SLURM variables -# ------------------------ -echo "Original SLURM allocation:" -echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "SLURM_NNODES: $SLURM_NNODES" -echo "SLURM_NTASKS: $SLURM_NTASKS" +NUM_NODES=$((xP + yD)) +echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD)" -# Get the full nodelist and extract first NUM_NODES FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') -# Create new nodelist in SLURM format -# This is a simplified approach - for complex ranges, you might need more sophisticated parsing -NEW_SLURM_NODELIST=$(echo "$SELECTED_NODES" | paste -sd, | sed 's/,/,/g') - # Update SLURM environment variables export SLURM_NNODES=$NUM_NODES export SLURM_NTASKS=$NUM_NODES export SLURM_JOB_NUM_NODES=$NUM_NODES export SLURM_NPROCS=$NUM_NODES -export SLURM_JOB_NODELIST="$NEW_SLURM_NODELIST" -export SLURM_NODELIST="$NEW_SLURM_NODELIST" - -# Keep other SLURM variables as they were or set defaults +export SLURM_JOB_NODELIST="$SELECTED_NODELIST_STR" +export SLURM_NODELIST="$SELECTED_NODELIST_STR" export SLURM_TASKS_PER_NODE="1(x$NUM_NODES)" -export SLURM_SUBMIT_DIR="${SLURM_SUBMIT_DIR:-$HOME}" -export SLURM_CLUSTER_NAME="${SLURM_CLUSTER_NAME}" # Let SLURM set this automatically -export SLURM_JOB_CPUS_PER_NODE="${SLURM_JOB_CPUS_PER_NODE}" -export SLURM_JOB_PARTITION="${SLURM_JOB_PARTITION}" # Should be set by sbatch/runner -export SLURM_JOBID="${SLURM_JOBID:-$SLURM_JOB_ID}" -export SLURM_JOB_QOS="${SLURM_JOB_QOS}" # Should be set by sbatch/runner if needed -export SLURM_JOB_ACCOUNT="${SLURM_JOB_ACCOUNT}" # Should be set by sbatch/runner export SLURM_NTASKS_PER_NODE=1 -export SLURM_SUBMIT_HOST="${SLURM_SUBMIT_HOST}" -export SLURM_JOB_ID="${SLURM_JOB_ID}" -# SLURM_CONF is auto-set by SLURM, no need to override -export SLURM_JOB_NAME="${SLURM_JOB_NAME:-1p1d_bench-serving}" echo "" -echo "Updated SLURM Environment Variables:" -echo "SLURM_JOB_ID: $SLURM_JOB_ID" -echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "SLURM_NNODES: $SLURM_NNODES" -echo "SLURM_NTASKS: $SLURM_NTASKS" -echo "SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "SLURM_JOB_CPUS_PER_NODE: $SLURM_JOB_CPUS_PER_NODE" -echo "SLURM_JOB_PARTITION: $SLURM_JOB_PARTITION" -echo "SLURM_JOB_NUM_NODES: $SLURM_JOB_NUM_NODES" -echo "SLURM_JOBID: $SLURM_JOBID" -echo "SLURM_JOB_QOS: $SLURM_JOB_QOS" -echo "SLURM_NODELIST: $SLURM_NODELIST" -echo "SLURM_JOB_ACCOUNT: $SLURM_JOB_ACCOUNT" -echo "SLURM_NPROCS: $SLURM_NPROCS" -echo "SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "SLURM_CONF: $SLURM_CONF" -echo "SLURM_JOB_NAME: $SLURM_JOB_NAME" -echo "SLURM_NTASKS_PER_NODE: $SLURM_NTASKS_PER_NODE" -echo "SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "SLURM_CLUSTER_NAME: $SLURM_CLUSTER_NAME" -echo "ulimit: $(ulimit -a)" -echo "" -echo "Selected nodes for execution:" -echo "$SELECTED_NODES" -echo "" +echo "Selected nodes: $SELECTED_NODELIST_STR" + +# ============================================================================= +# IP Resolution +# ============================================================================= -# Node information USER_NAME=$(whoami) MASTER_NODE=$(echo "$SELECTED_NODES" | head -n 1) NODE0_ADDR=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$MASTER_NODE" bash -c 'ip route get 1.1.1.1') NODE0_ADDR=$(echo "$NODE0_ADDR" | awk '/src/ {print $7}') IPS=() - -GW_NIC=$(ip route | awk '/^default/ {print $5; exit}') for NODE in $SELECTED_NODES; do IP=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$NODE" bash -c 'ip route get 1.1.1.1') IP=$(echo "$IP" | awk '/src/ {print $7}') IPS+=("$IP") done -echo "Selected node IPs: ${IPS[*]}" | sed 's/ /,/g' +echo "Node IPs: ${IPS[*]}" DOCKER_MOUNT_PATH="/workspace" -SGLANG_WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils" -timestamp=$(date +"%Y-%m-%d_%H-%M-%S") +WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils" NNODES=$NUM_NODES -echo "MASTER_NODE is ${MASTER_NODE}" -echo "NODE0_ADDR is ${NODE0_ADDR}" -echo "NNODES is ${NNODES}" -echo "REPO Directory is ${DI_REPO_DIR}" -echo "USER_NAME is ${USER_NAME}" - -# Get the RDMA priority and DSCP value from the NIC -if ! command -v nicctl >/dev/null 2>&1; then - echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2 - exit 1 -fi +echo "MASTER_NODE: ${MASTER_NODE}" +echo "NODE0_ADDR: ${NODE0_ADDR}" +echo "NNODES: ${NNODES}" +echo "REPO DIR: ${DI_REPO_DIR}" +echo "USER: ${USER_NAME}" # Reduce log spam export TQDM_MININTERVAL=20 +# Translate the host-resolved MODEL_PATH to the Docker mount namespace +DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}" + export DI_REPO_DIR=$DI_REPO_DIR -export SGLANG_WS_PATH=$SGLANG_WS_PATH +export WS_PATH=$WS_PATH export NNODES=$NNODES export NODE0_ADDR=$NODE0_ADDR export MODEL_PATH=$MODEL_PATH @@ -269,21 +264,16 @@ export yD=$yD export MODEL_NAME=$MODEL_NAME export USER_NAME=$USER_NAME export IPADDRS="$(echo "${IPS[*]}" | sed 's/ /,/g')" -export PREFILL_TP_SIZE=$PREFILL_TP_SIZE -export PREFILL_ENABLE_EP=$PREFILL_ENABLE_EP -export PREFILL_ENABLE_DP=$PREFILL_ENABLE_DP -export DECODE_TP_SIZE=$DECODE_TP_SIZE -export DECODE_ENABLE_EP=$DECODE_ENABLE_EP -export DECODE_ENABLE_DP=$DECODE_ENABLE_DP -export DECODE_MTP_SIZE=$DECODE_MTP_SIZE export GPUS_PER_NODE=$GPUS_PER_NODE export BENCH_INPUT_LEN=$BENCH_INPUT_LEN export BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN export BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO export BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY +export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE export DRY_RUN="${DRY_RUN:-0}" export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +export ENGINE=$ENGINE # Eval-related env vars (threaded from submit.sh) export RUN_EVAL="${RUN_EVAL:-false}" @@ -298,38 +288,101 @@ export SPEC_DECODING="${SPEC_DECODING:-}" export IS_MULTINODE="${IS_MULTINODE:-false}" SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') -export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" -export RUN_FILE_FULL="$SGLANG_WS_PATH/${RUN_FILE}" - +export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" +export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" -# Use only the selected nodes for srun execution SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) - cleanup() { - echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning stale logs folder..." - # clean up the logs folder - sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true - + echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..." + rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true echo "[${SLURM_JOB_ID}] cleanup done." } trap cleanup INT TERM HUP - -# Force NFS cache refresh on all nodes before running Docker to avoid stale file handle errors +# Force NFS cache refresh on all nodes echo "Refreshing NFS caches on all nodes..." srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c ' sync - # Force re-stat of the mounted directory to refresh NFS handles ls -la '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils > /dev/null 2>&1 stat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1 cat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1 - # Drop caches if we have permission (optional, requires root) echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true echo "NFS cache refreshed on $(hostname)" ' +# ============================================================================= +# Build engine-specific Docker environment variables +# ============================================================================= + +# Common env vars (always passed) +DOCKER_ENV_COMMON=( + -e SLURM_JOB_ID=\$SLURM_JOB_ID + -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST + -e NNODES=\$NNODES + -e NODE_RANK=\$SLURM_PROCID + -e NODE0_ADDR=\$NODE0_ADDR + -e MODEL_DIR=/models + -e MODEL_NAME=\$MODEL_NAME + -e GPUS_PER_NODE=\$GPUS_PER_NODE + -e xP=\$xP + -e yD=\$yD + -e IPADDRS=\$IPADDRS + -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN + -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN + -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO + -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER + -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY + -e TQDM_MININTERVAL=\$TQDM_MININTERVAL + -e DRY_RUN=\$DRY_RUN + -e BENCHMARK_LOGS_DIR=/benchmark_logs + -e ENGINE=\$ENGINE + -e WS_PATH=${WS_PATH} + -e RUN_EVAL=\$RUN_EVAL + -e EVAL_ONLY=\$EVAL_ONLY + -e EVAL_CONC=\$EVAL_CONC + -e FRAMEWORK=\$FRAMEWORK + -e PRECISION=\$PRECISION + -e MODEL_PREFIX=\$MODEL_PREFIX + -e RUNNER_TYPE=\$RUNNER_TYPE + -e RESULT_FILENAME=\$RESULT_FILENAME + -e SPEC_DECODING=\$SPEC_DECODING + -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE + -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP + -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP + -e DECODE_TP_SIZE=\$DECODE_TP_SIZE + -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP + -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP + -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE +) + +# Engine-specific env vars +if [[ "$ENGINE" == "vllm" ]]; then + DOCKER_ENV_ENGINE=( + -e VLLM_WS_PATH=${WS_PATH} + -e MODEL_PATH=$DOCKER_MODEL_PATH + -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma + -e UCX_SOCKADDR_TLS_PRIORITY=tcp + -e UCX_MEMTYPE_CACHE=y + -e UCX_RNDV_SCHEME=get_zcopy + -e UCX_RNDV_THRESH=4k + -e UCX_ROCM_IPC_MIN_ZCOPY=0 + -e UCX_LOG_LEVEL=warn + -e HSA_ENABLE_SDMA=1 + -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} + -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} + -e PYTHONPYCACHEPREFIX=/tmp/pycache + ) +else + DOCKER_ENV_ENGINE=( + -e SGLANG_WS_PATH=${WS_PATH} + ) +fi + +# Engine-specific container filter for pre-clean +CONT_FILTER="name=^container_${ENGINE}_" + srun \ --nodelist="$SELECTED_NODELIST_SRUN" \ --kill-on-bad-exit=1 \ @@ -341,10 +394,10 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" # Pre-clean (idempotent) -sudo docker ps -aq --filter \"name=^container_sbatch_\" | xargs -r sudo docker rm -f || true -sudo docker ps -aq | xargs -r sudo docker stop || true +\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$_DCMD rm -f || true +\$DOCKER_CMD ps -aq | xargs -r \$_DCMD stop || true -exec sudo docker run --rm \ +exec \$DOCKER_CMD run --rm \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -367,51 +420,18 @@ exec sudo docker run --rm \ --cap-add SYS_PTRACE \ --security-opt seccomp=unconfined \ --privileged \ + -v /sys:/sys \ + $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \ -v ${MODEL_DIR}:/models \ -v \$HOME/.ssh:/root/.ssh \ - -v $(which nicctl):/usr/sbin/nicctl \ --shm-size 128G \ -v /tmp:/run_logs \ -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ - -e SLURM_JOB_ID=\$SLURM_JOB_ID \ - -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST \ - -e NNODES=\$NNODES \ - -e NODE_RANK=\$SLURM_PROCID \ - -e NODE0_ADDR=\$NODE0_ADDR \ - -e MODEL_DIR=/models \ - -e SGLANG_WS_PATH=${SGLANG_WS_PATH} \ - -e GPUS_PER_NODE=\$GPUS_PER_NODE \ - -e xP=\$xP \ - -e yD=\$yD \ - -e MODEL_NAME=\$MODEL_NAME \ - -e IPADDRS=\$IPADDRS \ - -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \ - -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \ - -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP \ - -e DECODE_TP_SIZE=\$DECODE_TP_SIZE \ - -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP \ - -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP \ - -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE \ - -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN \ - -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN \ - -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO \ - -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER \ - -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY \ - -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \ - -e DRY_RUN=\$DRY_RUN \ - -e BENCHMARK_LOGS_DIR=/benchmark_logs \ - -e RUN_EVAL=\$RUN_EVAL \ - -e EVAL_ONLY=\$EVAL_ONLY \ - -e EVAL_CONC=\$EVAL_CONC \ - -e FRAMEWORK=\$FRAMEWORK \ - -e PRECISION=\$PRECISION \ - -e MODEL_PREFIX=\$MODEL_PREFIX \ - -e RUNNER_TYPE=\$RUNNER_TYPE \ - -e RESULT_FILENAME=\$RESULT_FILENAME \ - -e SPEC_DECODING=\$SPEC_DECODING \ - -e IS_MULTINODE=\$IS_MULTINODE \ + ${DOCKER_ENV_COMMON[*]} \ + ${DOCKER_ENV_ENGINE[*]} \ --name \"$DOCKER_CONT_NAME\" \ + --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' set -o pipefail mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' @@ -425,4 +445,4 @@ if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then fi " -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'sudo docker rm -f $DOCKER_CONT_NAME 2>/dev/null || true' +srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml new file mode 100644 index 000000000..c68bb46e3 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/models_vllm.yaml @@ -0,0 +1,42 @@ +# Model-specific vLLM server configurations for disaggregated inference. +# +# Each top-level key is a MODEL_NAME value (must match the model identifier +# used in amd-master.yaml and the directory/HF-cache name under MODEL_DIR). +# +# To add a new model: add a new top-level entry following the same schema. +# No script changes are required. +# +# Schema: +# : +# prefill_flags: str # vLLM CLI flags for prefill workers +# decode_flags: str # vLLM CLI flags for decode workers +# env: str # Space-separated KEY=VALUE pairs exported before vllm serve +# hf_dir: str # (optional) On-disk directory name if it differs from the key +# # e.g. HF cache layout: models--amd--Kimi-K2.5-MXFP4 + +Llama-3.1-405B-Instruct-FP8-KV: + prefill_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" + decode_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" + env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + +amd-Llama-3.3-70B-Instruct-FP8-KV: + prefill_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + decode_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + +Kimi-K2.5-MXFP4: + prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" + hf_dir: "models--amd--Kimi-K2.5-MXFP4" + +MiniMax-M2.5: + prefill_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_ENGINE_READY_TIMEOUT_S=3600" + hf_dir: "models--MiniMaxAI--MiniMax-M2.5" + +gpt-oss-120b: + prefill_flags: "--tensor-parallel-size 8" + decode_flags: "--tensor-parallel-size 8" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0" diff --git a/benchmarks/multi_node/amd_utils/moriio_proxy.py b/benchmarks/multi_node/amd_utils/moriio_proxy.py new file mode 100644 index 000000000..7d1e8454b --- /dev/null +++ b/benchmarks/multi_node/amd_utils/moriio_proxy.py @@ -0,0 +1,327 @@ +#!/usr/bin/env python3 +# MoRI-IO proxy server for vLLM PD disaggregation. +# +# Based on vLLM's examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py +# with the following adaptations for production multi-node use: +# - Ports configurable via PROXY_HTTP_PORT / PROXY_PING_PORT env vars +# - /health endpoint for sync.py barrier readiness checks +# - Uses stdlib `re` instead of `regex` to avoid extra dep +# +# The proxy performs two roles that vllm-router cannot: +# 1. ZMQ service discovery — prefill/decode workers register their RDMA ports +# 2. Request enrichment — injects remote endpoint info into kv_transfer_params + +import asyncio +import copy +import logging +import os +import re +import socket +import threading +import time +import uuid + +import aiohttp +import msgpack +import zmq +from quart import Quart, make_response, request + +logger = logging.getLogger("moriio_proxy") +logger.setLevel(logging.DEBUG) +handler = logging.StreamHandler() +handler.setFormatter(logging.Formatter( + "%(asctime)s %(levelname)s [%(name)s] %(message)s")) +logger.addHandler(handler) + +prefill_instances: list[dict] = [] +decode_instances: list[dict] = [] +request_nums = 0 +app = Quart(__name__) + +STREAM_IDLE_TIMEOUT = int(os.environ.get("PROXY_STREAM_IDLE_TIMEOUT", "300")) + +IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)") + +TRANSFER_TYPE = None + + +def _append_whole_dict_unique(target_list, data_dict): + new_filtered = {k: v for k, v in data_dict.items() if k != "index"} + for existed in target_list: + existed_filtered = {k: v for k, v in existed.items() if k != "index"} + if existed_filtered == new_filtered: + return False + logger.info("Registered instance: role=%s addr=%s hs_port=%s notify=%s dp=%s tp=%s", + data_dict.get("role"), data_dict.get("request_address"), + data_dict.get("handshake_port"), data_dict.get("notify_port"), + data_dict.get("dp_size"), data_dict.get("tp_size")) + target_list.append(data_dict) + transfer_mode = data_dict.get("transfer_mode", "unknown") + global TRANSFER_TYPE + + if TRANSFER_TYPE is None: + TRANSFER_TYPE = transfer_mode + logger.info("Transfer mode set to: %s", TRANSFER_TYPE) + elif transfer_mode != TRANSFER_TYPE: + raise ValueError(f"mismatched transfer mode {TRANSFER_TYPE} vs {transfer_mode}") + + return True + + +_list_lock = threading.RLock() + + +def _listen_for_register(hostname, port): + context = zmq.Context() + router_socket = context.socket(zmq.ROUTER) + router_socket.bind(f"tcp://{hostname}:{port}") + poller = zmq.Poller() + poller.register(router_socket, zmq.POLLIN) + global prefill_instances + global decode_instances + + while True: + socks = dict(poller.poll()) + if router_socket in socks: + remote_addr, msg = router_socket.recv_multipart() + data = msgpack.loads(msg) + if data["type"] == "HELLO": + pass + elif ( + data["type"] == "register" + and data["role"] == "P" + and data["request_address"] not in prefill_instances + ): + with _list_lock: + _append_whole_dict_unique(prefill_instances, data) + + elif ( + data["type"] == "register" + and data["role"] == "D" + and data["request_address"] not in decode_instances + ): + with _list_lock: + _append_whole_dict_unique(decode_instances, data) + + +def start_service_discovery(hostname, port): + if not hostname: + hostname = socket.gethostname() + if port == 0: + raise ValueError("Port cannot be 0") + + _listener_thread = threading.Thread( + target=_listen_for_register, args=(hostname, port), daemon=True + ) + _listener_thread.start() + logger.info("Service discovery listening on %s:%s", hostname, port) + return _listener_thread + + +async def send_request_to_prefill( + endpoint, req_data, request_id, d_endpoint, dip, dport, selected_prefill_dp_rank +): + req_data_copy = req_data + + req_data_copy["kv_transfer_params"].update( + { + "do_remote_decode": True, + "do_remote_prefill": False, + "remote_handshake_port": d_endpoint["handshake_port"], + "remote_notify_port": d_endpoint["notify_port"], + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": dip, + "remote_port": dport, + } + ) + req_data_copy["stream"] = False + req_data_copy["max_tokens"] = 1 + if "max_completion_tokens" in req_data_copy: + req_data_copy["max_completion_tokens"] = 1 + if "stream_options" in req_data_copy: + del req_data_copy["stream_options"] + async with aiohttp.ClientSession( + timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) + ) as session: + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id, + } + if selected_prefill_dp_rank is not None: + headers["X-data-parallel-rank"] = str(selected_prefill_dp_rank) + async with session.post( + url=endpoint, json=req_data_copy, headers=headers + ) as response: + if response.status == 200: + return await response.json() + else: + raise RuntimeError( + f"Prefill response status={response.status}" + ) + + +async def start_decode_request(endpoint, req_data, request_id): + session = aiohttp.ClientSession( + timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) + ) + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id, + } + response = await session.post(url=endpoint, json=req_data, headers=headers) + return session, response + + +async def stream_decode_response(session, response, request_id): + try: + if response.status == 200: + chunk_iter = response.content.iter_chunked(1024).__aiter__() + while True: + try: + chunk_bytes = await asyncio.wait_for( + chunk_iter.__anext__(), timeout=STREAM_IDLE_TIMEOUT, + ) + yield chunk_bytes + except StopAsyncIteration: + break + except asyncio.TimeoutError: + logger.error( + "Decode stream %s idle for %ds, aborting", + request_id, STREAM_IDLE_TIMEOUT, + ) + break + else: + raise RuntimeError( + f"Decode response status={response.status}" + ) + finally: + await response.release() + await session.close() + + +@app.route("/health", methods=["GET"]) +async def health_check(): + with _list_lock: + p_count = len(prefill_instances) + d_count = len(decode_instances) + return await make_response( + ({"status": "ok", "prefill_instances": p_count, "decode_instances": d_count}, 200) + ) + + +@app.route("/v1/completions", methods=["POST"]) +@app.route("/v1/chat/completions", methods=["POST"]) +async def handle_request(): + try: + with _list_lock: + global request_nums + request_nums += 1 + + def extract_ip_port_fast(url): + match = IP_PORT_PATTERN.search(url) + if not match: + raise ValueError(f"Invalid URL format: {url}") + return match.groups() + + req_data = await request.get_json() + request_id = str(uuid.uuid4()) + + if not prefill_instances or not decode_instances: + return await make_response( + ("Service Unavailable: No prefill or decode instances registered.", 503) + ) + + pid = request_nums % len(prefill_instances) + did = request_nums % len(decode_instances) + prefill_instance_endpoint = prefill_instances[pid] + decode_instance_endpoint = decode_instances[did] + + selected_prefill_dp_rank = None + if prefill_instance_endpoint["dp_size"] > 1: + selected_prefill_dp_rank = request_nums % prefill_instance_endpoint["dp_size"] + + dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"]) + + req_data_to_prefill = copy.deepcopy(req_data) + req_data_to_prefill["kv_transfer_params"] = {"transfer_id": request_id} + req_data["kv_transfer_params"] = {"transfer_id": request_id} + req_data_to_prefill["kv_transfer_params"]["remote_dp_size"] = ( + decode_instance_endpoint["dp_size"] + ) + req_data_to_prefill["kv_transfer_params"]["remote_tp_size"] = ( + decode_instance_endpoint["tp_size"] + ) + + send_prefill_task = asyncio.create_task( + send_request_to_prefill( + prefill_instance_endpoint["request_address"], + req_data_to_prefill, + request_id, + decode_instance_endpoint, + dip, + dport, + selected_prefill_dp_rank, + ) + ) + ip, port = extract_ip_port_fast(prefill_instance_endpoint["request_address"]) + + req_data["max_tokens"] -= 1 + + req_data["kv_transfer_params"] = { + "transfer_id": request_id, + "do_remote_decode": False, + "do_remote_prefill": True, + "remote_handshake_port": prefill_instance_endpoint["handshake_port"], + "remote_notify_port": prefill_instance_endpoint["notify_port"], + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": ip, + "remote_port": port, + } + if TRANSFER_TYPE == "READ": + prefill_response = await send_prefill_task + req_data["kv_transfer_params"]["remote_engine_id"] = prefill_response[ + "kv_transfer_params" + ]["remote_engine_id"] + req_data["kv_transfer_params"]["remote_block_ids"] = prefill_response[ + "kv_transfer_params" + ]["remote_block_ids"] + + req_data["kv_transfer_params"]["remote_dp_size"] = prefill_instance_endpoint[ + "dp_size" + ] + req_data["kv_transfer_params"]["remote_tp_size"] = prefill_instance_endpoint[ + "tp_size" + ] + + if selected_prefill_dp_rank is not None: + req_data["kv_transfer_params"]["remote_dp_rank"] = selected_prefill_dp_rank + + decode_request_task = asyncio.create_task( + start_decode_request( + decode_instance_endpoint["request_address"], req_data, request_id + ) + ) + + session, decode_response = await decode_request_task + stream_generator = stream_decode_response(session, decode_response, request_id) + response = await make_response(stream_generator) + return response + except Exception as e: + logger.exception("Error handling request: %s", e) + return await make_response((f"Internal Server Error: {e!s}", 500)) + + +if __name__ == "__main__": + http_port = int(os.environ.get("PROXY_HTTP_PORT", "30000")) + ping_port = int(os.environ.get("PROXY_PING_PORT", "36367")) + + t = start_service_discovery("0.0.0.0", ping_port) + app.debug = False + app.config["BODY_TIMEOUT"] = 360000 + app.config["RESPONSE_TIMEOUT"] = 360000 + + logger.info("MoRI-IO proxy starting: HTTP=%d, ZMQ=%d", http_port, ping_port) + app.run(host="0.0.0.0", port=http_port) + t.join() diff --git a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py new file mode 100644 index 000000000..8290276fb --- /dev/null +++ b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py @@ -0,0 +1,672 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The MiniMax AI team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only MiniMaxM2/M2.5 model.""" + +from collections.abc import Iterable +from typing import Any + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm._aiter_ops import rocm_aiter_ops +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config +from vllm.distributed import ( + get_ep_group, + get_pp_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention +from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.model_executor.models.utils import sequence_parallel_chunk +from vllm.sequence import IntermediateTensors + +from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP +from .utils import ( + AutoWeightsLoader, + PPMissingLayer, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, + make_layers, + maybe_prefix, +) + +logger = init_logger(__name__) + + +class MiniMaxM2MoE(nn.Module): + """MoE layer for MiniMax M2/M2.5 with EP/WideEP/Mori support. + + Follows the DeepSeek V2 MoE pattern: GateLinear + FusedMoE with + expert parallelism, EPLB, and sequence parallel awareness. + """ + + def __init__( + self, + config: PretrainedConfig, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ): + super().__init__() + vllm_config = get_current_vllm_config() + parallel_config = vllm_config.parallel_config + + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + + self.ep_group = get_ep_group().device_group + self.ep_rank = get_ep_group().rank_in_group + self.ep_size = self.ep_group.size() + + self.n_routed_experts: int = config.num_local_experts + self.n_shared_experts: int = 0 + + self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe + self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0) + self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() + + eplb_config = parallel_config.eplb_config + self.enable_eplb = parallel_config.enable_eplb + self.n_redundant_experts = eplb_config.num_redundant_experts + self.n_logical_experts = self.n_routed_experts + self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + + self.use_routing_bias = getattr(config, "use_routing_bias", False) + if self.use_routing_bias: + self.e_score_correction_bias = nn.Parameter( + torch.empty(config.num_local_experts, dtype=torch.float32) + ) + self.e_score_correction_bias.weight_loader = ( + MiniMaxM2MoE.ebias_weight_loader + ) + else: + self.e_score_correction_bias = None + + self.gate = GateLinear( + config.hidden_size, + config.num_local_experts, + out_dtype=torch.float32, + prefix=f"{prefix}.gate", + ) + + self.experts = FusedMoE( + num_experts=config.num_local_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + reduce_results=False, + renormalize=True, + scoring_func=getattr(config, "scoring_func", "softmax"), + e_score_correction_bias=self.e_score_correction_bias, + quant_config=quant_config, + prefix=f"{prefix}.experts", + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts, + is_sequence_parallel=self.is_sequence_parallel, + router_logits_dtype=torch.float32, + gate=self.gate, + routed_scaling_factor=1.0 + if not self.is_rocm_aiter_moe_enabled + else self.routed_scaling_factor, + ) + + @staticmethod + def ebias_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor) -> None: + assert param.size() == loaded_weight.size() + param.data.copy_(loaded_weight.to(torch.float32)) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + num_tokens, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + + if self.is_sequence_parallel: + hidden_states = sequence_parallel_chunk(hidden_states) + + if self.experts.is_internal_router: + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=hidden_states + ) + else: + router_logits, _ = self.gate(hidden_states) + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=router_logits + ) + + if hidden_states.dtype != torch.float16: + if not self.is_rocm_aiter_moe_enabled: + final_hidden_states = final_hidden_states * self.routed_scaling_factor + + if self.is_sequence_parallel: + final_hidden_states = tensor_model_parallel_all_gather( + final_hidden_states, 0 + ) + final_hidden_states = final_hidden_states[:num_tokens] + elif self.tp_size > 1: + final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( + final_hidden_states + ) + + return final_hidden_states.view(num_tokens, hidden_dim) + + +class MiniMaxM2Attention(nn.Module): + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rotary_dim: int, + rope_parameters: dict[str, Any] | None = None, + attn_window_size: int | None = None, + max_position_embeddings: int = 8192, + head_dim: int | None = None, + rms_norm_eps: float = 1e-06, + qkv_bias: bool = False, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = head_dim or (hidden_size // self.total_num_heads) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + if ( + rope_parameters is not None + and "partial_rotary_factor" not in rope_parameters + ): + rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim + self.rotary_emb = get_rope( + self.head_dim, + max_position=max_position_embeddings, + rope_parameters=rope_parameters, + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + per_layer_sliding_window=attn_window_size, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + + self.q_norm = MiniMaxText01RMSNormTP( + self.head_dim * self.total_num_heads, eps=rms_norm_eps + ) + self.k_norm = MiniMaxText01RMSNormTP( + self.head_dim * self.total_num_kv_heads, eps=rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = MiniMaxText01RMSNormTP.forward_qk( + self.q_norm, self.k_norm, q.contiguous(), k.contiguous() + ) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class MiniMaxM2DecoderLayer(nn.Module): + def __init__( + self, + config: PretrainedConfig, + prefix: str, + model_config: ModelConfig, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): + max_position_embeddings = max( + config.max_position_embeddings, config.max_model_len + ) + # DecoderLayers are created with `make_layers` which passes the prefix + # with the layer's index. + layer_idx = int(prefix.split(sep=".")[-1]) + + self.layer_idx = layer_idx + self.self_attn = MiniMaxM2Attention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rotary_dim=config.rotary_dim, + rope_parameters=config.rope_parameters, + max_position_embeddings=max_position_embeddings, + rms_norm_eps=config.rms_norm_eps, + qkv_bias=getattr(config, "attention_bias", False), + head_dim=getattr(config, "head_dim", None), + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + + self.block_sparse_moe = MiniMaxM2MoE( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + ) -> torch.Tensor: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + + hidden_states = self.block_sparse_moe(hidden_states) + + return hidden_states, residual + + +@support_torch_compile +class MiniMaxM2Model(nn.Module): + fall_back_to_pt_during_load = False + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config + + self.vocab_size = config.vocab_size + + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=None, + prefix=f"{prefix}.embed_tokens", + ) + else: + self.embed_tokens = PPMissingLayer() + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: MiniMaxM2DecoderLayer( + config, + prefix, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + ), + prefix=f"{prefix}.layers", + ) + + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_input_ids(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for layer in self.layers[self.start_layer : self.end_layer]: + hidden_states, residual = layer(positions, hidden_states, residual) + + if not get_pp_group().is_last_rank: + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return FusedMoE.make_expert_params_mapping( + self, + ckpt_gate_proj_name="w1", + ckpt_down_proj_name="w2", + ckpt_up_proj_name="w3", + num_experts=self.config.num_local_experts, + num_redundant_experts=0, + ) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = self.get_expert_mapping() + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is not None: + continue # skip spec decode layers for main model + + for param_name, weight_name, shard_id in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if ("mlp.experts." in name) and name not in params_dict: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader( + param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id, + ) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class MiniMaxM2MixtureOfExperts(MixtureOfExperts): + """EPLB protocol implementation for MiniMax M2/M2.5.""" + + moe_mlp_layers: list[MiniMaxM2MoE] + + def extract_moe_parameters(self, example_moe: MiniMaxM2MoE | None): + if example_moe is None: + self.num_moe_layers = 0 + self.num_expert_groups = 0 + self.num_logical_experts = 0 + self.num_physical_experts = 0 + self.num_local_physical_experts = 0 + self.num_routed_experts = 0 + self.num_shared_experts = 0 + self.num_redundant_experts = 0 + logger.warning("MiniMax M2: No MoE layer found in model.layers.") + else: + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = num_physical_experts - self.num_logical_experts + for moe in self.moe_mlp_layers: + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + + +class MiniMaxM2ForCausalLM( + nn.Module, SupportsLoRA, SupportsPP, MiniMaxM2MixtureOfExperts +): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + if hasattr(vllm_config.model_config, "max_model_len"): + self.config.max_model_len = vllm_config.model_config.max_model_len + self.model = MiniMaxM2Model( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead( + config.vocab_size, config.hidden_size, quant_config=None + ) + else: + self.lm_head = PPMissingLayer() + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + + self.num_moe_layers = config.num_hidden_layers + self._set_moe_parameters() + + def _set_moe_parameters(self): + self.expert_weights: list = [] + self.num_expert_groups = 1 + self.moe_layers: list = [] + self.moe_mlp_layers: list[MiniMaxM2MoE] = [] + example_moe = None + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + assert isinstance(layer, MiniMaxM2DecoderLayer) + if isinstance(layer.block_sparse_moe, MiniMaxM2MoE): + example_moe = layer.block_sparse_moe + self.moe_mlp_layers.append(layer.block_sparse_moe) + self.moe_layers.append(layer.block_sparse_moe.experts) + self.extract_moe_parameters(example_moe) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor | IntermediateTensors: + hidden_states = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + logits = self.logits_processor(self.lm_head, hidden_states) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() + + +def get_spec_layer_idx_from_weight_name( + config: PretrainedConfig, weight_name: str +) -> int | None: + if hasattr(config, "num_mtp_modules") and (config.num_mtp_modules > 0): + layer_idx = config.num_hidden_layers + for i in range(config.num_mtp_modules): + if weight_name.startswith(f"model.layers.{layer_idx + i}."): + return layer_idx + i + return None diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 7eb7414a6..cf08b3c2a 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -1,780 +1,19 @@ #!/bin/bash -# SGLang Disaggregated Server Launcher with Model-Specific Configurations +# Dual-Engine Disaggregated Server Dispatcher # ============================================================================= - -# ============================================================================= -# Environment Configuration -# ============================================================================= - -NODE0_ADDR="${NODE0_ADDR:-localhost}" -NODE_RANK="${NODE_RANK:-0}" -MODEL_DIR="${MODEL_DIR:-}" -MODEL_NAME="${MODEL_NAME:-}" - -xP="${xP:-1}" #-> Number of Prefill Workers -yD="${yD:-1}" #-> Number of Decode Workers - -IPADDRS="${IPADDRS:-localhost}" -HEADNODE_PORT="${HEADNODE_PORT:-20000}" -# Parallelism Configuration -PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" -PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}" -PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}" -DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" -DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}" -DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}" -DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}" - -# Benchmark Configuration -BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" -BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" -BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" -BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" -BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" -BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" - -# Extract the maximum concurrency from the x-delimited list -BENCH_MAX_CONC_VALUE=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) - -# Dry Run for debugging purpose -DRY_RUN="${DRY_RUN:-0}" - -# GPU count (expandable for different hardware) -GPUS_PER_NODE="${GPUS_PER_NODE:-8}" - - -# ============================================================================= -# Dependencies and Environment Setup -# ============================================================================= -source $SGLANG_WS_PATH/env.sh - -host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}') -host_name=$(hostname) - -# MORI_RDMA_TC configuration (optional) -# If set by runner, use it for RDMA traffic class configuration -# If not set, RDMA operations will proceed without QoS/traffic class settings -if [[ -n "${MORI_RDMA_TC}" ]]; then - echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC for RDMA traffic class configuration" - echo "[INFO] Host '$host_name' configured with MORI_RDMA_TC=$MORI_RDMA_TC" -else - echo "[INFO] MORI_RDMA_TC not set. Skipping RDMA traffic class configuration." - echo "[INFO] This is normal for clusters without QoS requirements." -fi - -# ============================================================================= -# Model-Specific Configuration from YAML +# Dispatches to the engine-specific server launcher based on ENGINE env var. +# ENGINE=sglang (default) -> server_sglang.sh (SGLang + MoRI) +# ENGINE=vllm -> server_vllm.sh (vLLM + Nixl/MoRI-IO) # ============================================================================= -MODELS_YAML="${SGLANG_WS_PATH}/models.yaml" -if [[ ! -f "$MODELS_YAML" ]]; then - echo "ERROR: models.yaml not found at $MODELS_YAML" - exit 1 -fi - -# Load model config via inline Python (PyYAML is available in SGLang containers) -# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP") -# is done here in Python to avoid bash glob-expanding the * characters. -eval "$(python3 -c " -import yaml, sys, os - -config_path = '${MODELS_YAML}' -model_name = '${MODEL_NAME}' - -with open(config_path) as f: - models = yaml.safe_load(f) - -if model_name not in models: - print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') - sys.exit(0) - -m = models[model_name] - -def eval_formula(val): - \"\"\"Evaluate chunked_prefill_size: if string, resolve variable names from env and compute.\"\"\" - if isinstance(val, (int, float)): - return int(val) - s = str(val) - # Build a namespace from env vars (convert numeric values to int) - ns = {} - for k, v in os.environ.items(): - try: - ns[k] = int(v) - except (ValueError, TypeError): - pass - try: - return int(eval(s, {'__builtins__': {}}, ns)) - except Exception as e: - print(f'echo \"WARNING: Cannot evaluate formula: {s} ({e})\"', file=sys.stderr) - return val - -def parse_range(cuda_range, default_start, default_end): - if '-' in str(cuda_range): - s, e = str(cuda_range).split('-') - return s, e - return str(default_start), str(default_end) - -# Output shell variables -print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"') -print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"') -print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"') - -prefill = m.get('prefill', {}) -decode = m.get('decode', {}) +ENGINE="${ENGINE:-sglang}" +WS_PATH="${WS_PATH:-${SGLANG_WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}}" +export WS_PATH ENGINE -print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"') -print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"') +echo "[DISPATCHER] ENGINE=$ENGINE WS_PATH=$WS_PATH" -dp = prefill.get('dp', {}) -no_dp = prefill.get('no_dp', {}) -print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"') -print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') -print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"') -print(f'PREFILL_CONTEXT_LENGTH_DP=\"{dp.get(\"context_length\", \"\")}\"') -print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"') -print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"') -print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') -print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) -print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') -print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') - -print(f'DECODE_MEM_FRACTION_STATIC=\"{decode.get(\"mem_fraction_static\", 0.85)}\"') -print(f'DECODE_PREFILL_ROUND_ROBIN_BALANCE=\"{decode.get(\"prefill_round_robin_balance\", True)}\"') - -dp = decode.get('dp', {}) -ep_only = decode.get('ep_only', {}) -no_dp = decode.get('no_dp', {}) - -# Decode DP config -print(f'DECODE_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 4096)}\"') -print(f'DECODE_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(dp.get('cuda_graph_bs_range', '1-160'), 1, 160) -print(f'DECODE_CUDA_GRAPH_BS_DP_START=\"{s}\"') -print(f'DECODE_CUDA_GRAPH_BS_DP_END=\"{e}\"') - -# Decode EP-only config (EP enabled but DP disabled) -print(f'DECODE_MAX_RUNNING_REQUESTS_EP_ONLY=\"{ep_only.get(\"max_running_requests\", 256)}\"') -print(f'DECODE_CHUNKED_PREFILL_SIZE_EP_ONLY=\"{eval_formula(ep_only.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(ep_only.get('cuda_graph_bs_range', '1-256'), 1, 256) -print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_START=\"{s}\"') -print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_END=\"{e}\"') - -# Decode no-DP config -print(f'DECODE_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') -print(f'DECODE_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) -print(f'DECODE_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') -print(f'DECODE_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') -")" - -echo "Loaded model configuration for: $MODEL_NAME" - -# Compute DP-dependent prefill parameters -if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then - prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP) - prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP - prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP - prefill_context_length=$PREFILL_CONTEXT_LENGTH_DP - prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_DP - prefill_enable_two_batch_overlap=$PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP +if [[ "$ENGINE" == "vllm" ]]; then + source "$WS_PATH/server_vllm.sh" else - prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) - prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP - prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP - prefill_context_length="" - prefill_max_total_tokens="" - prefill_enable_two_batch_overlap="false" + source "$WS_PATH/server_sglang.sh" fi - -# When both DP and EP are enabled, override max-running-requests with max bench concurrency -if [[ "$PREFILL_ENABLE_DP" == "true" ]] && [[ "$PREFILL_ENABLE_EP" == "true" ]]; then - prefill_max_running_requests=$BENCH_MAX_CONC_VALUE - prefill_dp_ranks=$PREFILL_TP_SIZE - # MORI_MAX_DISPATCH_TOKENS_PREFILL stays at 8192 (no change) - MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2)) - echo "[DP+EP override] Prefill: max-running-requests=$prefill_max_running_requests, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" -fi - -# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) -if [[ "$DECODE_ENABLE_DP" == "true" ]]; then - decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END)) - decode_max_running_requests=$((DECODE_CUDA_GRAPH_BS_DP_END * DECODE_TP_SIZE)) -elif [[ "$DECODE_ENABLE_EP" == "true" ]]; then - decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_EP_ONLY_START $DECODE_CUDA_GRAPH_BS_EP_ONLY_END)) - decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_EP_ONLY -else - decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_NO_DP_START $DECODE_CUDA_GRAPH_BS_NO_DP_END)) - decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP -fi - -# When both DP and EP are enabled, override max-running-requests and dispatch tokens -if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; then - decode_max_running_requests=$BENCH_MAX_CONC_VALUE - decode_dp_ranks=$DECODE_TP_SIZE - MORI_MAX_DISPATCH_TOKENS_DECODE=$((BENCH_MAX_CONC_VALUE / decode_dp_ranks)) - MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10)) - # Update derived variable - SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) - export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD - echo "[DP+EP override] Decode: max-running-requests=$decode_max_running_requests, DISPATCH_TOKENS=$MORI_MAX_DISPATCH_TOKENS_DECODE, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_DECODE, INTER_KERNEL_SWITCH=$SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD" -fi - -# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " -if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then - PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" -fi -if [[ -n "$prefill_context_length" ]]; then - PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --context-length ${prefill_context_length}" -fi -if [[ -n "$prefill_max_total_tokens" ]]; then - PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --max-total-tokens ${prefill_max_total_tokens}" -fi -if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then - PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap" - PREFILL_SDMA_ENV="MORI_ENABLE_SDMA=true" -fi - -DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} " - -if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then - DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" -fi - -if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then - MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) - MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) -fi - -# ============================================================================= -# Cluster Topology Configuration -# ============================================================================= -IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" - -# Ceiling division by GPUS_PER_NODE for nodes-per-worker -PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE)) -DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE)) -NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP)) - -# Build prefill arguments dynamically based on xP -PREFILL_HEADNODE_URLS=() -PREFILL_ARGS="" -for i in $(seq 0 $((xP - 1))); do - prefill_idx=$((i * PREFILL_NODES_PER_WORKER)) - PREFILL_HEADNODE_URLS[$i]="${IP_ARRAY[$prefill_idx]}:${HEADNODE_PORT}" - PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$prefill_idx]}:8000" -done - -# Build decode arguments dynamically based on yD -DECODE_HEADNODE_URLS=() -DECODE_ARGS="" -for i in $(seq 0 $((yD - 1))); do - decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET)) - DECODE_HEADNODE_URLS[$i]="${IP_ARRAY[$decode_idx]}:${HEADNODE_PORT}" - DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$decode_idx]}:8000" -done - -echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}" -echo "Decode worker headnode list: ${DECODE_HEADNODE_URLS[@]}" - -# ============================================================================= -# Configuration Builder Functions -# ============================================================================= - -build_server_config() { - local mode="$1" - local model_name="$2" - local tp_size="$3" - local enable_ep="$4" - local enable_dp="$5" - local decode_mtp_size="$6" - - # Calculate EP and DP sizes based on enable flags - local ep_size=1 - local dp_size=1 - - if [[ "$enable_ep" == "true" ]]; then - ep_size=$tp_size - fi - - if [[ "$enable_dp" == "true" ]]; then - dp_size=$tp_size - fi - - # Build parallelism arguments - local parallel_args="--tp-size ${tp_size}" - - if [[ "$enable_ep" == "true" ]]; then - parallel_args="$parallel_args --ep-size ${ep_size}" - fi - - if [[ "$enable_dp" == "true" ]]; then - parallel_args="$parallel_args --dp-size ${dp_size}" - fi - - # Get model-specific configuration from YAML-loaded variables - local base_config="$MODEL_BASE_FLAGS" - local mtp_config="" - local dp_config="" - local specific_config="" - - # MTP config (only if MTP is enabled and mode is decode) - if [ "$decode_mtp_size" -gt 0 ]; then - mtp_config="${MODEL_MTP_FLAGS} --speculative-num-steps ${decode_mtp_size} --speculative-num-draft-tokens $((decode_mtp_size + 1))" - fi - - # DP config (only if DP is enabled) - if [[ "$enable_dp" == "true" ]]; then - dp_config="$MODEL_DP_FLAGS" - fi - - # Mode-specific config - if [[ "$mode" == "prefill" ]]; then - specific_config="$PREFILL_MODE_FLAGS" - elif [[ "$mode" == "decode" ]]; then - specific_config="$DECODE_MODE_FLAGS" - fi - - # Combine: parallel args + base config + mtp config (decode only) + dp config + specific config - local full_config="$parallel_args" - if [[ -n "$base_config" ]]; then - full_config="$full_config $base_config" - fi - if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then - full_config="$full_config $mtp_config" - fi - if [[ -n "$dp_config" ]]; then - full_config="$full_config $dp_config" - fi - if [[ -n "$specific_config" ]]; then - full_config="$full_config $specific_config" - fi - - echo "$full_config" -} - -# Build complete server configurations -PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE") -DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE") - -if [[ -n "$MODEL_NAME" ]]; then - echo "Using model-specific configuration for: $MODEL_NAME" -fi - -if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]; then - PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') - DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') - unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL - unset MORI_MOE_MAX_INPUT_TOKENS_DECODE -fi - -# ============================================================================= -# Container Synchronization -# ============================================================================= - -echo "Waiting at the container creation barrier on $host_name" -python3 $SGLANG_WS_PATH/sync.py barrier \ - --local-ip ${host_ip} \ - --local-port 5000 \ - --enable-port \ - --node-ips ${IPADDRS} \ - --node-ports 5000 \ - --wait-for-all-ports \ - --timeout 300 - - -# ============================================================================= -# Node Role Assignment and Server Launch -# ============================================================================= - -if [ "$NODE_RANK" -eq 0 ]; then - echo "NODE INFO =======================================" - echo "================================================" - echo "Node List : ${SLURM_JOB_NODELIST}" - echo "Node IPs : ${IPADDRS}" - echo "Model Name : ${MODEL_NAME:-'Not specified'}" - echo "================================================" - - echo "CLUSTER INFO ====================================" - echo "================================================" - echo "${host_name}:${host_ip} is Proxy Node and Prefill Node" - echo "Using prefill config: $PREFILL_SERVER_CONFIG" - echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" - echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" - echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}" - echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" - echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}" - echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} " - echo "Decode env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} " - - echo "================================================" - - # start the head prefill server - PREFILL_MORI_MOE_ENV="" - set -x - if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then - PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" - fi - set +x - PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ - --model-path $MODEL_DIR/$MODEL_NAME \ - --disaggregation-mode prefill \ - --disaggregation-ib-device ${IBDEVICES} \ - --host 0.0.0.0 \ - --port 8000 \ - --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} " - - if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then - PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0" - fi - - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PREFILL_CMD" - else - set -x - eval "$PREFILL_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & - set +x - prefill0_pid=$! - fi - - - echo "Waiting for all prefill and decode servers to be up . . ." - - - BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports 8000 \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - echo "Congratulations!!! All prefill and decode servers are up . . ." - - ROUTER_CMD="python -m sglang_router.launch_router \ - --pd-disaggregation \ - --port 30000 \ - --policy random \ - --prefill-policy random \ - --decode-policy random \ - ${PREFILL_ARGS} \ - ${DECODE_ARGS}" - - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $ROUTER_CMD" - else - ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_proxy_${host_name}.log" - set -x - if [[ "${SGLANG_ROUTER_STDOUT_LOGS:-0}" == "1" ]]; then - eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & - else - eval "$ROUTER_CMD" >"$ROUTER_LOG_FILE" 2>&1 & - fi - set +x - proxy_pid=$! - - # Wait for router to be ready via health endpoint - HEALTH_BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports 30000 \ - --wait-for-all-health \ - --health-endpoint /readiness \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $HEALTH_BARRIER_CMD" - else - eval "$HEALTH_BARRIER_CMD" - fi - - echo "Router is ready for benchmarking" - fi - - - echo "Ready for benchmarking on ${host_name}:${host_ip}" - - echo "Benchmarking on ${host_name}:${host_ip}" - cd $SGLANG_WS_PATH - - # Export IS_MTP based on whether MTP is enabled - if [ "$DECODE_MTP_SIZE" -gt 0 ]; then - export IS_MTP=true - else - export IS_MTP=false - fi - - # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier - BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ - $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ - ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \ - ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" - - if [[ "${EVAL_ONLY:-false}" == "true" ]]; then - echo "EVAL_ONLY mode: skipping throughput benchmark" - elif [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BENCH_CMD" - else - set -x - eval "$BENCH_CMD" - set +x - fi - - # Run evaluation if requested (before killing router) - if [[ "${RUN_EVAL:-false}" == "true" ]]; then - echo "Running lm-eval evaluation on Node 0..." - - # Health check: verify the router is still serving before running eval. - # The throughput benchmark may have crashed/exhausted decode workers. - EVAL_HEALTH_OK=false - for _attempt in 1 2 3; do - if curl -sf --max-time 10 "http://0.0.0.0:30000/readiness" >/dev/null 2>&1; then - EVAL_HEALTH_OK=true - break - fi - echo "Eval health check attempt $_attempt failed, retrying in 10s..." - sleep 10 - done - - if [[ "$EVAL_HEALTH_OK" != "true" ]]; then - echo "WARNING: Router health check failed after 3 attempts. Skipping eval." - else - # Must run from repo root so utils/evals/${task}.yaml resolves - pushd /workspace - - # Source eval functions from benchmark_lib.sh - source /workspace/benchmarks/benchmark_lib.sh - - # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list - if [[ -n "${EVAL_CONC:-}" ]]; then - export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}" - else - export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) - fi - - # Override eval context length with model's configured context_length - if [[ -n "$prefill_context_length" ]]; then - export EVAL_MAX_MODEL_LEN="$prefill_context_length" - fi - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})" - else - # Run lm-eval against the router on port 30000 - run_eval --framework lm-eval --port 30000 - eval_rc=$? - - if [[ $eval_rc -ne 0 ]]; then - echo "ERROR: run_eval exited rc=$eval_rc; skipping metadata write and eval artifact staging" >&2 - EVAL_FAILED=1 - else - # Set metadata env vars for append_lm_eval_summary - export TP="${PREFILL_TP_SIZE}" - export CONC="${EVAL_CONCURRENT_REQUESTS}" - export EP_SIZE=1 - [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}" - export PREFILL_TP="${PREFILL_TP_SIZE}" - export PREFILL_EP=1 - [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}" - export PREFILL_NUM_WORKERS="${xP}" - export DECODE_TP="${DECODE_TP_SIZE}" - export DECODE_EP=1 - [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}" - export DECODE_NUM_WORKERS="${yD}" - export DP_ATTENTION="${PREFILL_ENABLE_DP}" - export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}" - export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}" - export ISL="${BENCH_INPUT_LEN}" - export OSL="${BENCH_OUTPUT_LEN}" - # IS_MULTINODE, FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, - # RESULT_FILENAME are already set via Docker -e flags from job.slurm - - append_lm_eval_summary - # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace - - # Copy eval artifacts to run_logs for NFS extraction by runner - EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results" - mkdir -p "$EVAL_COPY_DIR" - for f in meta_env.json; do - [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/" - done - # Use find for glob patterns to avoid "no match" errors - find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \; - find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \; - - echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR" - fi - fi - - popd - fi - fi - - # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) - LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" - mkdir -p "$LOGS_OUTPUT" - - if [[ "$DRY_RUN" -eq 0 ]]; then - cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" - echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" - fi - - echo "Killing the proxy server and prefill server" - - if [[ "$DRY_RUN" -eq 0 ]]; then - kill $proxy_pid - kill $prefill0_pid - fi - - if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then - echo "ERROR: eval failed; exiting node-0 with rc=1" - exit 1 - fi - -elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then - echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})" - echo "Using prefill config: $PREFILL_SERVER_CONFIG" - echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}" - - PREFILL_MORI_MOE_ENV="" - set -x - if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then - PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" - fi - set +x - PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ - --model-path $MODEL_DIR/${MODEL_NAME} \ - --disaggregation-mode prefill \ - --disaggregation-ib-device ${IBDEVICES} \ - --host 0.0.0.0 \ - --port 8000 \ - --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} " - - if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then - rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER)) - prefill_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER)) - PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[$prefill_idx]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank $rank" - fi - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PREFILL_CMD" - else - set -x - eval "$PREFILL_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & - set +x - prefill_pid=$! - fi - - echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports 30000 \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - - echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ - --remote-ip ${NODE0_ADDR} \ - --remote-port 30000" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $WAIT_CMD" - else - eval "$WAIT_CMD" - fi - - echo "Killing the rank $NODE_RANK prefill server" - - if [[ "$DRY_RUN" -eq 0 ]]; then - kill $prefill_pid - fi - -else - RANK=$((NODE_RANK - xP * PREFILL_NODES_PER_WORKER)) - echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})" - echo "Using decode config: $DECODE_SERVER_CONFIG" - echo "Decode node rank: $RANK" - echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" - - DECODE_MORI_MOE_ENV="" - set -x - if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_DECODE" ]]; then - DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}" - fi - set +x - DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ - --model-path ${MODEL_DIR}/${MODEL_NAME} \ - --disaggregation-mode decode \ - --disaggregation-ib-device ${IBDEVICES} \ - --host 0.0.0.0 \ - --port 8000 \ - --trust-remote-code \ - ${DECODE_SERVER_CONFIG} " - - if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then - rank=$((RANK % DECODE_NODES_PER_WORKER)) - decode_idx=$((RANK / DECODE_NODES_PER_WORKER)) - DECODE_CMD="$DECODE_CMD --dist-init-addr ${DECODE_HEADNODE_URLS[$decode_idx]} --nnodes ${DECODE_NODES_PER_WORKER} --node-rank $rank" - fi - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $DECODE_CMD" - else - set -x - eval "$DECODE_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & - - set +x - decode_pid=$! - fi - - - echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports 30000 \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - - - echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ - --remote-ip ${NODE0_ADDR} \ - --remote-port 30000" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $WAIT_CMD" - else - eval "$WAIT_CMD" - fi - - echo "Killing the rank $RANK decode server" - if [[ "$DRY_RUN" -eq 0 ]]; then - kill $decode_pid - fi - -fi - -echo "Script completed successfully" -exit 0 diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh new file mode 100755 index 000000000..53ca29cc5 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -0,0 +1,624 @@ +#!/bin/bash +# SGLang Disaggregated Server Launcher with Model-Specific Configurations +# ============================================================================= + +# ============================================================================= +# Environment Configuration +# ============================================================================= + +NODE0_ADDR="${NODE0_ADDR:-localhost}" +NODE_RANK="${NODE_RANK:-0}" +MODEL_DIR="${MODEL_DIR:-}" +MODEL_NAME="${MODEL_NAME:-}" + +xP="${xP:-1}" #-> Number of Prefill Workers +yD="${yD:-1}" #-> Number of Decode Workers + +IPADDRS="${IPADDRS:-localhost}" +HEADNODE_PORT="${HEADNODE_PORT:-20000}" +# Parallelism Configuration +PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" +PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}" +PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}" +DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" +DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}" +DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}" +DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}" + +# Benchmark Configuration +BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" +BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" +BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" +BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" + +# Dry Run for debugging purpose +DRY_RUN="${DRY_RUN:-0}" + +# GPU count (expandable for different hardware) +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + + +# ============================================================================= +# Dependencies and Environment Setup +# ============================================================================= +source $WS_PATH/env.sh + +host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}') +host_name=$(hostname) + +# MORI_RDMA_TC configuration (optional) +# If set by runner, use it for RDMA traffic class configuration +# If not set, RDMA operations will proceed without QoS/traffic class settings +if [[ -n "${MORI_RDMA_TC}" ]]; then + echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC for RDMA traffic class configuration" + echo "[INFO] Host '$host_name' configured with MORI_RDMA_TC=$MORI_RDMA_TC" +else + echo "[INFO] MORI_RDMA_TC not set. Skipping RDMA traffic class configuration." + echo "[INFO] This is normal for clusters without QoS requirements." +fi + +# ============================================================================= +# Model-Specific Configuration from YAML +# ============================================================================= +MODELS_YAML="${WS_PATH}/models.yaml" + +if [[ ! -f "$MODELS_YAML" ]]; then + echo "ERROR: models.yaml not found at $MODELS_YAML" + exit 1 +fi + +# Load model config via inline Python (PyYAML is available in SGLang containers) +# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP") +# is done here in Python to avoid bash glob-expanding the * characters. +eval "$(python3 -c " +import yaml, sys, os + +config_path = '${MODELS_YAML}' +model_name = '${MODEL_NAME}' + +with open(config_path) as f: + models = yaml.safe_load(f) + +if model_name not in models: + print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') + sys.exit(0) + +m = models[model_name] + +def eval_formula(val): + \"\"\"Evaluate chunked_prefill_size: if string, resolve variable names from env and compute.\"\"\" + if isinstance(val, (int, float)): + return int(val) + s = str(val) + # Build a namespace from env vars (convert numeric values to int) + ns = {} + for k, v in os.environ.items(): + try: + ns[k] = int(v) + except (ValueError, TypeError): + pass + try: + return int(eval(s, {'__builtins__': {}}, ns)) + except Exception as e: + print(f'echo \"WARNING: Cannot evaluate formula: {s} ({e})\"', file=sys.stderr) + return val + +def parse_range(cuda_range, default_start, default_end): + if '-' in str(cuda_range): + s, e = str(cuda_range).split('-') + return s, e + return str(default_start), str(default_end) + +# Output shell variables +print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"') +print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"') +print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"') + +prefill = m.get('prefill', {}) +decode = m.get('decode', {}) + +print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"') +print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"') + +dp = prefill.get('dp', {}) +no_dp = prefill.get('no_dp', {}) +print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"') +print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') +print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"') +print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') +print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) +print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') +print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') + +print(f'DECODE_MEM_FRACTION_STATIC=\"{decode.get(\"mem_fraction_static\", 0.85)}\"') +print(f'DECODE_PREFILL_ROUND_ROBIN_BALANCE=\"{decode.get(\"prefill_round_robin_balance\", True)}\"') + +dp = decode.get('dp', {}) +ep_only = decode.get('ep_only', {}) +no_dp = decode.get('no_dp', {}) + +# Decode DP config +print(f'DECODE_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 4096)}\"') +print(f'DECODE_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(dp.get('cuda_graph_bs_range', '1-160'), 1, 160) +print(f'DECODE_CUDA_GRAPH_BS_DP_START=\"{s}\"') +print(f'DECODE_CUDA_GRAPH_BS_DP_END=\"{e}\"') + +# Decode EP-only config (EP enabled but DP disabled) +print(f'DECODE_MAX_RUNNING_REQUESTS_EP_ONLY=\"{ep_only.get(\"max_running_requests\", 256)}\"') +print(f'DECODE_CHUNKED_PREFILL_SIZE_EP_ONLY=\"{eval_formula(ep_only.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(ep_only.get('cuda_graph_bs_range', '1-256'), 1, 256) +print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_START=\"{s}\"') +print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_END=\"{e}\"') + +# Decode no-DP config +print(f'DECODE_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') +print(f'DECODE_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) +print(f'DECODE_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') +print(f'DECODE_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') +")" + +echo "Loaded model configuration for: $MODEL_NAME" + +# Compute DP-dependent prefill parameters +if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then + prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP) + prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP + prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP +else + prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) + prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP + prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP +fi + +# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) +if [[ "$DECODE_ENABLE_DP" == "true" ]]; then + decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END)) + decode_max_running_requests=$((DECODE_CUDA_GRAPH_BS_DP_END * DECODE_TP_SIZE)) +elif [[ "$DECODE_ENABLE_EP" == "true" ]]; then + decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_EP_ONLY_START $DECODE_CUDA_GRAPH_BS_EP_ONLY_END)) + decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_EP_ONLY +else + decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_NO_DP_START $DECODE_CUDA_GRAPH_BS_NO_DP_END)) + decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP +fi + +# Use Decode configuration to configure different TP/DP size between P and D +PREFILL_DECODE_DIFFERENT_TP="" +if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then + if [[ "$DECODE_ENABLE_DP" == "true" ]]; then + PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}" + else + PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1" + fi +fi + +# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) +PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}" +if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" +fi + +DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]}" +if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then + DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" +fi + +if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then + MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) +fi + +# ============================================================================= +# Cluster Topology Configuration +# ============================================================================= +IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" + +# Ceiling division by GPUS_PER_NODE for nodes-per-worker +PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE)) +DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE)) +NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP)) + +# Build prefill arguments dynamically based on xP +PREFILL_HEADNODE_URLS=() +PREFILL_ARGS="" +for i in $(seq 0 $((xP - 1))); do + prefill_idx=$((i * PREFILL_NODES_PER_WORKER)) + PREFILL_HEADNODE_URLS[$i]="${IP_ARRAY[$prefill_idx]}:${HEADNODE_PORT}" + PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$prefill_idx]}:8000" +done + +# Build decode arguments dynamically based on yD +DECODE_HEADNODE_URLS=() +DECODE_ARGS="" +for i in $(seq 0 $((yD - 1))); do + decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET)) + DECODE_HEADNODE_URLS[$i]="${IP_ARRAY[$decode_idx]}:${HEADNODE_PORT}" + DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$decode_idx]}:8000" +done + +echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}" +echo "Decode worker headnode list: ${DECODE_HEADNODE_URLS[@]}" + +# ============================================================================= +# Configuration Builder Functions +# ============================================================================= + +build_server_config() { + local mode="$1" + local model_name="$2" + local tp_size="$3" + local enable_ep="$4" + local enable_dp="$5" + local decode_mtp_size="$6" + + # Calculate EP and DP sizes based on enable flags + local ep_size=1 + local dp_size=1 + + if [[ "$enable_ep" == "true" ]]; then + ep_size=$tp_size + fi + + if [[ "$enable_dp" == "true" ]]; then + dp_size=$tp_size + fi + + # Build parallelism arguments + local parallel_args="--tp-size ${tp_size}" + + if [[ "$enable_ep" == "true" ]]; then + parallel_args="$parallel_args --ep-size ${ep_size}" + fi + + if [[ "$enable_dp" == "true" ]]; then + parallel_args="$parallel_args --dp-size ${dp_size}" + fi + + # Get model-specific configuration from YAML-loaded variables + local base_config="$MODEL_BASE_FLAGS" + local mtp_config="" + local dp_config="" + local specific_config="" + + # MTP config (only if MTP is enabled and mode is decode) + if [ "$decode_mtp_size" -gt 0 ]; then + mtp_config="${MODEL_MTP_FLAGS} --speculative-num-steps ${decode_mtp_size} --speculative-num-draft-tokens $((decode_mtp_size + 1))" + fi + + # DP config (only if DP is enabled) + if [[ "$enable_dp" == "true" ]]; then + dp_config="$MODEL_DP_FLAGS" + fi + + # Mode-specific config + if [[ "$mode" == "prefill" ]]; then + specific_config="$PREFILL_MODE_FLAGS" + elif [[ "$mode" == "decode" ]]; then + specific_config="$DECODE_MODE_FLAGS" + fi + + # Combine: parallel args + base config + mtp config (decode only) + dp config + specific config + local full_config="$parallel_args" + if [[ -n "$base_config" ]]; then + full_config="$full_config $base_config" + fi + if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then + full_config="$full_config $mtp_config" + fi + if [[ -n "$dp_config" ]]; then + full_config="$full_config $dp_config" + fi + if [[ -n "$specific_config" ]]; then + full_config="$full_config $specific_config" + fi + + echo "$full_config" +} + +# Build complete server configurations +PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE") +DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE") + +if [[ -n "$MODEL_NAME" ]]; then + echo "Using model-specific configuration for: $MODEL_NAME" +fi + +# ============================================================================= +# Container Synchronization +# ============================================================================= + +echo "Waiting at the container creation barrier on $host_name" +python3 $WS_PATH/sync.py barrier \ + --local-ip ${host_ip} \ + --local-port 5000 \ + --enable-port \ + --node-ips ${IPADDRS} \ + --node-ports 5000 \ + --wait-for-all-ports \ + --timeout 300 + + +# ============================================================================= +# Node Role Assignment and Server Launch +# ============================================================================= + +if [ "$NODE_RANK" -eq 0 ]; then + echo "NODE INFO =======================================" + echo "================================================" + echo "Node List : ${SLURM_JOB_NODELIST}" + echo "Node IPs : ${IPADDRS}" + echo "Model Name : ${MODEL_NAME:-'Not specified'}" + echo "================================================" + + echo "CLUSTER INFO ====================================" + echo "================================================" + echo "${host_name}:${host_ip} is Proxy Node and Prefill Node" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" + echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" + echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}" + echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" + echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK: ${MORI_MAX_DISPATCH_TOKENS_PREFILL}" + echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE}" + echo "================================================" + + # start the head prefill server + PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + --model-path $MODEL_DIR/$MODEL_NAME \ + --disaggregation-mode prefill \ + --disaggregation-ib-device ${IBDEVICES} \ + --host 0.0.0.0 \ + --port 8000 \ + --trust-remote-code \ + ${PREFILL_SERVER_CONFIG} \ + --log-level-http warning" + + if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then + PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0" + fi + + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + set -x + eval "$PREFILL_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill0_pid=$! + fi + + + echo "Waiting for all prefill and decode servers to be up . . ." + + + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports 8000 \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + echo "Congratulations!!! All prefill and decode servers are up . . ." + + ROUTER_CMD="python -m sglang_router.launch_router \ + --pd-disaggregation \ + --port 30000 \ + --policy random \ + --prefill-policy random \ + --decode-policy random \ + ${PREFILL_ARGS} \ + ${DECODE_ARGS}" + + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $ROUTER_CMD" + else + ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_proxy_${host_name}.log" + set -x + if [[ "${SGLANG_ROUTER_STDOUT_LOGS:-0}" == "1" ]]; then + eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & + else + eval "$ROUTER_CMD" >"$ROUTER_LOG_FILE" 2>&1 & + fi + set +x + proxy_pid=$! + + # Wait for router to be ready via health endpoint + HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports 30000 \ + --wait-for-all-health \ + --health-endpoint /readiness \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $HEALTH_BARRIER_CMD" + else + eval "$HEALTH_BARRIER_CMD" + fi + + echo "Router is ready for benchmarking" + fi + + + echo "Ready for benchmarking on ${host_name}:${host_ip}" + + echo "Benchmarking on ${host_name}:${host_ip}" + cd $WS_PATH + + # Export IS_MTP based on whether MTP is enabled + if [ "$DECODE_MTP_SIZE" -gt 0 ]; then + export IS_MTP=true + else + export IS_MTP=false + fi + + # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier + BENCH_CMD="bash $WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ + $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ + ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \ + ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BENCH_CMD" + else + set -x + eval "$BENCH_CMD" + set +x + fi + + # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) + LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" + mkdir -p "$LOGS_OUTPUT" + + if [[ "$DRY_RUN" -eq 0 ]]; then + cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" + echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" + fi + + echo "Killing the proxy server and prefill server" + + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $proxy_pid + kill $prefill0_pid + fi + +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then + echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}" + + PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + --model-path $MODEL_DIR/${MODEL_NAME} \ + --disaggregation-mode prefill \ + --disaggregation-ib-device ${IBDEVICES} \ + --host 0.0.0.0 \ + --port 8000 \ + --trust-remote-code \ + ${PREFILL_SERVER_CONFIG} \ + --log-level-http warning" + + if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then + rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER)) + prefill_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER)) + PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[$prefill_idx]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank $rank" + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + set -x + eval "$PREFILL_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports 30000 \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port 30000" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the rank $NODE_RANK prefill server" + + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $prefill_pid + fi + +else + RANK=$((NODE_RANK - xP * PREFILL_NODES_PER_WORKER)) + echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})" + echo "Using decode config: $DECODE_SERVER_CONFIG" + echo "Decode node rank: $RANK" + echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" + + DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + --model-path ${MODEL_DIR}/${MODEL_NAME} \ + --disaggregation-mode decode \ + --disaggregation-ib-device ${IBDEVICES} \ + --host 0.0.0.0 \ + --port 8000 \ + --trust-remote-code \ + ${DECODE_SERVER_CONFIG} \ + --log-level-http warning" + + if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then + rank=$((RANK % DECODE_NODES_PER_WORKER)) + decode_idx=$((RANK / DECODE_NODES_PER_WORKER)) + DECODE_CMD="$DECODE_CMD --dist-init-addr ${DECODE_HEADNODE_URLS[$decode_idx]} --nnodes ${DECODE_NODES_PER_WORKER} --node-rank $rank" + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $DECODE_CMD" + else + set -x + eval "$DECODE_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & + + set +x + decode_pid=$! + fi + + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports 30000 \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port 30000" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the rank $RANK decode server" + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $decode_pid + fi + +fi + +echo "Script completed successfully" +exit 0 diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh new file mode 100755 index 000000000..a10e45d6d --- /dev/null +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -0,0 +1,490 @@ +#!/bin/bash +# vLLM Disaggregated Server Launcher with Model-Specific Configurations +# ============================================================================= +# +# Node role assignment (by NODE_RANK): +# 0 -> Proxy/Router + first Prefill node (kv_producer) +# 1..xP-1 -> Additional Prefill nodes (kv_producer) +# xP..xP+yD-1 -> Decode nodes (kv_consumer) +# +# Total nodes = xP + yD (router co-located with first prefill, like SGLang). + +# ============================================================================= +# Dependency Setup (idempotent; required when using base vLLM image) +# ============================================================================= +source "$(dirname "${BASH_SOURCE[0]}")/setup_deps.sh" + +# ============================================================================= +# Environment Configuration +# ============================================================================= + +NODE0_ADDR="${NODE0_ADDR:-localhost}" +NODE_RANK="${NODE_RANK:-0}" +MODEL_DIR="${MODEL_DIR:-}" +MODEL_NAME="${MODEL_NAME:-}" + +xP="${xP:-1}" +yD="${yD:-1}" + +IPADDRS="${IPADDRS:-localhost}" + +# Benchmark Configuration +BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" +BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" +BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" +BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" + +DRY_RUN="${DRY_RUN:-0}" +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + +ROUTER_PORT="${ROUTER_PORT:-30000}" +SERVER_PORT="${SERVER_PORT:-2584}" +ENGINE_ID="${ENGINE_ID:-${MODEL_NAME}-pd-run}" + +# Prefer MODEL_PATH from job.slurm (handles HF cache snapshot resolution) +MODEL_PATH="${MODEL_PATH:-${MODEL_DIR}/${MODEL_NAME}}" + +# ============================================================================= +# Dependencies and Environment Setup +# ============================================================================= +source $WS_PATH/env.sh + +host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}') +# RDMA IP for Nixl KV transfer (prefer 192.168.x.x subnet if available) +rdma_ip=$(hostname -I | tr ' ' '\n' | grep '^192\.168\.' | head -1) +rdma_ip="${rdma_ip:-$host_ip}" +host_name=$(hostname) + +echo "[INFO] Management IP (barriers/proxy): $host_ip" +echo "[INFO] RDMA IP (Nixl KV transfer): $rdma_ip" + +# ============================================================================= +# RDMA / Nixl Workarounds +# ============================================================================= + +setup_rdma_env() { + # Pensando ionic (RoCEv2) point-to-point /31 route fix. + # Each benic interface has a /31 to the TOR switch. Without explicit routes, + # traffic to other nodes' RDMA IPs falls through to the management network. + if [[ "$rdma_ip" =~ ^192\.168\.([0-9]+)\.([0-9]+)$ ]]; then + local rdma_subnet="${BASH_REMATCH[1]}" + local rdma_host="${BASH_REMATCH[2]}" + local rdma_gw="192.168.${rdma_subnet}.$(( rdma_host | 1 ))" + local rdma_iface + rdma_iface=$(ip -o addr show | awk -v ip="$rdma_ip" '$4 ~ ip {print $2}' | head -1) + if [[ -n "$rdma_iface" ]]; then + ip route replace "192.168.${rdma_subnet}.0/24" via "$rdma_gw" dev "$rdma_iface" 2>/dev/null && \ + echo "[RDMA-ROUTE] Added 192.168.${rdma_subnet}.0/24 via $rdma_gw dev $rdma_iface" || \ + echo "[RDMA-ROUTE] Route add failed for 192.168.${rdma_subnet}.0/24" + fi + fi + + # Patch Nixl UCX backend: set ucx_error_handling_mode=none. + # Required for ALL NIC types under high concurrency (C512+). Without this, + # UCX's default UCP_ERR_HANDLING_MODE_PEER triggers transport-level error + # recovery on ibv_post_send failures, preventing RIXL RDMA READ retries from + # recovering gracefully. This causes the prefill KV cache to fill to 100% + # and deadlock the pipeline. On ionic NICs this was already applied (rdmacm + # incompatibility); on mlx5 NICs it was incorrectly skipped. + local nixl_api + nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) + if [[ -n "$nixl_api" ]]; then + if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then + sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" + echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api (IBDEVICES=${IBDEVICES:-unset})" + else + echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" + fi + fi +} + +setup_rdma_env + +if [[ -z "$UCX_NET_DEVICES" ]]; then + echo "Error: UCX_NET_DEVICES is empty after env.sh detection" >&2 + exit 1 +fi + +# ============================================================================= +# Model-Specific Configuration from YAML +# ============================================================================= +MODELS_YAML="${WS_PATH}/models_vllm.yaml" + +if [[ ! -f "$MODELS_YAML" ]]; then + echo "ERROR: models.yaml not found at $MODELS_YAML" + exit 1 +fi + +if [[ -z "$MODEL_NAME" ]]; then + echo "ERROR: MODEL_NAME is not set"; exit 1 +fi + +eval "$(python3 -c " +import yaml, sys + +with open('${MODELS_YAML}') as f: + models = yaml.safe_load(f) + +model_name = '${MODEL_NAME}' +if model_name not in models: + print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') + sys.exit(0) + +m = models[model_name] + +def bash_escape(s): + \"\"\"Escape a value for safe embedding in a bash double-quoted assignment.\"\"\" + return s.replace('\\\\', '\\\\\\\\').replace('\"', '\\\\\"').replace('\$', '\\\\\$').replace('\`', '\\\\\`') + +pf = bash_escape(m.get('prefill_flags', '--tensor-parallel-size 8')) +df = bash_escape(m.get('decode_flags', '--tensor-parallel-size 8')) +ev = bash_escape(m.get('env', '')) +dev = bash_escape(m.get('decode_env', '')) +print(f'PREFILL_SERVER_CONFIG=\"{pf}\"') +print(f'DECODE_SERVER_CONFIG=\"{df}\"') +print(f'MODEL_ENVS=\"{ev}\"') +print(f'DECODE_MODEL_ENVS=\"{dev}\"') +")" + +echo "Loaded model configuration for: $MODEL_NAME" + +# Apply tensor-parallel size and EP/DP flags from submit pipeline. +if [[ -n "${PREFILL_TP_SIZE:-}" ]]; then + if echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then + PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${PREFILL_TP_SIZE}/g") + else + PREFILL_SERVER_CONFIG+=" --tensor-parallel-size ${PREFILL_TP_SIZE}" + fi +fi +if [[ -n "${DECODE_TP_SIZE:-}" ]]; then + if echo "$DECODE_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then + DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${DECODE_TP_SIZE}/g") + else + DECODE_SERVER_CONFIG+=" --tensor-parallel-size ${DECODE_TP_SIZE}" + fi +fi +if [[ "${PREFILL_ENABLE_EP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then + PREFILL_SERVER_CONFIG+=" --enable-expert-parallel" +fi +if [[ "${PREFILL_ENABLE_DP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then + PREFILL_SERVER_CONFIG+=" --enable-dp-attention" +fi +if [[ "${DECODE_ENABLE_EP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then + DECODE_SERVER_CONFIG+=" --enable-expert-parallel" +fi +if [[ "${DECODE_ENABLE_DP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then + DECODE_SERVER_CONFIG+=" --enable-dp-attention" +fi + +echo "PREFILL_SERVER_CONFIG (after TP/EP/DP): $PREFILL_SERVER_CONFIG" +echo "DECODE_SERVER_CONFIG (after TP/EP/DP): $DECODE_SERVER_CONFIG" + +# ============================================================================= +# Container Synchronization +# ============================================================================= + +echo "Waiting at the container creation barrier on $host_name" +python3 $WS_PATH/sync.py barrier \ + --local-ip ${host_ip} \ + --local-port 5000 \ + --enable-port \ + --node-ips ${IPADDRS} \ + --node-ports 5000 \ + --wait-for-all-ports \ + --timeout 600 + +# ============================================================================= +# ETCD Server Setup +# ============================================================================= + +echo "Proceeding to start etcd server on $host_name" +bash ${WS_PATH}/start_etcd.sh > /dev/null 2>&1 & +etcd_pid=$! + +echo "Waiting at etcd server barrier on $host_name" +python3 $WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports 2379 \ + --wait-for-all-ports \ + --timeout 300 + +echo "All etcd servers are up : $host_name" +sleep 3 + +echo "etcd endpoint health==================" +etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true +echo "======================================" + +python3 $WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports 2379 \ + --wait-for-all-ports \ + --timeout 300 + +# ============================================================================= +# Cluster Topology Configuration +# ============================================================================= +IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" + +PREFILL_ARGS="" +DECODE_ARGS="" + +for ((i=0; i "$PROXY_LOG_FILE" 2>&1 & + set +x + proxy_pid=$! + sleep 3 + fi + + PREFILL_CMD="vllm serve ${MODEL_PATH} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + ${PREFILL_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log" + set -x + eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 & + set +x + prefill_pid=$! + fi + + echo "Waiting for all prefill and decode servers to be up . . ." + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: skipping barrier (wait-for-all-ports)" + else + python3 $WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports $SERVER_PORT \ + --wait-for-all-ports \ + --timeout 1800 + fi + + echo "Congratulations!!! All prefill and decode servers are up . . ." + + # Wait for proxy /health to confirm it is accepting requests + HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-health \ + --health-endpoint /health \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $HEALTH_BARRIER_CMD" + else + eval "$HEALTH_BARRIER_CMD" + echo "MoRI-IO proxy is ready for benchmarking" + fi + + echo "Ready for benchmarking on ${host_name}:${host_ip}" + echo "Benchmarking on ${host_name}:${host_ip}" + cd $WS_PATH + + export ROUTER_PORT=$ROUTER_PORT + BENCH_CMD="bash $WS_PATH/bench.sh ${xP} ${yD} $((GPUS_PER_NODE*xP)) $((GPUS_PER_NODE*yD)) \ + $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ + ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \ + ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BENCH_CMD" + else + set -x + eval "$BENCH_CMD" + set +x + fi + + # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) + LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" + mkdir -p "$LOGS_OUTPUT" + + if [[ "$DRY_RUN" -eq 0 ]]; then + cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" + echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" + fi + + echo "Killing the proxy server and prefill server" + if [[ "$DRY_RUN" -eq 0 ]]; then + [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true + [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true + sleep 2 + # Fallback: ensure no orphaned processes keep ports open + pkill -f moriio_proxy 2>/dev/null || true + pkill -f "vllm serve" 2>/dev/null || true + fi + +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then + echo "${host_name}:${host_ip} is Additional Prefill Node (Model: ${MODEL_NAME})" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + + setup_vllm_env + + PREFILL_CMD="vllm serve ${MODEL_PATH} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + ${PREFILL_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log" + set -x + eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 & + set +x + prefill_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port ${ROUTER_PORT}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the prefill server" + [[ "$DRY_RUN" -eq 0 ]] && kill $prefill_pid 2>/dev/null || true + +else + echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME})" + echo "Using decode config: $DECODE_SERVER_CONFIG" + + setup_vllm_env + + for env_pair in ${DECODE_MODEL_ENVS}; do + export "$env_pair" + echo "[DECODE_ENV] $env_pair" + done + + DECODE_CMD="vllm serve ${MODEL_PATH} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + ${DECODE_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $DECODE_CMD" + else + DECODE_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log" + set -x + eval "$DECODE_CMD" > "$DECODE_LOG_FILE" 2>&1 & + set +x + decode_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port ${ROUTER_PORT}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the decode server" + [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid 2>/dev/null || true +fi + +echo "Killing the etcd server" +kill $etcd_pid 2>/dev/null || true +pkill -f etcd 2>/dev/null || true + +echo "Script completed successfully" +exit 0 diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh new file mode 100644 index 000000000..8c7a9f07a --- /dev/null +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -0,0 +1,908 @@ +#!/bin/bash +# ============================================================================= +# setup_deps.sh — Install missing vLLM disagg dependencies at container start. +# +# Base image: vllm/vllm-openai-rocm:v0.18.0 +# Sourced by server.sh so PATH / LD_LIBRARY_PATH exports persist. +# Idempotent: each component is skipped if already present. +# +# Build steps run in subshells to avoid CWD pollution between installers. +# ============================================================================= + +ROCM_PATH="${ROCM_PATH:-/opt/rocm}" +UCX_HOME="${UCX_HOME:-/usr/local/ucx}" +RIXL_HOME="${RIXL_HOME:-/usr/local/rixl}" + +_SETUP_START=$(date +%s) +_SETUP_INSTALLED=() + +git_clone_retry() { + local url="$1" dest="$2" max_tries=3 try=1 + while (( try <= max_tries )); do + if git clone --quiet "$url" "$dest" 2>/dev/null; then return 0; fi + echo "[SETUP] git clone attempt $try/$max_tries failed for $url, retrying in 10s..." + rm -rf "$dest" + sleep 10 + (( try++ )) + done + echo "[SETUP] git clone failed after $max_tries attempts: $url" + return 1 +} + +# --------------------------------------------------------------------------- +# 1. UCX (ROCm fork — required for GPU-direct RDMA via Nixl) +# --------------------------------------------------------------------------- +install_ucx() { + if [[ -x "${UCX_HOME}/bin/ucx_info" ]]; then + echo "[SETUP] UCX already present at ${UCX_HOME}" + return 0 + fi + + echo "[SETUP] Installing UCX build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + autoconf automake libtool pkg-config \ + librdmacm-dev rdmacm-utils libibverbs-dev ibverbs-utils ibverbs-providers \ + infiniband-diags perftest ethtool rdma-core strace \ + && rm -rf /var/lib/apt/lists/* + + echo "[SETUP] Building UCX from source (ROCm/ucx @ da3fac2a)..." + ( + set -e + mkdir -p /usr/local/src && cd /usr/local/src + git_clone_retry https://github.com/ROCm/ucx.git ucx && cd ucx + git checkout da3fac2a + ./autogen.sh && mkdir -p build && cd build + ../configure \ + --prefix="${UCX_HOME}" \ + --enable-shared --disable-static \ + --disable-doxygen-doc --enable-optimizations \ + --enable-devel-headers --enable-mt \ + --with-rocm="${ROCM_PATH}" --with-verbs --with-dm + make -j"$(nproc)" && make install + ) + rm -rf /usr/local/src/ucx + + if [[ ! -x "${UCX_HOME}/bin/ucx_info" ]]; then + echo "[SETUP] ERROR: UCX build failed"; exit 1 + fi + _SETUP_INSTALLED+=("UCX") +} + +# --------------------------------------------------------------------------- +# 2. RIXL (ROCm fork of NIXL — KV cache transfer for disaggregated vLLM) +# --------------------------------------------------------------------------- +install_rixl() { + if python3 -c "import rixl" 2>/dev/null; then + echo "[SETUP] RIXL Python bindings already present" + return 0 + fi + + echo "[SETUP] Installing RIXL build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc \ + libcpprest-dev libaio-dev \ + && rm -rf /var/lib/apt/lists/* + pip3 install --quiet meson "pybind11[global]" + + echo "[SETUP] Building RIXL from source (ROCm/RIXL @ f33a5599)..." + ( + set -e + git_clone_retry https://github.com/ROCm/RIXL.git /opt/rixl && cd /opt/rixl + git checkout f33a5599 + meson setup build --prefix="${RIXL_HOME}" \ + -Ducx_path="${UCX_HOME}" \ + -Drocm_path="${ROCM_PATH}" + cd build && ninja && ninja install + cd /opt/rixl + pip install --quiet \ + --config-settings=setup-args="-Drocm_path=${ROCM_PATH}" \ + --config-settings=setup-args="-Ducx_path=${UCX_HOME}" . + ) + rm -rf /opt/rixl + + if ! python3 -c "import rixl" 2>/dev/null; then + echo "[SETUP] ERROR: RIXL build failed"; exit 1 + fi + _SETUP_INSTALLED+=("RIXL") +} + +# --------------------------------------------------------------------------- +# 3. etcd (distributed KV store for vLLM disagg service discovery) +# --------------------------------------------------------------------------- +install_etcd() { + if [[ -x /usr/local/bin/etcd/etcd ]]; then + echo "[SETUP] etcd already present" + return 0 + fi + + local version="v3.6.0-rc.5" + echo "[SETUP] Downloading etcd ${version}..." + wget -q "https://github.com/etcd-io/etcd/releases/download/${version}/etcd-${version}-linux-amd64.tar.gz" \ + -O /tmp/etcd.tar.gz + mkdir -p /usr/local/bin/etcd + tar -xf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 + rm /tmp/etcd.tar.gz + _SETUP_INSTALLED+=("etcd") +} + +# --------------------------------------------------------------------------- +# 4. libionic1 (Pensando ionic RDMA verbs provider for RoCEv2 KV transfer) +# Harmless on non-Pensando nodes (shared lib is simply unused). +# --------------------------------------------------------------------------- +install_libionic() { + if dpkg -l libionic1 2>/dev/null | grep -q '^ii'; then + echo "[SETUP] libionic1 already installed" + return 0 + fi + + echo "[SETUP] Downloading and installing libionic1..." + wget -q "https://repo.radeon.com/amdainic/pensando/ubuntu/1.117.5/pool/main/r/rdma-core/libionic1_54.0-149.g3304be71_amd64.deb" \ + -O /tmp/libionic1.deb + dpkg -i /tmp/libionic1.deb || true + rm -f /tmp/libionic1.deb + _SETUP_INSTALLED+=("libionic1") +} + +# --------------------------------------------------------------------------- +# 5. MoRI-IO proxy deps (Python packages for the MoRI-IO-aware proxy server) +# The proxy replaces vllm-router: it handles both HTTP routing AND the +# MoRI-IO ZMQ registration/request-enrichment protocol. +# Only needed on NODE_RANK=0 (proxy node). +# --------------------------------------------------------------------------- +install_mori_proxy_deps() { + if python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then + echo "[SETUP] MoRI-IO proxy Python deps already present" + return 0 + fi + + echo "[SETUP] Installing MoRI-IO proxy Python deps..." + # v0.18.0 ships aiohttp, pyzmq, blinker(distutils); only quart and msgpack + # are missing. --ignore-installed blinker avoids pip's distutils uninstall + # error when quart pulls a newer blinker version. + pip install --quiet --ignore-installed blinker + pip install --quiet quart msgpack + + if ! python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then + echo "[SETUP] ERROR: MoRI-IO proxy deps install failed"; exit 1 + fi + _SETUP_INSTALLED+=("mori-proxy-deps") +} + +# --------------------------------------------------------------------------- +# 6. MoRI (Modular RDMA Interface — EP dispatch/combine kernels for MoE) +# Required for --all2all-backend mori (Expert Parallelism via RDMA). +# GPU kernels are JIT-compiled on first use; no hipcc needed at install. +# +# v0.18.0 ships MoRI 0.1.dev185+g2d02c6a98, but it STILL has the PCI +# topology bug (TopoSystemPci::Load assertion failure on Broadcom +# PEX890xx switches). Always rebuild from our target commit b645fc8 +# which includes the dsp2dev subordinate-range fix. +# --------------------------------------------------------------------------- +install_mori() { + local MORI_TARGET_COMMIT="b645fc8" + local MORI_MARKER="/usr/local/lib/python3.*/dist-packages/.mori_commit_${MORI_TARGET_COMMIT}" + + if ls $MORI_MARKER &>/dev/null; then + echo "[SETUP] MoRI @ $MORI_TARGET_COMMIT already installed (marker found)" + return 0 + fi + + echo "[SETUP] Installing MoRI build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + libopenmpi-dev openmpi-bin libpci-dev \ + && rm -rf /var/lib/apt/lists/* + + echo "[SETUP] Building MoRI from source (ROCm/mori @ $MORI_TARGET_COMMIT)..." + echo "[SETUP] (overriding image-provided version to fix PCI topology bug)" + ( + set -e + git_clone_retry https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori + git checkout "$MORI_TARGET_COMMIT" + pip install --quiet --force-reinstall . + ) + rm -rf /opt/mori + + if ! python3 -c "import mori" 2>/dev/null; then + echo "[SETUP] ERROR: MoRI build failed"; exit 1 + fi + touch $(python3 -c "import sysconfig; print(sysconfig.get_paths()['purelib'])")/.mori_commit_${MORI_TARGET_COMMIT} + _SETUP_INSTALLED+=("MoRI@$MORI_TARGET_COMMIT") +} + +# --------------------------------------------------------------------------- +# 6b. amd-quark (MXFP4 quantization support for Kimi-K2.5-MXFP4 and similar) +# Required due to ROCm vLLM missing the quark dependency: +# https://github.com/vllm-project/vllm/issues/35633 +# --------------------------------------------------------------------------- +install_amd_quark() { + if python3 -c "import quark" 2>/dev/null; then + echo "[SETUP] amd-quark already present" + return 0 + fi + + echo "[SETUP] Installing amd-quark for MXFP4 quantization support..." + pip install --quiet amd-quark + + if ! python3 -c "import quark" 2>/dev/null; then + echo "[SETUP] WARN: amd-quark install failed (non-fatal for non-MXFP4 models)" + return 0 + fi + _SETUP_INSTALLED+=("amd-quark") +} + +# --------------------------------------------------------------------------- +# 7. Patch vLLM MoRI-EP + FP8 incompatibility (present in v0.17.1 & v0.18.0) +# vLLM asserts MoRI requires AITER fused_moe, but AITER's FP8 kernel +# uses defer_input_quant=True which MoRI's prepare/finalize rejects. +# Patch: remove both the AITER requirement assertion and the +# defer_input_quant NotImplementedError so non-AITER kernels work. +# --------------------------------------------------------------------------- +patch_mori_fp8_compat() { + python3 -c ' +import re, os, sys +patched = [] + +# 1. Patch layer.py: remove multi-line AITER assertion for MoRI +try: + import vllm.model_executor.layers.fused_moe.layer as lm + f = lm.__file__ + src = open(f).read() + if "Mori needs to be used with aiter" in src: + new = re.sub( + r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)", + "pass # [PATCHED] AITER requirement removed for MoRI-EP + FP8", + src, flags=re.DOTALL) + if new != src: + open(f, "w").write(new) + patched.append("layer.py") +except Exception as e: + print(f"[SETUP] WARN patch layer.py: {e}", file=sys.stderr) + +# 2. Patch mori_prepare_finalize.py: remove defer_input_quant restriction +try: + import vllm.model_executor.layers.fused_moe.mori_prepare_finalize as mm + f = mm.__file__ + src = open(f).read() + if "defer_input_quant" in src: + new = re.sub( + r"raise NotImplementedError\([^)]*defer_input_quant[^)]*\)", + "pass # [PATCHED] defer_input_quant check removed for MoRI-EP + FP8", + src) + if new != src: + open(f, "w").write(new) + patched.append("mori_prepare_finalize.py") +except Exception as e: + print(f"[SETUP] WARN patch mori_pf: {e}", file=sys.stderr) + +if patched: + print(f"[SETUP] Patched: {chr(44).join(patched)}") +else: + print("[SETUP] No MoRI-FP8 patches needed") +' + _SETUP_INSTALLED+=("MoRI-FP8-patch") +} + +# --------------------------------------------------------------------------- +# 8. Patch vLLM MoRI-IO save_kv_layer busy-spin (C128 tail-batch deadlock) +# In WRITE mode, save_kv_layer spins forever waiting for the handshake +# callback to set write_ready_flags. This blocks the model worker thread, +# preventing it from responding to EngineCore shm_broadcast, causing a +# TimeoutError cascade and crash. +# Patch: add time.sleep(0.001) and a 30s timeout to yield CPU and prevent +# the model worker from deadlocking. +# --------------------------------------------------------------------------- +patch_moriio_save_kv_timeout() { + python3 -c ' +import os, sys + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc + f = mc.__file__ + src = open(f).read() + + # Already patched? + if "[PATCHED] save_kv_layer timeout" in src: + print("[SETUP] save_kv_layer timeout patch already applied") + sys.exit(0) + + old = """ while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.write_ready_flags + ): + continue""" + + if old not in src: + print("[SETUP] WARN: save_kv_layer busy-spin pattern not found, skipping patch") + sys.exit(0) + + new = """ # [PATCHED] save_kv_layer — null guard + timeout + sleep + if remote_engine_id is None: + return + import time as _time, os as _os + _wait_start = _time.monotonic() + _SAVE_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) + while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.write_ready_flags + ): + _elapsed = _time.monotonic() - _wait_start + if _elapsed > _SAVE_KV_TIMEOUT: + import logging as _logging + _logging.getLogger("vllm.moriio").warning( + "[HANGFIX] save_kv_layer: timeout (%.1fs) waiting for " + "write_ready_flags[%s], breaking to unblock model " + "worker", _elapsed, remote_engine_id) + break + _time.sleep(0.001) + continue""" + + new_src = src.replace(old, new) + if new_src == src: + print("[SETUP] WARN: replacement had no effect") + sys.exit(0) + + open(f, "w").write(new_src) + print("[SETUP] Patched save_kv_layer: null guard + timeout + sleep") +except Exception as e: + print(f"[SETUP] WARN patch save_kv_layer: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-save-kv-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 9. Patch MoRIIO waiting_for_transfer_complete with bounded timeout +# The original status.Wait() blocks forever if an RDMA completion never +# arrives (e.g., NIC queue saturation at C256). This replaces the unbounded +# wait with a polling loop using status.Succeeded() + configurable timeout. +# Also adds error handling to the write worker loop so a single failed +# transfer doesn't kill the background thread. +# --------------------------------------------------------------------------- +patch_moriio_transfer_timeout() { + python3 -c ' +import os, sys, textwrap + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine as me + f = me.__file__ + src = open(f).read() + + if "[PATCHED] transfer completion timeout" in src: + print("[SETUP] transfer completion timeout patch already applied") + sys.exit(0) + + # --- Patch 1: Replace waiting_for_transfer_complete with polling + timeout --- + old_wait = """ def waiting_for_transfer_complete(self): + if not self.transfer_status: + return + + transfers_to_wait = [] + with self.lock: + transfers_to_wait = self.transfer_status[:] + self.transfer_status.clear() + + for status in transfers_to_wait: + try: + status.Wait() + if not status.Succeeded(): + logger.error( + "Transfer failed: %s, Code: %s", status.Message(), status.Code() + ) + raise TransferError("MoRIIO transfer failed!") + except Exception as e: + logger.error("Transfer %s failed: %s", status, e) + raise""" + + new_wait = """ def waiting_for_transfer_complete(self): + # [PATCHED] transfer completion timeout — bounded polling loop + import time as _time, os as _os + if not self.transfer_status: + return + + _timeout = float(_os.environ.get("VLLM_MORIIO_TRANSFER_TIMEOUT", "120")) + + transfers_to_wait = [] + with self.lock: + transfers_to_wait = self.transfer_status[:] + self.transfer_status.clear() + + _start = _time.monotonic() + remaining = list(transfers_to_wait) + _polls = 0 + _completed = 0 + + while remaining: + _elapsed = _time.monotonic() - _start + if _elapsed > _timeout: + logger.error( + "[HANGFIX] transfer_timeout elapsed=%.1fs " + "pending=%d/%d completed=%d polls=%d " + "action=raise_transfer_error", + _elapsed, len(remaining), len(transfers_to_wait), + _completed, _polls, + ) + raise TransferError( + f"RDMA transfer timeout after {_elapsed:.1f}s, " + f"{len(remaining)}/{len(transfers_to_wait)} pending" + ) + + still_waiting = [] + for status in remaining: + try: + if status.Succeeded(): + _completed += 1 + continue + still_waiting.append(status) + except Exception as e: + logger.error( + "[HANGFIX] transfer_poll_error error=%s", e) + raise TransferError( + f"Transfer failed during poll: {e}" + ) from e + + remaining = still_waiting + if remaining: + _time.sleep(0.005) + _polls += 1 + if _polls % 2000 == 0: + logger.warning( + "[HANGFIX] transfer_wait pending=%d " + "completed=%d elapsed=%.1fs timeout=%.0fs", + len(remaining), _completed, + _time.monotonic() - _start, _timeout, + )""" + + if old_wait not in src: + print("[SETUP] WARN: waiting_for_transfer_complete pattern not found") + sys.exit(0) + + new_src = src.replace(old_wait, new_wait) + + # --- Patch 2: Add error handling + cleanup to _write_worker_loop --- + old_loop = """ self._execute_write_task(task)""" + + new_loop = """ try: + self._execute_write_task(task) + except Exception as _e: + logger.error( + "[HANGFIX] req=%s write_task_failed error=%s " + "action=cleanup_and_mark_done", + task.request_id, _e, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None + ) + except Exception: + pass""" + + if old_loop in new_src: + new_src = new_src.replace(old_loop, new_loop, 1) + else: + print("[SETUP] WARN: _write_worker_loop pattern not found for error handling") + + # --- Patch 3: Add deferred task timeout to _process_deferred_tasks --- + old_deferred = """ def _process_deferred_tasks(self) -> None: + \"\"\"Process tasks that were previously deferred.\"\"\" + if not self._deferred_tasks: + return + + still_deferred: list[WriteTask] = [] + for task in self._deferred_tasks: + if self._is_remote_ready(task): + self._execute_write_task(task) + else: + still_deferred.append(task) + + self._deferred_tasks = still_deferred""" + + new_deferred = """ def _process_deferred_tasks(self) -> None: + \"\"\"Process tasks that were previously deferred.\"\"\" + # [PATCHED] deferred task timeout — prune stale tasks + import time as _time, os as _os + if not self._deferred_tasks: + return + + _DEFER_TIMEOUT = float( + _os.environ.get("VLLM_MORIIO_DEFER_TIMEOUT", "60")) + + still_deferred: list[WriteTask] = [] + for task in self._deferred_tasks: + _age = _time.monotonic() - getattr(task, "_defer_ts", _time.monotonic()) + if _age > _DEFER_TIMEOUT: + logger.error( + "[HANGFIX] req=%s deferred_task_expired age=%.1fs " + "action=drop_and_mark_done", + task.request_id, _age, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None) + except Exception: + pass + continue + if self._is_remote_ready(task): + try: + self._execute_write_task(task) + except Exception as _e: + logger.error( + "[HANGFIX] req=%s deferred_write_failed error=%s", + task.request_id, _e, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None) + except Exception: + pass + else: + still_deferred.append(task) + + self._deferred_tasks = still_deferred""" + + if old_deferred in new_src: + new_src = new_src.replace(old_deferred, new_deferred, 1) + else: + print("[SETUP] WARN: _process_deferred_tasks pattern not found") + + # --- Patch 4: Stamp defer time when task is deferred --- + old_defer_add = """ self._deferred_tasks.append(task)""" + new_defer_add = """ import time as _time2 + if not hasattr(task, "_defer_ts"): + task._defer_ts = _time2.monotonic() + self._deferred_tasks.append(task)""" + if old_defer_add in new_src: + new_src = new_src.replace(old_defer_add, new_defer_add, 1) + else: + print("[SETUP] WARN: deferred task timestamp patch target not found") + + open(f, "w").write(new_src) + print("[SETUP] Patched: transfer timeout + writer error handling") + +except Exception as e: + print(f"[SETUP] WARN patch transfer_timeout: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-transfer-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 10. Patch MoRIIO start_load_kv busy-spin (same pattern as save_kv_layer) +# The READ-mode spin loop in start_load_kv has the same unbounded-spin +# issue as save_kv_layer. Add timeout + sleep + null guard. +# --------------------------------------------------------------------------- +patch_moriio_load_kv_timeout() { + python3 -c ' +import os, sys + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc + f = mc.__file__ + src = open(f).read() + + if "[PATCHED] start_load_kv timeout" in src: + print("[SETUP] start_load_kv timeout patch already applied") + sys.exit(0) + + old = """ while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.load_ready_flag + and wait_handshake_readd_req + ): + continue""" + + if old not in src: + print("[SETUP] WARN: start_load_kv busy-spin pattern not found, skipping") + sys.exit(0) + + new = """ # [PATCHED] start_load_kv timeout — prevent model worker deadlock + if remote_engine_id is None and not wait_handshake_readd_req: + self._reqs_to_send.update(metadata.reqs_to_send) + return + import time as _time, os as _os + _wait_start = _time.monotonic() + _LOAD_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) + while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.load_ready_flag + and wait_handshake_readd_req + ): + if _time.monotonic() - _wait_start > _LOAD_KV_TIMEOUT: + import logging as _logging + _logging.getLogger("vllm.moriio").warning( + "[HANGFIX] start_load_kv: timeout (%.1fs) waiting for " + "load_ready_flag[%s]", _time.monotonic() - _wait_start, + remote_engine_id) + break + _time.sleep(0.001) + continue""" + + new_src = src.replace(old, new) + if new_src == src: + print("[SETUP] WARN: start_load_kv replacement had no effect") + sys.exit(0) + + open(f, "w").write(new_src) + print("[SETUP] Patched start_load_kv busy-spin with timeout + sleep") +except Exception as e: + print(f"[SETUP] WARN patch start_load_kv: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-load-kv-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 11. Fix READ-mode scheduler assertion in _update_from_kv_xfer_finished +# vLLM asserts that a request in finished_recving must be either +# WAITING_FOR_REMOTE_KVS or finished. In READ mode the request can +# transition to RUNNING before the aggregated recv notification arrives, +# crashing the engine with AssertionError. +# (present in v0.17.1 & v0.18.0) +# --------------------------------------------------------------------------- +patch_scheduler_read_mode_fix() { + python3 -c ' +import os, sys + +try: + import vllm.v1.core.sched.scheduler as smod + f = smod.__file__ + src = open(f).read() + + if "[PATCHED] read-mode recv assertion" in src: + print("[SETUP] scheduler read-mode assertion fix already applied") + sys.exit(0) + + old_recv = """ for req_id in kv_connector_output.finished_recving or (): + logger.debug("Finished recving KV transfer for request %s", req_id) + assert req_id in self.requests + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + else: + assert RequestStatus.is_finished(req.status) + self._free_blocks(self.requests[req_id])""" + + new_recv = """ # [PATCHED] read-mode recv assertion — handle intermediate states + for req_id in kv_connector_output.finished_recving or (): + logger.debug("Finished recving KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping recv", req_id) + continue + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + elif RequestStatus.is_finished(req.status): + self._free_blocks(self.requests[req_id]) + else: + logger.debug( + "Request %s recv finished but status=%s (not " + "WAITING_FOR_REMOTE_KVS or finished), skipping " + "block free — will be freed on request completion", + req_id, req.status.name)""" + + if old_recv not in src: + print("[SETUP] WARN: scheduler finished_recving pattern not found, skipping") + sys.exit(0) + + new_src = src.replace(old_recv, new_recv, 1) + + old_send = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + assert req_id in self.requests + self._free_blocks(self.requests[req_id])""" + + new_send = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping send", req_id) + continue + self._free_blocks(self.requests[req_id])""" + + if old_send in new_src: + new_src = new_src.replace(old_send, new_send, 1) + else: + print("[SETUP] WARN: scheduler finished_sending pattern not found") + + open(f, "w").write(new_src) + print("[SETUP] Patched: scheduler _update_from_kv_xfer_finished read-mode fix") + +except Exception as e: + print(f"[SETUP] WARN patch scheduler read-mode: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("scheduler-read-mode-fix") +} + +# --------------------------------------------------------------------------- +# 12. Idle KV block reaper for disaggregated prefill (READ mode) +# The RIXL notification path can lose `finished_sending` signals under +# high concurrency with ibv_post_send failures. This leaves KV blocks +# permanently allocated on the prefill engine even after the decode has +# finished reading. Over multiple benchmark rounds, leaked blocks +# accumulate and eventually saturate the prefill KV cache. +# +# Fix: instrument the scheduler's `schedule()` method to detect idle +# periods (0 running, 0 waiting for >5s) and force-free blocks for +# any remaining requests whose status is finished. +# --------------------------------------------------------------------------- +patch_prefill_idle_kv_reaper() { + python3 -c ' +import os, sys + +try: + import vllm.v1.core.sched.scheduler as smod + f = smod.__file__ + src = open(f).read() + + if "[PATCHED] idle-kv-reaper" in src: + print("[SETUP] idle KV block reaper already applied") + sys.exit(0) + + # Find the _update_from_kv_xfer_finished method end and add reaper logic + # We inject into the method that processes KV transfer completions. + marker = "[PATCHED] read-mode recv assertion" + if marker not in src: + print("[SETUP] WARN: scheduler read-mode patch not found, skipping reaper") + sys.exit(0) + + # Add reaper state initialization to __init__ + old_init_marker = "self.finished_recving_kv_req_ids" + if old_init_marker not in src: + print("[SETUP] WARN: finished_recving_kv_req_ids not found in scheduler") + sys.exit(0) + + # Find the first occurrence to insert reaper state + init_pos = src.find(old_init_marker) + # Find the line containing it + line_end = src.find("\n", init_pos) + init_line = src[init_pos:line_end] + + # Add reaper state after this line + reaper_init = init_line + """ + # [PATCHED] idle-kv-reaper state + self._idle_kv_reaper_ts = 0.0 + self._idle_kv_reaper_active = False""" + + src = src.replace(init_line, reaper_init, 1) + + # Now add the reaper logic at the end of _update_from_kv_xfer_finished + # Find the finished_sending handler we patched + send_handler = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping send", req_id) + continue + self._free_blocks(self.requests[req_id])""" + + reaper_logic = send_handler + """ + + # [PATCHED] idle-kv-reaper — force-free leaked prefill KV blocks + import time as _time + _REAPER_IDLE_SECS = 5.0 + _num_running = sum(1 for r in self.requests.values() + if r.status == RequestStatus.RUNNING) + _should_reap = (_num_running == 0) + + if _should_reap: + if not self._idle_kv_reaper_active: + self._idle_kv_reaper_active = True + self._idle_kv_reaper_ts = _time.monotonic() + elif _time.monotonic() - self._idle_kv_reaper_ts > _REAPER_IDLE_SECS: + _reaped = 0 + _reap_ids = [] + for _rid, _req in list(self.requests.items()): + if RequestStatus.is_finished(_req.status): + _reap_ids.append(_rid) + for _rid in _reap_ids: + try: + _req = self.requests[_rid] + self._free_blocks(_req) + _reaped += 1 + except Exception as _e: + logger.debug("[KV-REAPER] free_blocks failed for %s: %s", _rid, _e) + if _reaped > 0: + logger.warning( + "[KV-REAPER] Force-freed blocks for %d finished " + "requests after %.1fs idle", + _reaped, _time.monotonic() - self._idle_kv_reaper_ts) + self._idle_kv_reaper_ts = _time.monotonic() + else: + self._idle_kv_reaper_active = False""" + + if send_handler in src: + src = src.replace(send_handler, reaper_logic, 1) + else: + print("[SETUP] WARN: send handler not found for reaper injection") + sys.exit(0) + + open(f, "w").write(src) + print("[SETUP] Patched: idle KV block reaper for prefill") + +except Exception as e: + print(f"[SETUP] WARN patch idle-kv-reaper: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("idle-kv-reaper") +} + +# --------------------------------------------------------------------------- +# 13. Patch MiniMax M2.5 WideEP + MoRI + EPLB support +# Replaces the upstream minimax_m2.py with our patched version that adds +# GateLinear, EP group integration, sequence parallelism, and the +# MixtureOfExperts EPLB protocol. Idempotent: skips if already patched. +# --------------------------------------------------------------------------- +patch_minimax_m2_wideep_mori() { + local patch_file="${WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}/patches/minimax_m2.py" + if [[ ! -f "$patch_file" ]]; then + # Also check the Docker-baked location + patch_file="/opt/vllm_disagg/patches/minimax_m2.py" + fi + if [[ ! -f "$patch_file" ]]; then + echo "[SETUP] minimax_m2.py patch not found, skipping (WideEP/MoRI not patched)" + return 0 + fi + + python3 -c ' +import os, sys, shutil + +try: + import vllm.model_executor.models.minimax_m2 as mmod + target = mmod.__file__ + src = sys.argv[1] + + with open(target) as f: + if "get_ep_group" in f.read(): + print("[SETUP] minimax_m2.py already has WideEP+MoRI support") + sys.exit(0) + + shutil.copy2(src, target) + print(f"[SETUP] Patched minimax_m2.py: {src} -> {target}") + +except Exception as e: + print(f"[SETUP] WARN patch minimax_m2: {e}", file=sys.stderr) +' "$patch_file" + _SETUP_INSTALLED+=("minimax-m2-wideep-mori") +} + +# ============================================================================= +# Run installers +# ============================================================================= + +install_ucx +install_rixl +install_etcd +install_libionic +install_mori +install_amd_quark +install_mori_proxy_deps +patch_mori_fp8_compat +patch_moriio_save_kv_timeout +patch_moriio_transfer_timeout +patch_moriio_load_kv_timeout +patch_scheduler_read_mode_fix +patch_prefill_idle_kv_reaper +patch_minimax_m2_wideep_mori + +# ============================================================================= +# Export paths (persists for server.sh since this file is sourced) +# ============================================================================= + +export ROCM_PATH="${ROCM_PATH}" +export UCX_HOME="${UCX_HOME}" +export RIXL_HOME="${RIXL_HOME}" +export PATH="${UCX_HOME}/bin:/usr/local/bin/etcd:/root/.cargo/bin:${PATH}" +export LD_LIBRARY_PATH="${UCX_HOME}/lib:${RIXL_HOME}/lib:${RIXL_HOME}/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" + +_SETUP_END=$(date +%s) +if [[ ${#_SETUP_INSTALLED[@]} -eq 0 ]]; then + echo "[SETUP] All dependencies already present (${_SETUP_END}s wallclock)" +else + echo "[SETUP] Installed: ${_SETUP_INSTALLED[*]} in $(( _SETUP_END - _SETUP_START ))s" +fi diff --git a/benchmarks/multi_node/amd_utils/start_etcd.sh b/benchmarks/multi_node/amd_utils/start_etcd.sh new file mode 100755 index 000000000..46bbd2964 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/start_etcd.sh @@ -0,0 +1,47 @@ +#!/bin/bash +set -x + +IPADDRS="${IPADDRS:-localhost}" + +# Use management network IP (matching what the Slurm script resolved) +host_ip=$(ip route get 1.1.1.1 2>/dev/null | sed -n 's/.*src \([^ ]*\).*/\1/p') +if [[ -z "$host_ip" ]]; then + host_ip=$(hostname -I | awk '{print $1}') +fi + +IFS=',' read -ra ADDR <<< "$IPADDRS" + +# Determine node name based on position in the IPADDRS list +index=0 +for ip in "${ADDR[@]}"; do + if [[ "$ip" == "$host_ip" ]]; then + break + fi + index=$((index + 1)) +done +node_name="etcd-$((index+1))" + +# Build initial cluster string +initial_cluster="" +for i in "${!ADDR[@]}"; do + peer_name="etcd-$((i+1))" + initial_cluster+="$peer_name=http://${ADDR[i]}:2380" + if [[ $i -lt $((${#ADDR[@]} - 1)) ]]; then + initial_cluster+="," + fi +done + +mkdir -p /var/lib/etcd +rm -rf /var/lib/etcd/* + +/usr/local/bin/etcd/etcd \ + --name "$node_name" \ + --data-dir /var/lib/etcd \ + --initial-advertise-peer-urls http://$host_ip:2380 \ + --listen-peer-urls http://0.0.0.0:2380 \ + --listen-client-urls http://0.0.0.0:2379 \ + --advertise-client-urls http://$host_ip:2379 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-cluster "$initial_cluster" \ + --initial-cluster-state new \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/etcd_NODE${NODE_RANK}.log diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index d2c49bc9e..a77462fc5 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -2,37 +2,51 @@ # # Cluster Configuration Template for Multi-Node Disaggregated Serving # -# This script submits a multi-node SGLang disaggregated benchmark job to SLURM. +# This script submits a multi-node disaggregated benchmark job to SLURM. # It must be configured for your specific cluster before use. +# +# ENGINE=sglang (default): SGLang disaggregated serving +# ENGINE=vllm: vLLM disaggregated serving +# +# Router is co-located with the first prefill node (same for both engines), +# so NUM_NODES = PREFILL_NODES + DECODE_NODES. usage() { cat << 'USAGE' -This script aims to provide a one-liner call to the submit_job_script.py, -so that the deployment process can be further simplified. - -To use this script, fill in the following script and run it under your `slurm_jobs` directory: -======== begin script area ======== -# REQUIRED: Cluster-specific configuration -export SLURM_ACCOUNT= # Your SLURM account name -export SLURM_PARTITION= # SLURM partition to submit to -export TIME_LIMIT= # Job time limit (e.g., "08:00:00") - -# REQUIRED: Model and container paths -export MODEL_PATH= # Path to model directory (e.g., /mnt/models, /nfsdata) -export CONTAINER_IMAGE= # Path to container squash file - -# REQUIRED: Hardware configuration -export GPUS_PER_NODE= # GPUs per node (e.g., 8 for MI355X, 4 for MI325X) - -# OPTIONAL: RDMA/Network configuration (set in runners/launch_mi355x-amds.sh for AMD) -# export IBDEVICES= # RDMA device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) -# export MORI_RDMA_TC= # RDMA traffic class (e.g., 96, 104) - -bash submit.sh \ -$PREFILL_NODES $PREFILL_WORKERS $DECODE_NODES $DECODE_WORKERS \ -$ADDITIONAL_FRONTENDS \ -$ISL $OSL $CONCURRENCIES $REQUEST_RATE -======== end script area ======== +Usage: + bash submit.sh \ + \ + \ + \ + \ + [NODE_LIST] + +Arguments: + PREFILL_NODES Number of prefill nodes + PREFILL_WORKERS Number of prefill workers (usually 1) + DECODE_NODES Number of decode nodes + DECODE_WORKERS Number of decode workers (usually 1) + ISL Input sequence length + OSL Output sequence length + CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") + REQUEST_RATE Request rate ("inf" for max throughput) + PREFILL_ENABLE_EP true/false or 1/0 (expert parallelism on prefill) + PREFILL_ENABLE_DP true/false or 1/0 (data-parallel attention on prefill) + DECODE_ENABLE_EP true/false or 1/0 (expert parallelism on decode) + DECODE_ENABLE_DP true/false or 1/0 (data-parallel attention on decode) + PREFILL_TP Tensor parallel size per prefill node + DECODE_TP Tensor parallel size per decode node + RANDOM_RANGE_RATIO Random range ratio for benchmark client + NODE_LIST Optional: comma-separated hostnames (must match NUM_NODES) + +Required environment variables: + SLURM_ACCOUNT SLURM account name + SLURM_PARTITION SLURM partition + TIME_LIMIT Job time limit (e.g., "08:00:00") + MODEL_PATH Path to model directory (e.g., /nfsdata) + MODEL_NAME Model name directory + CONTAINER_IMAGE Docker image name (e.g., vllm_disagg_pd:latest) + RUNNER_NAME Runner identifier (for job name) USAGE } @@ -53,6 +67,7 @@ check_env MODEL_PATH check_env MODEL_NAME check_env CONTAINER_IMAGE check_env RUNNER_NAME +check_env FRAMEWORK # GPUS_PER_NODE defaults to 8 (MI355X). Set to 4 for MI325X if needed. GPUS_PER_NODE="${GPUS_PER_NODE:-8}" @@ -66,31 +81,32 @@ ISL=$5 OSL=$6 CONCURRENCIES=$7 REQUEST_RATE=$8 -PREFILL_ENABLE_EP=${9:-1} -PREFILL_ENABLE_DP=${10:-1} -DECODE_ENABLE_EP=${11:-1} -DECODE_ENABLE_DP=${12:-1} +PREFILL_ENABLE_EP=${9:-true} +PREFILL_ENABLE_DP=${10:-true} +DECODE_ENABLE_EP=${11:-true} +DECODE_ENABLE_DP=${12:-true} PREFILL_TP=${13:-8} DECODE_TP=${14:-8} -RANDOM_RANGE_RATIO=${15} +RANDOM_RANGE_RATIO=${15:-0.8} NODE_LIST=${16} - NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) profiler_args="${ISL} ${OSL} ${CONCURRENCIES} ${REQUEST_RATE}" # Export variables for the SLURM job +export ENGINE="${FRAMEWORK:-sglang}" export MODEL_DIR=$MODEL_PATH export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE export PROFILER_ARGS=$profiler_args - - +# Engine-specific xP/yD semantics and TP exports +if [[ "$ENGINE" == "vllm" ]]; then + export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} + export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} +fi +# xP = prefill workers, yD = decode workers (may span multiple nodes) export xP=$PREFILL_WORKERS export yD=$DECODE_WORKERS -export NUM_NODES=$NUM_NODES -export GPUS_PER_NODE=$GPUS_PER_NODE -export MODEL_NAME=$MODEL_NAME export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $PREFILL_TP / $PREFILL_WORKERS )) export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP} export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP} @@ -98,12 +114,16 @@ export DECODE_TP_SIZE=$(( $DECODE_NODES * $DECODE_TP / $DECODE_WORKERS )) export DECODE_ENABLE_EP=${DECODE_ENABLE_EP} export DECODE_ENABLE_DP=${DECODE_ENABLE_DP} export DECODE_MTP_SIZE=${DECODE_MTP_SIZE} + +export NUM_NODES=$NUM_NODES +export GPUS_PER_NODE=$GPUS_PER_NODE +export MODEL_NAME=$MODEL_NAME export BENCH_INPUT_LEN=${ISL} export BENCH_OUTPUT_LEN=${OSL} -export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO} -export BENCH_NUM_PROMPTS_MULTIPLIER=10 +export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} export BENCH_REQUEST_RATE=${REQUEST_RATE} +export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} # Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker) export RUN_EVAL="${RUN_EVAL:-false}" @@ -118,13 +138,10 @@ export SPEC_DECODING="${SPEC_DECODING:-}" export IS_MULTINODE="${IS_MULTINODE:-false}" # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. -# SLURM writes output files on the batch node, so /tmp won't work (node-local). -# Defaults to a sibling directory of the submit working directory. export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" mkdir -p "$BENCHMARK_LOGS_DIR" # Optional: pass an explicit node list to sbatch. -# NODE_LIST is expected to be comma-separated hostnames. NODELIST_OPT=() if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then IFS=',' read -r -a NODE_ARR <<< "$NODE_LIST" @@ -137,6 +154,13 @@ if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then NODELIST_OPT=(--nodelist "$NODELIST_CSV") fi +# Optional: exclude specific nodes (e.g. nodes with broken Docker sockets). +# Set SLURM_EXCLUDE_NODES env var to a comma-separated list of hostnames. +EXCLUDE_OPT=() +if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then + EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES") +fi + # Construct the sbatch command sbatch_cmd=( sbatch @@ -145,6 +169,7 @@ sbatch_cmd=( -N "$NUM_NODES" -n "$NUM_NODES" "${NODELIST_OPT[@]}" + "${EXCLUDE_OPT[@]}" --time "$TIME_LIMIT" --partition "$SLURM_PARTITION" --account "$SLURM_ACCOUNT" @@ -154,7 +179,6 @@ sbatch_cmd=( "$(dirname "$0")/job.slurm" ) -# todo: --parsable outputs only the jobid and cluster name, test if jobid;clustername is correct JOB_ID=$("${sbatch_cmd[@]}") if [[ $? -ne 0 ]]; then echo "Error: Failed to submit job with sbatch" >&2 diff --git a/benchmarks/multi_node/amd_utils/sync.py b/benchmarks/multi_node/amd_utils/sync.py index 140951519..3678e7614 100755 --- a/benchmarks/multi_node/amd_utils/sync.py +++ b/benchmarks/multi_node/amd_utils/sync.py @@ -143,7 +143,10 @@ def close_port(): time.sleep(30) if args.enable_port: - time.sleep(30) + # Keep the port open long enough for slow nodes to pass their barrier. + # The previous 30s was too short when setup times vary by minutes. + grace = max(60, args.timeout // 2) if args.timeout > 0 else 300 + time.sleep(grace) close_port() diff --git a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh index 6a7314ab4..d17d1a323 100644 --- a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh @@ -19,7 +19,8 @@ check_env_vars \ DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ - RANDOM_RANGE_RATIO + RANDOM_RANGE_RATIO \ + FRAMEWORK if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh index 0124d4b4d..a8c0d2743 100644 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh @@ -19,7 +19,8 @@ check_env_vars \ DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ - RANDOM_RANGE_RATIO + RANDOM_RANGE_RATIO \ + FRAMEWORK if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" diff --git a/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh index b21e9204a..d7995fb25 100755 --- a/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh +++ b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh @@ -19,7 +19,8 @@ check_env_vars \ DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ - RANDOM_RANGE_RATIO + RANDOM_RANGE_RATIO \ + FRAMEWORK if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -27,7 +28,7 @@ fi set -x -cd "$GITHUB_WORKSPACE/benchmarks/multi_node/vllm_disagg_utils" || exit 1 +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 export TIME_LIMIT="08:00:00" export MODEL_PATH=$MODEL_PATH diff --git a/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh index 137ee0381..a9a28d889 100644 --- a/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh +++ b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh @@ -19,7 +19,8 @@ check_env_vars \ DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ - RANDOM_RANGE_RATIO + RANDOM_RANGE_RATIO \ + FRAMEWORK if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -27,7 +28,7 @@ fi set -x -cd "$GITHUB_WORKSPACE/benchmarks/multi_node/vllm_disagg_utils" || exit 1 +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 export TIME_LIMIT="08:00:00" export MODEL_PATH=$MODEL_PATH From ac064a882dc80737c518b1c62feb4a2389cae550 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 21 Apr 2026 07:57:08 +0000 Subject: [PATCH 30/85] use vLLM router as default router for vllm disagg Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/job.slurm | 34 ++++++++++++++++ .../multi_node/amd_utils/server_vllm.sh | 40 +++++++++++-------- 2 files changed, 58 insertions(+), 16 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 56fefb0ed..491f27aa8 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -77,6 +77,11 @@ PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} +# Router selection: "vllm-router" (external container) or "moriio" (in-container proxy) +ROUTER_TYPE="${ROUTER_TYPE:-vllm-router}" +ROUTER_PORT="${ROUTER_PORT:-30000}" +PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" + # ============================================================================= # Docker privilege detection # ============================================================================= @@ -289,6 +294,10 @@ export IS_MULTINODE="${IS_MULTINODE:-false}" SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" + +# vLLM external router container +VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy}" +ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) @@ -397,6 +406,24 @@ echo \"Rank \$SLURM_PROCID on \$(hostname)\" \$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$_DCMD rm -f || true \$DOCKER_CMD ps -aq | xargs -r \$_DCMD stop || true +# Start vLLM external router container on node 0 +if [[ \"$ENGINE\" == \"vllm\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then + \$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true + \$DOCKER_CMD run -d \\ + --name \"$ROUTER_CONT_NAME\" \\ + --network host \\ + \"$VLLM_ROUTER_IMAGE\" \\ + vllm-router \\ + --vllm-pd-disaggregation \\ + --vllm-discovery-address \"0.0.0.0:${PROXY_PING_PORT}\" \\ + --port \"${ROUTER_PORT}\" \\ + --host 0.0.0.0 \\ + --policy consistent_hash \\ + --prefill-policy consistent_hash \\ + --decode-policy consistent_hash \\ + --log-level info +fi + exec \$DOCKER_CMD run --rm \ --init \ --stop-timeout 10 \ @@ -446,3 +473,10 @@ fi " srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' + +# Clean up vLLM external router container on node 0 +if [[ "$ENGINE" == "vllm" && "$ROUTER_TYPE" == "vllm-router" ]]; then + srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' + '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true + ' +fi diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index a10e45d6d..6b70014ee 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -282,19 +282,24 @@ if [ "$NODE_RANK" -eq 0 ]; then setup_vllm_env # Start MoRI-IO proxy FIRST — workers register via ZMQ on startup - echo "Starting MoRI-IO proxy (HTTP=$ROUTER_PORT, ZMQ=$PROXY_PING_PORT)..." - PROXY_CMD="PROXY_HTTP_PORT=$ROUTER_PORT PROXY_PING_PORT=$PROXY_PING_PORT \ - python3 $WS_PATH/moriio_proxy.py" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PROXY_CMD" + # Skipped when ROUTER_TYPE=vllm-router (external router container started by job.slurm) + if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then + echo "Starting MoRI-IO proxy (HTTP=$ROUTER_PORT, ZMQ=$PROXY_PING_PORT)..." + PROXY_CMD="PROXY_HTTP_PORT=$ROUTER_PORT PROXY_PING_PORT=$PROXY_PING_PORT \ + python3 $WS_PATH/moriio_proxy.py" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PROXY_CMD" + else + PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log" + set -x + eval "$PROXY_CMD" > "$PROXY_LOG_FILE" 2>&1 & + set +x + proxy_pid=$! + sleep 3 + fi else - PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log" - set -x - eval "$PROXY_CMD" > "$PROXY_LOG_FILE" 2>&1 & - set +x - proxy_pid=$! - sleep 3 + echo "Using external vLLM router (ROUTER_TYPE=${ROUTER_TYPE:-vllm-router})" fi PREFILL_CMD="vllm serve ${MODEL_PATH} \ @@ -368,13 +373,16 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" fi - echo "Killing the proxy server and prefill server" + echo "Killing the prefill server" if [[ "$DRY_RUN" -eq 0 ]]; then - [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true + if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then + [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true + fi [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true sleep 2 - # Fallback: ensure no orphaned processes keep ports open - pkill -f moriio_proxy 2>/dev/null || true + if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then + pkill -f moriio_proxy 2>/dev/null || true + fi pkill -f "vllm serve" 2>/dev/null || true fi From 75b18c65b59429a2d1bd67f1a95209706e0e13aa Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 23 Apr 2026 01:49:52 +0000 Subject: [PATCH 31/85] fix bugs Signed-off-by: Chun Fang --- benchmarks/multi_node/amd_utils/bench.sh | 6 +- benchmarks/multi_node/amd_utils/env.sh | 4 +- benchmarks/multi_node/amd_utils/job.slurm | 60 ++++++++++--------- benchmarks/multi_node/amd_utils/server.sh | 8 +-- .../multi_node/amd_utils/server_vllm.sh | 54 ++++++++--------- benchmarks/multi_node/amd_utils/setup_deps.sh | 10 ++-- benchmarks/multi_node/amd_utils/submit.sh | 2 +- 7 files changed, 74 insertions(+), 70 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 87f3b1e8a..aecc29e83 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -11,7 +11,7 @@ # \ # -ENGINE="${ENGINE:-sglang}" +ENGINE="${ENGINE:-sglang-disagg}" n_prefill=$1 n_decode=$2 @@ -67,7 +67,7 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do # Engine-specific extra flags extra_flags="" - if [[ "$ENGINE" == "vllm" ]]; then + if [[ "$ENGINE" == "vllm-disagg" ]]; then extra_flags="--trust-remote-code" else if [ "$IS_MTP" = "true" ]; then @@ -92,7 +92,7 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do echo "-----------------------------------------" # vLLM: cooldown between rounds for idle KV block reaper - if [[ "$ENGINE" == "vllm" ]]; then + if [[ "$ENGINE" == "vllm-disagg" ]]; then echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." sleep 10 fi diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index c5a438541..81da415e8 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -9,7 +9,7 @@ # Set by runner or auto-detected from hostname. set -x -ENGINE="${ENGINE:-sglang}" +ENGINE="${ENGINE:-sglang-disagg}" export PYTHONDONTWRITEBYTECODE=1 # ============================================================================= @@ -43,7 +43,7 @@ export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} # Engine-specific environment # ============================================================================= -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then # ========================================================================= # vLLM/Nixl-specific environment # ========================================================================= diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 491f27aa8..b9a83941a 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -8,7 +8,7 @@ #SBATCH --time=24:00:00 # --output and --error are set by submit.sh via BENCHMARK_LOGS_DIR -ENGINE="${ENGINE:-sglang}" +ENGINE="${ENGINE:-sglang-disagg}" echo "=== Job Start Time ===" echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')" @@ -23,7 +23,7 @@ echo "" # Use $(pwd) not BASH_SOURCE — sbatch copies the script to /var/spool/slurmd/ # at runtime, but the CWD remains the submit-time directory (amd_utils/). -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then MODELS_YAML="$(pwd)/models_vllm.yaml" else MODELS_YAML="$(pwd)/models.yaml" @@ -111,7 +111,7 @@ if [[ -z "$MODEL_DIR" ]]; then fi export MODEL_DIR -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then # vLLM: Extract hf_dir from models.yaml, search multiple paths, resolve HF cache snapshots DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next} found && /^[^ ]/{exit} @@ -278,6 +278,7 @@ export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE export DRY_RUN="${DRY_RUN:-0}" export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +export KEEP_CONTAINERS="${KEEP_CONTAINERS:-0}" export ENGINE=$ENGINE # Eval-related env vars (threaded from submit.sh) @@ -367,7 +368,7 @@ DOCKER_ENV_COMMON=( ) # Engine-specific env vars -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then DOCKER_ENV_ENGINE=( -e VLLM_WS_PATH=${WS_PATH} -e MODEL_PATH=$DOCKER_MODEL_PATH @@ -403,28 +404,29 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" # Pre-clean (idempotent) -\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$_DCMD rm -f || true -\$DOCKER_CMD ps -aq | xargs -r \$_DCMD stop || true +\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true +\$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true # Start vLLM external router container on node 0 -if [[ \"$ENGINE\" == \"vllm\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then +if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then \$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true - \$DOCKER_CMD run -d \\ - --name \"$ROUTER_CONT_NAME\" \\ - --network host \\ - \"$VLLM_ROUTER_IMAGE\" \\ - vllm-router \\ - --vllm-pd-disaggregation \\ - --vllm-discovery-address \"0.0.0.0:${PROXY_PING_PORT}\" \\ - --port \"${ROUTER_PORT}\" \\ - --host 0.0.0.0 \\ - --policy consistent_hash \\ - --prefill-policy consistent_hash \\ - --decode-policy consistent_hash \\ - --log-level info + \$DOCKER_CMD run -d \ + --name \"$ROUTER_CONT_NAME\" \ + --network host \ + -v /tmp:/run_logs \ + \"$VLLM_ROUTER_IMAGE\" \ + bash -lc \"mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && exec vllm-router \ + --vllm-pd-disaggregation \ + --vllm-discovery-address 0.0.0.0:${PROXY_PING_PORT} \ + --port ${ROUTER_PORT} \ + --host 0.0.0.0 \ + --policy consistent_hash \ + --prefill-policy consistent_hash \ + --decode-policy consistent_hash \ + --log-level info 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_\$(hostname).log \" fi -exec \$DOCKER_CMD run --rm \ +exec \$DOCKER_CMD run \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -472,11 +474,13 @@ if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then fi " -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' +if [[ "${KEEP_CONTAINERS}" != "1" ]]; then + srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' -# Clean up vLLM external router container on node 0 -if [[ "$ENGINE" == "vllm" && "$ROUTER_TYPE" == "vllm-router" ]]; then - srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' - '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true - ' -fi + # Clean up vLLM external router container on node 0 + if [[ "$ENGINE" == "vllm-disagg" && "$ROUTER_TYPE" == "vllm-router" ]]; then + srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' + '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true + ' + fi +fi \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index cf08b3c2a..5c441a793 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -2,17 +2,17 @@ # Dual-Engine Disaggregated Server Dispatcher # ============================================================================= # Dispatches to the engine-specific server launcher based on ENGINE env var. -# ENGINE=sglang (default) -> server_sglang.sh (SGLang + MoRI) -# ENGINE=vllm -> server_vllm.sh (vLLM + Nixl/MoRI-IO) +# ENGINE=sglang-disagg (default) -> server_sglang.sh (SGLang + MoRI) +# ENGINE=vllm-disagg -> server_vllm.sh (vLLM + Nixl/MoRI-IO) # ============================================================================= -ENGINE="${ENGINE:-sglang}" +ENGINE="${ENGINE:-sglang-disagg}" WS_PATH="${WS_PATH:-${SGLANG_WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}}" export WS_PATH ENGINE echo "[DISPATCHER] ENGINE=$ENGINE WS_PATH=$WS_PATH" -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then source "$WS_PATH/server_vllm.sh" else source "$WS_PATH/server_sglang.sh" diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index 6b70014ee..73cad3adc 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -199,29 +199,29 @@ python3 $WS_PATH/sync.py barrier \ # ETCD Server Setup # ============================================================================= -echo "Proceeding to start etcd server on $host_name" -bash ${WS_PATH}/start_etcd.sh > /dev/null 2>&1 & -etcd_pid=$! - -echo "Waiting at etcd server barrier on $host_name" -python3 $WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports 2379 \ - --wait-for-all-ports \ - --timeout 300 - -echo "All etcd servers are up : $host_name" -sleep 3 - -echo "etcd endpoint health==================" -etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true -echo "======================================" - -python3 $WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports 2379 \ - --wait-for-all-ports \ - --timeout 300 +# echo "Proceeding to start etcd server on $host_name" +# bash ${WS_PATH}/start_etcd.sh > /dev/null 2>&1 & +# etcd_pid=$! + +# echo "Waiting at etcd server barrier on $host_name" +# python3 $WS_PATH/sync.py barrier \ +# --node-ips ${IPADDRS} \ +# --node-ports 2379 \ +# --wait-for-all-ports \ +# --timeout 300 + +# echo "All etcd servers are up : $host_name" +# sleep 3 + +# echo "etcd endpoint health==================" +# etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true +# echo "======================================" + +# python3 $WS_PATH/sync.py barrier \ +# --node-ips ${IPADDRS} \ +# --node-ports 2379 \ +# --wait-for-all-ports \ +# --timeout 300 # ============================================================================= # Cluster Topology Configuration @@ -343,7 +343,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "DRY RUN: $HEALTH_BARRIER_CMD" else eval "$HEALTH_BARRIER_CMD" - echo "MoRI-IO proxy is ready for benchmarking" + echo "${ROUTER_TYPE} is ready for benchmarking" fi echo "Ready for benchmarking on ${host_name}:${host_ip}" @@ -490,9 +490,9 @@ else [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid 2>/dev/null || true fi -echo "Killing the etcd server" -kill $etcd_pid 2>/dev/null || true -pkill -f etcd 2>/dev/null || true +# echo "Killing the etcd server" +# kill $etcd_pid 2>/dev/null || true +# pkill -f etcd 2>/dev/null || true echo "Script completed successfully" exit 0 diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 8c7a9f07a..589399f74 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -875,11 +875,11 @@ except Exception as e: # Run installers # ============================================================================= -install_ucx -install_rixl -install_etcd -install_libionic -install_mori +# install_ucx +# install_rixl +# install_etcd +# install_libionic +# install_mori install_amd_quark install_mori_proxy_deps patch_mori_fp8_compat diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index a77462fc5..f6670b5ee 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -100,7 +100,7 @@ export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE export PROFILER_ARGS=$profiler_args # Engine-specific xP/yD semantics and TP exports -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} fi From 5fcca879bf1243b68cc07daf8e5e7b213856a866 Mon Sep 17 00:00:00 2001 From: Simon Danielsson <70206058+simondanielsson@users.noreply.github.com> Date: Mon, 4 May 2026 12:58:19 +0200 Subject: [PATCH 32/85] [AMD] Bump to nightly vllm and vllm-router images (#1208) --------- Signed-off-by: Simon Danielsson --- .github/configs/amd-master.yaml | 4 +- benchmarks/multi_node/amd_utils/env.sh | 9 +- benchmarks/multi_node/amd_utils/job.slurm | 5 +- .../multi_node/amd_utils/moriio_proxy.py | 327 ------------------ .../amd_utils/patches/minimax_m2.py | 4 +- .../multi_node/amd_utils/server_vllm.sh | 32 +- benchmarks/multi_node/amd_utils/setup_deps.sh | 46 +-- 7 files changed, 43 insertions(+), 384 deletions(-) delete mode 100644 benchmarks/multi_node/amd_utils/moriio_proxy.py diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 26a34ebcb..2f9c21907 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1351,7 +1351,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" kimik2.5-fp4-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:v0.18.0 + image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x-disagg @@ -1404,7 +1404,7 @@ kimik2.5-fp4-mi355x-vllm-disagg: - "DECODE_NODES=2" minimaxm2.5-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:v0.18.0 + image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x-disagg diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 81da415e8..cd4794ed5 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -32,8 +32,13 @@ fi export IBDEVICES # Shared: Auto-detect default network interface (portable across clusters) -export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) -export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) +# Only auto-detect if not already set by the runner/environment +if [[ -z "$GLOO_SOCKET_IFNAME" ]]; then + export GLOO_SOCKET_IFNAME=$(ip route 2>/dev/null | grep '^default' | awk '{print $5}' | head -n 1) +fi +if [[ -z "$NCCL_SOCKET_IFNAME" ]]; then + export NCCL_SOCKET_IFNAME=$(ip route 2>/dev/null | grep '^default' | awk '{print $5}' | head -n 1) +fi set +x diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index b9a83941a..70f501df6 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -297,7 +297,7 @@ SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" # vLLM external router container -VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy}" +VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260503-e8992ca}" ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" @@ -417,6 +417,7 @@ if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \ \"$VLLM_ROUTER_IMAGE\" \ bash -lc \"mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && exec vllm-router \ --vllm-pd-disaggregation \ + --kv-connector moriio \ --vllm-discovery-address 0.0.0.0:${PROXY_PING_PORT} \ --port ${ROUTER_PORT} \ --host 0.0.0.0 \ @@ -483,4 +484,4 @@ if [[ "${KEEP_CONTAINERS}" != "1" ]]; then '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true ' fi -fi \ No newline at end of file +fi diff --git a/benchmarks/multi_node/amd_utils/moriio_proxy.py b/benchmarks/multi_node/amd_utils/moriio_proxy.py deleted file mode 100644 index 7d1e8454b..000000000 --- a/benchmarks/multi_node/amd_utils/moriio_proxy.py +++ /dev/null @@ -1,327 +0,0 @@ -#!/usr/bin/env python3 -# MoRI-IO proxy server for vLLM PD disaggregation. -# -# Based on vLLM's examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py -# with the following adaptations for production multi-node use: -# - Ports configurable via PROXY_HTTP_PORT / PROXY_PING_PORT env vars -# - /health endpoint for sync.py barrier readiness checks -# - Uses stdlib `re` instead of `regex` to avoid extra dep -# -# The proxy performs two roles that vllm-router cannot: -# 1. ZMQ service discovery — prefill/decode workers register their RDMA ports -# 2. Request enrichment — injects remote endpoint info into kv_transfer_params - -import asyncio -import copy -import logging -import os -import re -import socket -import threading -import time -import uuid - -import aiohttp -import msgpack -import zmq -from quart import Quart, make_response, request - -logger = logging.getLogger("moriio_proxy") -logger.setLevel(logging.DEBUG) -handler = logging.StreamHandler() -handler.setFormatter(logging.Formatter( - "%(asctime)s %(levelname)s [%(name)s] %(message)s")) -logger.addHandler(handler) - -prefill_instances: list[dict] = [] -decode_instances: list[dict] = [] -request_nums = 0 -app = Quart(__name__) - -STREAM_IDLE_TIMEOUT = int(os.environ.get("PROXY_STREAM_IDLE_TIMEOUT", "300")) - -IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)") - -TRANSFER_TYPE = None - - -def _append_whole_dict_unique(target_list, data_dict): - new_filtered = {k: v for k, v in data_dict.items() if k != "index"} - for existed in target_list: - existed_filtered = {k: v for k, v in existed.items() if k != "index"} - if existed_filtered == new_filtered: - return False - logger.info("Registered instance: role=%s addr=%s hs_port=%s notify=%s dp=%s tp=%s", - data_dict.get("role"), data_dict.get("request_address"), - data_dict.get("handshake_port"), data_dict.get("notify_port"), - data_dict.get("dp_size"), data_dict.get("tp_size")) - target_list.append(data_dict) - transfer_mode = data_dict.get("transfer_mode", "unknown") - global TRANSFER_TYPE - - if TRANSFER_TYPE is None: - TRANSFER_TYPE = transfer_mode - logger.info("Transfer mode set to: %s", TRANSFER_TYPE) - elif transfer_mode != TRANSFER_TYPE: - raise ValueError(f"mismatched transfer mode {TRANSFER_TYPE} vs {transfer_mode}") - - return True - - -_list_lock = threading.RLock() - - -def _listen_for_register(hostname, port): - context = zmq.Context() - router_socket = context.socket(zmq.ROUTER) - router_socket.bind(f"tcp://{hostname}:{port}") - poller = zmq.Poller() - poller.register(router_socket, zmq.POLLIN) - global prefill_instances - global decode_instances - - while True: - socks = dict(poller.poll()) - if router_socket in socks: - remote_addr, msg = router_socket.recv_multipart() - data = msgpack.loads(msg) - if data["type"] == "HELLO": - pass - elif ( - data["type"] == "register" - and data["role"] == "P" - and data["request_address"] not in prefill_instances - ): - with _list_lock: - _append_whole_dict_unique(prefill_instances, data) - - elif ( - data["type"] == "register" - and data["role"] == "D" - and data["request_address"] not in decode_instances - ): - with _list_lock: - _append_whole_dict_unique(decode_instances, data) - - -def start_service_discovery(hostname, port): - if not hostname: - hostname = socket.gethostname() - if port == 0: - raise ValueError("Port cannot be 0") - - _listener_thread = threading.Thread( - target=_listen_for_register, args=(hostname, port), daemon=True - ) - _listener_thread.start() - logger.info("Service discovery listening on %s:%s", hostname, port) - return _listener_thread - - -async def send_request_to_prefill( - endpoint, req_data, request_id, d_endpoint, dip, dport, selected_prefill_dp_rank -): - req_data_copy = req_data - - req_data_copy["kv_transfer_params"].update( - { - "do_remote_decode": True, - "do_remote_prefill": False, - "remote_handshake_port": d_endpoint["handshake_port"], - "remote_notify_port": d_endpoint["notify_port"], - "remote_engine_id": None, - "remote_block_ids": None, - "remote_host": dip, - "remote_port": dport, - } - ) - req_data_copy["stream"] = False - req_data_copy["max_tokens"] = 1 - if "max_completion_tokens" in req_data_copy: - req_data_copy["max_completion_tokens"] = 1 - if "stream_options" in req_data_copy: - del req_data_copy["stream_options"] - async with aiohttp.ClientSession( - timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) - ) as session: - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - "X-Request-Id": request_id, - } - if selected_prefill_dp_rank is not None: - headers["X-data-parallel-rank"] = str(selected_prefill_dp_rank) - async with session.post( - url=endpoint, json=req_data_copy, headers=headers - ) as response: - if response.status == 200: - return await response.json() - else: - raise RuntimeError( - f"Prefill response status={response.status}" - ) - - -async def start_decode_request(endpoint, req_data, request_id): - session = aiohttp.ClientSession( - timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) - ) - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - "X-Request-Id": request_id, - } - response = await session.post(url=endpoint, json=req_data, headers=headers) - return session, response - - -async def stream_decode_response(session, response, request_id): - try: - if response.status == 200: - chunk_iter = response.content.iter_chunked(1024).__aiter__() - while True: - try: - chunk_bytes = await asyncio.wait_for( - chunk_iter.__anext__(), timeout=STREAM_IDLE_TIMEOUT, - ) - yield chunk_bytes - except StopAsyncIteration: - break - except asyncio.TimeoutError: - logger.error( - "Decode stream %s idle for %ds, aborting", - request_id, STREAM_IDLE_TIMEOUT, - ) - break - else: - raise RuntimeError( - f"Decode response status={response.status}" - ) - finally: - await response.release() - await session.close() - - -@app.route("/health", methods=["GET"]) -async def health_check(): - with _list_lock: - p_count = len(prefill_instances) - d_count = len(decode_instances) - return await make_response( - ({"status": "ok", "prefill_instances": p_count, "decode_instances": d_count}, 200) - ) - - -@app.route("/v1/completions", methods=["POST"]) -@app.route("/v1/chat/completions", methods=["POST"]) -async def handle_request(): - try: - with _list_lock: - global request_nums - request_nums += 1 - - def extract_ip_port_fast(url): - match = IP_PORT_PATTERN.search(url) - if not match: - raise ValueError(f"Invalid URL format: {url}") - return match.groups() - - req_data = await request.get_json() - request_id = str(uuid.uuid4()) - - if not prefill_instances or not decode_instances: - return await make_response( - ("Service Unavailable: No prefill or decode instances registered.", 503) - ) - - pid = request_nums % len(prefill_instances) - did = request_nums % len(decode_instances) - prefill_instance_endpoint = prefill_instances[pid] - decode_instance_endpoint = decode_instances[did] - - selected_prefill_dp_rank = None - if prefill_instance_endpoint["dp_size"] > 1: - selected_prefill_dp_rank = request_nums % prefill_instance_endpoint["dp_size"] - - dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"]) - - req_data_to_prefill = copy.deepcopy(req_data) - req_data_to_prefill["kv_transfer_params"] = {"transfer_id": request_id} - req_data["kv_transfer_params"] = {"transfer_id": request_id} - req_data_to_prefill["kv_transfer_params"]["remote_dp_size"] = ( - decode_instance_endpoint["dp_size"] - ) - req_data_to_prefill["kv_transfer_params"]["remote_tp_size"] = ( - decode_instance_endpoint["tp_size"] - ) - - send_prefill_task = asyncio.create_task( - send_request_to_prefill( - prefill_instance_endpoint["request_address"], - req_data_to_prefill, - request_id, - decode_instance_endpoint, - dip, - dport, - selected_prefill_dp_rank, - ) - ) - ip, port = extract_ip_port_fast(prefill_instance_endpoint["request_address"]) - - req_data["max_tokens"] -= 1 - - req_data["kv_transfer_params"] = { - "transfer_id": request_id, - "do_remote_decode": False, - "do_remote_prefill": True, - "remote_handshake_port": prefill_instance_endpoint["handshake_port"], - "remote_notify_port": prefill_instance_endpoint["notify_port"], - "remote_engine_id": None, - "remote_block_ids": None, - "remote_host": ip, - "remote_port": port, - } - if TRANSFER_TYPE == "READ": - prefill_response = await send_prefill_task - req_data["kv_transfer_params"]["remote_engine_id"] = prefill_response[ - "kv_transfer_params" - ]["remote_engine_id"] - req_data["kv_transfer_params"]["remote_block_ids"] = prefill_response[ - "kv_transfer_params" - ]["remote_block_ids"] - - req_data["kv_transfer_params"]["remote_dp_size"] = prefill_instance_endpoint[ - "dp_size" - ] - req_data["kv_transfer_params"]["remote_tp_size"] = prefill_instance_endpoint[ - "tp_size" - ] - - if selected_prefill_dp_rank is not None: - req_data["kv_transfer_params"]["remote_dp_rank"] = selected_prefill_dp_rank - - decode_request_task = asyncio.create_task( - start_decode_request( - decode_instance_endpoint["request_address"], req_data, request_id - ) - ) - - session, decode_response = await decode_request_task - stream_generator = stream_decode_response(session, decode_response, request_id) - response = await make_response(stream_generator) - return response - except Exception as e: - logger.exception("Error handling request: %s", e) - return await make_response((f"Internal Server Error: {e!s}", 500)) - - -if __name__ == "__main__": - http_port = int(os.environ.get("PROXY_HTTP_PORT", "30000")) - ping_port = int(os.environ.get("PROXY_PING_PORT", "36367")) - - t = start_service_discovery("0.0.0.0", ping_port) - app.debug = False - app.config["BODY_TIMEOUT"] = 360000 - app.config["RESPONSE_TIMEOUT"] = 360000 - - logger.info("MoRI-IO proxy starting: HTTP=%d, ZMQ=%d", http_port, ping_port) - app.run(host="0.0.0.0", port=http_port) - t.join() diff --git a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py index 8290276fb..ac830eb1f 100644 --- a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py +++ b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py @@ -137,7 +137,6 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, - reduce_results=False, renormalize=True, scoring_func=getattr(config, "scoring_func", "softmax"), e_score_correction_bias=self.e_score_correction_bias, @@ -185,7 +184,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: ) final_hidden_states = final_hidden_states[:num_tokens] elif self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( + from vllm.distributed.communication_op import tensor_model_parallel_all_reduce + final_hidden_states = tensor_model_parallel_all_reduce( final_hidden_states ) diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index 73cad3adc..9acb05f54 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -242,7 +242,7 @@ done echo "Prefill node IPs: ${PREFILL_ARGS}" echo "Decode node IPs: ${DECODE_ARGS}" -# MoRI-IO proxy ZMQ registration port (must match moriio_proxy.py PROXY_PING_PORT) +# MoRI-IO proxy ZMQ registration port (must match vllm-router --vllm-discovery-address) PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" # vLLM environment (UCX transport vars are set at the Docker level in job.slurm) @@ -281,26 +281,8 @@ if [ "$NODE_RANK" -eq 0 ]; then setup_vllm_env - # Start MoRI-IO proxy FIRST — workers register via ZMQ on startup - # Skipped when ROUTER_TYPE=vllm-router (external router container started by job.slurm) - if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then - echo "Starting MoRI-IO proxy (HTTP=$ROUTER_PORT, ZMQ=$PROXY_PING_PORT)..." - PROXY_CMD="PROXY_HTTP_PORT=$ROUTER_PORT PROXY_PING_PORT=$PROXY_PING_PORT \ - python3 $WS_PATH/moriio_proxy.py" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PROXY_CMD" - else - PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log" - set -x - eval "$PROXY_CMD" > "$PROXY_LOG_FILE" 2>&1 & - set +x - proxy_pid=$! - sleep 3 - fi - else - echo "Using external vLLM router (ROUTER_TYPE=${ROUTER_TYPE:-vllm-router})" - fi + # Router is started as an external container by job.slurm (VLLM_ROUTER_IMAGE) + echo "Using external vllm-router container (started by job.slurm on this node)" PREFILL_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ @@ -343,7 +325,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "DRY RUN: $HEALTH_BARRIER_CMD" else eval "$HEALTH_BARRIER_CMD" - echo "${ROUTER_TYPE} is ready for benchmarking" + echo "MoRI-IO proxy is ready for benchmarking" fi echo "Ready for benchmarking on ${host_name}:${host_ip}" @@ -375,14 +357,8 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Killing the prefill server" if [[ "$DRY_RUN" -eq 0 ]]; then - if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then - [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true - fi [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true sleep 2 - if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then - pkill -f moriio_proxy 2>/dev/null || true - fi pkill -f "vllm serve" 2>/dev/null || true fi diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 589399f74..958cb9808 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -242,43 +242,48 @@ patch_mori_fp8_compat() { import re, os, sys patched = [] -# 1. Patch layer.py: remove multi-line AITER assertion for MoRI +# Patch layer.py: remove AITER requirement assertion(s) for MoRI try: import vllm.model_executor.layers.fused_moe.layer as lm f = lm.__file__ src = open(f).read() - if "Mori needs to be used with aiter" in src: + if "[PATCHED] AITER requirement removed for MoRI-EP + FP8" in src: + print("[SETUP] layer.py MoRI-FP8 patch already applied") + elif "Mori needs to be used with aiter" in src: + # v0.19+: two consecutive assertions inside `if self.moe_config.use_mori_kernels:` new = re.sub( - r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)", + r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)\s*" + r"assert not self\.aiter_fmoe_shared_expert_enabled,\s*\([^)]*\)", "pass # [PATCHED] AITER requirement removed for MoRI-EP + FP8", src, flags=re.DOTALL) + if new == src: + # v0.17.1/v0.18.0: only the first assertion existed + new = re.sub( + r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)", + "pass # [PATCHED] AITER requirement removed for MoRI-EP + FP8", + src, flags=re.DOTALL) if new != src: open(f, "w").write(new) patched.append("layer.py") + else: + print("[SETUP] ERROR: layer.py pattern found but regex had no effect", file=sys.stderr) + sys.exit(1) + else: + print("[SETUP] ERROR: layer.py AITER assertion pattern not found — vLLM API may have changed", file=sys.stderr) + sys.exit(1) except Exception as e: - print(f"[SETUP] WARN patch layer.py: {e}", file=sys.stderr) + print(f"[SETUP] ERROR patch layer.py: {e}", file=sys.stderr) + sys.exit(1) -# 2. Patch mori_prepare_finalize.py: remove defer_input_quant restriction -try: - import vllm.model_executor.layers.fused_moe.mori_prepare_finalize as mm - f = mm.__file__ - src = open(f).read() - if "defer_input_quant" in src: - new = re.sub( - r"raise NotImplementedError\([^)]*defer_input_quant[^)]*\)", - "pass # [PATCHED] defer_input_quant check removed for MoRI-EP + FP8", - src) - if new != src: - open(f, "w").write(new) - patched.append("mori_prepare_finalize.py") -except Exception as e: - print(f"[SETUP] WARN patch mori_pf: {e}", file=sys.stderr) +# prepare_finalize/mori.py (v0.19+) already handles defer_input_quant correctly +# (skips FP8 quant when True). No patch needed for that file. +# Added in 0.18.1: https://github.com/vllm-project/vllm/commit/6a9cceb219fcbd6b1eb540ddfdc77ec160f0e209 if patched: print(f"[SETUP] Patched: {chr(44).join(patched)}") else: print("[SETUP] No MoRI-FP8 patches needed") -' +' || exit 1 _SETUP_INSTALLED+=("MoRI-FP8-patch") } @@ -881,7 +886,6 @@ except Exception as e: # install_libionic # install_mori install_amd_quark -install_mori_proxy_deps patch_mori_fp8_compat patch_moriio_save_kv_timeout patch_moriio_transfer_timeout From b4d0b4890942d35e5ab2038a60334016ed81e6b3 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 12 May 2026 08:33:11 +0000 Subject: [PATCH 33/85] update vllm image and vllm router image --- .github/configs/amd-master.yaml | 2 +- benchmarks/multi_node/amd_utils/job.slurm | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 2f9c21907..f30f4ca53 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1351,7 +1351,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" kimik2.5-fp4-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c + image: aigmkt/vllm-dev:ainic2 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x-disagg diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 70f501df6..47eed2149 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -297,7 +297,7 @@ SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" # vLLM external router container -VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260503-e8992ca}" +VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260511-e667ebb}" ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" From b51320d824a1823adcbce9ff047c74c342c3b4ce Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 12 May 2026 10:12:22 +0000 Subject: [PATCH 34/85] update the interface prefix for tw cluster Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/env.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index cd4794ed5..ffdc9682e 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -54,9 +54,9 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then # ========================================================================= set -x - # UCX_NET_DEVICES: Use the first benic interface for UCX TCP transport + # UCX_NET_DEVICES: Use the first tw-eth interface for UCX TCP transport if [[ -z "$UCX_NET_DEVICES" ]]; then - UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/benic1p1/{print $2}' | head -1) + UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth/{print $2}' | head -1) if [[ -n "$UCX_NET_DEV" ]]; then export UCX_NET_DEVICES="$UCX_NET_DEV" else From 7d84712ca88b5e1ae676b1bd7124104f9c68b5e1 Mon Sep 17 00:00:00 2001 From: Shan Theresa Date: Wed, 13 May 2026 06:33:57 +0000 Subject: [PATCH 35/85] add deps for ib device auto-detection Signed-off-by: Shan Theresa --- benchmarks/multi_node/amd_utils/env.sh | 4 ++ benchmarks/multi_node/amd_utils/setup_deps.sh | 31 ++++++------ benchmarks/multi_node/amd_utils/submit.sh | 49 +++++++++++++++++++ 3 files changed, 68 insertions(+), 16 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index ffdc9682e..e01365503 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -56,7 +56,11 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then # UCX_NET_DEVICES: Use the first tw-eth interface for UCX TCP transport if [[ -z "$UCX_NET_DEVICES" ]]; then +<<<<<<< Updated upstream UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth/{print $2}' | head -1) +======= + UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth0/{print $2}' | head -1) +>>>>>>> Stashed changes if [[ -n "$UCX_NET_DEV" ]]; then export UCX_NET_DEVICES="$UCX_NET_DEV" else diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 958cb9808..860cecf96 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -144,28 +144,26 @@ install_libionic() { } # --------------------------------------------------------------------------- -# 5. MoRI-IO proxy deps (Python packages for the MoRI-IO-aware proxy server) -# The proxy replaces vllm-router: it handles both HTTP routing AND the -# MoRI-IO ZMQ registration/request-enrichment protocol. -# Only needed on NODE_RANK=0 (proxy node). +# 5. Container RDMA/net tools +# - ibv_devinfo comes from ibverbs-utils +# - iproute2 provides the `ip` command +# Used for in-container NIC/RDMA validation and routing checks. # --------------------------------------------------------------------------- -install_mori_proxy_deps() { - if python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then - echo "[SETUP] MoRI-IO proxy Python deps already present" +install_recipe_deps() { + if command -v ibv_devinfo >/dev/null 2>&1 && command -v ip >/dev/null 2>&1; then + echo "[SETUP] Container RDMA/net tools already present" return 0 fi - echo "[SETUP] Installing MoRI-IO proxy Python deps..." - # v0.18.0 ships aiohttp, pyzmq, blinker(distutils); only quart and msgpack - # are missing. --ignore-installed blinker avoids pip's distutils uninstall - # error when quart pulls a newer blinker version. - pip install --quiet --ignore-installed blinker - pip install --quiet quart msgpack + echo "[SETUP] Installing ibv_devinfo + iproute2 in container..." + apt-get update -q -y && apt-get install -q -y \ + ibverbs-utils iproute2 \ + && rm -rf /var/lib/apt/lists/* - if ! python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then - echo "[SETUP] ERROR: MoRI-IO proxy deps install failed"; exit 1 + if ! command -v ibv_devinfo >/dev/null 2>&1 || ! command -v ip >/dev/null 2>&1; then + echo "[SETUP] ERROR: Failed to install ibv_devinfo/iproute2"; exit 1 fi - _SETUP_INSTALLED+=("mori-proxy-deps") + _SETUP_INSTALLED+=("ibverbs-utils+iproute2") } # --------------------------------------------------------------------------- @@ -885,6 +883,7 @@ except Exception as e: # install_etcd # install_libionic # install_mori +install_recipe_deps install_amd_quark patch_mori_fp8_compat patch_moriio_save_kv_timeout diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index f6670b5ee..524b00c65 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -161,6 +161,55 @@ if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES") fi +# ============================================================================= +# Reuse existing allocation (skip sbatch) +# ============================================================================= +# When SLURM_REUSE_JOBID is set, run job.slurm directly in the current shell, +# attaching to the existing allocation. Inner `srun` calls pick up the +# allocation via SLURM_JOB_ID; SLURM_OVERLAP=1 lets them share task slots with +# the interactive shell already holding the allocation. +if [[ -n "${SLURM_REUSE_JOBID:-}" ]]; then + REUSE_JID="$SLURM_REUSE_JOBID" + echo "Reusing existing Slurm allocation ${REUSE_JID} (skipping sbatch)" >&2 + + # Resolve allocation's nodelist if not already provided. + ALLOC_NODELIST="${SLURM_JOB_NODELIST:-$(squeue -h -j "$REUSE_JID" -o '%N' 2>/dev/null)}" + if [[ -z "$ALLOC_NODELIST" ]]; then + echo "Error: could not resolve nodelist for job ${REUSE_JID}" >&2 + exit 1 + fi + ALLOC_NNODES=$(scontrol show hostnames "$ALLOC_NODELIST" | wc -l) + if [[ "$ALLOC_NNODES" -lt "$NUM_NODES" ]]; then + echo "Error: allocation ${REUSE_JID} has ${ALLOC_NNODES} nodes, need ${NUM_NODES}" >&2 + exit 1 + fi + + export SLURM_JOB_ID="$REUSE_JID" + export SLURM_JOBID="$REUSE_JID" + export SLURM_JOB_NODELIST="$ALLOC_NODELIST" + export SLURM_NODELIST="$ALLOC_NODELIST" + export SLURM_NNODES="$ALLOC_NNODES" + export SLURM_JOB_NUM_NODES="$ALLOC_NNODES" + export SLURM_NTASKS="$ALLOC_NNODES" + export SLURM_NPROCS="$ALLOC_NNODES" + export SLURM_NTASKS_PER_NODE=1 + export SLURM_TASKS_PER_NODE="1(x${ALLOC_NNODES})" + export SLURM_OVERLAP=1 + export SLURM_SUBMIT_DIR="$(pwd)" + + STDOUT_LOG="${BENCHMARK_LOGS_DIR}/slurm_job-${REUSE_JID}.out" + STDERR_LOG="${BENCHMARK_LOGS_DIR}/slurm_job-${REUSE_JID}.err" + rm -f "$STDOUT_LOG" "$STDERR_LOG" + + nohup bash "$(dirname "$0")/job.slurm" >"$STDOUT_LOG" 2>"$STDERR_LOG" & + INLINE_PID=$! + echo "$INLINE_PID" > "${BENCHMARK_LOGS_DIR}/slurm_job-${REUSE_JID}.pid" + echo "Started job.slurm (pid=${INLINE_PID}); logs: ${STDOUT_LOG}" >&2 + + echo "$REUSE_JID" + exit 0 +fi + # Construct the sbatch command sbatch_cmd=( sbatch From f377527754a3ba6ddc8d0838094381d84096227b Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 13 May 2026 10:42:03 +0000 Subject: [PATCH 36/85] update vllm image Signed-off-by: Theresa Shan --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index f30f4ca53..b7ffcf8bc 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1351,7 +1351,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" kimik2.5-fp4-mi355x-vllm-disagg: - image: aigmkt/vllm-dev:ainic2 + image: ghcr.io/simondanielsson/vllm-dev:ainic-test-hydra model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x-disagg From d868a772b48dfdc63e8cf3d8502b0622a310ddc7 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 13 May 2026 13:57:43 +0000 Subject: [PATCH 37/85] fix indentation and add missing finally block in async_request_openai_chat_completions Co-Authored-By: Claude Opus 4 --- utils/bench_serving/backend_request_func.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py index bd8e40bfd..1b22b1b91 100644 --- a/utils/bench_serving/backend_request_func.py +++ b/utils/bench_serving/backend_request_func.py @@ -460,9 +460,9 @@ async def async_request_openai_chat_completions( if _own_session: await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output def get_model(pretrained_model_name_or_path: str) -> str: From cd033111b937d1a4bd147a925ec433489ab0eb22 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 13 May 2026 13:59:32 +0000 Subject: [PATCH 38/85] fix tw-eth interface detection pattern in env.sh Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/env.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index e01365503..ffdc9682e 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -56,11 +56,7 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then # UCX_NET_DEVICES: Use the first tw-eth interface for UCX TCP transport if [[ -z "$UCX_NET_DEVICES" ]]; then -<<<<<<< Updated upstream UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth/{print $2}' | head -1) -======= - UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth0/{print $2}' | head -1) ->>>>>>> Stashed changes if [[ -n "$UCX_NET_DEV" ]]; then export UCX_NET_DEVICES="$UCX_NET_DEV" else From e46ffbbe362e507a95063d343dc7d8c4ab122050 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 13 May 2026 14:09:40 +0000 Subject: [PATCH 39/85] fix vllm-disagg config schema: use scenarios.fixed-seq-len Co-Authored-By: Claude Opus 4 --- .github/configs/amd-master.yaml | 178 ++++++++++++++++---------------- 1 file changed, 90 insertions(+), 88 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index b7ffcf8bc..67c71a9bb 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1359,49 +1359,50 @@ kimik2.5-fp4-mi355x-vllm-disagg: framework: vllm-disagg multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" minimaxm2.5-fp8-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c @@ -1412,51 +1413,52 @@ minimaxm2.5-fp8-mi355x-vllm-disagg: framework: vllm-disagg multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - # Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536, - # TP8 shards to 192 which is not divisible by FP8 block_n=128. - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + # Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536, + # TP8 shards to 192 which is not divisible by FP8 block_n=128. + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" dsr1-fp4-mi355x-sglang-disagg: image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 From fecf422303ad909eb5ed39fa0b88545ea102a880 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 13 May 2026 15:10:04 +0000 Subject: [PATCH 40/85] fix vllm-disagg routing to multi_node benchmark subdir Co-Authored-By: Claude Opus 4 --- runners/launch_mi355x-amds.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 6b47b34b7..4d4943631 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -56,7 +56,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then trap 'sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true' EXIT SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh" - if [[ "$FRAMEWORK" == "sglang-disagg" || "$FRAMEWORK" == "vllm-disagg" ]]; then + if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]]; then BENCHMARK_SUBDIR="multi_node" else BENCHMARK_SUBDIR="single_node" From b2664d0dcc4a8e92fe80148ad0b1c4b3ccac20b8 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 13 May 2026 15:51:26 +0000 Subject: [PATCH 41/85] fix result collection to use FRAMEWORK as log directory prefix The inline collect_latest_results.py hardcoded "sglang" as the log directory prefix, causing "No logs directory found" for vllm-disagg runs where bench.sh creates directories named vllm-disagg_isl_X_osl_Y. Co-Authored-By: Claude Opus 4 --- runners/launch_mi355x-amds.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 4d4943631..26714930e 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -122,7 +122,7 @@ for path in sorted(candidates, key=os.path.getmtime, reverse=True)[:nexp]: print(path) PY - LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1) + LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1 "$FRAMEWORK") if [ -z "$LOGS_DIR" ]; then echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" exit 1 From 8a6c46442b9e3eb9f846db3912b941587f206da2 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 02:23:11 +0000 Subject: [PATCH 42/85] suppress tokenizer warnings and debug output in bench.sh Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/bench.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index aecc29e83..33cc918bf 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -37,6 +37,9 @@ IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list" ROUTER_PORT="${ROUTER_PORT:-30000}" +export TRANSFORMERS_VERBOSITY=error +export TOKENIZERS_PARALLELISM=false + echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" profile_folder="${log_path}/${ENGINE}_isl_${chosen_isl}_osl_${chosen_osl}" From 6ed08fb928e280dc476bd7e8270faada6d499a34 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 02:44:58 +0000 Subject: [PATCH 43/85] fix vllm-disagg deadlock: stop router after rank 0 container exits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vllm-router runs as a separate container on node 0. After node 0's main container finishes the benchmark and exits, decode nodes remain stuck waiting for the router port to close. The router cleanup in job.slurm can't run until srun completes, but srun can't complete because decode nodes are blocked — deadlock. Fix: skip exec on rank 0 for vllm-disagg so the srun bash script continues after docker exits and can stop the router container, allowing decode nodes to detect the port closure and exit. Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/job.slurm | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 47eed2149..20ecb6683 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -427,7 +427,16 @@ if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \ --log-level info 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_\$(hostname).log \" fi -exec \$DOCKER_CMD run \ +# Skip exec on vllm-disagg rank 0 so we can stop the router after the main +# container exits. Without this, decode nodes block forever waiting for the +# router port to close (the router is a separate container). +MAYBE_EXEC=exec +if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then + MAYBE_EXEC= + set +e +fi + +\$MAYBE_EXEC \$DOCKER_CMD run \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -468,11 +477,11 @@ exec \$DOCKER_CMD run \ '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log ' +# Only reached when exec was skipped (vllm-disagg rank 0) DOCKER_EXIT_CODE=\$? -if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then - echo \"ERROR: docker exited rc=\$DOCKER_EXIT_CODE on \$(hostname)\" - exit \$DOCKER_EXIT_CODE -fi +echo \"[rank 0] Main container exited (rc=\$DOCKER_EXIT_CODE). Stopping vllm-router...\" +\$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true +exit \$DOCKER_EXIT_CODE " if [[ "${KEEP_CONTAINERS}" != "1" ]]; then From 9fba8281d0294c875e520c0dd1beee6dfc138ef7 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 02:57:46 +0000 Subject: [PATCH 44/85] reduce vllm-disagg concurrency sweep to single point for faster iteration Co-Authored-By: Claude Opus 4 --- .github/configs/amd-master.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 67c71a9bb..42e5be0f4 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1364,9 +1364,9 @@ kimik2.5-fp4-mi355x-vllm-disagg: - isl: 1024 osl: 1024 search-space: - # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total , 16, 32, 64, 128, 256, 512 - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + conc-list: [ 8 ] prefill: num-worker: 1 tp: 8 @@ -1387,7 +1387,7 @@ kimik2.5-fp4-mi355x-vllm-disagg: osl: 1024 search-space: - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + conc-list: [ 8 ] prefill: num-worker: 1 tp: 8 From 4ea260d40200ef2716790dceb738b0d13b07cc8b Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 03:30:18 +0000 Subject: [PATCH 45/85] preserve slurm logs on failure and print stderr inline The EXIT trap deleted benchmark_logs/ before saving artifacts, making it impossible to debug container startup failures. Now the trap always copies slurm .out/.err to the artifact directory and prints the last 100 lines of .err inline in the CI output. Co-Authored-By: Claude Opus 4 --- runners/launch_mi355x-amds.sh | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 26714930e..e05572a43 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -52,8 +52,24 @@ if [[ "$IS_MULTINODE" == "true" ]]; then sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true # Ensure root-owned files are cleaned up even on early exit to prevent - # EACCES errors when the next GH Actions job checks out on this runner - trap 'sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true' EXIT + # EACCES errors when the next GH Actions job checks out on this runner. + # Always preserve slurm logs as CI artifacts for debugging. + cleanup_and_save_logs() { + if [[ -n "${GITHUB_ACTIONS:-}" && -n "${JOB_ID:-}" ]]; then + local art_dir="$GITHUB_WORKSPACE/benchmark_artifacts" + mkdir -p "$art_dir" + cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$art_dir/" 2>/dev/null || true + fi + # Print .err inline so failures are visible in CI output + local err_file="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID:-unknown}.err" + if [[ -s "$err_file" ]]; then + echo "=== Slurm job stderr ===" + tail -100 "$err_file" + echo "========================" + fi + sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true + } + trap cleanup_and_save_logs EXIT SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh" if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]]; then @@ -171,16 +187,7 @@ PY sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true - # Upload logs as artifact if running in GitHub Actions - if [[ -n "${GITHUB_ACTIONS:-}" ]]; then - ARTIFACT_DIR="$GITHUB_WORKSPACE/benchmark_artifacts" - mkdir -p "$ARTIFACT_DIR" - cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$ARTIFACT_DIR/" 2>/dev/null || true - echo "Logs copied to $ARTIFACT_DIR for artifact upload" - fi - - # Clean up root-owned files to prevent EACCES on GH Actions checkout cleanup - sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true + # Log preservation and cleanup handled by EXIT trap (cleanup_and_save_logs) else From 756becb0b735be44be5eb8c366602f428780a1fc Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 09:16:42 +0000 Subject: [PATCH 46/85] enable set -x around docker privilege detection for CI debugging Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/job.slurm | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 20ecb6683..8d904044a 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -86,12 +86,14 @@ PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" # Docker privilege detection # ============================================================================= # Detect on the batch host. Per-node detection happens inside srun below. +set -x if docker ps &>/dev/null; then DOCKER_CMD="docker" else DOCKER_CMD="sudo docker" fi export DOCKER_CMD +set +x # ============================================================================= # Model Path Resolution From 7f9025ff2242b1a95afa724db8e743957b272aad Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 10:16:43 +0000 Subject: [PATCH 47/85] fix docker detection: test on compute node, not batch host The batch host has docker socket permissions but the compute nodes do not, causing "permission denied" on all srun tasks. Move the detection after SELECTED_NODES is known and probe via srun. Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/job.slurm | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 8d904044a..1da4b4890 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -82,19 +82,6 @@ ROUTER_TYPE="${ROUTER_TYPE:-vllm-router}" ROUTER_PORT="${ROUTER_PORT:-30000}" PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" -# ============================================================================= -# Docker privilege detection -# ============================================================================= -# Detect on the batch host. Per-node detection happens inside srun below. -set -x -if docker ps &>/dev/null; then - DOCKER_CMD="docker" -else - DOCKER_CMD="sudo docker" -fi -export DOCKER_CMD -set +x - # ============================================================================= # Model Path Resolution # ============================================================================= @@ -212,6 +199,16 @@ FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') +# Docker privilege detection — test on a compute node, not the batch host. +FIRST_NODE=$(echo "$SELECTED_NODES" | head -1) +if srun --nodelist="$FIRST_NODE" -N1 -n1 --overlap bash -c 'docker ps &>/dev/null'; then + DOCKER_CMD="docker" +else + DOCKER_CMD="sudo docker" +fi +export DOCKER_CMD +echo "[docker-detect] DOCKER_CMD=$DOCKER_CMD (tested on $FIRST_NODE)" + # Update SLURM environment variables export SLURM_NNODES=$NUM_NODES export SLURM_NTASKS=$NUM_NODES From 400ef364be4f5c359be98847291eff4ffb037497 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 10:50:01 +0000 Subject: [PATCH 48/85] fix docker detection: per-node probe since group membership varies Export DOCKER_CMD_DETECT as a shell snippet that each srun participant evaluates locally, instead of testing a single node and assuming all nodes have the same docker socket permissions. Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/job.slurm | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 1da4b4890..22b1ebcb3 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -199,15 +199,9 @@ FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') -# Docker privilege detection — test on a compute node, not the batch host. -FIRST_NODE=$(echo "$SELECTED_NODES" | head -1) -if srun --nodelist="$FIRST_NODE" -N1 -n1 --overlap bash -c 'docker ps &>/dev/null'; then - DOCKER_CMD="docker" -else - DOCKER_CMD="sudo docker" -fi -export DOCKER_CMD -echo "[docker-detect] DOCKER_CMD=$DOCKER_CMD (tested on $FIRST_NODE)" +# Docker privilege detection — evaluated per-node since group membership varies. +# Exported as a snippet so every srun participant resolves it locally. +export DOCKER_CMD_DETECT='if docker ps &>/dev/null 2>&1; then DOCKER_CMD=docker; else DOCKER_CMD="sudo docker"; fi' # Update SLURM environment variables export SLURM_NNODES=$NUM_NODES @@ -402,6 +396,10 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" +# Per-node docker privilege detection +eval \"\$DOCKER_CMD_DETECT\" +echo \"[docker-detect] rank \$SLURM_PROCID: DOCKER_CMD=\$DOCKER_CMD\" + # Pre-clean (idempotent) \$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true \$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true @@ -484,12 +482,12 @@ exit \$DOCKER_EXIT_CODE " if [[ "${KEEP_CONTAINERS}" != "1" ]]; then - srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' + srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'eval "$DOCKER_CMD_DETECT"; $DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' # Clean up vLLM external router container on node 0 if [[ "$ENGINE" == "vllm-disagg" && "$ROUTER_TYPE" == "vllm-router" ]]; then srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' - '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true + eval "$DOCKER_CMD_DETECT"; $DOCKER_CMD rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true ' fi fi From 21983add4ebdf06368f63e53f659da98cd9dd1d2 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 14:19:27 +0000 Subject: [PATCH 49/85] add vllm-disagg changelog entries and update kimi conc-list - Add perf-changelog entries for kimik2.5-fp4-mi355x-vllm-disagg and minimaxm2.5-fp8-mi355x-vllm-disagg to trigger CI benchmarks - Update kimi 1k1k conc-list from [8] to [16] - Comment out kimi 8k1k config until eval pipeline is wired up Co-Authored-By: Claude Opus 4 --- .github/configs/amd-master.yaml | 44 ++++++++++++++++----------------- perf-changelog.yaml | 10 ++++++++ 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 42e5be0f4..4d8e13064 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1364,9 +1364,9 @@ kimik2.5-fp4-mi355x-vllm-disagg: - isl: 1024 osl: 1024 search-space: - # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total , 16, 32, 64, 128, 256, 512 + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - spec-decoding: "none" - conc-list: [ 8 ] + conc-list: [ 16 ] prefill: num-worker: 1 tp: 8 @@ -1383,26 +1383,26 @@ kimik2.5-fp4-mi355x-vllm-disagg: additional-settings: - "DECODE_NODES=2" - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "none" - conc-list: [ 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" + # - isl: 8192 + # osl: 1024 + # search-space: + # - spec-decoding: "none" + # conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=1" + # - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + # decode: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: false + # additional-settings: + # - "DECODE_NODES=2" minimaxm2.5-fp8-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ad37e0c27..821f0454b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2974,6 +2974,16 @@ - "Update SGLang ROCm image from v0.5.11/v0.5.10rc0 to v0.5.12-rocm720-mi35x-20260517" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1440 +- config-keys: + - kimik2.5-fp4-mi355x-vllm-disagg + description: + - "Add vLLM disaggregated prefill-decode benchmark for Kimi-K2.5-MXFP4 on MI355X" + +- config-keys: + - minimaxm2.5-fp8-mi355x-vllm-disagg + description: + - "Add vLLM disaggregated prefill-decode benchmark for MiniMax-M2.5 on MI355X" + - config-keys: - dsv4-fp4-mi355x-vllm description: From 898e90126aa4a0869d01d1f054c1a299813047a5 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 14:50:15 +0000 Subject: [PATCH 50/85] switch vllm-disagg to 8k1k config to trigger multi-node eval Comment out 1k1k config and enable 8k1k with conc-list [16] so mark_eval_entries picks it up for the eval pipeline. Co-Authored-By: Claude Opus 4 --- .github/configs/amd-master.yaml | 46 ++++++++++++++++----------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 4d8e13064..3a04ecbe3 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1361,31 +1361,10 @@ kimik2.5-fp4-mi355x-vllm-disagg: disagg: true scenarios: fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - - spec-decoding: "none" - conc-list: [ 16 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - # - isl: 8192 + # - isl: 1024 # osl: 1024 # search-space: + # # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total # - spec-decoding: "none" # conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] # prefill: @@ -1404,6 +1383,27 @@ kimik2.5-fp4-mi355x-vllm-disagg: # additional-settings: # - "DECODE_NODES=2" + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 16 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + minimaxm2.5-fp8-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c model: MiniMaxAI/MiniMax-M2.5 From f311bfd8f2357661179ad510c73e4968772f16e6 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 02:52:17 +0000 Subject: [PATCH 51/85] add multi-node eval feature Signed-off-by: Theresa Shan --- .../multi_node/amd_utils/server_sglang.sh | 209 +++++++++++++++--- .../multi_node/amd_utils/server_vllm.sh | 84 ++++++- 2 files changed, 255 insertions(+), 38 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index 53ca29cc5..b410bc978 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -43,7 +43,7 @@ GPUS_PER_NODE="${GPUS_PER_NODE:-8}" # ============================================================================= # Dependencies and Environment Setup # ============================================================================= -source $WS_PATH/env.sh +source $SGLANG_WS_PATH/env.sh host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}') host_name=$(hostname) @@ -62,7 +62,7 @@ fi # ============================================================================= # Model-Specific Configuration from YAML # ============================================================================= -MODELS_YAML="${WS_PATH}/models.yaml" +MODELS_YAML="${SGLANG_WS_PATH}/models.yaml" if [[ ! -f "$MODELS_YAML" ]]; then echo "ERROR: models.yaml not found at $MODELS_YAML" @@ -127,6 +127,9 @@ no_dp = prefill.get('no_dp', {}) print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"') print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"') +print(f'PREFILL_CONTEXT_LENGTH_DP=\"{dp.get(\"context_length\", \"\")}\"') +print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"') +print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"') print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) @@ -169,10 +172,16 @@ if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP) prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP + prefill_context_length=$PREFILL_CONTEXT_LENGTH_DP + prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_DP + prefill_enable_two_batch_overlap=$PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP else prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP + prefill_context_length="" + prefill_max_total_tokens="" + prefill_enable_two_batch_overlap="false" fi # Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) @@ -187,29 +196,31 @@ else decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP fi -# Use Decode configuration to configure different TP/DP size between P and D -PREFILL_DECODE_DIFFERENT_TP="" -if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then - if [[ "$DECODE_ENABLE_DP" == "true" ]]; then - PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}" - else - PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1" - fi -fi - # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}" +PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" fi +if [[ -n "$prefill_context_length" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --context-length ${prefill_context_length}" +fi +if [[ -n "$prefill_max_total_tokens" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --max-total-tokens ${prefill_max_total_tokens}" +fi +if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap" + PREFILL_SDMA_ENV="MORI_ENABLE_SDMA=true" +fi + +DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} " -DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]}" if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" fi if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) + MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) fi # ============================================================================= @@ -327,12 +338,24 @@ if [[ -n "$MODEL_NAME" ]]; then echo "Using model-specific configuration for: $MODEL_NAME" fi +if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]; then + PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') + DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') + unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL + unset MORI_MOE_MAX_INPUT_TOKENS_DECODE + # NOTE: that currently with fp8_combine set, the evals do not pass on InferenceX eval harness + # or on SGLang native harness for high concurrency 4k and gets no where near the golden score of + # 0.95 on even basic GSM8k grade school math as confirmed by @billishyahao from AMD + # and as confirmed by @Oseltamivir. This was initally merged with @billishyahao promising + # that an fast follow PR to fix the evals via having quant correction in the fp8 combine +fi + # ============================================================================= # Container Synchronization # ============================================================================= echo "Waiting at the container creation barrier on $host_name" -python3 $WS_PATH/sync.py barrier \ +python3 $SGLANG_WS_PATH/sync.py barrier \ --local-ip ${host_ip} \ --local-port 5000 \ --enable-port \ @@ -362,20 +385,27 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}" echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" - echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK: ${MORI_MAX_DISPATCH_TOKENS_PREFILL}" - echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE}" + echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}" + echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} " + echo "Decode env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} " + echo "================================================" # start the head prefill server - PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_MORI_MOE_ENV="" + set -x + if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then + PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" + fi + set +x + PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/$MODEL_NAME \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ --host 0.0.0.0 \ --port 8000 \ --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} \ - --log-level-http warning" + ${PREFILL_SERVER_CONFIG} " if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0" @@ -396,7 +426,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Waiting for all prefill and decode servers to be up . . ." - BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ --node-ips ${IPADDRS} \ --node-ports 8000 \ --wait-for-all-ports \ @@ -433,7 +463,7 @@ if [ "$NODE_RANK" -eq 0 ]; then proxy_pid=$! # Wait for router to be ready via health endpoint - HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + HEALTH_BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ --node-ips ${NODE0_ADDR} \ --node-ports 30000 \ --wait-for-all-health \ @@ -453,7 +483,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Ready for benchmarking on ${host_name}:${host_ip}" echo "Benchmarking on ${host_name}:${host_ip}" - cd $WS_PATH + cd $SGLANG_WS_PATH # Export IS_MTP based on whether MTP is enabled if [ "$DECODE_MTP_SIZE" -gt 0 ]; then @@ -463,12 +493,14 @@ if [ "$NODE_RANK" -eq 0 ]; then fi # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier - BENCH_CMD="bash $WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ + BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \ ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" - if [[ "$DRY_RUN" -eq 1 ]]; then + if [[ "${EVAL_ONLY:-false}" == "true" ]]; then + echo "EVAL_ONLY mode: skipping throughput benchmark" + elif [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $BENCH_CMD" else set -x @@ -476,6 +508,96 @@ if [ "$NODE_RANK" -eq 0 ]; then set +x fi + # Run evaluation if requested (before killing router) + if [[ "${RUN_EVAL:-false}" == "true" ]]; then + echo "Running lm-eval evaluation on Node 0..." + + # Health check: verify the router is still serving before running eval. + # The throughput benchmark may have crashed/exhausted decode workers. + EVAL_HEALTH_OK=false + for _attempt in 1 2 3; do + if curl -sf --max-time 10 "http://0.0.0.0:30000/readiness" >/dev/null 2>&1; then + EVAL_HEALTH_OK=true + break + fi + echo "Eval health check attempt $_attempt failed, retrying in 10s..." + sleep 10 + done + + if [[ "$EVAL_HEALTH_OK" != "true" ]]; then + echo "WARNING: Router health check failed after 3 attempts. Skipping eval." + else + # Must run from repo root so utils/evals/${task}.yaml resolves + pushd /workspace + + # Source eval functions from benchmark_lib.sh + source /workspace/benchmarks/benchmark_lib.sh + + # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list + if [[ -n "${EVAL_CONC:-}" ]]; then + export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}" + else + export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) + fi + + # Override eval context length with model's configured context_length + if [[ -n "$prefill_context_length" ]]; then + export EVAL_MAX_MODEL_LEN="$prefill_context_length" + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})" + else + # Run lm-eval against the router on port 30000 + run_eval --framework lm-eval --port 30000 + eval_rc=$? + + if [[ $eval_rc -ne 0 ]]; then + echo "ERROR: run_eval exited rc=$eval_rc; skipping metadata write and eval artifact staging" >&2 + EVAL_FAILED=1 + else + # Set metadata env vars for append_lm_eval_summary + export TP="${PREFILL_TP_SIZE}" + export CONC="${EVAL_CONCURRENT_REQUESTS}" + export EP_SIZE=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}" + export PREFILL_TP="${PREFILL_TP_SIZE}" + export PREFILL_EP=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}" + export PREFILL_NUM_WORKERS="${xP}" + export DECODE_TP="${DECODE_TP_SIZE}" + export DECODE_EP=1 + [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}" + export DECODE_NUM_WORKERS="${yD}" + export DP_ATTENTION="${PREFILL_ENABLE_DP}" + export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}" + export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}" + export ISL="${BENCH_INPUT_LEN}" + export OSL="${BENCH_OUTPUT_LEN}" + # IS_MULTINODE, FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, + # RESULT_FILENAME are already set via Docker -e flags from job.slurm + + append_lm_eval_summary + # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace + + # Copy eval artifacts to run_logs for NFS extraction by runner + EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results" + mkdir -p "$EVAL_COPY_DIR" + for f in meta_env.json; do + [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/" + done + # Use find for glob patterns to avoid "no match" errors + find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \; + find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \; + + echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR" + fi + fi + + popd + fi + fi + # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" mkdir -p "$LOGS_OUTPUT" @@ -492,20 +614,30 @@ if [ "$NODE_RANK" -eq 0 ]; then kill $prefill0_pid fi + if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then + echo "ERROR: eval failed; exiting node-0 with rc=1" + exit 1 + fi + elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})" echo "Using prefill config: $PREFILL_SERVER_CONFIG" echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}" - PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_MORI_MOE_ENV="" + set -x + if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then + PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" + fi + set +x + PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/${MODEL_NAME} \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ --host 0.0.0.0 \ --port 8000 \ --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} \ - --log-level-http warning" + ${PREFILL_SERVER_CONFIG} " if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER)) @@ -524,7 +656,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then fi echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ --node-ips ${NODE0_ADDR} \ --node-ports 30000 \ --wait-for-all-ports \ @@ -537,7 +669,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then fi echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $WS_PATH/sync.py wait \ + WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ --remote-ip ${NODE0_ADDR} \ --remote-port 30000" @@ -560,15 +692,20 @@ else echo "Decode node rank: $RANK" echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" - DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + DECODE_MORI_MOE_ENV="" + set -x + if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_DECODE" ]]; then + DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}" + fi + set +x + DECODE_CMD="${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ --model-path ${MODEL_DIR}/${MODEL_NAME} \ --disaggregation-mode decode \ --disaggregation-ib-device ${IBDEVICES} \ --host 0.0.0.0 \ --port 8000 \ --trust-remote-code \ - ${DECODE_SERVER_CONFIG} \ - --log-level-http warning" + ${DECODE_SERVER_CONFIG} " if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then rank=$((RANK % DECODE_NODES_PER_WORKER)) @@ -589,7 +726,7 @@ else echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ --node-ips ${NODE0_ADDR} \ --node-ports 30000 \ --wait-for-all-ports \ @@ -603,7 +740,7 @@ else echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $WS_PATH/sync.py wait \ + WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ --remote-ip ${NODE0_ADDR} \ --remote-port 30000" @@ -621,4 +758,4 @@ else fi echo "Script completed successfully" -exit 0 +exit 0 \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index 9acb05f54..60b0adb92 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -338,7 +338,9 @@ if [ "$NODE_RANK" -eq 0 ]; then ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \ ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" - if [[ "$DRY_RUN" -eq 1 ]]; then + if [[ "${EVAL_ONLY:-false}" == "true" ]]; then + echo "EVAL_ONLY mode: skipping throughput benchmark" + elif [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $BENCH_CMD" else set -x @@ -346,7 +348,80 @@ if [ "$NODE_RANK" -eq 0 ]; then set +x fi - # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) + # Run evaluation if requested (before killing router) + if [[ "${RUN_EVAL:-false}" == "true" ]]; then + echo "Running lm-eval evaluation on Node 0..." + + EVAL_HEALTH_OK=false + for _attempt in 1 2 3; do + if curl -sf --max-time 10 "http://0.0.0.0:${ROUTER_PORT}/health" >/dev/null 2>&1; then + EVAL_HEALTH_OK=true + break + fi + echo "Eval health check attempt $_attempt failed, retrying in 10s..." + sleep 10 + done + + if [[ "$EVAL_HEALTH_OK" != "true" ]]; then + echo "WARNING: Router health check failed after 3 attempts. Skipping eval." + else + pushd /workspace + + source /workspace/benchmarks/benchmark_lib.sh + + if [[ -n "${EVAL_CONC:-}" ]]; then + export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}" + else + export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: run_eval --framework lm-eval --port $ROUTER_PORT (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})" + else + run_eval --framework lm-eval --port "$ROUTER_PORT" + eval_rc=$? + + if [[ $eval_rc -ne 0 ]]; then + echo "ERROR: run_eval exited rc=$eval_rc; skipping metadata write and eval artifact staging" >&2 + EVAL_FAILED=1 + else + export TP="${PREFILL_TP_SIZE}" + export CONC="${EVAL_CONCURRENT_REQUESTS}" + export EP_SIZE=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}" + export PREFILL_TP="${PREFILL_TP_SIZE}" + export PREFILL_EP=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}" + export PREFILL_NUM_WORKERS="${xP}" + export DECODE_TP="${DECODE_TP_SIZE}" + export DECODE_EP=1 + [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}" + export DECODE_NUM_WORKERS="${yD}" + export DP_ATTENTION="${PREFILL_ENABLE_DP}" + export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}" + export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}" + export ISL="${BENCH_INPUT_LEN}" + export OSL="${BENCH_OUTPUT_LEN}" + + append_lm_eval_summary + + EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results" + mkdir -p "$EVAL_COPY_DIR" + for f in meta_env.json; do + [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/" + done + find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \; + find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \; + + echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR" + fi + fi + + popd + fi + fi + + # Copy benchmark/eval results to BENCHMARK_LOGS_DIR (mounted from host) LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" mkdir -p "$LOGS_OUTPUT" @@ -362,6 +437,11 @@ if [ "$NODE_RANK" -eq 0 ]; then pkill -f "vllm serve" 2>/dev/null || true fi + if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then + echo "ERROR: eval failed; exiting node-0 with rc=1" + exit 1 + fi + elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then echo "${host_name}:${host_ip} is Additional Prefill Node (Model: ${MODEL_NAME})" echo "Using prefill config: $PREFILL_SERVER_CONFIG" From 7b92e576cdf81659aa2df30281f0b64c5fbdea58 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 02:53:02 +0000 Subject: [PATCH 52/85] remove start_etcd.sh Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/start_etcd.sh | 47 ------------------- 1 file changed, 47 deletions(-) delete mode 100755 benchmarks/multi_node/amd_utils/start_etcd.sh diff --git a/benchmarks/multi_node/amd_utils/start_etcd.sh b/benchmarks/multi_node/amd_utils/start_etcd.sh deleted file mode 100755 index 46bbd2964..000000000 --- a/benchmarks/multi_node/amd_utils/start_etcd.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -set -x - -IPADDRS="${IPADDRS:-localhost}" - -# Use management network IP (matching what the Slurm script resolved) -host_ip=$(ip route get 1.1.1.1 2>/dev/null | sed -n 's/.*src \([^ ]*\).*/\1/p') -if [[ -z "$host_ip" ]]; then - host_ip=$(hostname -I | awk '{print $1}') -fi - -IFS=',' read -ra ADDR <<< "$IPADDRS" - -# Determine node name based on position in the IPADDRS list -index=0 -for ip in "${ADDR[@]}"; do - if [[ "$ip" == "$host_ip" ]]; then - break - fi - index=$((index + 1)) -done -node_name="etcd-$((index+1))" - -# Build initial cluster string -initial_cluster="" -for i in "${!ADDR[@]}"; do - peer_name="etcd-$((i+1))" - initial_cluster+="$peer_name=http://${ADDR[i]}:2380" - if [[ $i -lt $((${#ADDR[@]} - 1)) ]]; then - initial_cluster+="," - fi -done - -mkdir -p /var/lib/etcd -rm -rf /var/lib/etcd/* - -/usr/local/bin/etcd/etcd \ - --name "$node_name" \ - --data-dir /var/lib/etcd \ - --initial-advertise-peer-urls http://$host_ip:2380 \ - --listen-peer-urls http://0.0.0.0:2380 \ - --listen-client-urls http://0.0.0.0:2379 \ - --advertise-client-urls http://$host_ip:2379 \ - --initial-cluster-token etcd-cluster-1 \ - --initial-cluster "$initial_cluster" \ - --initial-cluster-state new \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/etcd_NODE${NODE_RANK}.log From e18e09de6e7c7b8c6ce029179f0c925ae4e21ad7 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 03:03:23 +0000 Subject: [PATCH 53/85] change decode to 1, easier for testing Signed-off-by: Theresa Shan --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 3a04ecbe3..89e19713b 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1397,12 +1397,12 @@ kimik2.5-fp4-mi355x-vllm-disagg: - "PREFILL_NODES=1" - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" decode: - num-worker: 2 + num-worker: 1 tp: 8 ep: 8 dp-attn: false additional-settings: - - "DECODE_NODES=2" + - "DECODE_NODES=1" minimaxm2.5-fp8-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c From 21eab91fe9ba433917af086e89873321349b3ede Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 06:49:13 +0000 Subject: [PATCH 54/85] add --served-model-name to vllm serve commands and wire up eval Set --served-model-name on all prefill/decode vllm serve commands so the model name matches what run_lm_eval sends in API requests. Also add eval pipeline support (health check, run_eval, artifact staging) mirroring server_sglang.sh. Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/server_vllm.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index 60b0adb92..35da4ad27 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -284,7 +284,9 @@ if [ "$NODE_RANK" -eq 0 ]; then # Router is started as an external container by job.slurm (VLLM_ROUTER_IMAGE) echo "Using external vllm-router container (started by job.slurm on this node)" + SERVED_MODEL="${MODEL:-${MODEL_NAME}}" PREFILL_CMD="vllm serve ${MODEL_PATH} \ + --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ --trust-remote-code \ --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ @@ -448,7 +450,9 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then setup_vllm_env + SERVED_MODEL="${MODEL:-${MODEL_NAME}}" PREFILL_CMD="vllm serve ${MODEL_PATH} \ + --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ --trust-remote-code \ --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ @@ -502,7 +506,9 @@ else echo "[DECODE_ENV] $env_pair" done + SERVED_MODEL="${MODEL:-${MODEL_NAME}}" DECODE_CMD="vllm serve ${MODEL_PATH} \ + --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ --trust-remote-code \ --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ From 58bb2a3040b72951dc6e34b15bfd4422956793fc Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 08:31:41 +0000 Subject: [PATCH 55/85] fix model name consistency between vllm serve and bench client bench.sh now uses MODEL_NAME for vllm-disagg to match --served-model-name, and MODEL_PATH for sglang to match its default. Simplified SERVED_MODEL to use MODEL_NAME directly since MODEL env var is not available inside the container. Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/bench.sh | 8 +++++++- benchmarks/multi_node/amd_utils/server_vllm.sh | 6 +++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 33cc918bf..24dfbf587 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -20,6 +20,12 @@ decode_gpus=$4 model_path=$5 model_name=$6 MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" +# vllm-disagg uses --served-model-name MODEL_NAME; sglang defaults to MODEL_PATH +if [[ "$ENGINE" == "vllm-disagg" ]]; then + BENCH_MODEL="${MODEL_NAME:-${MODEL_PATH}}" +else + BENCH_MODEL="${MODEL_PATH}" +fi log_path=$7 chosen_isl=${8:-1024} @@ -80,7 +86,7 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do run_benchmark_serving \ --bench-serving-dir "$REPO_ROOT" \ - --model "$MODEL_PATH" \ + --model "$BENCH_MODEL" \ --port "$ROUTER_PORT" \ --backend openai \ --input-len "$chosen_isl" \ diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index 35da4ad27..ecab81656 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -284,7 +284,7 @@ if [ "$NODE_RANK" -eq 0 ]; then # Router is started as an external container by job.slurm (VLLM_ROUTER_IMAGE) echo "Using external vllm-router container (started by job.slurm on this node)" - SERVED_MODEL="${MODEL:-${MODEL_NAME}}" + SERVED_MODEL="${MODEL_NAME}" PREFILL_CMD="vllm serve ${MODEL_PATH} \ --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ @@ -450,7 +450,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then setup_vllm_env - SERVED_MODEL="${MODEL:-${MODEL_NAME}}" + SERVED_MODEL="${MODEL_NAME}" PREFILL_CMD="vllm serve ${MODEL_PATH} \ --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ @@ -506,7 +506,7 @@ else echo "[DECODE_ENV] $env_pair" done - SERVED_MODEL="${MODEL:-${MODEL_NAME}}" + SERVED_MODEL="${MODEL_NAME}" DECODE_CMD="vllm serve ${MODEL_PATH} \ --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ From c17d4c1e6aaff8dc9abccffe7829c1ee4018b4be Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 09:28:13 +0000 Subject: [PATCH 56/85] add token patch to bench for vllm Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/bench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 24dfbf587..554db8b91 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -77,7 +77,7 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do # Engine-specific extra flags extra_flags="" if [[ "$ENGINE" == "vllm-disagg" ]]; then - extra_flags="--trust-remote-code" + extra_flags="--trust-remote-code --tokenizer $MODEL_PATH" else if [ "$IS_MTP" = "true" ]; then extra_flags="--use-chat-template" From 47455c4170b1503960a71cb0d8a1021466456cfc Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 09:50:34 +0000 Subject: [PATCH 57/85] add --tokenizer passthrough to run_benchmark_serving MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit benchmark_lib.sh rejected unknown flags — add --tokenizer support so vllm-disagg bench can resolve the tokenizer from the local model path instead of attempting an HF download with the short model name. Co-Authored-By: Claude Opus 4 --- benchmarks/benchmark_lib.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index f5e39b4cf..7dbbaaaa8 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -210,6 +210,7 @@ run_benchmark_serving() { local dsv4=false local trust_remote_code=false local server_pid="" + local tokenizer="" while [[ $# -gt 0 ]]; do case $1 in @@ -278,6 +279,10 @@ run_benchmark_serving() { server_pid="$2" shift 2 ;; + --tokenizer) + tokenizer="$2" + shift 2 + ;; *) echo "Unknown parameter: $1" return 1 @@ -385,6 +390,10 @@ run_benchmark_serving() { benchmark_cmd+=(--trust-remote-code) fi + if [[ -n "$tokenizer" ]]; then + benchmark_cmd+=(--tokenizer "$tokenizer") + fi + # Run benchmark with optional server monitoring set -x if [[ -n "$server_pid" ]]; then From 839b5476d5934cda6f35fec89570047b1bdb1fa5 Mon Sep 17 00:00:00 2001 From: Shan Theresa Date: Fri, 15 May 2026 10:38:14 +0000 Subject: [PATCH 58/85] update vllm image for kimi2.5 and Minimax disagg. Signed-off-by: Shan Theresa --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 89e19713b..1e8ea34ca 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1351,7 +1351,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" kimik2.5-fp4-mi355x-vllm-disagg: - image: ghcr.io/simondanielsson/vllm-dev:ainic-test-hydra + image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x-disagg @@ -1405,7 +1405,7 @@ kimik2.5-fp4-mi355x-vllm-disagg: - "DECODE_NODES=1" minimaxm2.5-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c + image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x-disagg From 3f43d1409aa1d321905437952c9ad38dc878c2ff Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Mon, 18 May 2026 15:52:08 +0800 Subject: [PATCH 59/85] Update setup_deps.sh --- benchmarks/multi_node/amd_utils/setup_deps.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 860cecf96..c65412bac 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -885,7 +885,7 @@ except Exception as e: # install_mori install_recipe_deps install_amd_quark -patch_mori_fp8_compat +# patch_mori_fp8_compat patch_moriio_save_kv_timeout patch_moriio_transfer_timeout patch_moriio_load_kv_timeout From e4852e231ace5eb5787d1c2e82217ec7188e0ef1 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Mon, 18 May 2026 23:27:34 +0800 Subject: [PATCH 60/85] Update amd-master.yaml restore the kimi k2.5 settings --- .github/configs/amd-master.yaml | 48 ++++++++++++++++----------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 1e8ea34ca..eb3a1de9e 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1361,33 +1361,33 @@ kimik2.5-fp4-mi355x-vllm-disagg: disagg: true scenarios: fixed-seq-len: - # - isl: 1024 - # osl: 1024 - # search-space: - # # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - # - spec-decoding: "none" - # conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: false - # additional-settings: - # - "PREFILL_NODES=1" - # - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - # decode: - # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "DECODE_NODES=2" + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" - isl: 8192 osl: 1024 search-space: - spec-decoding: "none" - conc-list: [ 16 ] + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] prefill: num-worker: 1 tp: 8 @@ -1397,12 +1397,12 @@ kimik2.5-fp4-mi355x-vllm-disagg: - "PREFILL_NODES=1" - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" decode: - num-worker: 1 + num-worker: 2 tp: 8 ep: 8 dp-attn: false additional-settings: - - "DECODE_NODES=1" + - "DECODE_NODES=2" minimaxm2.5-fp8-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036 From 61bc8b9174d36052cb6b57bf5d074484d0deb1b3 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 19 May 2026 14:48:09 +0000 Subject: [PATCH 61/85] update req rate for vllm. Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/bench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 554db8b91..05384f435 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -31,7 +31,7 @@ log_path=$7 chosen_isl=${8:-1024} chosen_osl=${9:-1024} concurrency_list=${10:-"512x1"} -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then chosen_req_rate=${11:-inf} else chosen_req_rate=${11:-1} From 81203a352cdc8a2de2e830d40278e0155a99d5a3 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 19 May 2026 15:20:28 +0000 Subject: [PATCH 62/85] make the sglang env consistent with upstream Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/env.sh | 55 +++++++++++++++++--------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index ffdc9682e..aa69d0e46 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -119,41 +119,52 @@ else # ========================================================================= export SGLANG_USE_AITER=1 - export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200 - export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200 + + export SGLANG_MORI_DISPATCH_DTYPE=auto + export SGLANG_MORI_FP8_COMB=true + export SGLANG_MORI_QP_PER_TRANSFER=4 + export SGLANG_MORI_NUM_WORKERS=4 + export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000 + + export MORI_IO_QP_MAX_SEND_WR=16384 + export MORI_IO_QP_MAX_CQE=32768 + export MORI_IO_QP_MAX_SGE=4 + + export MORI_IO_TC_DISABLE=0 + + export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600 + export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600 # Disable allocating memory in one pass export MORI_SHMEM_MODE=ISOLATION - export SGLANG_MORI_FP8_DISP=True - if [[ "$MODEL_NAME" == *mxfp4* ]]; then - export SGLANG_MORI_FP8_DISP=False - fi + # Enable spec v2 + export SGLANG_ENABLE_SPEC_V2=1 + export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 - export SGLANG_MORI_FP4_DISP=False - export SGLANG_MORI_FP8_COMB=False + export SGLANG_LOG_MS=true + export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 - # Per-role dispatch token limits (prefill uses higher throughput, decode uses lower) - export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384 - if [[ "$MODEL_NAME" == *mxfp4* ]]; then - export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288 - fi - export MORI_MAX_DISPATCH_TOKENS_DECODE=160 + export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192 + export MORI_MAX_DISPATCH_TOKENS_DECODE=512 + + export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=32768 + export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2703 # set MTP size=1 when EP16 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) export MORI_EP_LAUNCH_CONFIG_MODE=AUTO - export MORI_IO_QP_MAX_SEND_WR=16384 - export MORI_IO_QP_MAX_CQE=32768 - export MORI_IO_QP_MAX_SGE=4 export MORI_APP_LOG_LEVEL=INFO - # Router logging control + # Router logging control: + # 0 (default) keeps noisy per-request access logs out of stdout while still logging to file. + # 1 mirrors router logs to stdout via tee (useful for live debugging). export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}" # QoS/DSCP configuration + # Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname if [[ -n "$MORI_RDMA_TC" ]]; then echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)" elif command -v nicctl &> /dev/null; then @@ -166,17 +177,21 @@ $1 == "DSCP" && $2 == ":" && $NF == p { if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then TC=$(( 4 * ND_DSCP )) export MORI_RDMA_SL=$ND_PRIO + export MORI_IO_SL=$ND_PRIO export MORI_RDMA_TC=$TC - echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL" + export MORI_IO_TC=$TC + echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL" else echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." # Fall back to hostname-based detection NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 + export MORI_IO_TC=96 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 + export MORI_IO_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." @@ -187,9 +202,11 @@ $1 == "DSCP" && $2 == ":" && $NF == p { NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 + export MORI_IO_TC=96 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 + export MORI_IO_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." From 895ba67604860cb442cea73643e9de61e1261359 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 19 May 2026 15:31:32 +0000 Subject: [PATCH 63/85] node blacklist Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/submit.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index 524b00c65..fa3d65418 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -157,6 +157,7 @@ fi # Optional: exclude specific nodes (e.g. nodes with broken Docker sockets). # Set SLURM_EXCLUDE_NODES env var to a comma-separated list of hostnames. EXCLUDE_OPT=() +SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-mia1-p01-g11,mia1-p01-g12,mia1-p01-g15}" if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES") fi From dab93b8e52b8c21a8d2b569b2661fe161d24ee9f Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Thu, 21 May 2026 15:57:01 +0200 Subject: [PATCH 64/85] fix: remove faulty minimax patch Signed-off-by: simondanielsson --- .../amd_utils/patches/minimax_m2.py | 672 ------------------ benchmarks/multi_node/amd_utils/setup_deps.sh | 40 -- 2 files changed, 712 deletions(-) delete mode 100644 benchmarks/multi_node/amd_utils/patches/minimax_m2.py diff --git a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py deleted file mode 100644 index ac830eb1f..000000000 --- a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py +++ /dev/null @@ -1,672 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Copyright 2025 The MiniMax AI team. -# Copyright 2023 The vLLM team. -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only MiniMaxM2/M2.5 model.""" - -from collections.abc import Iterable -from typing import Any - -import torch -from torch import nn -from transformers import PretrainedConfig - -from vllm._aiter_ops import rocm_aiter_ops -from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config -from vllm.distributed import ( - get_ep_group, - get_pp_group, - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_gather, -) -from vllm.logger import init_logger -from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ( - QKVParallelLinear, - RowParallelLinear, -) -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, - VocabParallelEmbedding, -) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) -from vllm.model_executor.models.utils import sequence_parallel_chunk -from vllm.sequence import IntermediateTensors - -from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP -from .utils import ( - AutoWeightsLoader, - PPMissingLayer, - is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, - make_layers, - maybe_prefix, -) - -logger = init_logger(__name__) - - -class MiniMaxM2MoE(nn.Module): - """MoE layer for MiniMax M2/M2.5 with EP/WideEP/Mori support. - - Follows the DeepSeek V2 MoE pattern: GateLinear + FusedMoE with - expert parallelism, EPLB, and sequence parallel awareness. - """ - - def __init__( - self, - config: PretrainedConfig, - quant_config: QuantizationConfig | None = None, - prefix: str = "", - ): - super().__init__() - vllm_config = get_current_vllm_config() - parallel_config = vllm_config.parallel_config - - self.tp_size = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() - - self.ep_group = get_ep_group().device_group - self.ep_rank = get_ep_group().rank_in_group - self.ep_size = self.ep_group.size() - - self.n_routed_experts: int = config.num_local_experts - self.n_shared_experts: int = 0 - - self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe - self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0) - self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() - - eplb_config = parallel_config.eplb_config - self.enable_eplb = parallel_config.enable_eplb - self.n_redundant_experts = eplb_config.num_redundant_experts - self.n_logical_experts = self.n_routed_experts - self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts - self.n_local_physical_experts = self.n_physical_experts // self.ep_size - - self.use_routing_bias = getattr(config, "use_routing_bias", False) - if self.use_routing_bias: - self.e_score_correction_bias = nn.Parameter( - torch.empty(config.num_local_experts, dtype=torch.float32) - ) - self.e_score_correction_bias.weight_loader = ( - MiniMaxM2MoE.ebias_weight_loader - ) - else: - self.e_score_correction_bias = None - - self.gate = GateLinear( - config.hidden_size, - config.num_local_experts, - out_dtype=torch.float32, - prefix=f"{prefix}.gate", - ) - - self.experts = FusedMoE( - num_experts=config.num_local_experts, - top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - renormalize=True, - scoring_func=getattr(config, "scoring_func", "softmax"), - e_score_correction_bias=self.e_score_correction_bias, - quant_config=quant_config, - prefix=f"{prefix}.experts", - enable_eplb=self.enable_eplb, - num_redundant_experts=self.n_redundant_experts, - is_sequence_parallel=self.is_sequence_parallel, - router_logits_dtype=torch.float32, - gate=self.gate, - routed_scaling_factor=1.0 - if not self.is_rocm_aiter_moe_enabled - else self.routed_scaling_factor, - ) - - @staticmethod - def ebias_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor) -> None: - assert param.size() == loaded_weight.size() - param.data.copy_(loaded_weight.to(torch.float32)) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - num_tokens, hidden_dim = hidden_states.shape - hidden_states = hidden_states.view(-1, hidden_dim) - - if self.is_sequence_parallel: - hidden_states = sequence_parallel_chunk(hidden_states) - - if self.experts.is_internal_router: - final_hidden_states = self.experts( - hidden_states=hidden_states, router_logits=hidden_states - ) - else: - router_logits, _ = self.gate(hidden_states) - final_hidden_states = self.experts( - hidden_states=hidden_states, router_logits=router_logits - ) - - if hidden_states.dtype != torch.float16: - if not self.is_rocm_aiter_moe_enabled: - final_hidden_states = final_hidden_states * self.routed_scaling_factor - - if self.is_sequence_parallel: - final_hidden_states = tensor_model_parallel_all_gather( - final_hidden_states, 0 - ) - final_hidden_states = final_hidden_states[:num_tokens] - elif self.tp_size > 1: - from vllm.distributed.communication_op import tensor_model_parallel_all_reduce - final_hidden_states = tensor_model_parallel_all_reduce( - final_hidden_states - ) - - return final_hidden_states.view(num_tokens, hidden_dim) - - -class MiniMaxM2Attention(nn.Module): - def __init__( - self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - rotary_dim: int, - rope_parameters: dict[str, Any] | None = None, - attn_window_size: int | None = None, - max_position_embeddings: int = 8192, - head_dim: int | None = None, - rms_norm_eps: float = 1e-06, - qkv_bias: bool = False, - cache_config: CacheConfig | None = None, - quant_config: QuantizationConfig | None = None, - prefix: str = "", - ) -> None: - super().__init__() - self.hidden_size = hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = num_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.total_num_kv_heads = num_kv_heads - if self.total_num_kv_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_kv_heads == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - self.head_dim = head_dim or (hidden_size // self.total_num_heads) - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - self.max_position_embeddings = max_position_embeddings - - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=qkv_bias, - quant_config=quant_config, - prefix=f"{prefix}.qkv_proj", - ) - - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.o_proj", - ) - - if ( - rope_parameters is not None - and "partial_rotary_factor" not in rope_parameters - ): - rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim - self.rotary_emb = get_rope( - self.head_dim, - max_position=max_position_embeddings, - rope_parameters=rope_parameters, - ) - self.attn = Attention( - self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - per_layer_sliding_window=attn_window_size, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - ) - - self.q_norm = MiniMaxText01RMSNormTP( - self.head_dim * self.total_num_heads, eps=rms_norm_eps - ) - self.k_norm = MiniMaxText01RMSNormTP( - self.head_dim * self.total_num_kv_heads, eps=rms_norm_eps - ) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = MiniMaxText01RMSNormTP.forward_qk( - self.q_norm, self.k_norm, q.contiguous(), k.contiguous() - ) - q, k = self.rotary_emb(positions, q, k) - attn_output = self.attn(q, k, v) - output, _ = self.o_proj(attn_output) - return output - - -class MiniMaxM2DecoderLayer(nn.Module): - def __init__( - self, - config: PretrainedConfig, - prefix: str, - model_config: ModelConfig, - cache_config: CacheConfig | None = None, - quant_config: QuantizationConfig | None = None, - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - max_position_embeddings = getattr(config, "max_position_embeddings", 8192) - if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): - max_position_embeddings = max( - config.max_position_embeddings, config.max_model_len - ) - # DecoderLayers are created with `make_layers` which passes the prefix - # with the layer's index. - layer_idx = int(prefix.split(sep=".")[-1]) - - self.layer_idx = layer_idx - self.self_attn = MiniMaxM2Attention( - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - num_kv_heads=config.num_key_value_heads, - rotary_dim=config.rotary_dim, - rope_parameters=config.rope_parameters, - max_position_embeddings=max_position_embeddings, - rms_norm_eps=config.rms_norm_eps, - qkv_bias=getattr(config, "attention_bias", False), - head_dim=getattr(config, "head_dim", None), - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn", - ) - - self.block_sparse_moe = MiniMaxM2MoE( - config=config, - quant_config=quant_config, - prefix=f"{prefix}.mlp", - ) - self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm( - config.hidden_size, eps=config.rms_norm_eps - ) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - residual: torch.Tensor | None, - ) -> torch.Tensor: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm(hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - ) - - # Fully Connected - hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) - - hidden_states = self.block_sparse_moe(hidden_states) - - return hidden_states, residual - - -@support_torch_compile -class MiniMaxM2Model(nn.Module): - fall_back_to_pt_during_load = False - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - model_config = vllm_config.model_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - self.config = config - - self.vocab_size = config.vocab_size - - if get_pp_group().is_first_rank: - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - quant_config=None, - prefix=f"{prefix}.embed_tokens", - ) - else: - self.embed_tokens = PPMissingLayer() - - self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, - lambda prefix: MiniMaxM2DecoderLayer( - config, - prefix, - model_config=model_config, - cache_config=cache_config, - quant_config=quant_config, - ), - prefix=f"{prefix}.layers", - ) - - if get_pp_group().is_last_rank: - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - else: - self.norm = PPMissingLayer() - self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( - ["hidden_states", "residual"], config.hidden_size - ) - - def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.embed_tokens(input_ids) - - def forward( - self, - input_ids: torch.Tensor | None, - positions: torch.Tensor, - intermediate_tensors: IntermediateTensors | None, - inputs_embeds: torch.Tensor | None = None, - ) -> torch.Tensor | IntermediateTensors: - if get_pp_group().is_first_rank: - if inputs_embeds is not None: - hidden_states = inputs_embeds - else: - hidden_states = self.embed_input_ids(input_ids) - residual = None - else: - assert intermediate_tensors is not None - hidden_states = intermediate_tensors["hidden_states"] - residual = intermediate_tensors["residual"] - - for layer in self.layers[self.start_layer : self.end_layer]: - hidden_states, residual = layer(positions, hidden_states, residual) - - if not get_pp_group().is_last_rank: - return IntermediateTensors( - {"hidden_states": hidden_states, "residual": residual} - ) - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return FusedMoE.make_expert_params_mapping( - self, - ckpt_gate_proj_name="w1", - ckpt_down_proj_name="w2", - ckpt_up_proj_name="w3", - num_experts=self.config.num_local_experts, - num_redundant_experts=0, - ) - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - - # Params for weights, fp8 weight scales, fp8 activation scales - # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = self.get_expert_mapping() - - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - - spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) - if spec_layer is not None: - continue # skip spec decode layers for main model - - for param_name, weight_name, shard_id in stacked_params_mapping: - # Skip non-stacked layers and experts (experts handled below). - if weight_name not in name: - continue - # We have mlp.experts[0].gate_proj in the checkpoint. - # Since we handle the experts below in expert_params_mapping, - # we need to skip here BEFORE we update the name, otherwise - # name will be updated to mlp.experts[0].gate_up_proj, which - # will then be updated below in expert_params_mapping - # for mlp.experts[0].gate_gate_up_proj, which breaks load. - if ("mlp.experts." in name) and name not in params_dict: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - for mapping in expert_params_mapping: - param_name, weight_name, expert_id, shard_id = mapping - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader( - param, - loaded_weight, - name, - shard_id=shard_id, - expert_id=expert_id, - ) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr( - param, "weight_loader", default_weight_loader - ) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - - -class MiniMaxM2MixtureOfExperts(MixtureOfExperts): - """EPLB protocol implementation for MiniMax M2/M2.5.""" - - moe_mlp_layers: list[MiniMaxM2MoE] - - def extract_moe_parameters(self, example_moe: MiniMaxM2MoE | None): - if example_moe is None: - self.num_moe_layers = 0 - self.num_expert_groups = 0 - self.num_logical_experts = 0 - self.num_physical_experts = 0 - self.num_local_physical_experts = 0 - self.num_routed_experts = 0 - self.num_shared_experts = 0 - self.num_redundant_experts = 0 - logger.warning("MiniMax M2: No MoE layer found in model.layers.") - else: - self.num_logical_experts = example_moe.n_logical_experts - self.num_physical_experts = example_moe.n_physical_experts - self.num_local_physical_experts = example_moe.n_local_physical_experts - self.num_routed_experts = example_moe.n_routed_experts - self.num_shared_experts = example_moe.n_shared_experts - self.num_redundant_experts = example_moe.n_redundant_experts - - def update_physical_experts_metadata( - self, - num_physical_experts: int, - num_local_physical_experts: int, - ) -> None: - assert self.num_local_physical_experts == num_local_physical_experts - self.num_physical_experts = num_physical_experts - self.num_local_physical_experts = num_local_physical_experts - self.num_redundant_experts = num_physical_experts - self.num_logical_experts - for moe in self.moe_mlp_layers: - moe.n_local_physical_experts = num_local_physical_experts - moe.n_physical_experts = num_physical_experts - moe.n_redundant_experts = self.num_redundant_experts - moe.experts.update_expert_map() - - -class MiniMaxM2ForCausalLM( - nn.Module, SupportsLoRA, SupportsPP, MiniMaxM2MixtureOfExperts -): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - } - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.config = config - self.quant_config = quant_config - if hasattr(vllm_config.model_config, "max_model_len"): - self.config.max_model_len = vllm_config.model_config.max_model_len - self.model = MiniMaxM2Model( - vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") - ) - if get_pp_group().is_last_rank: - self.lm_head = ParallelLMHead( - config.vocab_size, config.hidden_size, quant_config=None - ) - else: - self.lm_head = PPMissingLayer() - self.logits_processor = LogitsProcessor(config.vocab_size) - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors - ) - - self.num_moe_layers = config.num_hidden_layers - self._set_moe_parameters() - - def _set_moe_parameters(self): - self.expert_weights: list = [] - self.num_expert_groups = 1 - self.moe_layers: list = [] - self.moe_mlp_layers: list[MiniMaxM2MoE] = [] - example_moe = None - for layer in self.model.layers: - if isinstance(layer, PPMissingLayer): - continue - assert isinstance(layer, MiniMaxM2DecoderLayer) - if isinstance(layer.block_sparse_moe, MiniMaxM2MoE): - example_moe = layer.block_sparse_moe - self.moe_mlp_layers.append(layer.block_sparse_moe) - self.moe_layers.append(layer.block_sparse_moe.experts) - self.extract_moe_parameters(example_moe) - - def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.embed_input_ids(input_ids) - - def forward( - self, - input_ids: torch.Tensor | None, - positions: torch.Tensor, - intermediate_tensors: IntermediateTensors | None = None, - inputs_embeds: torch.Tensor | None = None, - **kwargs, - ) -> torch.Tensor | IntermediateTensors: - hidden_states = self.model( - input_ids, positions, intermediate_tensors, inputs_embeds - ) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - ) -> torch.Tensor | None: - logits = self.logits_processor(self.lm_head, hidden_states) - return logits - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) - return loader.load_weights(weights) - - def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return self.model.get_expert_mapping() - - -def get_spec_layer_idx_from_weight_name( - config: PretrainedConfig, weight_name: str -) -> int | None: - if hasattr(config, "num_mtp_modules") and (config.num_mtp_modules > 0): - layer_idx = config.num_hidden_layers - for i in range(config.num_mtp_modules): - if weight_name.startswith(f"model.layers.{layer_idx + i}."): - return layer_idx + i - return None diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index c65412bac..3dbc1677f 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -835,45 +835,6 @@ except Exception as e: _SETUP_INSTALLED+=("idle-kv-reaper") } -# --------------------------------------------------------------------------- -# 13. Patch MiniMax M2.5 WideEP + MoRI + EPLB support -# Replaces the upstream minimax_m2.py with our patched version that adds -# GateLinear, EP group integration, sequence parallelism, and the -# MixtureOfExperts EPLB protocol. Idempotent: skips if already patched. -# --------------------------------------------------------------------------- -patch_minimax_m2_wideep_mori() { - local patch_file="${WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}/patches/minimax_m2.py" - if [[ ! -f "$patch_file" ]]; then - # Also check the Docker-baked location - patch_file="/opt/vllm_disagg/patches/minimax_m2.py" - fi - if [[ ! -f "$patch_file" ]]; then - echo "[SETUP] minimax_m2.py patch not found, skipping (WideEP/MoRI not patched)" - return 0 - fi - - python3 -c ' -import os, sys, shutil - -try: - import vllm.model_executor.models.minimax_m2 as mmod - target = mmod.__file__ - src = sys.argv[1] - - with open(target) as f: - if "get_ep_group" in f.read(): - print("[SETUP] minimax_m2.py already has WideEP+MoRI support") - sys.exit(0) - - shutil.copy2(src, target) - print(f"[SETUP] Patched minimax_m2.py: {src} -> {target}") - -except Exception as e: - print(f"[SETUP] WARN patch minimax_m2: {e}", file=sys.stderr) -' "$patch_file" - _SETUP_INSTALLED+=("minimax-m2-wideep-mori") -} - # ============================================================================= # Run installers # ============================================================================= @@ -891,7 +852,6 @@ patch_moriio_transfer_timeout patch_moriio_load_kv_timeout patch_scheduler_read_mode_fix patch_prefill_idle_kv_reaper -patch_minimax_m2_wideep_mori # ============================================================================= # Export paths (persists for server.sh since this file is sourced) From 3e07aea0a7142c7ff6e9316e0c3c1508eef0f6fd Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Thu, 21 May 2026 16:15:41 +0200 Subject: [PATCH 65/85] fix: remove unneeded commented-out code from setup_deps.sh Signed-off-by: simondanielsson --- benchmarks/multi_node/amd_utils/setup_deps.sh | 217 ------------------ 1 file changed, 217 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 3dbc1677f..1b5c6f45e 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -29,119 +29,6 @@ git_clone_retry() { return 1 } -# --------------------------------------------------------------------------- -# 1. UCX (ROCm fork — required for GPU-direct RDMA via Nixl) -# --------------------------------------------------------------------------- -install_ucx() { - if [[ -x "${UCX_HOME}/bin/ucx_info" ]]; then - echo "[SETUP] UCX already present at ${UCX_HOME}" - return 0 - fi - - echo "[SETUP] Installing UCX build dependencies..." - apt-get update -q -y && apt-get install -q -y \ - autoconf automake libtool pkg-config \ - librdmacm-dev rdmacm-utils libibverbs-dev ibverbs-utils ibverbs-providers \ - infiniband-diags perftest ethtool rdma-core strace \ - && rm -rf /var/lib/apt/lists/* - - echo "[SETUP] Building UCX from source (ROCm/ucx @ da3fac2a)..." - ( - set -e - mkdir -p /usr/local/src && cd /usr/local/src - git_clone_retry https://github.com/ROCm/ucx.git ucx && cd ucx - git checkout da3fac2a - ./autogen.sh && mkdir -p build && cd build - ../configure \ - --prefix="${UCX_HOME}" \ - --enable-shared --disable-static \ - --disable-doxygen-doc --enable-optimizations \ - --enable-devel-headers --enable-mt \ - --with-rocm="${ROCM_PATH}" --with-verbs --with-dm - make -j"$(nproc)" && make install - ) - rm -rf /usr/local/src/ucx - - if [[ ! -x "${UCX_HOME}/bin/ucx_info" ]]; then - echo "[SETUP] ERROR: UCX build failed"; exit 1 - fi - _SETUP_INSTALLED+=("UCX") -} - -# --------------------------------------------------------------------------- -# 2. RIXL (ROCm fork of NIXL — KV cache transfer for disaggregated vLLM) -# --------------------------------------------------------------------------- -install_rixl() { - if python3 -c "import rixl" 2>/dev/null; then - echo "[SETUP] RIXL Python bindings already present" - return 0 - fi - - echo "[SETUP] Installing RIXL build dependencies..." - apt-get update -q -y && apt-get install -q -y \ - libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc \ - libcpprest-dev libaio-dev \ - && rm -rf /var/lib/apt/lists/* - pip3 install --quiet meson "pybind11[global]" - - echo "[SETUP] Building RIXL from source (ROCm/RIXL @ f33a5599)..." - ( - set -e - git_clone_retry https://github.com/ROCm/RIXL.git /opt/rixl && cd /opt/rixl - git checkout f33a5599 - meson setup build --prefix="${RIXL_HOME}" \ - -Ducx_path="${UCX_HOME}" \ - -Drocm_path="${ROCM_PATH}" - cd build && ninja && ninja install - cd /opt/rixl - pip install --quiet \ - --config-settings=setup-args="-Drocm_path=${ROCM_PATH}" \ - --config-settings=setup-args="-Ducx_path=${UCX_HOME}" . - ) - rm -rf /opt/rixl - - if ! python3 -c "import rixl" 2>/dev/null; then - echo "[SETUP] ERROR: RIXL build failed"; exit 1 - fi - _SETUP_INSTALLED+=("RIXL") -} - -# --------------------------------------------------------------------------- -# 3. etcd (distributed KV store for vLLM disagg service discovery) -# --------------------------------------------------------------------------- -install_etcd() { - if [[ -x /usr/local/bin/etcd/etcd ]]; then - echo "[SETUP] etcd already present" - return 0 - fi - - local version="v3.6.0-rc.5" - echo "[SETUP] Downloading etcd ${version}..." - wget -q "https://github.com/etcd-io/etcd/releases/download/${version}/etcd-${version}-linux-amd64.tar.gz" \ - -O /tmp/etcd.tar.gz - mkdir -p /usr/local/bin/etcd - tar -xf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 - rm /tmp/etcd.tar.gz - _SETUP_INSTALLED+=("etcd") -} - -# --------------------------------------------------------------------------- -# 4. libionic1 (Pensando ionic RDMA verbs provider for RoCEv2 KV transfer) -# Harmless on non-Pensando nodes (shared lib is simply unused). -# --------------------------------------------------------------------------- -install_libionic() { - if dpkg -l libionic1 2>/dev/null | grep -q '^ii'; then - echo "[SETUP] libionic1 already installed" - return 0 - fi - - echo "[SETUP] Downloading and installing libionic1..." - wget -q "https://repo.radeon.com/amdainic/pensando/ubuntu/1.117.5/pool/main/r/rdma-core/libionic1_54.0-149.g3304be71_amd64.deb" \ - -O /tmp/libionic1.deb - dpkg -i /tmp/libionic1.deb || true - rm -f /tmp/libionic1.deb - _SETUP_INSTALLED+=("libionic1") -} # --------------------------------------------------------------------------- # 5. Container RDMA/net tools @@ -166,47 +53,6 @@ install_recipe_deps() { _SETUP_INSTALLED+=("ibverbs-utils+iproute2") } -# --------------------------------------------------------------------------- -# 6. MoRI (Modular RDMA Interface — EP dispatch/combine kernels for MoE) -# Required for --all2all-backend mori (Expert Parallelism via RDMA). -# GPU kernels are JIT-compiled on first use; no hipcc needed at install. -# -# v0.18.0 ships MoRI 0.1.dev185+g2d02c6a98, but it STILL has the PCI -# topology bug (TopoSystemPci::Load assertion failure on Broadcom -# PEX890xx switches). Always rebuild from our target commit b645fc8 -# which includes the dsp2dev subordinate-range fix. -# --------------------------------------------------------------------------- -install_mori() { - local MORI_TARGET_COMMIT="b645fc8" - local MORI_MARKER="/usr/local/lib/python3.*/dist-packages/.mori_commit_${MORI_TARGET_COMMIT}" - - if ls $MORI_MARKER &>/dev/null; then - echo "[SETUP] MoRI @ $MORI_TARGET_COMMIT already installed (marker found)" - return 0 - fi - - echo "[SETUP] Installing MoRI build dependencies..." - apt-get update -q -y && apt-get install -q -y \ - libopenmpi-dev openmpi-bin libpci-dev \ - && rm -rf /var/lib/apt/lists/* - - echo "[SETUP] Building MoRI from source (ROCm/mori @ $MORI_TARGET_COMMIT)..." - echo "[SETUP] (overriding image-provided version to fix PCI topology bug)" - ( - set -e - git_clone_retry https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori - git checkout "$MORI_TARGET_COMMIT" - pip install --quiet --force-reinstall . - ) - rm -rf /opt/mori - - if ! python3 -c "import mori" 2>/dev/null; then - echo "[SETUP] ERROR: MoRI build failed"; exit 1 - fi - touch $(python3 -c "import sysconfig; print(sysconfig.get_paths()['purelib'])")/.mori_commit_${MORI_TARGET_COMMIT} - _SETUP_INSTALLED+=("MoRI@$MORI_TARGET_COMMIT") -} - # --------------------------------------------------------------------------- # 6b. amd-quark (MXFP4 quantization support for Kimi-K2.5-MXFP4 and similar) # Required due to ROCm vLLM missing the quark dependency: @@ -228,63 +74,6 @@ install_amd_quark() { _SETUP_INSTALLED+=("amd-quark") } -# --------------------------------------------------------------------------- -# 7. Patch vLLM MoRI-EP + FP8 incompatibility (present in v0.17.1 & v0.18.0) -# vLLM asserts MoRI requires AITER fused_moe, but AITER's FP8 kernel -# uses defer_input_quant=True which MoRI's prepare/finalize rejects. -# Patch: remove both the AITER requirement assertion and the -# defer_input_quant NotImplementedError so non-AITER kernels work. -# --------------------------------------------------------------------------- -patch_mori_fp8_compat() { - python3 -c ' -import re, os, sys -patched = [] - -# Patch layer.py: remove AITER requirement assertion(s) for MoRI -try: - import vllm.model_executor.layers.fused_moe.layer as lm - f = lm.__file__ - src = open(f).read() - if "[PATCHED] AITER requirement removed for MoRI-EP + FP8" in src: - print("[SETUP] layer.py MoRI-FP8 patch already applied") - elif "Mori needs to be used with aiter" in src: - # v0.19+: two consecutive assertions inside `if self.moe_config.use_mori_kernels:` - new = re.sub( - r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)\s*" - r"assert not self\.aiter_fmoe_shared_expert_enabled,\s*\([^)]*\)", - "pass # [PATCHED] AITER requirement removed for MoRI-EP + FP8", - src, flags=re.DOTALL) - if new == src: - # v0.17.1/v0.18.0: only the first assertion existed - new = re.sub( - r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)", - "pass # [PATCHED] AITER requirement removed for MoRI-EP + FP8", - src, flags=re.DOTALL) - if new != src: - open(f, "w").write(new) - patched.append("layer.py") - else: - print("[SETUP] ERROR: layer.py pattern found but regex had no effect", file=sys.stderr) - sys.exit(1) - else: - print("[SETUP] ERROR: layer.py AITER assertion pattern not found — vLLM API may have changed", file=sys.stderr) - sys.exit(1) -except Exception as e: - print(f"[SETUP] ERROR patch layer.py: {e}", file=sys.stderr) - sys.exit(1) - -# prepare_finalize/mori.py (v0.19+) already handles defer_input_quant correctly -# (skips FP8 quant when True). No patch needed for that file. -# Added in 0.18.1: https://github.com/vllm-project/vllm/commit/6a9cceb219fcbd6b1eb540ddfdc77ec160f0e209 - -if patched: - print(f"[SETUP] Patched: {chr(44).join(patched)}") -else: - print("[SETUP] No MoRI-FP8 patches needed") -' || exit 1 - _SETUP_INSTALLED+=("MoRI-FP8-patch") -} - # --------------------------------------------------------------------------- # 8. Patch vLLM MoRI-IO save_kv_layer busy-spin (C128 tail-batch deadlock) # In WRITE mode, save_kv_layer spins forever waiting for the handshake @@ -839,14 +628,8 @@ except Exception as e: # Run installers # ============================================================================= -# install_ucx -# install_rixl -# install_etcd -# install_libionic -# install_mori install_recipe_deps install_amd_quark -# patch_mori_fp8_compat patch_moriio_save_kv_timeout patch_moriio_transfer_timeout patch_moriio_load_kv_timeout From 9237eac8d80e47d7198de8d03ac1fb4565d1995b Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Thu, 21 May 2026 16:16:41 +0200 Subject: [PATCH 66/85] fix: bump to latest nightly vllm image on minimax Signed-off-by: simondanielsson --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index eb3a1de9e..fd82d05cb 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1405,7 +1405,7 @@ kimik2.5-fp4-mi355x-vllm-disagg: - "DECODE_NODES=2" minimaxm2.5-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036 + image: vllm/vllm-openai-rocm:nightly-a6682d1d259cca69a9ae737ea5608fbbe7520031 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x-disagg From 4c1520d9a5d6e8593f5b0c64f534607d1cae7a51 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Thu, 21 May 2026 16:35:36 +0200 Subject: [PATCH 67/85] fix: temporarily mount /coredumps Signed-off-by: simondanielsson --- benchmarks/multi_node/amd_utils/job.slurm | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 22b1ebcb3..9d19f3ddc 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -404,6 +404,10 @@ echo \"[docker-detect] rank \$SLURM_PROCID: DOCKER_CMD=\$DOCKER_CMD\" \$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true \$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true +# Ensure host coredump dir exists and is world-writable so the GPU runtime +# can drop coredumps from inside the container (mounted at /coredumps below). +mkdir -p /tmp/coredumps && chmod 1777 /tmp/coredumps || true + # Start vLLM external router container on node 0 if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then \$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true @@ -462,6 +466,7 @@ fi -v \$HOME/.ssh:/root/.ssh \ --shm-size 128G \ -v /tmp:/run_logs \ + -v /tmp/coredumps:/coredumps \ -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ ${DOCKER_ENV_COMMON[*]} \ From c2e0377d2466276b3ee53d1c6bf1ecc350389d47 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Thu, 21 May 2026 17:05:11 +0200 Subject: [PATCH 68/85] tmp: add bette r debugging capabilities Signed-off-by: simondanielsson --- benchmarks/multi_node/amd_utils/job.slurm | 1 + benchmarks/multi_node/amd_utils/models_vllm.yaml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 9d19f3ddc..6b5115eed 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -407,6 +407,7 @@ echo \"[docker-detect] rank \$SLURM_PROCID: DOCKER_CMD=\$DOCKER_CMD\" # Ensure host coredump dir exists and is world-writable so the GPU runtime # can drop coredumps from inside the container (mounted at /coredumps below). mkdir -p /tmp/coredumps && chmod 1777 /tmp/coredumps || true +echo \"[coredump-prep] rank \$SLURM_PROCID on \$(hostname): /tmp/coredumps -> \$(ls -ld /tmp/coredumps 2>&1)\" # Start vLLM external router container on node 0 if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml index c68bb46e3..b2b87a03f 100644 --- a/benchmarks/multi_node/amd_utils/models_vllm.yaml +++ b/benchmarks/multi_node/amd_utils/models_vllm.yaml @@ -33,7 +33,7 @@ Kimi-K2.5-MXFP4: MiniMax-M2.5: prefill_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" - env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_ENGINE_READY_TIMEOUT_S=3600" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_ENGINE_READY_TIMEOUT_S=3600 HIP_LAUNCH_BLOCKING=1 AMD_SERIALIZE_KERNEL=3 AMD_SERIALIZE_COPY=3 AMD_LOG_LEVEL=3" hf_dir: "models--MiniMaxAI--MiniMax-M2.5" gpt-oss-120b: From b172350dbd7da65d26a36fe28ab8395797912e4b Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Thu, 21 May 2026 18:42:18 +0200 Subject: [PATCH 69/85] fix: disable custom all-reduce for minimax Signed-off-by: simondanielsson --- benchmarks/multi_node/amd_utils/models_vllm.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml index b2b87a03f..a770d1ccd 100644 --- a/benchmarks/multi_node/amd_utils/models_vllm.yaml +++ b/benchmarks/multi_node/amd_utils/models_vllm.yaml @@ -31,8 +31,8 @@ Kimi-K2.5-MXFP4: hf_dir: "models--amd--Kimi-K2.5-MXFP4" MiniMax-M2.5: - prefill_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" - decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + prefill_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32 --disable-custom-all-reduce" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32 --disable-custom-all-reduce" env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_ENGINE_READY_TIMEOUT_S=3600 HIP_LAUNCH_BLOCKING=1 AMD_SERIALIZE_KERNEL=3 AMD_SERIALIZE_COPY=3 AMD_LOG_LEVEL=3" hf_dir: "models--MiniMaxAI--MiniMax-M2.5" From 9eaf5485986513f6de985b88939a7e9a0ae74dd5 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Thu, 21 May 2026 20:49:07 +0000 Subject: [PATCH 70/85] fix: minimax segfault by avoiding M=8K fmoe kernel shape Signed-off-by: simondanielsson --- benchmarks/multi_node/amd_utils/models_vllm.yaml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml index a770d1ccd..8cc731c04 100644 --- a/benchmarks/multi_node/amd_utils/models_vllm.yaml +++ b/benchmarks/multi_node/amd_utils/models_vllm.yaml @@ -31,9 +31,11 @@ Kimi-K2.5-MXFP4: hf_dir: "models--amd--Kimi-K2.5-MXFP4" MiniMax-M2.5: - prefill_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32 --disable-custom-all-reduce" - decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32 --disable-custom-all-reduce" - env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_ENGINE_READY_TIMEOUT_S=3600 HIP_LAUNCH_BLOCKING=1 AMD_SERIALIZE_KERNEL=3 AMD_SERIALIZE_COPY=3 AMD_LOG_LEVEL=3" + # AITER fused-MoE kernel fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384 for gfx950 writes OOB when run with MiniMax's shapes at M=8K(=num batched tokens), crashing vllm during AITER warmup. + # Set token budget to 4k to avoid using that shape, instead of disabling AITER_MOE. + prefill_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + decode_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ENGINE_READY_TIMEOUT_S=3600" hf_dir: "models--MiniMaxAI--MiniMax-M2.5" gpt-oss-120b: From 2bde2b6b987071cd56454caa64ea2051b1379acb Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Thu, 21 May 2026 20:58:09 +0000 Subject: [PATCH 71/85] revert: fix: temporarily mount /coredumps Signed-off-by: simondanielsson --- benchmarks/multi_node/amd_utils/job.slurm | 6 ------ 1 file changed, 6 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 6b5115eed..22b1ebcb3 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -404,11 +404,6 @@ echo \"[docker-detect] rank \$SLURM_PROCID: DOCKER_CMD=\$DOCKER_CMD\" \$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true \$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true -# Ensure host coredump dir exists and is world-writable so the GPU runtime -# can drop coredumps from inside the container (mounted at /coredumps below). -mkdir -p /tmp/coredumps && chmod 1777 /tmp/coredumps || true -echo \"[coredump-prep] rank \$SLURM_PROCID on \$(hostname): /tmp/coredumps -> \$(ls -ld /tmp/coredumps 2>&1)\" - # Start vLLM external router container on node 0 if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then \$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true @@ -467,7 +462,6 @@ fi -v \$HOME/.ssh:/root/.ssh \ --shm-size 128G \ -v /tmp:/run_logs \ - -v /tmp/coredumps:/coredumps \ -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ ${DOCKER_ENV_COMMON[*]} \ From e6d26d762db877fed8c7c31ad1f6d4deede2159e Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Thu, 21 May 2026 20:59:59 +0000 Subject: [PATCH 72/85] feat: add VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 as in single node example Signed-off-by: simondanielsson --- benchmarks/multi_node/amd_utils/models_vllm.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml index 8cc731c04..b051de8d9 100644 --- a/benchmarks/multi_node/amd_utils/models_vllm.yaml +++ b/benchmarks/multi_node/amd_utils/models_vllm.yaml @@ -35,7 +35,7 @@ MiniMax-M2.5: # Set token budget to 4k to avoid using that shape, instead of disabling AITER_MOE. prefill_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" decode_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" - env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ENGINE_READY_TIMEOUT_S=3600" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ENGINE_READY_TIMEOUT_S=3600 VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1" hf_dir: "models--MiniMaxAI--MiniMax-M2.5" gpt-oss-120b: From 102e59fdb3bdb3876754d57199df83b09e64b3ff Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 26 May 2026 03:35:26 +0000 Subject: [PATCH 73/85] fix: use FRAMEWORK arg in collect_latest_results.py to match vllm-disagg log dirs Signed-off-by: Theresa Shan --- runners/launch_mi355x-amds.sh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index e05572a43..00fd994f3 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -124,16 +124,14 @@ if [[ "$IS_MULTINODE" == "true" ]]; then if [[ "${EVAL_ONLY:-false}" != "true" ]]; then cat > collect_latest_results.py <<'PY' import os, sys -job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) -prefixes = ["sglang", "vllm"] +job_dir, isl, osl, nexp, framework = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]), sys.argv[5] logs_root = f"{job_dir}/logs/" candidates = [] if os.path.isdir(logs_root): for name in os.listdir(logs_root): - for pfx in prefixes: - subdir = f"{logs_root}{name}/{pfx}_isl_{isl}_osl_{osl}" - if os.path.isdir(subdir): - candidates.append(subdir) + subdir = f"{logs_root}{name}/{framework}_isl_{isl}_osl_{osl}" + if os.path.isdir(subdir): + candidates.append(subdir) for path in sorted(candidates, key=os.path.getmtime, reverse=True)[:nexp]: print(path) PY From c60e6af86ac971ccd9ba27612970ff4c3324faf3 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 26 May 2026 08:57:12 +0000 Subject: [PATCH 74/85] remove unused vllm_disagg_utils directory No external references to this folder exist in the codebase. Co-Authored-By: Claude Opus 4 --- .../multi_node/vllm_disagg_utils/bench.sh | 76 -- .../multi_node/vllm_disagg_utils/env.sh | 98 -- .../multi_node/vllm_disagg_utils/job.slurm | 358 ------- .../multi_node/vllm_disagg_utils/models.yaml | 42 - .../vllm_disagg_utils/moriio_proxy.py | 327 ------- .../vllm_disagg_utils/patches/minimax_m2.py | 672 ------------- .../multi_node/vllm_disagg_utils/server.sh | 490 ---------- .../vllm_disagg_utils/setup_deps.sh | 908 ------------------ .../vllm_disagg_utils/start_etcd.sh | 47 - .../multi_node/vllm_disagg_utils/submit.sh | 166 ---- .../multi_node/vllm_disagg_utils/sync.py | 201 ---- 11 files changed, 3385 deletions(-) delete mode 100755 benchmarks/multi_node/vllm_disagg_utils/bench.sh delete mode 100755 benchmarks/multi_node/vllm_disagg_utils/env.sh delete mode 100644 benchmarks/multi_node/vllm_disagg_utils/job.slurm delete mode 100644 benchmarks/multi_node/vllm_disagg_utils/models.yaml delete mode 100644 benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py delete mode 100644 benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py delete mode 100755 benchmarks/multi_node/vllm_disagg_utils/server.sh delete mode 100644 benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh delete mode 100755 benchmarks/multi_node/vllm_disagg_utils/start_etcd.sh delete mode 100755 benchmarks/multi_node/vllm_disagg_utils/submit.sh delete mode 100755 benchmarks/multi_node/vllm_disagg_utils/sync.py diff --git a/benchmarks/multi_node/vllm_disagg_utils/bench.sh b/benchmarks/multi_node/vllm_disagg_utils/bench.sh deleted file mode 100755 index 274c5954e..000000000 --- a/benchmarks/multi_node/vllm_disagg_utils/bench.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash -# vLLM Disaggregated Benchmark Runner -# -# Produces JSON result files via benchmark_serving.py (same as SGLang bench.sh) -# so that the CI pipeline can collect and process results. -# -# Usage: bash bench.sh \ -# \ -# - -n_prefill=$1 -n_decode=$2 -prefill_gpus=$3 -decode_gpus=$4 -model_path=$5 -model_name=$6 -MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" -log_path=$7 - -chosen_isl=${8:-1024} -chosen_osl=${9:-1024} -concurrency_list=${10:-"512x1"} -chosen_req_rate=${11:-inf} -random_range_ratio=${12:-0.8} -num_prompts_multiplier=${13:-10} - -IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list" - -ROUTER_PORT="${ROUTER_PORT:-30000}" - -echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" - -profile_folder="${log_path}/vllm_isl_${chosen_isl}_osl_${chosen_osl}" -mkdir -p "$profile_folder" - -source "$(dirname "$0")/../../benchmark_lib.sh" - -REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" - -for max_concurrency in "${chosen_concurrencies[@]}"; do - - export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}" - - num_prompts=$(( max_concurrency * num_prompts_multiplier )) - if [[ "$num_prompts" -lt 16 ]]; then - num_prompts=16 - fi - - echo "profile_folder: $profile_folder" - echo "max_concurrency: $max_concurrency" - echo "chosen_req_rate: $chosen_req_rate" - echo "MODEL_PATH: $MODEL_PATH" - echo "ROUTER_PORT: $ROUTER_PORT" - echo "chosen_isl: $chosen_isl" - echo "chosen_osl: $chosen_osl" - echo "num_prompts: $num_prompts" - echo "export_file: $export_file" - - run_benchmark_serving \ - --bench-serving-dir "$REPO_ROOT" \ - --model "$MODEL_PATH" \ - --port "$ROUTER_PORT" \ - --backend openai \ - --input-len "$chosen_isl" \ - --output-len "$chosen_osl" \ - --random-range-ratio "$random_range_ratio" \ - --num-prompts "$num_prompts" \ - --max-concurrency "$max_concurrency" \ - --result-filename "$export_file" \ - --result-dir /workspace/ \ - --trust-remote-code - - echo "-----------------------------------------" - echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." - sleep 10 -done diff --git a/benchmarks/multi_node/vllm_disagg_utils/env.sh b/benchmarks/multi_node/vllm_disagg_utils/env.sh deleted file mode 100755 index e1cc2f6af..000000000 --- a/benchmarks/multi_node/vllm_disagg_utils/env.sh +++ /dev/null @@ -1,98 +0,0 @@ -#!/bin/bash -# vLLM/Nixl environment setup for multi-node disaggregated serving. -# -# REQUIRED ENVIRONMENT VARIABLES: -# IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) -# Set by runner or auto-detected from hostname. -# -# UCX and RIXL paths (LD_LIBRARY_PATH, PATH) are set by setup_deps.sh, which is -# sourced at the top of server.sh before this file. - -set -x - -# IBDEVICES configuration -# Prefer IBDEVICES set by runner (runners/launch_mi355x-amds.sh) -# Fall back to hostname detection if not set (for direct script execution) -if [[ -z "$IBDEVICES" ]]; then - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7 - elif [[ $NODENAME == mia1* ]]; then - export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 - else - DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',') - if [[ -n "$DETECTED" ]]; then - export IBDEVICES="$DETECTED" - else - echo "WARNING: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2 - fi - fi - echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $(hostname -s)" -else - echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)" -fi - -if [[ -z "$UCX_NET_DEVICES" ]]; then - # Use the first benic interface for UCX TCP transport (maps to ionic RDMA NIC). - # We use TCP device names (benicXp1) instead of IB device names (ionic_X:1) - # because ud_verbs/ionic crashes in ucp_request_memory_dereg (UCX bug with ionic provider). - UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/benic1p1/{print $2}' | head -1) - if [[ -n "$UCX_NET_DEV" ]]; then - export UCX_NET_DEVICES="$UCX_NET_DEV" - else - FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1) - if [[ -n "$FIRST_IB" ]]; then - export UCX_NET_DEVICES="${FIRST_IB}:1" - fi - fi - echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES" -else - echo "[INFO] Using UCX_NET_DEVICES=$UCX_NET_DEVICES (set by environment)" -fi - -export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) -export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} - -# RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing -export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1} - -# QoS/DSCP configuration for lossless RoCEv2 fabric. -# Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname -if [[ -n "$UCX_IB_TRAFFIC_CLASS" ]]; then - echo "[INFO] Using UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS (set by environment)" -elif command -v nicctl &> /dev/null; then - ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') - ND_DSCP=$(nicctl show qos 2>/dev/null | awk -v p="$ND_PRIO" ' -$1 == "DSCP" && $2 == ":" && $NF == p { - print $3; exit -}') - if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then - export UCX_IB_TRAFFIC_CLASS=$(( 4 * ND_DSCP )) - export UCX_IB_SL=$ND_PRIO - echo "[INFO] Detected QoS from nicctl: UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS, UCX_IB_SL=$UCX_IB_SL" - else - echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - export UCX_IB_TRAFFIC_CLASS=96 - echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" - elif [[ $NODENAME == mia1* ]]; then - export UCX_IB_TRAFFIC_CLASS=104 - echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" - fi - fi -else - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - export UCX_IB_TRAFFIC_CLASS=96 - echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" - elif [[ $NODENAME == mia1* ]]; then - export UCX_IB_TRAFFIC_CLASS=104 - echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" - else - echo "[INFO] No nicctl and unable to detect from hostname. Skipping QoS configuration." - fi -fi - -set +x -echo "[INFO] IBDEVICES=$IBDEVICES UCX_NET_DEVICES=$UCX_NET_DEVICES NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX UCX_IB_TRAFFIC_CLASS=${UCX_IB_TRAFFIC_CLASS:-unset}" diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm deleted file mode 100644 index e1cad0817..000000000 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ /dev/null @@ -1,358 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=vllm-pd-bench -#SBATCH -N 3 # Overridden by submit.sh -N flag -#SBATCH -n 3 # Overridden by submit.sh -n flag -#SBATCH --ntasks-per-node=1 -#SBATCH --spread-job -#SBATCH --gres=gpu:8 -#SBATCH --time=24:00:00 -# --output and --error are set by submit.sh via BENCHMARK_LOGS_DIR - -echo "=== Job Start Time ===" -echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')" -echo "PST Time: $(TZ=America/Los_Angeles date '+%Y-%m-%d %H:%M:%S %Z')" -echo "=======================" -echo "" - -# ============================================================================= -# Model Validation -# ============================================================================= - -# Use $(pwd) not BASH_SOURCE — sbatch copies the script to /var/spool/slurmd/ -# at runtime, but the CWD remains the submit-time directory (vllm_disagg_utils/). -MODELS_YAML="$(pwd)/models.yaml" - -if [[ ! -f "$MODELS_YAML" ]]; then - echo "Error: models.yaml not found at $MODELS_YAML" - exit 1 -fi - -if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then - echo "Error: DOCKER_IMAGE_NAME is not set." - exit 1 -fi - -MODEL_NAME="${MODEL_NAME:-None}" -if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then - echo "Error: Model '$MODEL_NAME' not found in models.yaml" - echo "Available models:" - grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/ - /' - exit 1 -fi -echo "Model found: $MODEL_NAME" - -RUN_FILE="server.sh" -echo "Runfile set: $RUN_FILE" - -# DI_REPO_DIR points to the repo root. -# $(pwd) is vllm_disagg_utils/ (the sbatch submit dir); go up 3 levels to reach the repo root. -export DI_REPO_DIR=$(cd "$(pwd)/../../.." && pwd) - -xP="${xP:-1}" -yD="${yD:-1}" - -# Benchmark configuration -BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" -BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" -BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" -BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" -BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" -BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" - -GPUS_PER_NODE="${GPUS_PER_NODE:-8}" - -# ============================================================================= -# Docker privilege detection -# ============================================================================= -# Detect on the batch host (used for post-srun cleanup). -# Per-node detection happens inside the srun inline script below because -# some nodes may require sudo while others do not. -if docker ps &>/dev/null; then - DOCKER_CMD="docker" -else - DOCKER_CMD="sudo docker" -fi -export DOCKER_CMD - -# ============================================================================= -# Model Path Resolution -# ============================================================================= - -# MODEL_DIR detection: prefer env var, fall back to hostname detection -if [[ -z "$MODEL_DIR" ]]; then - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - MODEL_DIR="/nfsdata" - elif [[ $NODENAME == mia1* ]]; then - MODEL_DIR="/it-share/data" - else - MODEL_DIR="/nfsdata" - fi - echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $(hostname -s)" -fi -export MODEL_DIR - -# Extract hf_dir from models.yaml (the line after the model's top-level key) -DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next} - found && /^[^ ]/{exit} - found && /hf_dir:/{gsub(/[" ]/, "", $2); print $2; exit}' "$MODELS_YAML") -DISK_DIR_NAME="${DISK_DIR_NAME:-$MODEL_NAME}" -echo "Looking for model: $MODEL_NAME (disk dir: $DISK_DIR_NAME)" - -resolve_hf_cache_path() { - local base_path=$1 - if [[ -d "${base_path}/snapshots" ]]; then - local snapshot=$(ls -1 "${base_path}/snapshots" 2>/dev/null | head -1) - if [[ -n "$snapshot" ]]; then - echo "${base_path}/snapshots/${snapshot}" - return 0 - fi - fi - echo "$base_path" - return 1 -} - -MODEL_PATH="" -SEARCH_PATHS=( - "${MODEL_DIR}/${DISK_DIR_NAME}" - "${MODEL_DIR}/${MODEL_NAME}" - "/nfsdata/hf_hub_cache-0/${DISK_DIR_NAME}" - "/nfsdata/hf_hub_cache-0/${MODEL_NAME}" -) - -for search_path in "${SEARCH_PATHS[@]}"; do - if [[ -d "$search_path" ]]; then - RESOLVED=$(resolve_hf_cache_path "$search_path") - MODEL_PATH="$RESOLVED" - echo "Found MODEL_PATH: $MODEL_PATH" - break - fi -done - -if [[ -z "$MODEL_PATH" ]]; then - echo "FATAL: Model '$MODEL_NAME' not found. Searched:" - for p in "${SEARCH_PATHS[@]}"; do echo " - $p"; done - exit 1 -fi -echo "Final MODEL_PATH: $MODEL_PATH" - -# ============================================================================= -# Node Selection and vLLM-Specific NUM_NODES -# ============================================================================= - -# Router co-located with first prefill: xP + yD nodes total (same as SGLang) -NUM_NODES=$((xP + yD)) -echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD, proxy co-located with first prefill)" - -FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) -SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') - -# Update SLURM environment variables -export SLURM_NNODES=$NUM_NODES -export SLURM_NTASKS=$NUM_NODES -export SLURM_JOB_NUM_NODES=$NUM_NODES -export SLURM_NPROCS=$NUM_NODES -export SLURM_JOB_NODELIST="$SELECTED_NODELIST_STR" -export SLURM_NODELIST="$SELECTED_NODELIST_STR" -export SLURM_TASKS_PER_NODE="1(x$NUM_NODES)" -export SLURM_NTASKS_PER_NODE=1 - -echo "" -echo "Selected nodes: $SELECTED_NODELIST_STR" - -# ============================================================================= -# IP Resolution -# ============================================================================= - -USER_NAME=$(whoami) -MASTER_NODE=$(echo "$SELECTED_NODES" | head -n 1) -NODE0_ADDR=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$MASTER_NODE" bash -c 'ip route get 1.1.1.1') -NODE0_ADDR=$(echo "$NODE0_ADDR" | awk '/src/ {print $7}') - -IPS=() -for NODE in $SELECTED_NODES; do - IP=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$NODE" bash -c 'ip route get 1.1.1.1') - IP=$(echo "$IP" | awk '/src/ {print $7}') - IPS+=("$IP") -done - -echo "Node IPs: ${IPS[*]}" - -DOCKER_MOUNT_PATH="/workspace" -VLLM_WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/vllm_disagg_utils" - -NNODES=$NUM_NODES - -echo "MASTER_NODE: ${MASTER_NODE}" -echo "NODE0_ADDR: ${NODE0_ADDR}" -echo "NNODES: ${NNODES}" -echo "REPO DIR: ${DI_REPO_DIR}" -echo "USER: ${USER_NAME}" - -# Reduce log spam -export TQDM_MININTERVAL=20 - -# Translate the host-resolved MODEL_PATH to the Docker mount namespace -DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}" - -export DI_REPO_DIR=$DI_REPO_DIR -export VLLM_WS_PATH=$VLLM_WS_PATH -export NNODES=$NNODES -export NODE0_ADDR=$NODE0_ADDR -export MODEL_PATH=$MODEL_PATH -export MODEL_DIR=$MODEL_DIR -export xP=$xP -export yD=$yD -export MODEL_NAME=$MODEL_NAME -export USER_NAME=$USER_NAME -export IPADDRS="$(echo "${IPS[*]}" | sed 's/ /,/g')" -export GPUS_PER_NODE=$GPUS_PER_NODE -export BENCH_INPUT_LEN=$BENCH_INPUT_LEN -export BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN -export BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO -export BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER -export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY -export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE -export DRY_RUN="${DRY_RUN:-0}" -export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" - -# TP / EP / DP (from vllm_disagg_utils/submit.sh; mirrors amd_utils disagg) -export PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-false}" -export PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-false}" -export DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-false}" -export DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-false}" -export PREFILL_TP="${PREFILL_TP:-8}" -export DECODE_TP="${DECODE_TP:-8}" - -SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') -export DOCKER_CONT_NAME="container_vllm_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" -export RUN_FILE_FULL="$VLLM_WS_PATH/${RUN_FILE}" - -SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) - -cleanup() { - echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..." - rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true - echo "[${SLURM_JOB_ID}] cleanup done." -} - -trap cleanup INT TERM HUP - -# Force NFS cache refresh on all nodes -echo "Refreshing NFS caches on all nodes..." -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c ' - sync - ls -la '"$DI_REPO_DIR"'/benchmarks/multi_node/vllm_disagg_utils > /dev/null 2>&1 - stat '"$DI_REPO_DIR"'/benchmarks/multi_node/vllm_disagg_utils/server.sh > /dev/null 2>&1 - cat '"$DI_REPO_DIR"'/benchmarks/multi_node/vllm_disagg_utils/server.sh > /dev/null 2>&1 - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true - echo "NFS cache refreshed on $(hostname)" -' - -srun \ - --nodelist="$SELECTED_NODELIST_SRUN" \ - --kill-on-bad-exit=1 \ - --signal=TERM@30 \ - --unbuffered \ - bash -lc " -set -euo pipefail - -echo \"Rank \$SLURM_PROCID on \$(hostname)\" - -# Per-node Docker privilege detection (some nodes need sudo, others don't) -if docker ps &>/dev/null; then - _DCMD=docker -else - _DCMD='sudo docker' -fi - -# Pre-clean (idempotent) -\$_DCMD ps -aq --filter \"name=^container_vllm_\" | xargs -r \$_DCMD rm -f || true -\$_DCMD ps -aq | xargs -r \$_DCMD stop || true - -exec \$_DCMD run --rm \ - --init \ - --stop-timeout 10 \ - --device /dev/dri \ - --device /dev/kfd \ - --device /dev/infiniband \ - --device=/dev/infiniband/rdma_cm \ - --device=/dev/infiniband/uverbs0 \ - --device=/dev/infiniband/uverbs1 \ - --device=/dev/infiniband/uverbs2 \ - --device=/dev/infiniband/uverbs3 \ - --device=/dev/infiniband/uverbs4 \ - --device=/dev/infiniband/uverbs5 \ - --device=/dev/infiniband/uverbs6 \ - --device=/dev/infiniband/uverbs7 \ - --ulimit memlock=-1 \ - --ulimit stack=67108864 \ - --network host \ - --ipc host \ - --group-add video \ - --cap-add SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --privileged \ - -v /sys:/sys \ - $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \ - -v ${MODEL_DIR}:/models \ - -v \$HOME/.ssh:/root/.ssh \ - --shm-size 128G \ - -v /tmp:/run_logs \ - -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ - -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ - -e SLURM_JOB_ID=\$SLURM_JOB_ID \ - -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST \ - -e NNODES=\$NNODES \ - -e NODE_RANK=\$SLURM_PROCID \ - -e NODE0_ADDR=\$NODE0_ADDR \ - -e MODEL_DIR=/models \ - -e MODEL_NAME=\$MODEL_NAME \ - -e MODEL_PATH=$DOCKER_MODEL_PATH \ - -e VLLM_WS_PATH=${VLLM_WS_PATH} \ - -e GPUS_PER_NODE=\$GPUS_PER_NODE \ - -e xP=\$xP \ - -e yD=\$yD \ - -e IPADDRS=\$IPADDRS \ - -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN \ - -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN \ - -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO \ - -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER \ - -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY \ - -e BENCH_REQUEST_RATE=\$BENCH_REQUEST_RATE \ - -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \ - -e DRY_RUN=\$DRY_RUN \ - -e BENCHMARK_LOGS_DIR=/benchmark_logs \ - -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma \ - -e UCX_SOCKADDR_TLS_PRIORITY=tcp \ - -e UCX_MEMTYPE_CACHE=y \ - -e UCX_RNDV_SCHEME=get_zcopy \ - -e UCX_RNDV_THRESH=4k \ - -e UCX_ROCM_IPC_MIN_ZCOPY=0 \ - -e UCX_LOG_LEVEL=warn \ - -e HSA_ENABLE_SDMA=1 \ - -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} \ - -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} \ - -e PYTHONPYCACHEPREFIX=/tmp/pycache \ - -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \ - -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP \ - -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP \ - -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP \ - -e PREFILL_TP=\$PREFILL_TP \ - -e DECODE_TP=\$DECODE_TP \ - --name \"$DOCKER_CONT_NAME\" \ - --entrypoint \"\" \ - \"$DOCKER_IMAGE_NAME\" bash -lc ' - mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' - '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log - ' - -DOCKER_EXIT_CODE=\$? -if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then - echo \"ERROR: docker exited rc=\$DOCKER_EXIT_CODE on \$(hostname)\" - exit \$DOCKER_EXIT_CODE -fi -" - -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'if docker ps &>/dev/null; then D=docker; else D="sudo docker"; fi; $D rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml deleted file mode 100644 index c68bb46e3..000000000 --- a/benchmarks/multi_node/vllm_disagg_utils/models.yaml +++ /dev/null @@ -1,42 +0,0 @@ -# Model-specific vLLM server configurations for disaggregated inference. -# -# Each top-level key is a MODEL_NAME value (must match the model identifier -# used in amd-master.yaml and the directory/HF-cache name under MODEL_DIR). -# -# To add a new model: add a new top-level entry following the same schema. -# No script changes are required. -# -# Schema: -# : -# prefill_flags: str # vLLM CLI flags for prefill workers -# decode_flags: str # vLLM CLI flags for decode workers -# env: str # Space-separated KEY=VALUE pairs exported before vllm serve -# hf_dir: str # (optional) On-disk directory name if it differs from the key -# # e.g. HF cache layout: models--amd--Kimi-K2.5-MXFP4 - -Llama-3.1-405B-Instruct-FP8-KV: - prefill_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" - decode_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" - env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" - -amd-Llama-3.3-70B-Instruct-FP8-KV: - prefill_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" - decode_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" - env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" - -Kimi-K2.5-MXFP4: - prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" - decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" - env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" - hf_dir: "models--amd--Kimi-K2.5-MXFP4" - -MiniMax-M2.5: - prefill_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" - decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" - env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_ENGINE_READY_TIMEOUT_S=3600" - hf_dir: "models--MiniMaxAI--MiniMax-M2.5" - -gpt-oss-120b: - prefill_flags: "--tensor-parallel-size 8" - decode_flags: "--tensor-parallel-size 8" - env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0" diff --git a/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py b/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py deleted file mode 100644 index 7d1e8454b..000000000 --- a/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py +++ /dev/null @@ -1,327 +0,0 @@ -#!/usr/bin/env python3 -# MoRI-IO proxy server for vLLM PD disaggregation. -# -# Based on vLLM's examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py -# with the following adaptations for production multi-node use: -# - Ports configurable via PROXY_HTTP_PORT / PROXY_PING_PORT env vars -# - /health endpoint for sync.py barrier readiness checks -# - Uses stdlib `re` instead of `regex` to avoid extra dep -# -# The proxy performs two roles that vllm-router cannot: -# 1. ZMQ service discovery — prefill/decode workers register their RDMA ports -# 2. Request enrichment — injects remote endpoint info into kv_transfer_params - -import asyncio -import copy -import logging -import os -import re -import socket -import threading -import time -import uuid - -import aiohttp -import msgpack -import zmq -from quart import Quart, make_response, request - -logger = logging.getLogger("moriio_proxy") -logger.setLevel(logging.DEBUG) -handler = logging.StreamHandler() -handler.setFormatter(logging.Formatter( - "%(asctime)s %(levelname)s [%(name)s] %(message)s")) -logger.addHandler(handler) - -prefill_instances: list[dict] = [] -decode_instances: list[dict] = [] -request_nums = 0 -app = Quart(__name__) - -STREAM_IDLE_TIMEOUT = int(os.environ.get("PROXY_STREAM_IDLE_TIMEOUT", "300")) - -IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)") - -TRANSFER_TYPE = None - - -def _append_whole_dict_unique(target_list, data_dict): - new_filtered = {k: v for k, v in data_dict.items() if k != "index"} - for existed in target_list: - existed_filtered = {k: v for k, v in existed.items() if k != "index"} - if existed_filtered == new_filtered: - return False - logger.info("Registered instance: role=%s addr=%s hs_port=%s notify=%s dp=%s tp=%s", - data_dict.get("role"), data_dict.get("request_address"), - data_dict.get("handshake_port"), data_dict.get("notify_port"), - data_dict.get("dp_size"), data_dict.get("tp_size")) - target_list.append(data_dict) - transfer_mode = data_dict.get("transfer_mode", "unknown") - global TRANSFER_TYPE - - if TRANSFER_TYPE is None: - TRANSFER_TYPE = transfer_mode - logger.info("Transfer mode set to: %s", TRANSFER_TYPE) - elif transfer_mode != TRANSFER_TYPE: - raise ValueError(f"mismatched transfer mode {TRANSFER_TYPE} vs {transfer_mode}") - - return True - - -_list_lock = threading.RLock() - - -def _listen_for_register(hostname, port): - context = zmq.Context() - router_socket = context.socket(zmq.ROUTER) - router_socket.bind(f"tcp://{hostname}:{port}") - poller = zmq.Poller() - poller.register(router_socket, zmq.POLLIN) - global prefill_instances - global decode_instances - - while True: - socks = dict(poller.poll()) - if router_socket in socks: - remote_addr, msg = router_socket.recv_multipart() - data = msgpack.loads(msg) - if data["type"] == "HELLO": - pass - elif ( - data["type"] == "register" - and data["role"] == "P" - and data["request_address"] not in prefill_instances - ): - with _list_lock: - _append_whole_dict_unique(prefill_instances, data) - - elif ( - data["type"] == "register" - and data["role"] == "D" - and data["request_address"] not in decode_instances - ): - with _list_lock: - _append_whole_dict_unique(decode_instances, data) - - -def start_service_discovery(hostname, port): - if not hostname: - hostname = socket.gethostname() - if port == 0: - raise ValueError("Port cannot be 0") - - _listener_thread = threading.Thread( - target=_listen_for_register, args=(hostname, port), daemon=True - ) - _listener_thread.start() - logger.info("Service discovery listening on %s:%s", hostname, port) - return _listener_thread - - -async def send_request_to_prefill( - endpoint, req_data, request_id, d_endpoint, dip, dport, selected_prefill_dp_rank -): - req_data_copy = req_data - - req_data_copy["kv_transfer_params"].update( - { - "do_remote_decode": True, - "do_remote_prefill": False, - "remote_handshake_port": d_endpoint["handshake_port"], - "remote_notify_port": d_endpoint["notify_port"], - "remote_engine_id": None, - "remote_block_ids": None, - "remote_host": dip, - "remote_port": dport, - } - ) - req_data_copy["stream"] = False - req_data_copy["max_tokens"] = 1 - if "max_completion_tokens" in req_data_copy: - req_data_copy["max_completion_tokens"] = 1 - if "stream_options" in req_data_copy: - del req_data_copy["stream_options"] - async with aiohttp.ClientSession( - timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) - ) as session: - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - "X-Request-Id": request_id, - } - if selected_prefill_dp_rank is not None: - headers["X-data-parallel-rank"] = str(selected_prefill_dp_rank) - async with session.post( - url=endpoint, json=req_data_copy, headers=headers - ) as response: - if response.status == 200: - return await response.json() - else: - raise RuntimeError( - f"Prefill response status={response.status}" - ) - - -async def start_decode_request(endpoint, req_data, request_id): - session = aiohttp.ClientSession( - timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) - ) - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - "X-Request-Id": request_id, - } - response = await session.post(url=endpoint, json=req_data, headers=headers) - return session, response - - -async def stream_decode_response(session, response, request_id): - try: - if response.status == 200: - chunk_iter = response.content.iter_chunked(1024).__aiter__() - while True: - try: - chunk_bytes = await asyncio.wait_for( - chunk_iter.__anext__(), timeout=STREAM_IDLE_TIMEOUT, - ) - yield chunk_bytes - except StopAsyncIteration: - break - except asyncio.TimeoutError: - logger.error( - "Decode stream %s idle for %ds, aborting", - request_id, STREAM_IDLE_TIMEOUT, - ) - break - else: - raise RuntimeError( - f"Decode response status={response.status}" - ) - finally: - await response.release() - await session.close() - - -@app.route("/health", methods=["GET"]) -async def health_check(): - with _list_lock: - p_count = len(prefill_instances) - d_count = len(decode_instances) - return await make_response( - ({"status": "ok", "prefill_instances": p_count, "decode_instances": d_count}, 200) - ) - - -@app.route("/v1/completions", methods=["POST"]) -@app.route("/v1/chat/completions", methods=["POST"]) -async def handle_request(): - try: - with _list_lock: - global request_nums - request_nums += 1 - - def extract_ip_port_fast(url): - match = IP_PORT_PATTERN.search(url) - if not match: - raise ValueError(f"Invalid URL format: {url}") - return match.groups() - - req_data = await request.get_json() - request_id = str(uuid.uuid4()) - - if not prefill_instances or not decode_instances: - return await make_response( - ("Service Unavailable: No prefill or decode instances registered.", 503) - ) - - pid = request_nums % len(prefill_instances) - did = request_nums % len(decode_instances) - prefill_instance_endpoint = prefill_instances[pid] - decode_instance_endpoint = decode_instances[did] - - selected_prefill_dp_rank = None - if prefill_instance_endpoint["dp_size"] > 1: - selected_prefill_dp_rank = request_nums % prefill_instance_endpoint["dp_size"] - - dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"]) - - req_data_to_prefill = copy.deepcopy(req_data) - req_data_to_prefill["kv_transfer_params"] = {"transfer_id": request_id} - req_data["kv_transfer_params"] = {"transfer_id": request_id} - req_data_to_prefill["kv_transfer_params"]["remote_dp_size"] = ( - decode_instance_endpoint["dp_size"] - ) - req_data_to_prefill["kv_transfer_params"]["remote_tp_size"] = ( - decode_instance_endpoint["tp_size"] - ) - - send_prefill_task = asyncio.create_task( - send_request_to_prefill( - prefill_instance_endpoint["request_address"], - req_data_to_prefill, - request_id, - decode_instance_endpoint, - dip, - dport, - selected_prefill_dp_rank, - ) - ) - ip, port = extract_ip_port_fast(prefill_instance_endpoint["request_address"]) - - req_data["max_tokens"] -= 1 - - req_data["kv_transfer_params"] = { - "transfer_id": request_id, - "do_remote_decode": False, - "do_remote_prefill": True, - "remote_handshake_port": prefill_instance_endpoint["handshake_port"], - "remote_notify_port": prefill_instance_endpoint["notify_port"], - "remote_engine_id": None, - "remote_block_ids": None, - "remote_host": ip, - "remote_port": port, - } - if TRANSFER_TYPE == "READ": - prefill_response = await send_prefill_task - req_data["kv_transfer_params"]["remote_engine_id"] = prefill_response[ - "kv_transfer_params" - ]["remote_engine_id"] - req_data["kv_transfer_params"]["remote_block_ids"] = prefill_response[ - "kv_transfer_params" - ]["remote_block_ids"] - - req_data["kv_transfer_params"]["remote_dp_size"] = prefill_instance_endpoint[ - "dp_size" - ] - req_data["kv_transfer_params"]["remote_tp_size"] = prefill_instance_endpoint[ - "tp_size" - ] - - if selected_prefill_dp_rank is not None: - req_data["kv_transfer_params"]["remote_dp_rank"] = selected_prefill_dp_rank - - decode_request_task = asyncio.create_task( - start_decode_request( - decode_instance_endpoint["request_address"], req_data, request_id - ) - ) - - session, decode_response = await decode_request_task - stream_generator = stream_decode_response(session, decode_response, request_id) - response = await make_response(stream_generator) - return response - except Exception as e: - logger.exception("Error handling request: %s", e) - return await make_response((f"Internal Server Error: {e!s}", 500)) - - -if __name__ == "__main__": - http_port = int(os.environ.get("PROXY_HTTP_PORT", "30000")) - ping_port = int(os.environ.get("PROXY_PING_PORT", "36367")) - - t = start_service_discovery("0.0.0.0", ping_port) - app.debug = False - app.config["BODY_TIMEOUT"] = 360000 - app.config["RESPONSE_TIMEOUT"] = 360000 - - logger.info("MoRI-IO proxy starting: HTTP=%d, ZMQ=%d", http_port, ping_port) - app.run(host="0.0.0.0", port=http_port) - t.join() diff --git a/benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py b/benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py deleted file mode 100644 index 8290276fb..000000000 --- a/benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py +++ /dev/null @@ -1,672 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Copyright 2025 The MiniMax AI team. -# Copyright 2023 The vLLM team. -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only MiniMaxM2/M2.5 model.""" - -from collections.abc import Iterable -from typing import Any - -import torch -from torch import nn -from transformers import PretrainedConfig - -from vllm._aiter_ops import rocm_aiter_ops -from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config -from vllm.distributed import ( - get_ep_group, - get_pp_group, - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_gather, -) -from vllm.logger import init_logger -from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ( - QKVParallelLinear, - RowParallelLinear, -) -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, - VocabParallelEmbedding, -) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) -from vllm.model_executor.models.utils import sequence_parallel_chunk -from vllm.sequence import IntermediateTensors - -from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP -from .utils import ( - AutoWeightsLoader, - PPMissingLayer, - is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, - make_layers, - maybe_prefix, -) - -logger = init_logger(__name__) - - -class MiniMaxM2MoE(nn.Module): - """MoE layer for MiniMax M2/M2.5 with EP/WideEP/Mori support. - - Follows the DeepSeek V2 MoE pattern: GateLinear + FusedMoE with - expert parallelism, EPLB, and sequence parallel awareness. - """ - - def __init__( - self, - config: PretrainedConfig, - quant_config: QuantizationConfig | None = None, - prefix: str = "", - ): - super().__init__() - vllm_config = get_current_vllm_config() - parallel_config = vllm_config.parallel_config - - self.tp_size = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() - - self.ep_group = get_ep_group().device_group - self.ep_rank = get_ep_group().rank_in_group - self.ep_size = self.ep_group.size() - - self.n_routed_experts: int = config.num_local_experts - self.n_shared_experts: int = 0 - - self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe - self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0) - self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() - - eplb_config = parallel_config.eplb_config - self.enable_eplb = parallel_config.enable_eplb - self.n_redundant_experts = eplb_config.num_redundant_experts - self.n_logical_experts = self.n_routed_experts - self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts - self.n_local_physical_experts = self.n_physical_experts // self.ep_size - - self.use_routing_bias = getattr(config, "use_routing_bias", False) - if self.use_routing_bias: - self.e_score_correction_bias = nn.Parameter( - torch.empty(config.num_local_experts, dtype=torch.float32) - ) - self.e_score_correction_bias.weight_loader = ( - MiniMaxM2MoE.ebias_weight_loader - ) - else: - self.e_score_correction_bias = None - - self.gate = GateLinear( - config.hidden_size, - config.num_local_experts, - out_dtype=torch.float32, - prefix=f"{prefix}.gate", - ) - - self.experts = FusedMoE( - num_experts=config.num_local_experts, - top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - reduce_results=False, - renormalize=True, - scoring_func=getattr(config, "scoring_func", "softmax"), - e_score_correction_bias=self.e_score_correction_bias, - quant_config=quant_config, - prefix=f"{prefix}.experts", - enable_eplb=self.enable_eplb, - num_redundant_experts=self.n_redundant_experts, - is_sequence_parallel=self.is_sequence_parallel, - router_logits_dtype=torch.float32, - gate=self.gate, - routed_scaling_factor=1.0 - if not self.is_rocm_aiter_moe_enabled - else self.routed_scaling_factor, - ) - - @staticmethod - def ebias_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor) -> None: - assert param.size() == loaded_weight.size() - param.data.copy_(loaded_weight.to(torch.float32)) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - num_tokens, hidden_dim = hidden_states.shape - hidden_states = hidden_states.view(-1, hidden_dim) - - if self.is_sequence_parallel: - hidden_states = sequence_parallel_chunk(hidden_states) - - if self.experts.is_internal_router: - final_hidden_states = self.experts( - hidden_states=hidden_states, router_logits=hidden_states - ) - else: - router_logits, _ = self.gate(hidden_states) - final_hidden_states = self.experts( - hidden_states=hidden_states, router_logits=router_logits - ) - - if hidden_states.dtype != torch.float16: - if not self.is_rocm_aiter_moe_enabled: - final_hidden_states = final_hidden_states * self.routed_scaling_factor - - if self.is_sequence_parallel: - final_hidden_states = tensor_model_parallel_all_gather( - final_hidden_states, 0 - ) - final_hidden_states = final_hidden_states[:num_tokens] - elif self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( - final_hidden_states - ) - - return final_hidden_states.view(num_tokens, hidden_dim) - - -class MiniMaxM2Attention(nn.Module): - def __init__( - self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - rotary_dim: int, - rope_parameters: dict[str, Any] | None = None, - attn_window_size: int | None = None, - max_position_embeddings: int = 8192, - head_dim: int | None = None, - rms_norm_eps: float = 1e-06, - qkv_bias: bool = False, - cache_config: CacheConfig | None = None, - quant_config: QuantizationConfig | None = None, - prefix: str = "", - ) -> None: - super().__init__() - self.hidden_size = hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = num_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.total_num_kv_heads = num_kv_heads - if self.total_num_kv_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_kv_heads == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - self.head_dim = head_dim or (hidden_size // self.total_num_heads) - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - self.max_position_embeddings = max_position_embeddings - - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=qkv_bias, - quant_config=quant_config, - prefix=f"{prefix}.qkv_proj", - ) - - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.o_proj", - ) - - if ( - rope_parameters is not None - and "partial_rotary_factor" not in rope_parameters - ): - rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim - self.rotary_emb = get_rope( - self.head_dim, - max_position=max_position_embeddings, - rope_parameters=rope_parameters, - ) - self.attn = Attention( - self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - per_layer_sliding_window=attn_window_size, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - ) - - self.q_norm = MiniMaxText01RMSNormTP( - self.head_dim * self.total_num_heads, eps=rms_norm_eps - ) - self.k_norm = MiniMaxText01RMSNormTP( - self.head_dim * self.total_num_kv_heads, eps=rms_norm_eps - ) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = MiniMaxText01RMSNormTP.forward_qk( - self.q_norm, self.k_norm, q.contiguous(), k.contiguous() - ) - q, k = self.rotary_emb(positions, q, k) - attn_output = self.attn(q, k, v) - output, _ = self.o_proj(attn_output) - return output - - -class MiniMaxM2DecoderLayer(nn.Module): - def __init__( - self, - config: PretrainedConfig, - prefix: str, - model_config: ModelConfig, - cache_config: CacheConfig | None = None, - quant_config: QuantizationConfig | None = None, - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - max_position_embeddings = getattr(config, "max_position_embeddings", 8192) - if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): - max_position_embeddings = max( - config.max_position_embeddings, config.max_model_len - ) - # DecoderLayers are created with `make_layers` which passes the prefix - # with the layer's index. - layer_idx = int(prefix.split(sep=".")[-1]) - - self.layer_idx = layer_idx - self.self_attn = MiniMaxM2Attention( - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - num_kv_heads=config.num_key_value_heads, - rotary_dim=config.rotary_dim, - rope_parameters=config.rope_parameters, - max_position_embeddings=max_position_embeddings, - rms_norm_eps=config.rms_norm_eps, - qkv_bias=getattr(config, "attention_bias", False), - head_dim=getattr(config, "head_dim", None), - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn", - ) - - self.block_sparse_moe = MiniMaxM2MoE( - config=config, - quant_config=quant_config, - prefix=f"{prefix}.mlp", - ) - self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm( - config.hidden_size, eps=config.rms_norm_eps - ) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - residual: torch.Tensor | None, - ) -> torch.Tensor: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm(hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - ) - - # Fully Connected - hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) - - hidden_states = self.block_sparse_moe(hidden_states) - - return hidden_states, residual - - -@support_torch_compile -class MiniMaxM2Model(nn.Module): - fall_back_to_pt_during_load = False - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - model_config = vllm_config.model_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - self.config = config - - self.vocab_size = config.vocab_size - - if get_pp_group().is_first_rank: - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - quant_config=None, - prefix=f"{prefix}.embed_tokens", - ) - else: - self.embed_tokens = PPMissingLayer() - - self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, - lambda prefix: MiniMaxM2DecoderLayer( - config, - prefix, - model_config=model_config, - cache_config=cache_config, - quant_config=quant_config, - ), - prefix=f"{prefix}.layers", - ) - - if get_pp_group().is_last_rank: - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - else: - self.norm = PPMissingLayer() - self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( - ["hidden_states", "residual"], config.hidden_size - ) - - def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.embed_tokens(input_ids) - - def forward( - self, - input_ids: torch.Tensor | None, - positions: torch.Tensor, - intermediate_tensors: IntermediateTensors | None, - inputs_embeds: torch.Tensor | None = None, - ) -> torch.Tensor | IntermediateTensors: - if get_pp_group().is_first_rank: - if inputs_embeds is not None: - hidden_states = inputs_embeds - else: - hidden_states = self.embed_input_ids(input_ids) - residual = None - else: - assert intermediate_tensors is not None - hidden_states = intermediate_tensors["hidden_states"] - residual = intermediate_tensors["residual"] - - for layer in self.layers[self.start_layer : self.end_layer]: - hidden_states, residual = layer(positions, hidden_states, residual) - - if not get_pp_group().is_last_rank: - return IntermediateTensors( - {"hidden_states": hidden_states, "residual": residual} - ) - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return FusedMoE.make_expert_params_mapping( - self, - ckpt_gate_proj_name="w1", - ckpt_down_proj_name="w2", - ckpt_up_proj_name="w3", - num_experts=self.config.num_local_experts, - num_redundant_experts=0, - ) - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - - # Params for weights, fp8 weight scales, fp8 activation scales - # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = self.get_expert_mapping() - - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - - spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) - if spec_layer is not None: - continue # skip spec decode layers for main model - - for param_name, weight_name, shard_id in stacked_params_mapping: - # Skip non-stacked layers and experts (experts handled below). - if weight_name not in name: - continue - # We have mlp.experts[0].gate_proj in the checkpoint. - # Since we handle the experts below in expert_params_mapping, - # we need to skip here BEFORE we update the name, otherwise - # name will be updated to mlp.experts[0].gate_up_proj, which - # will then be updated below in expert_params_mapping - # for mlp.experts[0].gate_gate_up_proj, which breaks load. - if ("mlp.experts." in name) and name not in params_dict: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - for mapping in expert_params_mapping: - param_name, weight_name, expert_id, shard_id = mapping - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader( - param, - loaded_weight, - name, - shard_id=shard_id, - expert_id=expert_id, - ) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr( - param, "weight_loader", default_weight_loader - ) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - - -class MiniMaxM2MixtureOfExperts(MixtureOfExperts): - """EPLB protocol implementation for MiniMax M2/M2.5.""" - - moe_mlp_layers: list[MiniMaxM2MoE] - - def extract_moe_parameters(self, example_moe: MiniMaxM2MoE | None): - if example_moe is None: - self.num_moe_layers = 0 - self.num_expert_groups = 0 - self.num_logical_experts = 0 - self.num_physical_experts = 0 - self.num_local_physical_experts = 0 - self.num_routed_experts = 0 - self.num_shared_experts = 0 - self.num_redundant_experts = 0 - logger.warning("MiniMax M2: No MoE layer found in model.layers.") - else: - self.num_logical_experts = example_moe.n_logical_experts - self.num_physical_experts = example_moe.n_physical_experts - self.num_local_physical_experts = example_moe.n_local_physical_experts - self.num_routed_experts = example_moe.n_routed_experts - self.num_shared_experts = example_moe.n_shared_experts - self.num_redundant_experts = example_moe.n_redundant_experts - - def update_physical_experts_metadata( - self, - num_physical_experts: int, - num_local_physical_experts: int, - ) -> None: - assert self.num_local_physical_experts == num_local_physical_experts - self.num_physical_experts = num_physical_experts - self.num_local_physical_experts = num_local_physical_experts - self.num_redundant_experts = num_physical_experts - self.num_logical_experts - for moe in self.moe_mlp_layers: - moe.n_local_physical_experts = num_local_physical_experts - moe.n_physical_experts = num_physical_experts - moe.n_redundant_experts = self.num_redundant_experts - moe.experts.update_expert_map() - - -class MiniMaxM2ForCausalLM( - nn.Module, SupportsLoRA, SupportsPP, MiniMaxM2MixtureOfExperts -): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - } - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.config = config - self.quant_config = quant_config - if hasattr(vllm_config.model_config, "max_model_len"): - self.config.max_model_len = vllm_config.model_config.max_model_len - self.model = MiniMaxM2Model( - vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") - ) - if get_pp_group().is_last_rank: - self.lm_head = ParallelLMHead( - config.vocab_size, config.hidden_size, quant_config=None - ) - else: - self.lm_head = PPMissingLayer() - self.logits_processor = LogitsProcessor(config.vocab_size) - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors - ) - - self.num_moe_layers = config.num_hidden_layers - self._set_moe_parameters() - - def _set_moe_parameters(self): - self.expert_weights: list = [] - self.num_expert_groups = 1 - self.moe_layers: list = [] - self.moe_mlp_layers: list[MiniMaxM2MoE] = [] - example_moe = None - for layer in self.model.layers: - if isinstance(layer, PPMissingLayer): - continue - assert isinstance(layer, MiniMaxM2DecoderLayer) - if isinstance(layer.block_sparse_moe, MiniMaxM2MoE): - example_moe = layer.block_sparse_moe - self.moe_mlp_layers.append(layer.block_sparse_moe) - self.moe_layers.append(layer.block_sparse_moe.experts) - self.extract_moe_parameters(example_moe) - - def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.embed_input_ids(input_ids) - - def forward( - self, - input_ids: torch.Tensor | None, - positions: torch.Tensor, - intermediate_tensors: IntermediateTensors | None = None, - inputs_embeds: torch.Tensor | None = None, - **kwargs, - ) -> torch.Tensor | IntermediateTensors: - hidden_states = self.model( - input_ids, positions, intermediate_tensors, inputs_embeds - ) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - ) -> torch.Tensor | None: - logits = self.logits_processor(self.lm_head, hidden_states) - return logits - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) - return loader.load_weights(weights) - - def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return self.model.get_expert_mapping() - - -def get_spec_layer_idx_from_weight_name( - config: PretrainedConfig, weight_name: str -) -> int | None: - if hasattr(config, "num_mtp_modules") and (config.num_mtp_modules > 0): - layer_idx = config.num_hidden_layers - for i in range(config.num_mtp_modules): - if weight_name.startswith(f"model.layers.{layer_idx + i}."): - return layer_idx + i - return None diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh deleted file mode 100755 index 9b0ff2ebb..000000000 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ /dev/null @@ -1,490 +0,0 @@ -#!/bin/bash -# vLLM Disaggregated Server Launcher with Model-Specific Configurations -# ============================================================================= -# -# Node role assignment (by NODE_RANK): -# 0 -> Proxy/Router + first Prefill node (kv_producer) -# 1..xP-1 -> Additional Prefill nodes (kv_producer) -# xP..xP+yD-1 -> Decode nodes (kv_consumer) -# -# Total nodes = xP + yD (router co-located with first prefill, like SGLang). - -# ============================================================================= -# Dependency Setup (idempotent; required when using base vLLM image) -# ============================================================================= -source "$(dirname "${BASH_SOURCE[0]}")/setup_deps.sh" - -# ============================================================================= -# Environment Configuration -# ============================================================================= - -NODE0_ADDR="${NODE0_ADDR:-localhost}" -NODE_RANK="${NODE_RANK:-0}" -MODEL_DIR="${MODEL_DIR:-}" -MODEL_NAME="${MODEL_NAME:-}" - -xP="${xP:-1}" -yD="${yD:-1}" - -IPADDRS="${IPADDRS:-localhost}" - -# Benchmark Configuration -BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" -BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" -BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" -BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" -BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" -BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" - -DRY_RUN="${DRY_RUN:-0}" -GPUS_PER_NODE="${GPUS_PER_NODE:-8}" - -ROUTER_PORT="${ROUTER_PORT:-30000}" -SERVER_PORT="${SERVER_PORT:-2584}" -ENGINE_ID="${ENGINE_ID:-${MODEL_NAME}-pd-run}" - -# Prefer MODEL_PATH from job.slurm (handles HF cache snapshot resolution) -MODEL_PATH="${MODEL_PATH:-${MODEL_DIR}/${MODEL_NAME}}" - -# ============================================================================= -# Dependencies and Environment Setup -# ============================================================================= -source $VLLM_WS_PATH/env.sh - -host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}') -# RDMA IP for Nixl KV transfer (prefer 192.168.x.x subnet if available) -rdma_ip=$(hostname -I | tr ' ' '\n' | grep '^192\.168\.' | head -1) -rdma_ip="${rdma_ip:-$host_ip}" -host_name=$(hostname) - -echo "[INFO] Management IP (barriers/proxy): $host_ip" -echo "[INFO] RDMA IP (Nixl KV transfer): $rdma_ip" - -# ============================================================================= -# RDMA / Nixl Workarounds -# ============================================================================= - -setup_rdma_env() { - # Pensando ionic (RoCEv2) point-to-point /31 route fix. - # Each benic interface has a /31 to the TOR switch. Without explicit routes, - # traffic to other nodes' RDMA IPs falls through to the management network. - if [[ "$rdma_ip" =~ ^192\.168\.([0-9]+)\.([0-9]+)$ ]]; then - local rdma_subnet="${BASH_REMATCH[1]}" - local rdma_host="${BASH_REMATCH[2]}" - local rdma_gw="192.168.${rdma_subnet}.$(( rdma_host | 1 ))" - local rdma_iface - rdma_iface=$(ip -o addr show | awk -v ip="$rdma_ip" '$4 ~ ip {print $2}' | head -1) - if [[ -n "$rdma_iface" ]]; then - ip route replace "192.168.${rdma_subnet}.0/24" via "$rdma_gw" dev "$rdma_iface" 2>/dev/null && \ - echo "[RDMA-ROUTE] Added 192.168.${rdma_subnet}.0/24 via $rdma_gw dev $rdma_iface" || \ - echo "[RDMA-ROUTE] Route add failed for 192.168.${rdma_subnet}.0/24" - fi - fi - - # Patch Nixl UCX backend: set ucx_error_handling_mode=none. - # Required for ALL NIC types under high concurrency (C512+). Without this, - # UCX's default UCP_ERR_HANDLING_MODE_PEER triggers transport-level error - # recovery on ibv_post_send failures, preventing RIXL RDMA READ retries from - # recovering gracefully. This causes the prefill KV cache to fill to 100% - # and deadlock the pipeline. On ionic NICs this was already applied (rdmacm - # incompatibility); on mlx5 NICs it was incorrectly skipped. - local nixl_api - nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) - if [[ -n "$nixl_api" ]]; then - if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then - sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" - echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api (IBDEVICES=${IBDEVICES:-unset})" - else - echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" - fi - fi -} - -setup_rdma_env - -if [[ -z "$UCX_NET_DEVICES" ]]; then - echo "Error: UCX_NET_DEVICES is empty after env.sh detection" >&2 - exit 1 -fi - -# ============================================================================= -# Model-Specific Configuration from YAML -# ============================================================================= -MODELS_YAML="${VLLM_WS_PATH}/models.yaml" - -if [[ ! -f "$MODELS_YAML" ]]; then - echo "ERROR: models.yaml not found at $MODELS_YAML" - exit 1 -fi - -if [[ -z "$MODEL_NAME" ]]; then - echo "ERROR: MODEL_NAME is not set"; exit 1 -fi - -eval "$(python3 -c " -import yaml, sys - -with open('${MODELS_YAML}') as f: - models = yaml.safe_load(f) - -model_name = '${MODEL_NAME}' -if model_name not in models: - print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') - sys.exit(0) - -m = models[model_name] - -def bash_escape(s): - \"\"\"Escape a value for safe embedding in a bash double-quoted assignment.\"\"\" - return s.replace('\\\\', '\\\\\\\\').replace('\"', '\\\\\"').replace('\$', '\\\\\$').replace('\`', '\\\\\`') - -pf = bash_escape(m.get('prefill_flags', '--tensor-parallel-size 8')) -df = bash_escape(m.get('decode_flags', '--tensor-parallel-size 8')) -ev = bash_escape(m.get('env', '')) -dev = bash_escape(m.get('decode_env', '')) -print(f'PREFILL_SERVER_CONFIG=\"{pf}\"') -print(f'DECODE_SERVER_CONFIG=\"{df}\"') -print(f'MODEL_ENVS=\"{ev}\"') -print(f'DECODE_MODEL_ENVS=\"{dev}\"') -")" - -echo "Loaded model configuration for: $MODEL_NAME" - -# Apply tensor-parallel size and EP/DP flags from submit pipeline (YAML PREFILL_TP / dp-attn / ep). -if [[ -n "${PREFILL_TP:-}" ]]; then - if echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then - PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${PREFILL_TP}/g") - else - PREFILL_SERVER_CONFIG+=" --tensor-parallel-size ${PREFILL_TP}" - fi -fi -if [[ -n "${DECODE_TP:-}" ]]; then - if echo "$DECODE_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then - DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${DECODE_TP}/g") - else - DECODE_SERVER_CONFIG+=" --tensor-parallel-size ${DECODE_TP}" - fi -fi -if [[ "${PREFILL_ENABLE_EP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then - PREFILL_SERVER_CONFIG+=" --enable-expert-parallel" -fi -if [[ "${PREFILL_ENABLE_DP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then - PREFILL_SERVER_CONFIG+=" --enable-dp-attention" -fi -if [[ "${DECODE_ENABLE_EP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then - DECODE_SERVER_CONFIG+=" --enable-expert-parallel" -fi -if [[ "${DECODE_ENABLE_DP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then - DECODE_SERVER_CONFIG+=" --enable-dp-attention" -fi - -echo "PREFILL_SERVER_CONFIG (after TP/EP/DP): $PREFILL_SERVER_CONFIG" -echo "DECODE_SERVER_CONFIG (after TP/EP/DP): $DECODE_SERVER_CONFIG" - -# ============================================================================= -# Container Synchronization -# ============================================================================= - -echo "Waiting at the container creation barrier on $host_name" -python3 $VLLM_WS_PATH/sync.py barrier \ - --local-ip ${host_ip} \ - --local-port 5000 \ - --enable-port \ - --node-ips ${IPADDRS} \ - --node-ports 5000 \ - --wait-for-all-ports \ - --timeout 600 - -# ============================================================================= -# ETCD Server Setup -# ============================================================================= - -echo "Proceeding to start etcd server on $host_name" -bash ${VLLM_WS_PATH}/start_etcd.sh > /dev/null 2>&1 & -etcd_pid=$! - -echo "Waiting at etcd server barrier on $host_name" -python3 $VLLM_WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports 2379 \ - --wait-for-all-ports \ - --timeout 300 - -echo "All etcd servers are up : $host_name" -sleep 3 - -echo "etcd endpoint health==================" -etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true -echo "======================================" - -python3 $VLLM_WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports 2379 \ - --wait-for-all-ports \ - --timeout 300 - -# ============================================================================= -# Cluster Topology Configuration -# ============================================================================= -IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" - -PREFILL_ARGS="" -DECODE_ARGS="" - -for ((i=0; i "$PROXY_LOG_FILE" 2>&1 & - set +x - proxy_pid=$! - sleep 3 - fi - - PREFILL_CMD="vllm serve ${MODEL_PATH} \ - --port $SERVER_PORT \ - --trust-remote-code \ - --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ - ${PREFILL_SERVER_CONFIG}" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PREFILL_CMD" - else - PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log" - set -x - eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 & - set +x - prefill_pid=$! - fi - - echo "Waiting for all prefill and decode servers to be up . . ." - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: skipping barrier (wait-for-all-ports)" - else - python3 $VLLM_WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports $SERVER_PORT \ - --wait-for-all-ports \ - --timeout 1800 - fi - - echo "Congratulations!!! All prefill and decode servers are up . . ." - - # Wait for proxy /health to confirm it is accepting requests - HEALTH_BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports ${ROUTER_PORT} \ - --wait-for-all-health \ - --health-endpoint /health \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $HEALTH_BARRIER_CMD" - else - eval "$HEALTH_BARRIER_CMD" - echo "MoRI-IO proxy is ready for benchmarking" - fi - - echo "Ready for benchmarking on ${host_name}:${host_ip}" - echo "Benchmarking on ${host_name}:${host_ip}" - cd $VLLM_WS_PATH - - export ROUTER_PORT=$ROUTER_PORT - BENCH_CMD="bash $VLLM_WS_PATH/bench.sh ${xP} ${yD} $((GPUS_PER_NODE*xP)) $((GPUS_PER_NODE*yD)) \ - $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ - ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \ - ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BENCH_CMD" - else - set -x - eval "$BENCH_CMD" - set +x - fi - - # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) - LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" - mkdir -p "$LOGS_OUTPUT" - - if [[ "$DRY_RUN" -eq 0 ]]; then - cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" - echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" - fi - - echo "Killing the proxy server and prefill server" - if [[ "$DRY_RUN" -eq 0 ]]; then - [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true - [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true - sleep 2 - # Fallback: ensure no orphaned processes keep ports open - pkill -f moriio_proxy 2>/dev/null || true - pkill -f "vllm serve" 2>/dev/null || true - fi - -elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then - echo "${host_name}:${host_ip} is Additional Prefill Node (Model: ${MODEL_NAME})" - echo "Using prefill config: $PREFILL_SERVER_CONFIG" - - setup_vllm_env - - PREFILL_CMD="vllm serve ${MODEL_PATH} \ - --port $SERVER_PORT \ - --trust-remote-code \ - --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ - ${PREFILL_SERVER_CONFIG}" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PREFILL_CMD" - else - PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log" - set -x - eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 & - set +x - prefill_pid=$! - fi - - echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports ${ROUTER_PORT} \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - - echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $VLLM_WS_PATH/sync.py wait \ - --remote-ip ${NODE0_ADDR} \ - --remote-port ${ROUTER_PORT}" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $WAIT_CMD" - else - eval "$WAIT_CMD" - fi - - echo "Killing the prefill server" - [[ "$DRY_RUN" -eq 0 ]] && kill $prefill_pid 2>/dev/null || true - -else - echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME})" - echo "Using decode config: $DECODE_SERVER_CONFIG" - - setup_vllm_env - - for env_pair in ${DECODE_MODEL_ENVS}; do - export "$env_pair" - echo "[DECODE_ENV] $env_pair" - done - - DECODE_CMD="vllm serve ${MODEL_PATH} \ - --port $SERVER_PORT \ - --trust-remote-code \ - --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ - ${DECODE_SERVER_CONFIG}" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $DECODE_CMD" - else - DECODE_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log" - set -x - eval "$DECODE_CMD" > "$DECODE_LOG_FILE" 2>&1 & - set +x - decode_pid=$! - fi - - echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports ${ROUTER_PORT} \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - - echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $VLLM_WS_PATH/sync.py wait \ - --remote-ip ${NODE0_ADDR} \ - --remote-port ${ROUTER_PORT}" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $WAIT_CMD" - else - eval "$WAIT_CMD" - fi - - echo "Killing the decode server" - [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid 2>/dev/null || true -fi - -echo "Killing the etcd server" -kill $etcd_pid 2>/dev/null || true -pkill -f etcd 2>/dev/null || true - -echo "Script completed successfully" -exit 0 diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh deleted file mode 100644 index 7f691d141..000000000 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ /dev/null @@ -1,908 +0,0 @@ -#!/bin/bash -# ============================================================================= -# setup_deps.sh — Install missing vLLM disagg dependencies at container start. -# -# Base image: vllm/vllm-openai-rocm:v0.18.0 -# Sourced by server.sh so PATH / LD_LIBRARY_PATH exports persist. -# Idempotent: each component is skipped if already present. -# -# Build steps run in subshells to avoid CWD pollution between installers. -# ============================================================================= - -ROCM_PATH="${ROCM_PATH:-/opt/rocm}" -UCX_HOME="${UCX_HOME:-/usr/local/ucx}" -RIXL_HOME="${RIXL_HOME:-/usr/local/rixl}" - -_SETUP_START=$(date +%s) -_SETUP_INSTALLED=() - -git_clone_retry() { - local url="$1" dest="$2" max_tries=3 try=1 - while (( try <= max_tries )); do - if git clone --quiet "$url" "$dest" 2>/dev/null; then return 0; fi - echo "[SETUP] git clone attempt $try/$max_tries failed for $url, retrying in 10s..." - rm -rf "$dest" - sleep 10 - (( try++ )) - done - echo "[SETUP] git clone failed after $max_tries attempts: $url" - return 1 -} - -# --------------------------------------------------------------------------- -# 1. UCX (ROCm fork — required for GPU-direct RDMA via Nixl) -# --------------------------------------------------------------------------- -install_ucx() { - if [[ -x "${UCX_HOME}/bin/ucx_info" ]]; then - echo "[SETUP] UCX already present at ${UCX_HOME}" - return 0 - fi - - echo "[SETUP] Installing UCX build dependencies..." - apt-get update -q -y && apt-get install -q -y \ - autoconf automake libtool pkg-config \ - librdmacm-dev rdmacm-utils libibverbs-dev ibverbs-utils ibverbs-providers \ - infiniband-diags perftest ethtool rdma-core strace \ - && rm -rf /var/lib/apt/lists/* - - echo "[SETUP] Building UCX from source (ROCm/ucx @ da3fac2a)..." - ( - set -e - mkdir -p /usr/local/src && cd /usr/local/src - git_clone_retry https://github.com/ROCm/ucx.git ucx && cd ucx - git checkout da3fac2a - ./autogen.sh && mkdir -p build && cd build - ../configure \ - --prefix="${UCX_HOME}" \ - --enable-shared --disable-static \ - --disable-doxygen-doc --enable-optimizations \ - --enable-devel-headers --enable-mt \ - --with-rocm="${ROCM_PATH}" --with-verbs --with-dm - make -j"$(nproc)" && make install - ) - rm -rf /usr/local/src/ucx - - if [[ ! -x "${UCX_HOME}/bin/ucx_info" ]]; then - echo "[SETUP] ERROR: UCX build failed"; exit 1 - fi - _SETUP_INSTALLED+=("UCX") -} - -# --------------------------------------------------------------------------- -# 2. RIXL (ROCm fork of NIXL — KV cache transfer for disaggregated vLLM) -# --------------------------------------------------------------------------- -install_rixl() { - if python3 -c "import rixl" 2>/dev/null; then - echo "[SETUP] RIXL Python bindings already present" - return 0 - fi - - echo "[SETUP] Installing RIXL build dependencies..." - apt-get update -q -y && apt-get install -q -y \ - libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc \ - libcpprest-dev libaio-dev \ - && rm -rf /var/lib/apt/lists/* - pip3 install --quiet meson "pybind11[global]" - - echo "[SETUP] Building RIXL from source (ROCm/RIXL @ f33a5599)..." - ( - set -e - git_clone_retry https://github.com/ROCm/RIXL.git /opt/rixl && cd /opt/rixl - git checkout f33a5599 - meson setup build --prefix="${RIXL_HOME}" \ - -Ducx_path="${UCX_HOME}" \ - -Drocm_path="${ROCM_PATH}" - cd build && ninja && ninja install - cd /opt/rixl - pip install --quiet \ - --config-settings=setup-args="-Drocm_path=${ROCM_PATH}" \ - --config-settings=setup-args="-Ducx_path=${UCX_HOME}" . - ) - rm -rf /opt/rixl - - if ! python3 -c "import rixl" 2>/dev/null; then - echo "[SETUP] ERROR: RIXL build failed"; exit 1 - fi - _SETUP_INSTALLED+=("RIXL") -} - -# --------------------------------------------------------------------------- -# 3. etcd (distributed KV store for vLLM disagg service discovery) -# --------------------------------------------------------------------------- -install_etcd() { - if [[ -x /usr/local/bin/etcd/etcd ]]; then - echo "[SETUP] etcd already present" - return 0 - fi - - local version="v3.6.0-rc.5" - echo "[SETUP] Downloading etcd ${version}..." - wget -q "https://github.com/etcd-io/etcd/releases/download/${version}/etcd-${version}-linux-amd64.tar.gz" \ - -O /tmp/etcd.tar.gz - mkdir -p /usr/local/bin/etcd - tar -xf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 - rm /tmp/etcd.tar.gz - _SETUP_INSTALLED+=("etcd") -} - -# --------------------------------------------------------------------------- -# 4. libionic1 (Pensando ionic RDMA verbs provider for RoCEv2 KV transfer) -# Harmless on non-Pensando nodes (shared lib is simply unused). -# --------------------------------------------------------------------------- -install_libionic() { - if dpkg -l libionic1 2>/dev/null | grep -q '^ii'; then - echo "[SETUP] libionic1 already installed" - return 0 - fi - - echo "[SETUP] Downloading and installing libionic1..." - wget -q "https://repo.radeon.com/amdainic/pensando/ubuntu/1.117.5/pool/main/r/rdma-core/libionic1_54.0-149.g3304be71_amd64.deb" \ - -O /tmp/libionic1.deb - dpkg -i /tmp/libionic1.deb || true - rm -f /tmp/libionic1.deb - _SETUP_INSTALLED+=("libionic1") -} - -# --------------------------------------------------------------------------- -# 5. MoRI-IO proxy deps (Python packages for the MoRI-IO-aware proxy server) -# The proxy replaces vllm-router: it handles both HTTP routing AND the -# MoRI-IO ZMQ registration/request-enrichment protocol. -# Only needed on NODE_RANK=0 (proxy node). -# --------------------------------------------------------------------------- -install_mori_proxy_deps() { - if python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then - echo "[SETUP] MoRI-IO proxy Python deps already present" - return 0 - fi - - echo "[SETUP] Installing MoRI-IO proxy Python deps..." - # v0.18.0 ships aiohttp, pyzmq, blinker(distutils); only quart and msgpack - # are missing. --ignore-installed blinker avoids pip's distutils uninstall - # error when quart pulls a newer blinker version. - pip install --quiet --ignore-installed blinker - pip install --quiet quart msgpack - - if ! python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then - echo "[SETUP] ERROR: MoRI-IO proxy deps install failed"; exit 1 - fi - _SETUP_INSTALLED+=("mori-proxy-deps") -} - -# --------------------------------------------------------------------------- -# 6. MoRI (Modular RDMA Interface — EP dispatch/combine kernels for MoE) -# Required for --all2all-backend mori (Expert Parallelism via RDMA). -# GPU kernels are JIT-compiled on first use; no hipcc needed at install. -# -# v0.18.0 ships MoRI 0.1.dev185+g2d02c6a98, but it STILL has the PCI -# topology bug (TopoSystemPci::Load assertion failure on Broadcom -# PEX890xx switches). Always rebuild from our target commit b645fc8 -# which includes the dsp2dev subordinate-range fix. -# --------------------------------------------------------------------------- -install_mori() { - local MORI_TARGET_COMMIT="b645fc8" - local MORI_MARKER="/usr/local/lib/python3.*/dist-packages/.mori_commit_${MORI_TARGET_COMMIT}" - - if ls $MORI_MARKER &>/dev/null; then - echo "[SETUP] MoRI @ $MORI_TARGET_COMMIT already installed (marker found)" - return 0 - fi - - echo "[SETUP] Installing MoRI build dependencies..." - apt-get update -q -y && apt-get install -q -y \ - libopenmpi-dev openmpi-bin libpci-dev \ - && rm -rf /var/lib/apt/lists/* - - echo "[SETUP] Building MoRI from source (ROCm/mori @ $MORI_TARGET_COMMIT)..." - echo "[SETUP] (overriding image-provided version to fix PCI topology bug)" - ( - set -e - git_clone_retry https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori - git checkout "$MORI_TARGET_COMMIT" - pip install --quiet --force-reinstall . - ) - rm -rf /opt/mori - - if ! python3 -c "import mori" 2>/dev/null; then - echo "[SETUP] ERROR: MoRI build failed"; exit 1 - fi - touch $(python3 -c "import sysconfig; print(sysconfig.get_paths()['purelib'])")/.mori_commit_${MORI_TARGET_COMMIT} - _SETUP_INSTALLED+=("MoRI@$MORI_TARGET_COMMIT") -} - -# --------------------------------------------------------------------------- -# 6b. amd-quark (MXFP4 quantization support for Kimi-K2.5-MXFP4 and similar) -# Required due to ROCm vLLM missing the quark dependency: -# https://github.com/vllm-project/vllm/issues/35633 -# --------------------------------------------------------------------------- -install_amd_quark() { - if python3 -c "import quark" 2>/dev/null; then - echo "[SETUP] amd-quark already present" - return 0 - fi - - echo "[SETUP] Installing amd-quark for MXFP4 quantization support..." - pip install --quiet amd-quark - - if ! python3 -c "import quark" 2>/dev/null; then - echo "[SETUP] WARN: amd-quark install failed (non-fatal for non-MXFP4 models)" - return 0 - fi - _SETUP_INSTALLED+=("amd-quark") -} - -# --------------------------------------------------------------------------- -# 7. Patch vLLM MoRI-EP + FP8 incompatibility (present in v0.17.1 & v0.18.0) -# vLLM asserts MoRI requires AITER fused_moe, but AITER's FP8 kernel -# uses defer_input_quant=True which MoRI's prepare/finalize rejects. -# Patch: remove both the AITER requirement assertion and the -# defer_input_quant NotImplementedError so non-AITER kernels work. -# --------------------------------------------------------------------------- -patch_mori_fp8_compat() { - python3 -c ' -import re, os, sys -patched = [] - -# 1. Patch layer.py: remove multi-line AITER assertion for MoRI -try: - import vllm.model_executor.layers.fused_moe.layer as lm - f = lm.__file__ - src = open(f).read() - if "Mori needs to be used with aiter" in src: - new = re.sub( - r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)", - "pass # [PATCHED] AITER requirement removed for MoRI-EP + FP8", - src, flags=re.DOTALL) - if new != src: - open(f, "w").write(new) - patched.append("layer.py") -except Exception as e: - print(f"[SETUP] WARN patch layer.py: {e}", file=sys.stderr) - -# 2. Patch mori_prepare_finalize.py: remove defer_input_quant restriction -try: - import vllm.model_executor.layers.fused_moe.mori_prepare_finalize as mm - f = mm.__file__ - src = open(f).read() - if "defer_input_quant" in src: - new = re.sub( - r"raise NotImplementedError\([^)]*defer_input_quant[^)]*\)", - "pass # [PATCHED] defer_input_quant check removed for MoRI-EP + FP8", - src) - if new != src: - open(f, "w").write(new) - patched.append("mori_prepare_finalize.py") -except Exception as e: - print(f"[SETUP] WARN patch mori_pf: {e}", file=sys.stderr) - -if patched: - print(f"[SETUP] Patched: {chr(44).join(patched)}") -else: - print("[SETUP] No MoRI-FP8 patches needed") -' - _SETUP_INSTALLED+=("MoRI-FP8-patch") -} - -# --------------------------------------------------------------------------- -# 8. Patch vLLM MoRI-IO save_kv_layer busy-spin (C128 tail-batch deadlock) -# In WRITE mode, save_kv_layer spins forever waiting for the handshake -# callback to set write_ready_flags. This blocks the model worker thread, -# preventing it from responding to EngineCore shm_broadcast, causing a -# TimeoutError cascade and crash. -# Patch: add time.sleep(0.001) and a 30s timeout to yield CPU and prevent -# the model worker from deadlocking. -# --------------------------------------------------------------------------- -patch_moriio_save_kv_timeout() { - python3 -c ' -import os, sys - -try: - import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc - f = mc.__file__ - src = open(f).read() - - # Already patched? - if "[PATCHED] save_kv_layer timeout" in src: - print("[SETUP] save_kv_layer timeout patch already applied") - sys.exit(0) - - old = """ while True: - if ( - self._ready_requests.empty() - and remote_engine_id not in self.write_ready_flags - ): - continue""" - - if old not in src: - print("[SETUP] WARN: save_kv_layer busy-spin pattern not found, skipping patch") - sys.exit(0) - - new = """ # [PATCHED] save_kv_layer — null guard + timeout + sleep - if remote_engine_id is None: - return - import time as _time, os as _os - _wait_start = _time.monotonic() - _SAVE_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) - while True: - if ( - self._ready_requests.empty() - and remote_engine_id not in self.write_ready_flags - ): - _elapsed = _time.monotonic() - _wait_start - if _elapsed > _SAVE_KV_TIMEOUT: - import logging as _logging - _logging.getLogger("vllm.moriio").warning( - "[HANGFIX] save_kv_layer: timeout (%.1fs) waiting for " - "write_ready_flags[%s], breaking to unblock model " - "worker", _elapsed, remote_engine_id) - break - _time.sleep(0.001) - continue""" - - new_src = src.replace(old, new) - if new_src == src: - print("[SETUP] WARN: replacement had no effect") - sys.exit(0) - - open(f, "w").write(new_src) - print("[SETUP] Patched save_kv_layer: null guard + timeout + sleep") -except Exception as e: - print(f"[SETUP] WARN patch save_kv_layer: {e}", file=sys.stderr) -' - _SETUP_INSTALLED+=("MoRIIO-save-kv-timeout-patch") -} - -# --------------------------------------------------------------------------- -# 9. Patch MoRIIO waiting_for_transfer_complete with bounded timeout -# The original status.Wait() blocks forever if an RDMA completion never -# arrives (e.g., NIC queue saturation at C256). This replaces the unbounded -# wait with a polling loop using status.Succeeded() + configurable timeout. -# Also adds error handling to the write worker loop so a single failed -# transfer doesn't kill the background thread. -# --------------------------------------------------------------------------- -patch_moriio_transfer_timeout() { - python3 -c ' -import os, sys, textwrap - -try: - import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine as me - f = me.__file__ - src = open(f).read() - - if "[PATCHED] transfer completion timeout" in src: - print("[SETUP] transfer completion timeout patch already applied") - sys.exit(0) - - # --- Patch 1: Replace waiting_for_transfer_complete with polling + timeout --- - old_wait = """ def waiting_for_transfer_complete(self): - if not self.transfer_status: - return - - transfers_to_wait = [] - with self.lock: - transfers_to_wait = self.transfer_status[:] - self.transfer_status.clear() - - for status in transfers_to_wait: - try: - status.Wait() - if not status.Succeeded(): - logger.error( - "Transfer failed: %s, Code: %s", status.Message(), status.Code() - ) - raise TransferError("MoRIIO transfer failed!") - except Exception as e: - logger.error("Transfer %s failed: %s", status, e) - raise""" - - new_wait = """ def waiting_for_transfer_complete(self): - # [PATCHED] transfer completion timeout — bounded polling loop - import time as _time, os as _os - if not self.transfer_status: - return - - _timeout = float(_os.environ.get("VLLM_MORIIO_TRANSFER_TIMEOUT", "120")) - - transfers_to_wait = [] - with self.lock: - transfers_to_wait = self.transfer_status[:] - self.transfer_status.clear() - - _start = _time.monotonic() - remaining = list(transfers_to_wait) - _polls = 0 - _completed = 0 - - while remaining: - _elapsed = _time.monotonic() - _start - if _elapsed > _timeout: - logger.error( - "[HANGFIX] transfer_timeout elapsed=%.1fs " - "pending=%d/%d completed=%d polls=%d " - "action=raise_transfer_error", - _elapsed, len(remaining), len(transfers_to_wait), - _completed, _polls, - ) - raise TransferError( - f"RDMA transfer timeout after {_elapsed:.1f}s, " - f"{len(remaining)}/{len(transfers_to_wait)} pending" - ) - - still_waiting = [] - for status in remaining: - try: - if status.Succeeded(): - _completed += 1 - continue - still_waiting.append(status) - except Exception as e: - logger.error( - "[HANGFIX] transfer_poll_error error=%s", e) - raise TransferError( - f"Transfer failed during poll: {e}" - ) from e - - remaining = still_waiting - if remaining: - _time.sleep(0.005) - _polls += 1 - if _polls % 2000 == 0: - logger.warning( - "[HANGFIX] transfer_wait pending=%d " - "completed=%d elapsed=%.1fs timeout=%.0fs", - len(remaining), _completed, - _time.monotonic() - _start, _timeout, - )""" - - if old_wait not in src: - print("[SETUP] WARN: waiting_for_transfer_complete pattern not found") - sys.exit(0) - - new_src = src.replace(old_wait, new_wait) - - # --- Patch 2: Add error handling + cleanup to _write_worker_loop --- - old_loop = """ self._execute_write_task(task)""" - - new_loop = """ try: - self._execute_write_task(task) - except Exception as _e: - logger.error( - "[HANGFIX] req=%s write_task_failed error=%s " - "action=cleanup_and_mark_done", - task.request_id, _e, - ) - try: - _wr = self.worker.moriio_wrapper - with _wr.lock: - _wr.done_req_ids.append(task.request_id) - _wr.done_remote_allocate_req_dict.pop( - task.request_id, None - ) - except Exception: - pass""" - - if old_loop in new_src: - new_src = new_src.replace(old_loop, new_loop, 1) - else: - print("[SETUP] WARN: _write_worker_loop pattern not found for error handling") - - # --- Patch 3: Add deferred task timeout to _process_deferred_tasks --- - old_deferred = """ def _process_deferred_tasks(self) -> None: - \"\"\"Process tasks that were previously deferred.\"\"\" - if not self._deferred_tasks: - return - - still_deferred: list[WriteTask] = [] - for task in self._deferred_tasks: - if self._is_remote_ready(task): - self._execute_write_task(task) - else: - still_deferred.append(task) - - self._deferred_tasks = still_deferred""" - - new_deferred = """ def _process_deferred_tasks(self) -> None: - \"\"\"Process tasks that were previously deferred.\"\"\" - # [PATCHED] deferred task timeout — prune stale tasks - import time as _time, os as _os - if not self._deferred_tasks: - return - - _DEFER_TIMEOUT = float( - _os.environ.get("VLLM_MORIIO_DEFER_TIMEOUT", "60")) - - still_deferred: list[WriteTask] = [] - for task in self._deferred_tasks: - _age = _time.monotonic() - getattr(task, "_defer_ts", _time.monotonic()) - if _age > _DEFER_TIMEOUT: - logger.error( - "[HANGFIX] req=%s deferred_task_expired age=%.1fs " - "action=drop_and_mark_done", - task.request_id, _age, - ) - try: - _wr = self.worker.moriio_wrapper - with _wr.lock: - _wr.done_req_ids.append(task.request_id) - _wr.done_remote_allocate_req_dict.pop( - task.request_id, None) - except Exception: - pass - continue - if self._is_remote_ready(task): - try: - self._execute_write_task(task) - except Exception as _e: - logger.error( - "[HANGFIX] req=%s deferred_write_failed error=%s", - task.request_id, _e, - ) - try: - _wr = self.worker.moriio_wrapper - with _wr.lock: - _wr.done_req_ids.append(task.request_id) - _wr.done_remote_allocate_req_dict.pop( - task.request_id, None) - except Exception: - pass - else: - still_deferred.append(task) - - self._deferred_tasks = still_deferred""" - - if old_deferred in new_src: - new_src = new_src.replace(old_deferred, new_deferred, 1) - else: - print("[SETUP] WARN: _process_deferred_tasks pattern not found") - - # --- Patch 4: Stamp defer time when task is deferred --- - old_defer_add = """ self._deferred_tasks.append(task)""" - new_defer_add = """ import time as _time2 - if not hasattr(task, "_defer_ts"): - task._defer_ts = _time2.monotonic() - self._deferred_tasks.append(task)""" - if old_defer_add in new_src: - new_src = new_src.replace(old_defer_add, new_defer_add, 1) - else: - print("[SETUP] WARN: deferred task timestamp patch target not found") - - open(f, "w").write(new_src) - print("[SETUP] Patched: transfer timeout + writer error handling") - -except Exception as e: - print(f"[SETUP] WARN patch transfer_timeout: {e}", file=sys.stderr) -' - _SETUP_INSTALLED+=("MoRIIO-transfer-timeout-patch") -} - -# --------------------------------------------------------------------------- -# 10. Patch MoRIIO start_load_kv busy-spin (same pattern as save_kv_layer) -# The READ-mode spin loop in start_load_kv has the same unbounded-spin -# issue as save_kv_layer. Add timeout + sleep + null guard. -# --------------------------------------------------------------------------- -patch_moriio_load_kv_timeout() { - python3 -c ' -import os, sys - -try: - import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc - f = mc.__file__ - src = open(f).read() - - if "[PATCHED] start_load_kv timeout" in src: - print("[SETUP] start_load_kv timeout patch already applied") - sys.exit(0) - - old = """ while True: - if ( - self._ready_requests.empty() - and remote_engine_id not in self.load_ready_flag - and wait_handshake_readd_req - ): - continue""" - - if old not in src: - print("[SETUP] WARN: start_load_kv busy-spin pattern not found, skipping") - sys.exit(0) - - new = """ # [PATCHED] start_load_kv timeout — prevent model worker deadlock - if remote_engine_id is None and not wait_handshake_readd_req: - self._reqs_to_send.update(metadata.reqs_to_send) - return - import time as _time, os as _os - _wait_start = _time.monotonic() - _LOAD_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) - while True: - if ( - self._ready_requests.empty() - and remote_engine_id not in self.load_ready_flag - and wait_handshake_readd_req - ): - if _time.monotonic() - _wait_start > _LOAD_KV_TIMEOUT: - import logging as _logging - _logging.getLogger("vllm.moriio").warning( - "[HANGFIX] start_load_kv: timeout (%.1fs) waiting for " - "load_ready_flag[%s]", _time.monotonic() - _wait_start, - remote_engine_id) - break - _time.sleep(0.001) - continue""" - - new_src = src.replace(old, new) - if new_src == src: - print("[SETUP] WARN: start_load_kv replacement had no effect") - sys.exit(0) - - open(f, "w").write(new_src) - print("[SETUP] Patched start_load_kv busy-spin with timeout + sleep") -except Exception as e: - print(f"[SETUP] WARN patch start_load_kv: {e}", file=sys.stderr) -' - _SETUP_INSTALLED+=("MoRIIO-load-kv-timeout-patch") -} - -# --------------------------------------------------------------------------- -# 11. Fix READ-mode scheduler assertion in _update_from_kv_xfer_finished -# vLLM asserts that a request in finished_recving must be either -# WAITING_FOR_REMOTE_KVS or finished. In READ mode the request can -# transition to RUNNING before the aggregated recv notification arrives, -# crashing the engine with AssertionError. -# (present in v0.17.1 & v0.18.0) -# --------------------------------------------------------------------------- -patch_scheduler_read_mode_fix() { - python3 -c ' -import os, sys - -try: - import vllm.v1.core.sched.scheduler as smod - f = smod.__file__ - src = open(f).read() - - if "[PATCHED] read-mode recv assertion" in src: - print("[SETUP] scheduler read-mode assertion fix already applied") - sys.exit(0) - - old_recv = """ for req_id in kv_connector_output.finished_recving or (): - logger.debug("Finished recving KV transfer for request %s", req_id) - assert req_id in self.requests - req = self.requests[req_id] - if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: - self.finished_recving_kv_req_ids.add(req_id) - else: - assert RequestStatus.is_finished(req.status) - self._free_blocks(self.requests[req_id])""" - - new_recv = """ # [PATCHED] read-mode recv assertion — handle intermediate states - for req_id in kv_connector_output.finished_recving or (): - logger.debug("Finished recving KV transfer for request %s", req_id) - if req_id not in self.requests: - logger.debug("Request %s already removed, skipping recv", req_id) - continue - req = self.requests[req_id] - if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: - self.finished_recving_kv_req_ids.add(req_id) - elif RequestStatus.is_finished(req.status): - self._free_blocks(self.requests[req_id]) - else: - logger.debug( - "Request %s recv finished but status=%s (not " - "WAITING_FOR_REMOTE_KVS or finished), skipping " - "block free — will be freed on request completion", - req_id, req.status.name)""" - - if old_recv not in src: - print("[SETUP] WARN: scheduler finished_recving pattern not found, skipping") - sys.exit(0) - - new_src = src.replace(old_recv, new_recv, 1) - - old_send = """ for req_id in kv_connector_output.finished_sending or (): - logger.debug("Finished sending KV transfer for request %s", req_id) - assert req_id in self.requests - self._free_blocks(self.requests[req_id])""" - - new_send = """ for req_id in kv_connector_output.finished_sending or (): - logger.debug("Finished sending KV transfer for request %s", req_id) - if req_id not in self.requests: - logger.debug("Request %s already removed, skipping send", req_id) - continue - self._free_blocks(self.requests[req_id])""" - - if old_send in new_src: - new_src = new_src.replace(old_send, new_send, 1) - else: - print("[SETUP] WARN: scheduler finished_sending pattern not found") - - open(f, "w").write(new_src) - print("[SETUP] Patched: scheduler _update_from_kv_xfer_finished read-mode fix") - -except Exception as e: - print(f"[SETUP] WARN patch scheduler read-mode: {e}", file=sys.stderr) -' - _SETUP_INSTALLED+=("scheduler-read-mode-fix") -} - -# --------------------------------------------------------------------------- -# 12. Idle KV block reaper for disaggregated prefill (READ mode) -# The RIXL notification path can lose `finished_sending` signals under -# high concurrency with ibv_post_send failures. This leaves KV blocks -# permanently allocated on the prefill engine even after the decode has -# finished reading. Over multiple benchmark rounds, leaked blocks -# accumulate and eventually saturate the prefill KV cache. -# -# Fix: instrument the scheduler's `schedule()` method to detect idle -# periods (0 running, 0 waiting for >5s) and force-free blocks for -# any remaining requests whose status is finished. -# --------------------------------------------------------------------------- -patch_prefill_idle_kv_reaper() { - python3 -c ' -import os, sys - -try: - import vllm.v1.core.sched.scheduler as smod - f = smod.__file__ - src = open(f).read() - - if "[PATCHED] idle-kv-reaper" in src: - print("[SETUP] idle KV block reaper already applied") - sys.exit(0) - - # Find the _update_from_kv_xfer_finished method end and add reaper logic - # We inject into the method that processes KV transfer completions. - marker = "[PATCHED] read-mode recv assertion" - if marker not in src: - print("[SETUP] WARN: scheduler read-mode patch not found, skipping reaper") - sys.exit(0) - - # Add reaper state initialization to __init__ - old_init_marker = "self.finished_recving_kv_req_ids" - if old_init_marker not in src: - print("[SETUP] WARN: finished_recving_kv_req_ids not found in scheduler") - sys.exit(0) - - # Find the first occurrence to insert reaper state - init_pos = src.find(old_init_marker) - # Find the line containing it - line_end = src.find("\n", init_pos) - init_line = src[init_pos:line_end] - - # Add reaper state after this line - reaper_init = init_line + """ - # [PATCHED] idle-kv-reaper state - self._idle_kv_reaper_ts = 0.0 - self._idle_kv_reaper_active = False""" - - src = src.replace(init_line, reaper_init, 1) - - # Now add the reaper logic at the end of _update_from_kv_xfer_finished - # Find the finished_sending handler we patched - send_handler = """ for req_id in kv_connector_output.finished_sending or (): - logger.debug("Finished sending KV transfer for request %s", req_id) - if req_id not in self.requests: - logger.debug("Request %s already removed, skipping send", req_id) - continue - self._free_blocks(self.requests[req_id])""" - - reaper_logic = send_handler + """ - - # [PATCHED] idle-kv-reaper — force-free leaked prefill KV blocks - import time as _time - _REAPER_IDLE_SECS = 5.0 - _num_running = sum(1 for r in self.requests.values() - if r.status == RequestStatus.RUNNING) - _should_reap = (_num_running == 0) - - if _should_reap: - if not self._idle_kv_reaper_active: - self._idle_kv_reaper_active = True - self._idle_kv_reaper_ts = _time.monotonic() - elif _time.monotonic() - self._idle_kv_reaper_ts > _REAPER_IDLE_SECS: - _reaped = 0 - _reap_ids = [] - for _rid, _req in list(self.requests.items()): - if RequestStatus.is_finished(_req.status): - _reap_ids.append(_rid) - for _rid in _reap_ids: - try: - _req = self.requests[_rid] - self._free_blocks(_req) - _reaped += 1 - except Exception as _e: - logger.debug("[KV-REAPER] free_blocks failed for %s: %s", _rid, _e) - if _reaped > 0: - logger.warning( - "[KV-REAPER] Force-freed blocks for %d finished " - "requests after %.1fs idle", - _reaped, _time.monotonic() - self._idle_kv_reaper_ts) - self._idle_kv_reaper_ts = _time.monotonic() - else: - self._idle_kv_reaper_active = False""" - - if send_handler in src: - src = src.replace(send_handler, reaper_logic, 1) - else: - print("[SETUP] WARN: send handler not found for reaper injection") - sys.exit(0) - - open(f, "w").write(src) - print("[SETUP] Patched: idle KV block reaper for prefill") - -except Exception as e: - print(f"[SETUP] WARN patch idle-kv-reaper: {e}", file=sys.stderr) -' - _SETUP_INSTALLED+=("idle-kv-reaper") -} - -# --------------------------------------------------------------------------- -# 13. Patch MiniMax M2.5 WideEP + MoRI + EPLB support -# Replaces the upstream minimax_m2.py with our patched version that adds -# GateLinear, EP group integration, sequence parallelism, and the -# MixtureOfExperts EPLB protocol. Idempotent: skips if already patched. -# --------------------------------------------------------------------------- -patch_minimax_m2_wideep_mori() { - local patch_file="${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}/patches/minimax_m2.py" - if [[ ! -f "$patch_file" ]]; then - # Also check the Docker-baked location - patch_file="/opt/vllm_disagg/patches/minimax_m2.py" - fi - if [[ ! -f "$patch_file" ]]; then - echo "[SETUP] minimax_m2.py patch not found, skipping (WideEP/MoRI not patched)" - return 0 - fi - - python3 -c ' -import os, sys, shutil - -try: - import vllm.model_executor.models.minimax_m2 as mmod - target = mmod.__file__ - src = sys.argv[1] - - with open(target) as f: - if "get_ep_group" in f.read(): - print("[SETUP] minimax_m2.py already has WideEP+MoRI support") - sys.exit(0) - - shutil.copy2(src, target) - print(f"[SETUP] Patched minimax_m2.py: {src} -> {target}") - -except Exception as e: - print(f"[SETUP] WARN patch minimax_m2: {e}", file=sys.stderr) -' "$patch_file" - _SETUP_INSTALLED+=("minimax-m2-wideep-mori") -} - -# ============================================================================= -# Run installers -# ============================================================================= - -install_ucx -install_rixl -install_etcd -install_libionic -install_mori -install_amd_quark -install_mori_proxy_deps -patch_mori_fp8_compat -patch_moriio_save_kv_timeout -patch_moriio_transfer_timeout -patch_moriio_load_kv_timeout -patch_scheduler_read_mode_fix -patch_prefill_idle_kv_reaper -patch_minimax_m2_wideep_mori - -# ============================================================================= -# Export paths (persists for server.sh since this file is sourced) -# ============================================================================= - -export ROCM_PATH="${ROCM_PATH}" -export UCX_HOME="${UCX_HOME}" -export RIXL_HOME="${RIXL_HOME}" -export PATH="${UCX_HOME}/bin:/usr/local/bin/etcd:/root/.cargo/bin:${PATH}" -export LD_LIBRARY_PATH="${UCX_HOME}/lib:${RIXL_HOME}/lib:${RIXL_HOME}/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" - -_SETUP_END=$(date +%s) -if [[ ${#_SETUP_INSTALLED[@]} -eq 0 ]]; then - echo "[SETUP] All dependencies already present (${_SETUP_END}s wallclock)" -else - echo "[SETUP] Installed: ${_SETUP_INSTALLED[*]} in $(( _SETUP_END - _SETUP_START ))s" -fi diff --git a/benchmarks/multi_node/vllm_disagg_utils/start_etcd.sh b/benchmarks/multi_node/vllm_disagg_utils/start_etcd.sh deleted file mode 100755 index 46bbd2964..000000000 --- a/benchmarks/multi_node/vllm_disagg_utils/start_etcd.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -set -x - -IPADDRS="${IPADDRS:-localhost}" - -# Use management network IP (matching what the Slurm script resolved) -host_ip=$(ip route get 1.1.1.1 2>/dev/null | sed -n 's/.*src \([^ ]*\).*/\1/p') -if [[ -z "$host_ip" ]]; then - host_ip=$(hostname -I | awk '{print $1}') -fi - -IFS=',' read -ra ADDR <<< "$IPADDRS" - -# Determine node name based on position in the IPADDRS list -index=0 -for ip in "${ADDR[@]}"; do - if [[ "$ip" == "$host_ip" ]]; then - break - fi - index=$((index + 1)) -done -node_name="etcd-$((index+1))" - -# Build initial cluster string -initial_cluster="" -for i in "${!ADDR[@]}"; do - peer_name="etcd-$((i+1))" - initial_cluster+="$peer_name=http://${ADDR[i]}:2380" - if [[ $i -lt $((${#ADDR[@]} - 1)) ]]; then - initial_cluster+="," - fi -done - -mkdir -p /var/lib/etcd -rm -rf /var/lib/etcd/* - -/usr/local/bin/etcd/etcd \ - --name "$node_name" \ - --data-dir /var/lib/etcd \ - --initial-advertise-peer-urls http://$host_ip:2380 \ - --listen-peer-urls http://0.0.0.0:2380 \ - --listen-client-urls http://0.0.0.0:2379 \ - --advertise-client-urls http://$host_ip:2379 \ - --initial-cluster-token etcd-cluster-1 \ - --initial-cluster "$initial_cluster" \ - --initial-cluster-state new \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/etcd_NODE${NODE_RANK}.log diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh deleted file mode 100755 index ecb5a9876..000000000 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash -# -# Cluster Configuration Template for Multi-Node vLLM Disaggregated Serving -# -# This script submits a multi-node vLLM disaggregated benchmark job to SLURM. -# It must be configured for your specific cluster before use. -# -# Router is co-located with the first prefill node (same as SGLang), so -# NUM_NODES = PREFILL_NODES + DECODE_NODES. - -usage() { - cat << 'USAGE' -Usage: - bash submit.sh \ - \ - \ - \ - \ - [NODE_LIST] - -Arguments: - PREFILL_NODES Number of prefill nodes - PREFILL_WORKERS Number of prefill workers (usually 1) - DECODE_NODES Number of decode nodes - DECODE_WORKERS Number of decode workers (usually 1) - ISL Input sequence length - OSL Output sequence length - CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") - REQUEST_RATE Request rate ("inf" for max throughput) - PREFILL_ENABLE_EP true/false (from PREFILL_EP in YAML; false when EP==1) - PREFILL_ENABLE_DP true/false (data-parallel attention on prefill) - DECODE_ENABLE_EP true/false (from DECODE_EP in YAML) - DECODE_ENABLE_DP true/false (data-parallel attention on decode) - PREFILL_TP Tensor parallel size per prefill node - DECODE_TP Tensor parallel size per decode node - RANDOM_RANGE_RATIO Random range ratio for benchmark client - NODE_LIST Optional: comma-separated hostnames (must match NUM_NODES) - -Required environment variables: - SLURM_ACCOUNT SLURM account name - SLURM_PARTITION SLURM partition - TIME_LIMIT Job time limit (e.g., "08:00:00") - MODEL_PATH Path to model directory (e.g., /nfsdata) - MODEL_NAME Model name directory - CONTAINER_IMAGE Docker image name (e.g., vllm_disagg_pd:latest) - RUNNER_NAME Runner identifier (for job name) -USAGE -} - -check_env() { - local name="$1" - if [[ -z "${!name:-}" ]]; then - echo "Error: ${name} not specified" >&2 - usage >&2 - exit 1 - fi -} - -check_env SLURM_ACCOUNT -check_env SLURM_PARTITION -check_env TIME_LIMIT - -check_env MODEL_PATH -check_env MODEL_NAME -check_env CONTAINER_IMAGE -check_env RUNNER_NAME - -GPUS_PER_NODE="${GPUS_PER_NODE:-8}" - -# COMMAND_LINE ARGS (aligned with benchmarks/multi_node/amd_utils/submit.sh) -PREFILL_NODES=$1 -PREFILL_WORKERS=${2:-1} -DECODE_NODES=$3 -DECODE_WORKERS=${4:-1} -ISL=$5 -OSL=$6 -CONCURRENCIES=$7 -REQUEST_RATE=$8 -PREFILL_ENABLE_EP=${9:-false} -PREFILL_ENABLE_DP=${10:-false} -DECODE_ENABLE_EP=${11:-false} -DECODE_ENABLE_DP=${12:-false} -PREFILL_TP=${13:-8} -DECODE_TP=${14:-8} -RANDOM_RANGE_RATIO=${15:-0.8} -NODE_LIST=${16} - -# Router co-located with first prefill: xP + yD nodes total -NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) -profiler_args="${ISL} ${OSL} ${CONCURRENCIES} ${REQUEST_RATE}" - -# Export variables for the SLURM job -export MODEL_DIR=$MODEL_PATH -export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE -export PROFILER_ARGS=$profiler_args - -# For vLLM, each worker = 1 node (TP=8 per node). -# xP/yD must match the node counts so NUM_NODES = xP+yD is correct. -export xP=$PREFILL_NODES -export yD=$DECODE_NODES -export NUM_NODES=$NUM_NODES -export GPUS_PER_NODE=$GPUS_PER_NODE -export MODEL_NAME=$MODEL_NAME -export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP} -export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP} -export DECODE_ENABLE_EP=${DECODE_ENABLE_EP} -export DECODE_ENABLE_DP=${DECODE_ENABLE_DP} -export PREFILL_TP=${PREFILL_TP} -export DECODE_TP=${DECODE_TP} -export BENCH_INPUT_LEN=${ISL} -export BENCH_OUTPUT_LEN=${OSL} -export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} -export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} -export BENCH_REQUEST_RATE=${REQUEST_RATE} -export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} - -export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} -export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} - -# Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. -export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" -mkdir -p "$BENCHMARK_LOGS_DIR" - -# Optional: pass an explicit node list to sbatch. -NODELIST_OPT=() -if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then - IFS=',' read -r -a NODE_ARR <<< "$NODE_LIST" - if [[ "${#NODE_ARR[@]}" -ne "$NUM_NODES" ]]; then - echo "Error: NODE_LIST has ${#NODE_ARR[@]} nodes but NUM_NODES=${NUM_NODES}" >&2 - echo "Error: NODE_LIST='${NODE_LIST}'" >&2 - exit 1 - fi - NODELIST_CSV="$(IFS=,; echo "${NODE_ARR[*]}")" - NODELIST_OPT=(--nodelist "$NODELIST_CSV") -fi - -# Optional: exclude specific nodes (e.g. nodes with broken Docker sockets). -# Set SLURM_EXCLUDE_NODES env var to a comma-separated list of hostnames. -EXCLUDE_OPT=() -if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then - EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES") -fi - -# Construct the sbatch command -sbatch_cmd=( - sbatch - --parsable - -N "$NUM_NODES" - -n "$NUM_NODES" - "${NODELIST_OPT[@]}" - "${EXCLUDE_OPT[@]}" - --time "$TIME_LIMIT" - --partition "$SLURM_PARTITION" - --account "$SLURM_ACCOUNT" - --job-name "$RUNNER_NAME" - --output "${BENCHMARK_LOGS_DIR}/slurm_job-%j.out" - --error "${BENCHMARK_LOGS_DIR}/slurm_job-%j.err" - "$(dirname "$0")/job.slurm" -) - -JOB_ID=$("${sbatch_cmd[@]}") -if [[ $? -ne 0 ]]; then - echo "Error: Failed to submit job with sbatch" >&2 - exit 1 -fi -echo "$JOB_ID" diff --git a/benchmarks/multi_node/vllm_disagg_utils/sync.py b/benchmarks/multi_node/vllm_disagg_utils/sync.py deleted file mode 100755 index 3678e7614..000000000 --- a/benchmarks/multi_node/vllm_disagg_utils/sync.py +++ /dev/null @@ -1,201 +0,0 @@ -#!/usr/bin/env python3 -""" -Multi-node synchronization utilities for disaggregated inference. - -Subcommands: - barrier - Wait until all specified nodes have opened their ports (TCP barrier) - Optionally wait for HTTP health endpoints to return 200 - wait - Block until a remote port closes (shutdown coordination) -""" - -import socket -import time -import threading -import argparse -import sys -import urllib.request -import urllib.error - - -def is_port_open(ip, port, timeout=2): - """Check if a given IP and port are accessible.""" - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.settimeout(timeout) - return s.connect_ex((ip, port)) == 0 - - -def check_health(ip, port, path="/health", timeout=2): - """Return True if http://ip:port/path returns HTTP 200.""" - try: - url = f"http://{ip}:{port}{path}" - req = urllib.request.Request(url) - with urllib.request.urlopen(req, timeout=timeout) as resp: - return getattr(resp, "status", 200) == 200 - except (urllib.error.URLError, urllib.error.HTTPError, OSError): - return False - - -# ============================================================================= -# barrier subcommand -# ============================================================================= - -def cmd_barrier(args): - """Wait until all nodes have opened the specified ports.""" - NODE_IPS = [ip.strip() for ip in args.node_ips.split(",") if ip.strip()] - NODE_PORTS = [int(p.strip()) for p in args.node_ports.split(",") if p.strip()] - - if not NODE_IPS: - print("Error: NODE_IPS argument is empty or not set.") - sys.exit(1) - - if len(NODE_PORTS) == 1: - NODE_PORTS *= len(NODE_IPS) - elif len(NODE_PORTS) != len(NODE_IPS): - print("Error: Number of ports must match number of node IPs or only one port should be given for all.") - sys.exit(1) - - server_socket = None - - def open_port(): - nonlocal server_socket - server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server_socket.bind((args.local_ip, args.local_port)) - server_socket.listen(5) - print(f"Port {args.local_port} is now open on {args.local_ip}.") - while True: - conn, addr = server_socket.accept() - conn.close() - - def close_port(): - nonlocal server_socket - if server_socket: - server_socket.close() - print(f"Port {args.local_port} has been closed on {args.local_ip}.") - - if args.enable_port: - threading.Thread(target=open_port, daemon=True).start() - - # Wait for all ports (TCP check) - if args.wait_for_all_ports: - start_time = time.time() - timeout = args.timeout - - while True: - if timeout > 0: - elapsed = time.time() - start_time - if elapsed >= timeout: - not_open = [(ip, port) for ip, port in zip(NODE_IPS, NODE_PORTS) - if not is_port_open(ip, port)] - print(f"ERROR: Timeout after {timeout} seconds waiting for ports to open.", flush=True) - print("The following nodes/ports are still not responding:", flush=True) - for ip, port in not_open: - print(f" - {ip}:{port}", flush=True) - sys.exit(1) - - all_open = all(is_port_open(ip, port) for ip, port in zip(NODE_IPS, NODE_PORTS)) - if all_open: - break - - if timeout > 0: - remaining = timeout - (time.time() - start_time) - print(f"Waiting for nodes.{NODE_PORTS},{NODE_IPS} . . ({remaining:.0f}s remaining)", flush=True) - else: - print(f"Waiting for nodes.{NODE_PORTS},{NODE_IPS} . .", flush=True) - time.sleep(5) - - # Wait for all health endpoints (HTTP check) - if args.wait_for_all_health: - health_path = args.health_endpoint - start_time = time.time() - timeout = args.timeout - - while True: - if timeout > 0: - elapsed = time.time() - start_time - if elapsed >= timeout: - not_ready = [ - (ip, port) - for ip, port in zip(NODE_IPS, NODE_PORTS) - if not check_health(ip, port, health_path) - ] - print(f"ERROR: Timeout after {timeout} seconds waiting for health endpoints.", flush=True) - print(f"The following (http://ip:port{health_path}) are still not responding:", flush=True) - for ip, port in not_ready: - print(f" - http://{ip}:{port}{health_path}", flush=True) - sys.exit(1) - - all_ready = all( - check_health(ip, port, health_path) - for ip, port in zip(NODE_IPS, NODE_PORTS) - ) - if all_ready: - break - - if timeout > 0: - remaining = timeout - (time.time() - start_time) - print( - f"Waiting for health on {list(zip(NODE_IPS, NODE_PORTS))} ({health_path}) .. ({remaining:.0f}s remaining)", - flush=True, - ) - else: - print(f"Waiting for health on {list(zip(NODE_IPS, NODE_PORTS))} ({health_path}) ..", flush=True) - time.sleep(30) - - if args.enable_port: - # Keep the port open long enough for slow nodes to pass their barrier. - # The previous 30s was too short when setup times vary by minutes. - grace = max(60, args.timeout // 2) if args.timeout > 0 else 300 - time.sleep(grace) - close_port() - - -# ============================================================================= -# wait subcommand -# ============================================================================= - -def cmd_wait(args): - """Wait while a remote port remains open, exit when it closes.""" - print(f"Waiting while port {args.remote_port} on {args.remote_ip} is open...") - while is_port_open(args.remote_ip, args.remote_port): - time.sleep(5) - print(f"Port {args.remote_port} on {args.remote_ip} is now closed.") - - -# ============================================================================= -# CLI -# ============================================================================= - -def main(): - parser = argparse.ArgumentParser(description="Multi-node synchronization utilities.") - subparsers = parser.add_subparsers(dest="command", required=True) - - # barrier subcommand - bp = subparsers.add_parser("barrier", help="Wait for all nodes to open specified ports.") - bp.add_argument("--local-ip", required=False, help="Local IP address to bind the server.") - bp.add_argument("--local-port", type=int, required=False, help="Port number to bind the server.") - bp.add_argument("--enable-port", action="store_true", help="Enable opening and closing of local port.") - bp.add_argument("--node-ips", required=True, help="Comma-separated list of node IPs.") - bp.add_argument("--node-ports", required=True, help="Comma-separated list of ports to check.") - bp.add_argument("--timeout", type=int, default=600, - help="Timeout in seconds (default: 600). Set to 0 for no timeout.") - bp.add_argument("--wait-for-all-ports", action="store_true", - help="Wait until all node ports are open (TCP).") - bp.add_argument("--wait-for-all-health", action="store_true", - help="Wait until http://ip:port/health returns 200 for all nodes.") - bp.add_argument("--health-endpoint", default="/health", - help="Path for health check (default: /health).") - bp.set_defaults(func=cmd_barrier) - - # wait subcommand - wp = subparsers.add_parser("wait", help="Wait while a remote port remains open.") - wp.add_argument("--remote-ip", required=True, help="Remote server IP address.") - wp.add_argument("--remote-port", type=int, required=True, help="Remote port number.") - wp.set_defaults(func=cmd_wait) - - args = parser.parse_args() - args.func(args) - - -if __name__ == "__main__": - main() From 106a4e4dc2ddcd3a5f65ffcfd8d0b1febdb7fd9c Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 26 May 2026 09:08:01 +0000 Subject: [PATCH 75/85] revert: restore backend_request_func.py to match main Co-Authored-By: Claude Opus 4 --- utils/bench_serving/backend_request_func.py | 270 ++++++++------------ 1 file changed, 107 insertions(+), 163 deletions(-) diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py index 1b22b1b91..7f4a93284 100644 --- a/utils/bench_serving/backend_request_func.py +++ b/utils/bench_serving/backend_request_func.py @@ -14,7 +14,7 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) -AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=30 * 60) +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) @dataclass @@ -49,16 +49,12 @@ class RequestFuncOutput: async def async_request_tgi( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, - session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - _own_session = session is None - if _own_session: - session = aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) - try: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: params = { "best_of": request_func_input.best_of, "max_new_tokens": request_func_input.output_len, @@ -66,6 +62,7 @@ async def async_request_tgi( "temperature": 0.01, # TGI does not accept 0.0 temperature. "top_p": 0.99, # TGI does not accept 1.0 top_p. "truncate": request_func_input.prompt_len, + # TGI does not accept ignore_eos flag. } payload = { "inputs": request_func_input.prompt, @@ -116,28 +113,21 @@ async def async_request_tgi( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) - finally: - if _own_session: - await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output async def async_request_trt_llm( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, - session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - _own_session = session is None - if _own_session: - session = aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) - try: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: assert request_func_input.best_of == 1 payload = { "accumulate_tokens": True, @@ -191,25 +181,18 @@ async def async_request_trt_llm( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) - finally: - if _own_session: - await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output async def async_request_deepspeed_mii( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, - session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: - _own_session = session is None - if _own_session: - session = aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) - try: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: assert request_func_input.best_of == 1 payload = { @@ -242,30 +225,23 @@ async def async_request_deepspeed_mii( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) - finally: - if _own_session: - await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output async def async_request_openai_completions( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, - session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith( ("completions", "profile") ), "OpenAI Completions API URL must end with 'completions' or 'profile'." - _own_session = session is None - if _own_session: - session = aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) - try: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: payload = { "model": request_func_input.model_name \ if request_func_input.model_name else request_func_input.model, @@ -305,35 +281,33 @@ async def async_request_openai_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - if chunk == "[DONE]": - break - - data = json.loads(chunk) - - # NOTE: Some completion API might have a last - # usage summary response without a token so we - # want to check a token was generated - if choices := data.get("choices"): - # Note that text could be empty here - # e.g. for special tokens - text = choices[0].get("text") - timestamp = time.perf_counter() - # First token - if not first_chunk_received: - first_chunk_received = True - ttft = time.perf_counter() - st - output.ttft = ttft - - # Decoding phase - else: - output.itl.append(timestamp - - most_recent_timestamp) - - most_recent_timestamp = timestamp - generated_text += text or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") + if chunk != "[DONE]": + data = json.loads(chunk) + + # NOTE: Some completion API might have a last + # usage summary response without a token so we + # want to check a token was generated + if choices := data.get("choices"): + # Note that text could be empty here + # e.g. for special tokens + text = choices[0].get("text") + timestamp = time.perf_counter() + # First token + if not first_chunk_received: + first_chunk_received = True + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + most_recent_timestamp = timestamp + generated_text += text or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") if first_chunk_received: output.success = True else: @@ -350,9 +324,6 @@ async def async_request_openai_completions( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) - finally: - if _own_session: - await session.close() if pbar: pbar.update(1) @@ -362,19 +333,15 @@ async def async_request_openai_completions( async def async_request_openai_chat_completions( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, - session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith( "chat/completions" ), "OpenAI Chat Completions API URL must end with 'chat/completions'." - _own_session = session is None - if _own_session: - session = aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) - try: - content = [{"type": "text", "text": request_func_input.prompt}] + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: + content = request_func_input.prompt if request_func_input.multi_modal_content: content = [{"type": "text", "text": request_func_input.prompt}] content.append(request_func_input.multi_modal_content) @@ -421,30 +388,28 @@ async def async_request_openai_chat_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - if chunk == "[DONE]": - break - - timestamp = time.perf_counter() - data = json.loads(chunk) + if chunk != "[DONE]": + timestamp = time.perf_counter() + data = json.loads(chunk) - if choices := data.get("choices"): - content = choices[0]["delta"].get("content") - # First token - if ttft == 0.0: - ttft = timestamp - st - output.ttft = ttft + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft - # Decoding phase - else: - output.itl.append(timestamp - - most_recent_timestamp) + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) - generated_text += content or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") - most_recent_timestamp = timestamp + most_recent_timestamp = timestamp output.generated_text = generated_text output.success = True @@ -456,13 +421,10 @@ async def async_request_openai_chat_completions( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) - finally: - if _own_session: - await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output def get_model(pretrained_model_name_or_path: str) -> str: @@ -504,64 +466,46 @@ def _fix_tokenizer_for_sglang(tokenizer, model_path): import json from pathlib import Path - def _resolve(filename): - """Return a filesystem path for `filename`, whether `model_path` is a - local directory or an HF Hub repo id. Returns None and logs a warning - on failure so we don't silently fail to apply the v5 fix.""" - local = Path(model_path) / filename - if local.is_file(): - return str(local) - try: - from huggingface_hub import hf_hub_download - return hf_hub_download(repo_id=model_path, filename=filename) - except Exception as e: - print( - f"v5 tokenizer fix: cannot resolve {filename} for {model_path!r} " - f"({type(e).__name__}: {e}); fix will not apply.", - flush=True, - ) - return None - backend = getattr(tokenizer, "_tokenizer", None) if backend is not None: - tok_file = _resolve("tokenizer.json") - if tok_file is not None: + try: from tokenizers import Tokenizer as RawTokenizer - raw = RawTokenizer.from_file(tok_file) - raw_pre = type(raw.pre_tokenizer).__name__ if raw.pre_tokenizer else None - loaded_pre = type(backend.pre_tokenizer).__name__ if backend.pre_tokenizer else None - if raw_pre and loaded_pre and raw_pre != loaded_pre: - print( - f"v5 tokenizer fix: {model_path} pre_tokenizer {loaded_pre} -> {raw_pre}, " - f"decoder {type(backend.decoder).__name__ if backend.decoder else None} -> " - f"{type(raw.decoder).__name__ if raw.decoder else None}", - flush=True, - ) - backend.pre_tokenizer = raw.pre_tokenizer - backend.decoder = raw.decoder - - config_file = _resolve("tokenizer_config.json") - if config_file is not None: - with open(config_file) as f: - config = json.load(f) - tok_class = config.get("tokenizer_class", "") - bos_eos_classes = { - "LlamaTokenizer", "LlamaTokenizerFast", - "CodeLlamaTokenizer", "CodeLlamaTokenizerFast", - "GemmaTokenizer", "GemmaTokenizerFast", "CohereTokenizerFast", - } - if tok_class in bos_eos_classes: - defaults = {"add_bos_token": True, "add_eos_token": False} - changed = False - for attr in ("add_bos_token", "add_eos_token"): - val = config.get(attr) - if val is None: - val = defaults.get(attr, False) - if getattr(tokenizer, attr, None) != val: - setattr(tokenizer, f"_{attr}", val) - changed = True - if changed and hasattr(tokenizer, "update_post_processor"): - tokenizer.update_post_processor() + tok_file = Path(model_path) / "tokenizer.json" + if tok_file.is_file(): + raw = RawTokenizer.from_file(str(tok_file)) + raw_pre = type(raw.pre_tokenizer).__name__ if raw.pre_tokenizer else None + loaded_pre = type(backend.pre_tokenizer).__name__ if backend.pre_tokenizer else None + if raw_pre and loaded_pre and raw_pre != loaded_pre: + backend.pre_tokenizer = raw.pre_tokenizer + backend.decoder = raw.decoder + except Exception: + pass + + try: + config_file = Path(model_path) / "tokenizer_config.json" + if config_file.is_file(): + with open(config_file) as f: + config = json.load(f) + tok_class = config.get("tokenizer_class", "") + bos_eos_classes = { + "LlamaTokenizer", "LlamaTokenizerFast", + "CodeLlamaTokenizer", "CodeLlamaTokenizerFast", + "GemmaTokenizer", "GemmaTokenizerFast", "CohereTokenizerFast", + } + if tok_class in bos_eos_classes: + defaults = {"add_bos_token": True, "add_eos_token": False} + changed = False + for attr in ("add_bos_token", "add_eos_token"): + val = config.get(attr) + if val is None: + val = defaults.get(attr, False) + if getattr(tokenizer, attr, None) != val: + setattr(tokenizer, f"_{attr}", val) + changed = True + if changed and hasattr(tokenizer, "update_post_processor"): + tokenizer.update_post_processor() + except Exception: + pass return tokenizer From 8ccd28aa8c105dd16dc5fcb9f36ef41d3abf4c02 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 26 May 2026 09:08:43 +0000 Subject: [PATCH 76/85] revert: restore benchmark_serving.py to match main Co-Authored-By: Claude Opus 4 --- utils/bench_serving/benchmark_serving.py | 67 +++++------------------- 1 file changed, 13 insertions(+), 54 deletions(-) diff --git a/utils/bench_serving/benchmark_serving.py b/utils/bench_serving/benchmark_serving.py index 0e491384c..1412a8925 100644 --- a/utils/bench_serving/benchmark_serving.py +++ b/utils/bench_serving/benchmark_serving.py @@ -39,17 +39,16 @@ from multiprocessing import Pool, cpu_count from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple -import aiohttp import numpy as np -from backend_request_func import (AIOHTTP_TIMEOUT, ASYNC_REQUEST_FUNCS, - RequestFuncInput, RequestFuncOutput) +from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, + RequestFuncOutput) from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase try: - from backend_request_func import get_tokenizer -except ImportError: from vllm.transformers_utils.tokenizer import get_tokenizer +except ImportError: + from backend_request_func import get_tokenizer try: from vllm.utils import FlexibleArgumentParser @@ -519,14 +518,11 @@ async def benchmark( else: raise ValueError(f"Unknown backend: {backend}") - connector = aiohttp.TCPConnector(limit=0, enable_cleanup_closed=True) - shared_session = aiohttp.ClientSession( - trust_env=True, timeout=AIOHTTP_TIMEOUT, connector=connector) - print("Starting initial single prompt test run...") test_prompt, test_prompt_len, test_output_len, test_mm_content = ( input_requests[0]) if backend != "openai-chat" and test_mm_content is not None: + # multi-modal benchmark is only available on OpenAI Chat backend. raise ValueError( "Multi-modal content is only supported on 'openai-chat' backend.") test_input = RequestFuncInput( @@ -545,15 +541,13 @@ async def benchmark( if num_warmups > 0: print(f"Warming up with {num_warmups} requests...") warmup_pbar = None if disable_tqdm else tqdm(total=num_warmups) - warmup_semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else asyncio.Semaphore(num_warmups) + warmup_semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None async def warmup_limited_req_fn(): if warmup_semaphore is None: return await request_func(request_func_input=test_input, pbar=warmup_pbar) async with warmup_semaphore: - return await request_func( - request_func_input=test_input, pbar=warmup_pbar, - session=shared_session) + return await request_func(request_func_input=test_input, pbar=warmup_pbar) warmup_tasks = [] for _ in range(num_warmups): @@ -566,6 +560,7 @@ async def warmup_limited_req_fn(): print("Warmup completed.") if lora_modules: + # For each input request, choose a LoRA module at random. lora_modules = iter( [random.choice(lora_modules) for _ in range(len(input_requests))]) @@ -582,8 +577,7 @@ async def warmup_limited_req_fn(): best_of=best_of, multi_modal_content=test_mm_content, ignore_eos=ignore_eos) - profile_output = await request_func( - request_func_input=profile_input, session=shared_session) + profile_output = await request_func(request_func_input=profile_input) if profile_output.success: print("Profiler started") @@ -604,10 +598,10 @@ async def warmup_limited_req_fn(): async def limited_request_func(request_func_input, pbar): if semaphore is None: return await request_func(request_func_input=request_func_input, - pbar=pbar, session=shared_session) + pbar=pbar) async with semaphore: return await request_func(request_func_input=request_func_input, - pbar=pbar, session=shared_session) + pbar=pbar) print("Starting main benchmark run...") @@ -635,28 +629,7 @@ async def limited_request_func(request_func_input, pbar): asyncio.create_task( limited_request_func(request_func_input=request_func_input, pbar=pbar))) - gather_timeout = max(7200, len(input_requests) * 30) - try: - outputs: List[RequestFuncOutput] = await asyncio.wait_for( - asyncio.gather(*tasks), timeout=gather_timeout) - except asyncio.TimeoutError: - completed = pbar.n if pbar else "?" - print(f"\n[WARNING] Benchmark timed out after {gather_timeout}s " - f"({completed}/{len(tasks)} requests completed). " - "Collecting partial results...") - for task in tasks: - if not task.done(): - task.cancel() - await asyncio.gather(*tasks, return_exceptions=True) - outputs = [] - for task in tasks: - if task.done() and not task.cancelled(): - try: - outputs.append(task.result()) - except Exception: - outputs.append(RequestFuncOutput()) - else: - outputs.append(RequestFuncOutput()) + outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) if profile: print("Stopping profiler...") @@ -669,14 +642,10 @@ async def limited_request_func(request_func_input, pbar): logprobs=logprobs, best_of=best_of, ) - profile_output = await request_func( - request_func_input=profile_input, session=shared_session) + profile_output = await request_func(request_func_input=profile_input) if profile_output.success: print("Profiler stopped") - await shared_session.close() - await connector.close() - if pbar is not None: pbar.close() @@ -971,16 +940,6 @@ def main(args: argparse.Namespace): json.dump(result_json, outfile) save_to_pytorch_benchmark_format(args, result_json, file_name) - max_failure_rate = 0.05 - completed = benchmark_result["completed"] - failure_rate = 1 - completed / args.num_prompts - if failure_rate > max_failure_rate: - raise SystemExit( - f"FAIL: request failure rate {failure_rate:.1%} exceeds " - f"{max_failure_rate:.0%} threshold " - f"({completed}/{args.num_prompts} completed)" - ) - if __name__ == "__main__": parser = FlexibleArgumentParser( From 93da023c574fc93dfd5bca240d3187d273ea997d Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 26 May 2026 09:11:06 +0000 Subject: [PATCH 77/85] revert: fully restore benchmark_serving.py to match main Restores import order and failure-rate check block. Co-Authored-By: Claude Opus 4 --- utils/bench_serving/benchmark_serving.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/utils/bench_serving/benchmark_serving.py b/utils/bench_serving/benchmark_serving.py index 1412a8925..741e44236 100644 --- a/utils/bench_serving/benchmark_serving.py +++ b/utils/bench_serving/benchmark_serving.py @@ -46,9 +46,9 @@ from transformers import PreTrainedTokenizerBase try: - from vllm.transformers_utils.tokenizer import get_tokenizer -except ImportError: from backend_request_func import get_tokenizer +except ImportError: + from vllm.transformers_utils.tokenizer import get_tokenizer try: from vllm.utils import FlexibleArgumentParser @@ -940,6 +940,16 @@ def main(args: argparse.Namespace): json.dump(result_json, outfile) save_to_pytorch_benchmark_format(args, result_json, file_name) + max_failure_rate = 0.05 + completed = benchmark_result["completed"] + failure_rate = 1 - completed / args.num_prompts + if failure_rate > max_failure_rate: + raise SystemExit( + f"FAIL: request failure rate {failure_rate:.1%} exceeds " + f"{max_failure_rate:.0%} threshold " + f"({completed}/{args.num_prompts} completed)" + ) + if __name__ == "__main__": parser = FlexibleArgumentParser( From f242ee5b8fd0dc811ff7696898d6fb8a6cbaa22a Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 26 May 2026 09:12:28 +0000 Subject: [PATCH 78/85] revert: fully restore backend_request_func.py to match main Restores _resolve helper and tokenizer fix logic. Co-Authored-By: Claude Opus 4 --- utils/bench_serving/backend_request_func.py | 92 ++++++++++++--------- 1 file changed, 55 insertions(+), 37 deletions(-) diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py index 7f4a93284..4c8820f8d 100644 --- a/utils/bench_serving/backend_request_func.py +++ b/utils/bench_serving/backend_request_func.py @@ -466,46 +466,64 @@ def _fix_tokenizer_for_sglang(tokenizer, model_path): import json from pathlib import Path + def _resolve(filename): + """Return a filesystem path for `filename`, whether `model_path` is a + local directory or an HF Hub repo id. Returns None and logs a warning + on failure so we don't silently fail to apply the v5 fix.""" + local = Path(model_path) / filename + if local.is_file(): + return str(local) + try: + from huggingface_hub import hf_hub_download + return hf_hub_download(repo_id=model_path, filename=filename) + except Exception as e: + print( + f"v5 tokenizer fix: cannot resolve {filename} for {model_path!r} " + f"({type(e).__name__}: {e}); fix will not apply.", + flush=True, + ) + return None + backend = getattr(tokenizer, "_tokenizer", None) if backend is not None: - try: + tok_file = _resolve("tokenizer.json") + if tok_file is not None: from tokenizers import Tokenizer as RawTokenizer - tok_file = Path(model_path) / "tokenizer.json" - if tok_file.is_file(): - raw = RawTokenizer.from_file(str(tok_file)) - raw_pre = type(raw.pre_tokenizer).__name__ if raw.pre_tokenizer else None - loaded_pre = type(backend.pre_tokenizer).__name__ if backend.pre_tokenizer else None - if raw_pre and loaded_pre and raw_pre != loaded_pre: - backend.pre_tokenizer = raw.pre_tokenizer - backend.decoder = raw.decoder - except Exception: - pass - - try: - config_file = Path(model_path) / "tokenizer_config.json" - if config_file.is_file(): - with open(config_file) as f: - config = json.load(f) - tok_class = config.get("tokenizer_class", "") - bos_eos_classes = { - "LlamaTokenizer", "LlamaTokenizerFast", - "CodeLlamaTokenizer", "CodeLlamaTokenizerFast", - "GemmaTokenizer", "GemmaTokenizerFast", "CohereTokenizerFast", - } - if tok_class in bos_eos_classes: - defaults = {"add_bos_token": True, "add_eos_token": False} - changed = False - for attr in ("add_bos_token", "add_eos_token"): - val = config.get(attr) - if val is None: - val = defaults.get(attr, False) - if getattr(tokenizer, attr, None) != val: - setattr(tokenizer, f"_{attr}", val) - changed = True - if changed and hasattr(tokenizer, "update_post_processor"): - tokenizer.update_post_processor() - except Exception: - pass + raw = RawTokenizer.from_file(tok_file) + raw_pre = type(raw.pre_tokenizer).__name__ if raw.pre_tokenizer else None + loaded_pre = type(backend.pre_tokenizer).__name__ if backend.pre_tokenizer else None + if raw_pre and loaded_pre and raw_pre != loaded_pre: + print( + f"v5 tokenizer fix: {model_path} pre_tokenizer {loaded_pre} -> {raw_pre}, " + f"decoder {type(backend.decoder).__name__ if backend.decoder else None} -> " + f"{type(raw.decoder).__name__ if raw.decoder else None}", + flush=True, + ) + backend.pre_tokenizer = raw.pre_tokenizer + backend.decoder = raw.decoder + + config_file = _resolve("tokenizer_config.json") + if config_file is not None: + with open(config_file) as f: + config = json.load(f) + tok_class = config.get("tokenizer_class", "") + bos_eos_classes = { + "LlamaTokenizer", "LlamaTokenizerFast", + "CodeLlamaTokenizer", "CodeLlamaTokenizerFast", + "GemmaTokenizer", "GemmaTokenizerFast", "CohereTokenizerFast", + } + if tok_class in bos_eos_classes: + defaults = {"add_bos_token": True, "add_eos_token": False} + changed = False + for attr in ("add_bos_token", "add_eos_token"): + val = config.get(attr) + if val is None: + val = defaults.get(attr, False) + if getattr(tokenizer, attr, None) != val: + setattr(tokenizer, f"_{attr}", val) + changed = True + if changed and hasattr(tokenizer, "update_post_processor"): + tokenizer.update_post_processor() return tokenizer From b133e5fbd93f985e81afdfec047c4af8c31943bf Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 27 May 2026 06:22:53 +0000 Subject: [PATCH 79/85] add pr-link to vllm-disagg changelog entries Co-Authored-By: Claude Opus 4 --- perf-changelog.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 821f0454b..1d347f93a 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2978,11 +2978,13 @@ - kimik2.5-fp4-mi355x-vllm-disagg description: - "Add vLLM disaggregated prefill-decode benchmark for Kimi-K2.5-MXFP4 on MI355X" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1569 - config-keys: - minimaxm2.5-fp8-mi355x-vllm-disagg description: - "Add vLLM disaggregated prefill-decode benchmark for MiniMax-M2.5 on MI355X" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1569 - config-keys: - dsv4-fp4-mi355x-vllm From b53a95b83c036d346b4dc366488dcf29d731873e Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 27 May 2026 06:43:20 +0000 Subject: [PATCH 80/85] fix: sync env.sh with upstream main - Fix IBDEVICES detection log: move info message inside success branch, exit 1 on failure instead of silently propagating empty strings - Add missing SGLANG_USE_AITER=1 - Set SGLANG_ENABLE_OVERLAP_PLAN_STREAM=0 to match upstream Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/env.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index aa69d0e46..5b31dc7d9 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -22,10 +22,11 @@ if [[ -z "$IBDEVICES" ]]; then DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',') if [[ -n "$DETECTED" ]]; then export IBDEVICES="$DETECTED" + echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES via ibv_devinfo on $(hostname -s)" else - echo "WARNING: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2 + echo "ERROR: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2 + exit 1 fi - echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $(hostname -s)" else echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)" fi @@ -140,7 +141,7 @@ else # Enable spec v2 export SGLANG_ENABLE_SPEC_V2=1 - export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 + export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=0 export SGLANG_LOG_MS=true export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 From 8de53c8b8aa4a4b325577d55a8eb0f79fc55c4d8 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 27 May 2026 06:51:56 +0000 Subject: [PATCH 81/85] fix: restore SGLANG_MORI_COMBINE_DTYPE in server launch commands The refactored server_sglang.sh dropped the per-role COMBINE_DTYPE mapping that the old server.sh had. SGLang reads SGLANG_MORI_COMBINE_DTYPE internally, so map it from MORI_COMBINE_DTYPE_PREFILL (fp8_direct_cast) on prefill and MORI_COMBINE_DTYPE_DECODE (fp8) on decode. Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/server_sglang.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index b410bc978..9fa5b0af5 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -398,7 +398,7 @@ if [ "$NODE_RANK" -eq 0 ]; then PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" fi set +x - PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/$MODEL_NAME \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ @@ -630,7 +630,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" fi set +x - PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/${MODEL_NAME} \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ @@ -698,7 +698,7 @@ else DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}" fi set +x - DECODE_CMD="${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ --model-path ${MODEL_DIR}/${MODEL_NAME} \ --disaggregation-mode decode \ --disaggregation-ib-device ${IBDEVICES} \ From 9fe9b24ba8b1758687c1825447c24e4d2da92178 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 27 May 2026 06:52:23 +0000 Subject: [PATCH 82/85] refactor: move static vLLM env vars to env.sh, remove dead etcd code Move VLLM_USE_V1, VLLM_SERVER_DEV_MODE, VLLM_DISABLE_REQUEST_ID_RANDOMIZATION to env.sh alongside other engine-specific config. Remove commented-out etcd setup block that is no longer used. Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/env.sh | 4 +++ .../multi_node/amd_utils/server_vllm.sh | 35 +------------------ 2 files changed, 5 insertions(+), 34 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 5b31dc7d9..58c1f6c83 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -53,6 +53,10 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then # ========================================================================= # vLLM/Nixl-specific environment # ========================================================================= + export VLLM_USE_V1=1 + export VLLM_SERVER_DEV_MODE=0 + export VLLM_DISABLE_REQUEST_ID_RANDOMIZATION=1 + set -x # UCX_NET_DEVICES: Use the first tw-eth interface for UCX TCP transport diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index ecab81656..d61fe0359 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -195,34 +195,6 @@ python3 $WS_PATH/sync.py barrier \ --wait-for-all-ports \ --timeout 600 -# ============================================================================= -# ETCD Server Setup -# ============================================================================= - -# echo "Proceeding to start etcd server on $host_name" -# bash ${WS_PATH}/start_etcd.sh > /dev/null 2>&1 & -# etcd_pid=$! - -# echo "Waiting at etcd server barrier on $host_name" -# python3 $WS_PATH/sync.py barrier \ -# --node-ips ${IPADDRS} \ -# --node-ports 2379 \ -# --wait-for-all-ports \ -# --timeout 300 - -# echo "All etcd servers are up : $host_name" -# sleep 3 - -# echo "etcd endpoint health==================" -# etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true -# echo "======================================" - -# python3 $WS_PATH/sync.py barrier \ -# --node-ips ${IPADDRS} \ -# --node-ports 2379 \ -# --wait-for-all-ports \ -# --timeout 300 - # ============================================================================= # Cluster Topology Configuration # ============================================================================= @@ -245,15 +217,10 @@ echo "Decode node IPs: ${DECODE_ARGS}" # MoRI-IO proxy ZMQ registration port (must match vllm-router --vllm-discovery-address) PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" -# vLLM environment (UCX transport vars are set at the Docker level in job.slurm) +# vLLM runtime environment (static vars moved to env.sh; these depend on per-node state) setup_vllm_env() { - export VLLM_USE_V1=1 - export VLLM_SERVER_DEV_MODE=0 export VLLM_NIXL_SIDE_CHANNEL_HOST=${rdma_ip} export VLLM_NIXL_SIDE_CHANNEL_PORT=5600 - # Workaround: disable request-ID randomization so MoRI-IO connector can - # match completion IDs between prefill and decode without PR #34907 patch. - export VLLM_DISABLE_REQUEST_ID_RANDOMIZATION=1 for env_pair in ${MODEL_ENVS}; do export "$env_pair" done From 6286f441d53e7cad1663fd67c1d7024455435d9d Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 27 May 2026 06:59:28 +0000 Subject: [PATCH 83/85] fix: pass IS_MULTINODE into Docker container The refactored DOCKER_ENV_COMMON array dropped -e IS_MULTINODE that the old job.slurm had. Without it, eval metadata tagging inside the container sees an empty value. Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/job.slurm | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 22b1ebcb3..a0dd81bb9 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -358,6 +358,7 @@ DOCKER_ENV_COMMON=( -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE + -e IS_MULTINODE=\$IS_MULTINODE ) # Engine-specific env vars From 37733fb0bb6e2f2ba107382f029210ee2b0fc6dc Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 27 May 2026 07:04:42 +0000 Subject: [PATCH 84/85] fix: improve vllm-disagg changelog descriptions Co-Authored-By: Claude Opus 4 --- perf-changelog.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1d347f93a..def63fd87 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2977,13 +2977,13 @@ - config-keys: - kimik2.5-fp4-mi355x-vllm-disagg description: - - "Add vLLM disaggregated prefill-decode benchmark for Kimi-K2.5-MXFP4 on MI355X" + - "Add Kimi-K2.5-MXFP4 FP4 vLLM disagg PD recipe (1P2D, MoRI-EP + MoRI-IO) for MI355X on vllm/vllm-openai-rocm:nightly" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1569 - config-keys: - minimaxm2.5-fp8-mi355x-vllm-disagg description: - - "Add vLLM disaggregated prefill-decode benchmark for MiniMax-M2.5 on MI355X" + - "Add MiniMax-M2.5 FP8 vLLM disagg PD recipe (1P2D, MoRI-EP + MoRI-IO) for MI355X on vllm/vllm-openai-rocm:nightly" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1569 - config-keys: From b1ae7810171ab5589bd3fda5b94d1d6e17d76ee9 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 27 May 2026 07:20:12 +0000 Subject: [PATCH 85/85] fix: restore DP+EP override blocks and trailing newline in server_sglang.sh Add BENCH_MAX_CONC_VALUE extraction and the two DP+EP override blocks that the refactor from server.sh dropped. These adjust max-running-requests, dispatch tokens, and MOE input tokens when both DP and EP are enabled. Also add trailing newline for POSIX compliance. server_sglang.sh now matches upstream server.sh exactly. Co-Authored-By: Claude Opus 4 --- .../multi_node/amd_utils/server_sglang.sh | 31 +++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index 9fa5b0af5..7eb7414a6 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -33,6 +33,9 @@ BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" +# Extract the maximum concurrency from the x-delimited list +BENCH_MAX_CONC_VALUE=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) + # Dry Run for debugging purpose DRY_RUN="${DRY_RUN:-0}" @@ -184,6 +187,15 @@ else prefill_enable_two_batch_overlap="false" fi +# When both DP and EP are enabled, override max-running-requests with max bench concurrency +if [[ "$PREFILL_ENABLE_DP" == "true" ]] && [[ "$PREFILL_ENABLE_EP" == "true" ]]; then + prefill_max_running_requests=$BENCH_MAX_CONC_VALUE + prefill_dp_ranks=$PREFILL_TP_SIZE + # MORI_MAX_DISPATCH_TOKENS_PREFILL stays at 8192 (no change) + MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2)) + echo "[DP+EP override] Prefill: max-running-requests=$prefill_max_running_requests, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" +fi + # Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) if [[ "$DECODE_ENABLE_DP" == "true" ]]; then decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END)) @@ -196,6 +208,18 @@ else decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP fi +# When both DP and EP are enabled, override max-running-requests and dispatch tokens +if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; then + decode_max_running_requests=$BENCH_MAX_CONC_VALUE + decode_dp_ranks=$DECODE_TP_SIZE + MORI_MAX_DISPATCH_TOKENS_DECODE=$((BENCH_MAX_CONC_VALUE / decode_dp_ranks)) + MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10)) + # Update derived variable + SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) + export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD + echo "[DP+EP override] Decode: max-running-requests=$decode_max_running_requests, DISPATCH_TOKENS=$MORI_MAX_DISPATCH_TOKENS_DECODE, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_DECODE, INTER_KERNEL_SWITCH=$SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD" +fi + # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then @@ -343,11 +367,6 @@ if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]] DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL unset MORI_MOE_MAX_INPUT_TOKENS_DECODE - # NOTE: that currently with fp8_combine set, the evals do not pass on InferenceX eval harness - # or on SGLang native harness for high concurrency 4k and gets no where near the golden score of - # 0.95 on even basic GSM8k grade school math as confirmed by @billishyahao from AMD - # and as confirmed by @Oseltamivir. This was initally merged with @billishyahao promising - # that an fast follow PR to fix the evals via having quant correction in the fp8 combine fi # ============================================================================= @@ -758,4 +777,4 @@ else fi echo "Script completed successfully" -exit 0 \ No newline at end of file +exit 0