diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a3afb2f6b..fd82d05cb 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1350,6 +1350,115 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=2" +kimik2.5-fp4-mi355x-vllm-disagg: + image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + runner: mi355x-disagg + precision: fp4 + framework: vllm-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + +minimaxm2.5-fp8-mi355x-vllm-disagg: + image: vllm/vllm-openai-rocm:nightly-a6682d1d259cca69a9ae737ea5608fbbe7520031 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi355x-disagg + precision: fp8 + framework: vllm-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + # Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536, + # TP8 shards to 192 which is not divisible by FP8 block_n=128. + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" dsr1-fp4-mi355x-sglang-disagg: image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index f5e39b4cf..7dbbaaaa8 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -210,6 +210,7 @@ run_benchmark_serving() { local dsv4=false local trust_remote_code=false local server_pid="" + local tokenizer="" while [[ $# -gt 0 ]]; do case $1 in @@ -278,6 +279,10 @@ run_benchmark_serving() { server_pid="$2" shift 2 ;; + --tokenizer) + tokenizer="$2" + shift 2 + ;; *) echo "Unknown parameter: $1" return 1 @@ -385,6 +390,10 @@ run_benchmark_serving() { benchmark_cmd+=(--trust-remote-code) fi + if [[ -n "$tokenizer" ]]; then + benchmark_cmd+=(--tokenizer "$tokenizer") + fi + # Run benchmark with optional server monitoring set -x if [[ -n "$server_pid" ]]; then diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index ac996c5a9..05384f435 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -1,4 +1,17 @@ #!/bin/bash +# Dual-Engine Disaggregated Benchmark Runner +# +# ENGINE=sglang (default): SGLang benchmark +# ENGINE=vllm: vLLM benchmark +# +# Produces JSON result files via benchmark_serving.py so that the CI pipeline +# can collect and process results. +# +# Usage: bash bench.sh \ +# \ +# + +ENGINE="${ENGINE:-sglang-disagg}" n_prefill=$1 n_decode=$2 @@ -6,58 +19,90 @@ prefill_gpus=$3 decode_gpus=$4 model_path=$5 model_name=$6 -MODEL_PATH="${model_path}/${model_name}" +MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" +# vllm-disagg uses --served-model-name MODEL_NAME; sglang defaults to MODEL_PATH +if [[ "$ENGINE" == "vllm-disagg" ]]; then + BENCH_MODEL="${MODEL_NAME:-${MODEL_PATH}}" +else + BENCH_MODEL="${MODEL_PATH}" +fi log_path=$7 chosen_isl=${8:-1024} chosen_osl=${9:-1024} concurrency_list=${10:-"512x1"} -chosen_req_rate=${11:-1} +if [[ "$ENGINE" == "vllm-disagg" ]]; then + chosen_req_rate=${11:-inf} +else + chosen_req_rate=${11:-1} +fi random_range_ratio=${12:-0.8} num_prompts_multiplier=${13:-10} IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list" -echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" +ROUTER_PORT="${ROUTER_PORT:-30000}" -head_node="localhost" -head_port="30000" +export TRANSFORMERS_VERBOSITY=error +export TOKENIZERS_PARALLELISM=false +echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" -profile_folder="${log_path}/sglang_isl_${chosen_isl}_osl_${chosen_osl}" -mkdir -p $profile_folder +profile_folder="${log_path}/${ENGINE}_isl_${chosen_isl}_osl_${chosen_osl}" +mkdir -p "$profile_folder" source "$(dirname "$0")/../../benchmark_lib.sh" -# Repo root inside the container (3 levels up from this script's directory) REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" -for max_concurrency in ${chosen_concurrencies[@]}; do +for max_concurrency in "${chosen_concurrencies[@]}"; do export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}" + num_prompts=$(( max_concurrency * num_prompts_multiplier )) + if [[ "$num_prompts" -lt 16 ]]; then + num_prompts=16 + fi + echo "profile_folder: $profile_folder" echo "max_concurrency: $max_concurrency" echo "chosen_req_rate: $chosen_req_rate" echo "MODEL_PATH: $MODEL_PATH" - echo "head_port: $head_port" + echo "ROUTER_PORT: $ROUTER_PORT" echo "chosen_isl: $chosen_isl" echo "chosen_osl: $chosen_osl" + echo "num_prompts: $num_prompts" echo "export_file: $export_file" + # Engine-specific extra flags + extra_flags="" + if [[ "$ENGINE" == "vllm-disagg" ]]; then + extra_flags="--trust-remote-code --tokenizer $MODEL_PATH" + else + if [ "$IS_MTP" = "true" ]; then + extra_flags="--use-chat-template" + fi + fi + run_benchmark_serving \ --bench-serving-dir "$REPO_ROOT" \ - --model ${MODEL_PATH} \ - --port ${head_port} \ + --model "$BENCH_MODEL" \ + --port "$ROUTER_PORT" \ --backend openai \ - --input-len ${chosen_isl} \ - --output-len ${chosen_osl} \ - --random-range-ratio ${random_range_ratio} \ - --num-prompts $(( $max_concurrency * $num_prompts_multiplier )) \ + --input-len "$chosen_isl" \ + --output-len "$chosen_osl" \ + --random-range-ratio "$random_range_ratio" \ + --num-prompts "$num_prompts" \ --max-concurrency "$max_concurrency" \ --result-filename "$export_file" \ --result-dir /workspace/ \ - $( [ "$IS_MTP" = "true" ] && echo "--use-chat-template" ) + $extra_flags echo "-----------------------------------------" + + # vLLM: cooldown between rounds for idle KV block reaper + if [[ "$ENGINE" == "vllm-disagg" ]]; then + echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." + sleep 10 + fi done diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 904576003..58c1f6c83 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -1,110 +1,209 @@ #!/bin/bash -# SGLang/MoRI environment setup for multi-node disaggregated serving. +# Dual-engine environment setup for multi-node disaggregated serving. +# +# ENGINE=sglang (default): SGLang/MoRI environment +# ENGINE=vllm: vLLM/Nixl environment # # REQUIRED ENVIRONMENT VARIABLES: # IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) -# This must be set by the runner script (runners/launch_mi355x-amds.sh) -# -# OPTIONAL ENVIRONMENT VARIABLES: -# MORI_RDMA_TC - RDMA traffic class (e.g., 96, 104). Set by runner if cluster uses QoS. - +# Set by runner or auto-detected from hostname. set -x + +ENGINE="${ENGINE:-sglang-disagg}" export PYTHONDONTWRITEBYTECODE=1 -# IBDEVICES configuration +# ============================================================================= +# Shared: IBDEVICES detection +# ============================================================================= + # Prefer IBDEVICES set by runner (runners/launch_mi355x-amds.sh) # Fall back to hostname detection if not set (for direct script execution) if [[ -z "$IBDEVICES" ]]; then - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7 - elif [[ $NODENAME == mia1* ]]; then - export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 + DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',') + if [[ -n "$DETECTED" ]]; then + export IBDEVICES="$DETECTED" + echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES via ibv_devinfo on $(hostname -s)" else - echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2 + echo "ERROR: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2 exit 1 fi - echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $NODENAME" else echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)" fi export IBDEVICES -# Auto-detect default network interface (portable across clusters) -export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) -export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) +# Shared: Auto-detect default network interface (portable across clusters) +# Only auto-detect if not already set by the runner/environment +if [[ -z "$GLOO_SOCKET_IFNAME" ]]; then + export GLOO_SOCKET_IFNAME=$(ip route 2>/dev/null | grep '^default' | awk '{print $5}' | head -n 1) +fi +if [[ -z "$NCCL_SOCKET_IFNAME" ]]; then + export NCCL_SOCKET_IFNAME=$(ip route 2>/dev/null | grep '^default' | awk '{print $5}' | head -n 1) +fi +set +x -export NCCL_IB_HCA=$IBDEVICES +export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} -export SGLANG_USE_AITER=1 +# ============================================================================= +# Engine-specific environment +# ============================================================================= -export SGLANG_MORI_DISPATCH_DTYPE=auto -export MORI_COMBINE_DTYPE_PREFILL=fp8_direct_cast -export MORI_COMBINE_DTYPE_DECODE=fp8 -export SGLANG_MORI_QP_PER_TRANSFER=4 -export SGLANG_MORI_NUM_WORKERS=4 -export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000 +if [[ "$ENGINE" == "vllm-disagg" ]]; then + # ========================================================================= + # vLLM/Nixl-specific environment + # ========================================================================= + export VLLM_USE_V1=1 + export VLLM_SERVER_DEV_MODE=0 + export VLLM_DISABLE_REQUEST_ID_RANDOMIZATION=1 -export MORI_IO_QP_MAX_SEND_WR=16384 -export MORI_IO_QP_MAX_CQE=32768 -export MORI_IO_QP_MAX_SGE=4 + set -x -export MORI_IO_TC_DISABLE=0 + # UCX_NET_DEVICES: Use the first tw-eth interface for UCX TCP transport + if [[ -z "$UCX_NET_DEVICES" ]]; then + UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth/{print $2}' | head -1) + if [[ -n "$UCX_NET_DEV" ]]; then + export UCX_NET_DEVICES="$UCX_NET_DEV" + else + FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1) + if [[ -n "$FIRST_IB" ]]; then + export UCX_NET_DEVICES="${FIRST_IB}:1" + fi + fi + echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES" + else + echo "[INFO] Using UCX_NET_DEVICES=$UCX_NET_DEVICES (set by environment)" + fi -export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600 -export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600 + # RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing + export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1} -# Disable allocating memory in one pass -export MORI_SHMEM_MODE=ISOLATION + # QoS/DSCP configuration for lossless RoCEv2 fabric. + if [[ -n "$UCX_IB_TRAFFIC_CLASS" ]]; then + echo "[INFO] Using UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS (set by environment)" + elif command -v nicctl &> /dev/null; then + ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') + ND_DSCP=$(nicctl show qos 2>/dev/null | awk -v p="$ND_PRIO" ' +$1 == "DSCP" && $2 == ":" && $NF == p { + print $3; exit +}') + if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then + export UCX_IB_TRAFFIC_CLASS=$(( 4 * ND_DSCP )) + export UCX_IB_SL=$ND_PRIO + echo "[INFO] Detected QoS from nicctl: UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS, UCX_IB_SL=$UCX_IB_SL" + else + echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export UCX_IB_TRAFFIC_CLASS=96 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export UCX_IB_TRAFFIC_CLASS=104 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + fi + fi + else + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export UCX_IB_TRAFFIC_CLASS=96 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export UCX_IB_TRAFFIC_CLASS=104 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + else + echo "[INFO] No nicctl and unable to detect from hostname. Skipping QoS configuration." + fi + fi + + set +x + echo "[INFO] IBDEVICES=$IBDEVICES UCX_NET_DEVICES=$UCX_NET_DEVICES NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX UCX_IB_TRAFFIC_CLASS=${UCX_IB_TRAFFIC_CLASS:-unset}" + +else + # ========================================================================= + # SGLang/MoRI-specific environment + # ========================================================================= + + export SGLANG_USE_AITER=1 + + export SGLANG_MORI_DISPATCH_DTYPE=auto + export SGLANG_MORI_FP8_COMB=true + export SGLANG_MORI_QP_PER_TRANSFER=4 + export SGLANG_MORI_NUM_WORKERS=4 + export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000 + + export MORI_IO_QP_MAX_SEND_WR=16384 + export MORI_IO_QP_MAX_CQE=32768 + export MORI_IO_QP_MAX_SGE=4 + + export MORI_IO_TC_DISABLE=0 -# Enable spec v2 -export SGLANG_ENABLE_SPEC_V2=1 -export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=0 + export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600 + export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600 -export SGLANG_LOG_MS=true -export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 + # Disable allocating memory in one pass + export MORI_SHMEM_MODE=ISOLATION -export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192 -export MORI_MAX_DISPATCH_TOKENS_DECODE=512 + # Enable spec v2 + export SGLANG_ENABLE_SPEC_V2=1 + export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=0 -export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=32768 -export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2703 + export SGLANG_LOG_MS=true + export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 -# set MTP size=1 when EP16 -export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) + export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192 + export MORI_MAX_DISPATCH_TOKENS_DECODE=512 -export MORI_EP_LAUNCH_CONFIG_MODE=AUTO + export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=32768 + export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2703 + # set MTP size=1 when EP16 + export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) -export MORI_APP_LOG_LEVEL=INFO + export MORI_EP_LAUNCH_CONFIG_MODE=AUTO -# Router logging control: -# 0 (default) keeps noisy per-request access logs out of stdout while still logging to file. -# 1 mirrors router logs to stdout via tee (useful for live debugging). -export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}" + export MORI_APP_LOG_LEVEL=INFO -# QoS/DSCP configuration -# Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname -if [[ -n "$MORI_RDMA_TC" ]]; then - echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)" -elif command -v nicctl &> /dev/null; then - ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') - ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" ' + # Router logging control: + # 0 (default) keeps noisy per-request access logs out of stdout while still logging to file. + # 1 mirrors router logs to stdout via tee (useful for live debugging). + export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}" + + # QoS/DSCP configuration + # Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname + if [[ -n "$MORI_RDMA_TC" ]]; then + echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)" + elif command -v nicctl &> /dev/null; then + ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') + ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" ' $1 == "DSCP" && $2 == ":" && $NF == p { print $3; exit }') - if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then - TC=$(( 4 * ND_DSCP )) - export MORI_RDMA_SL=$ND_PRIO - export MORI_IO_SL=$ND_PRIO - export MORI_RDMA_TC=$TC - export MORI_IO_TC=$TC - echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL" + if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then + TC=$(( 4 * ND_DSCP )) + export MORI_RDMA_SL=$ND_PRIO + export MORI_IO_SL=$ND_PRIO + export MORI_RDMA_TC=$TC + export MORI_IO_TC=$TC + echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL" + else + echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." + # Fall back to hostname-based detection + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export MORI_RDMA_TC=96 + export MORI_IO_TC=96 + echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export MORI_RDMA_TC=104 + export MORI_IO_TC=104 + echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" + else + echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." + fi + fi else - echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." - # Fall back to hostname-based detection + # nicctl not available, try hostname-based detection NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 @@ -115,28 +214,12 @@ $1 == "DSCP" && $2 == ":" && $NF == p { export MORI_IO_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else - echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." + echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." + echo " This is normal for clusters without QoS or outside Docker containers." fi fi -else - # nicctl not available, try hostname-based detection - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - export MORI_RDMA_TC=96 - export MORI_IO_TC=96 - echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" - elif [[ $NODENAME == mia1* ]]; then - export MORI_RDMA_TC=104 - export MORI_IO_TC=104 - echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" - else - echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." - echo " This is normal for clusters without QoS or outside Docker containers." - fi -fi - -# FIXME: WA for latest upstream 0305 image -export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} + # FIXME: WA for latest upstream 0305 image + export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} -set +x +fi diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 824605c46..a0dd81bb9 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -1,265 +1,258 @@ #!/bin/bash -#SBATCH --job-name=1p2d_bench-serving # Specify a custom string for your slurm batch job -#SBATCH -N 3 # CHECK this to be right in batch jobs -#SBATCH -n 3 # CHECK this to be right in batch jobs +#SBATCH --job-name=disagg-bench +#SBATCH -N 3 # Overridden by submit.sh -N flag +#SBATCH -n 3 # Overridden by submit.sh -n flag #SBATCH --ntasks-per-node=1 #SBATCH --spread-job -#SBATCH --gres=gpu:8 # Request 8 GPUs and 8 NICs (use --gres if specific GPU resources are needed) -#SBATCH --time=24:00:00 # Set a time limit for the job (HH:MM:SS) +#SBATCH --gres=gpu:8 +#SBATCH --time=24:00:00 # --output and --error are set by submit.sh via BENCHMARK_LOGS_DIR +ENGINE="${ENGINE:-sglang-disagg}" -# ------------------------ -# Print current time in UTC and PST formats -# ------------------------ echo "=== Job Start Time ===" echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')" echo "PST Time: $(TZ=America/Los_Angeles date '+%Y-%m-%d %H:%M:%S %Z')" +echo "ENGINE: $ENGINE" echo "=======================" echo "" # ============================================================================= -# Model validation from models.yaml (replaces hardcoded VALID_MODELS array) +# Model Validation # ============================================================================= -# DI_REPO_DIR is set below from $(pwd); use the submit-time working directory -# because sbatch copies this script to /var/spool/slurmd/ at runtime. -MODELS_YAML="$(pwd)/models.yaml" + +# Use $(pwd) not BASH_SOURCE — sbatch copies the script to /var/spool/slurmd/ +# at runtime, but the CWD remains the submit-time directory (amd_utils/). +if [[ "$ENGINE" == "vllm-disagg" ]]; then + MODELS_YAML="$(pwd)/models_vllm.yaml" +else + MODELS_YAML="$(pwd)/models.yaml" +fi if [[ ! -f "$MODELS_YAML" ]]; then - echo "Error: models.yaml not found at $MODELS_YAML" + echo "Error: models YAML not found at $MODELS_YAML" exit 1 fi -# Validate MODEL_NAME exists as a top-level key in models.yaml +if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then + echo "Error: DOCKER_IMAGE_NAME is not set." + exit 1 +fi + +MODEL_NAME="${MODEL_NAME:-None}" if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then - echo "Error: Model '$MODEL_NAME' not found in models.yaml" + echo "Error: Model '$MODEL_NAME' not found in $MODELS_YAML" echo "Available models:" grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/ - /' exit 1 fi echo "Model found: $MODEL_NAME" -# All models use server.sh as the entrypoint RUN_FILE="server.sh" echo "Runfile set: $RUN_FILE" -if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then - echo "Error: DOCKER_IMAGE_NAME is not set." - exit 1 -fi - -# DI_REPO_DIR points to the repo root so Docker can access both benchmarks/ and utils/. +# DI_REPO_DIR points to the repo root. # $(pwd) is amd_utils/ (the sbatch submit dir); go up 3 levels to reach the repo root. export DI_REPO_DIR=$(cd "$(pwd)/../../.." && pwd) -xP="${xP:-1}" #-> Number of Prefill Workers -yD="${yD:-1}" #-> Number of Decode Workers +xP="${xP:-1}" +yD="${yD:-1}" -# Parallelism Configuration with defaults -PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" -PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}" -PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}" -DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" -DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}" -DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}" -DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} # 0 for disabling MTP - -# Benchmark Configuration with defaults +# Benchmark configuration BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" GPUS_PER_NODE="${GPUS_PER_NODE:-8}" -MODEL_NAME="${MODEL_NAME:-None}" +# Engine-specific defaults +PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-false}" +PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-false}" +DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-false}" +DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-false}" +PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" +DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" +DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} + +# Router selection: "vllm-router" (external container) or "moriio" (in-container proxy) +ROUTER_TYPE="${ROUTER_TYPE:-vllm-router}" +ROUTER_PORT="${ROUTER_PORT:-30000}" +PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" + +# ============================================================================= +# Model Path Resolution +# ============================================================================= # MODEL_DIR detection: prefer env var, fall back to hostname detection if [[ -z "$MODEL_DIR" ]]; then NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then MODEL_DIR="/nfsdata" - echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then MODEL_DIR="/it-share/data" - echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $NODENAME" else - MODEL_DIR="/nfsdata" # Default fallback - echo "[INFO] Using default MODEL_DIR=$MODEL_DIR (hostname $NODENAME not recognized)" + MODEL_DIR="/nfsdata" fi + echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $(hostname -s)" fi export MODEL_DIR -# ------------------------ -# Model path validation and selection across all nodes -# ------------------------ -echo "Looking for model: $MODEL_NAME" -echo "Checking model availability across all allocated nodes..." - -# Get all allocated nodes -ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -TOTAL_NODES=$(echo "$ALL_NODES" | wc -l) - -echo "Total allocated nodes: $TOTAL_NODES" -echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')" - -# Function to check model path on all nodes -check_model_path() { - local path=$1 - local check_name=$2 - - echo "Checking $check_name: $path" +if [[ "$ENGINE" == "vllm-disagg" ]]; then + # vLLM: Extract hf_dir from models.yaml, search multiple paths, resolve HF cache snapshots + DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next} + found && /^[^ ]/{exit} + found && /hf_dir:/{gsub(/[" ]/, "", $2); print $2; exit}' "$MODELS_YAML") + DISK_DIR_NAME="${DISK_DIR_NAME:-$MODEL_NAME}" + echo "Looking for model: $MODEL_NAME (disk dir: $DISK_DIR_NAME)" + + resolve_hf_cache_path() { + local base_path=$1 + if [[ -d "${base_path}/snapshots" ]]; then + local snapshot=$(ls -1 "${base_path}/snapshots" 2>/dev/null | head -1) + if [[ -n "$snapshot" ]]; then + echo "${base_path}/snapshots/${snapshot}" + return 0 + fi + fi + echo "$base_path" + return 1 + } + + MODEL_PATH="" + SEARCH_PATHS=( + "${MODEL_DIR}/${DISK_DIR_NAME}" + "${MODEL_DIR}/${MODEL_NAME}" + "/nfsdata/hf_hub_cache-0/${DISK_DIR_NAME}" + "/nfsdata/hf_hub_cache-0/${MODEL_NAME}" + ) + + for search_path in "${SEARCH_PATHS[@]}"; do + if [[ -d "$search_path" ]]; then + RESOLVED=$(resolve_hf_cache_path "$search_path") + MODEL_PATH="$RESOLVED" + echo "Found MODEL_PATH: $MODEL_PATH" + break + fi + done - # Run check on all nodes in parallel - srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c " - if [ -d '$path' ]; then - echo \"\$(hostname): ✓ Found $path\" - exit 0 + if [[ -z "$MODEL_PATH" ]]; then + echo "FATAL: Model '$MODEL_NAME' not found. Searched:" + for p in "${SEARCH_PATHS[@]}"; do echo " - $p"; done + exit 1 + fi + echo "Final MODEL_PATH: $MODEL_PATH" +else + # SGLang: Validate model path across all allocated nodes + echo "Looking for model: $MODEL_NAME" + echo "Checking model availability across all allocated nodes..." + + ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") + TOTAL_NODES=$(echo "$ALL_NODES" | wc -l) + echo "Total allocated nodes: $TOTAL_NODES" + echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')" + + check_model_path() { + local path=$1 + local check_name=$2 + echo "Checking $check_name: $path" + srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c " + if [ -d '$path' ]; then + echo \"\$(hostname): Found $path\" + exit 0 + else + echo \"\$(hostname): Missing $path\" + exit 1 + fi + " + local exit_code=$? + if [ $exit_code -eq 0 ]; then + echo "$check_name available on ALL nodes" + return 0 else - echo \"\$(hostname): ✗ Missing $path\" - exit 1 + echo "$check_name NOT available on all nodes" + return 1 fi - " + } - # Check if all nodes succeeded (exit code 0) - local exit_code=$? - if [ $exit_code -eq 0 ]; then - echo "✓ $check_name available on ALL nodes" - return 0 + if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then + MODEL_PATH="$MODEL_DIR/$MODEL_NAME" + echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)" else - echo "✗ $check_name NOT available on all nodes" - return 1 + echo "FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in:" + echo " - $MODEL_DIR/$MODEL_NAME" + exit 1 fi -} - -# Check model weights exist on "$MODEL_DIR/$MODEL_NAME" -if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then - MODEL_PATH="$MODEL_DIR/$MODEL_NAME" - echo "" - echo "✓ Selected MODEL_PATH: $MODEL_PATH (available on all nodes)" -else - echo "" - echo "✗ FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in the following:" - echo " - $MODEL_DIR/$MODEL_NAME" - echo "" - echo "Model must be accessible from all nodes for distributed execution." - echo "Please ensure the model is available on all allocated nodes." - exit 1 + echo "Final MODEL_PATH: $MODEL_PATH" fi -echo "Final MODEL_PATH: $MODEL_PATH" -echo "" - -NUM_NODES="${NUM_NODES}" +# ============================================================================= +# Node Selection +# ============================================================================= -# ------------------------ -# Extract first NUM_NODES from SLURM allocation and update SLURM variables -# ------------------------ -echo "Original SLURM allocation:" -echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "SLURM_NNODES: $SLURM_NNODES" -echo "SLURM_NTASKS: $SLURM_NTASKS" +NUM_NODES=$((xP + yD)) +echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD)" -# Get the full nodelist and extract first NUM_NODES FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') -# Create new nodelist in SLURM format -# This is a simplified approach - for complex ranges, you might need more sophisticated parsing -NEW_SLURM_NODELIST=$(echo "$SELECTED_NODES" | paste -sd, | sed 's/,/,/g') +# Docker privilege detection — evaluated per-node since group membership varies. +# Exported as a snippet so every srun participant resolves it locally. +export DOCKER_CMD_DETECT='if docker ps &>/dev/null 2>&1; then DOCKER_CMD=docker; else DOCKER_CMD="sudo docker"; fi' # Update SLURM environment variables export SLURM_NNODES=$NUM_NODES export SLURM_NTASKS=$NUM_NODES export SLURM_JOB_NUM_NODES=$NUM_NODES export SLURM_NPROCS=$NUM_NODES -export SLURM_JOB_NODELIST="$NEW_SLURM_NODELIST" -export SLURM_NODELIST="$NEW_SLURM_NODELIST" - -# Keep other SLURM variables as they were or set defaults +export SLURM_JOB_NODELIST="$SELECTED_NODELIST_STR" +export SLURM_NODELIST="$SELECTED_NODELIST_STR" export SLURM_TASKS_PER_NODE="1(x$NUM_NODES)" -export SLURM_SUBMIT_DIR="${SLURM_SUBMIT_DIR:-$HOME}" -export SLURM_CLUSTER_NAME="${SLURM_CLUSTER_NAME}" # Let SLURM set this automatically -export SLURM_JOB_CPUS_PER_NODE="${SLURM_JOB_CPUS_PER_NODE}" -export SLURM_JOB_PARTITION="${SLURM_JOB_PARTITION}" # Should be set by sbatch/runner -export SLURM_JOBID="${SLURM_JOBID:-$SLURM_JOB_ID}" -export SLURM_JOB_QOS="${SLURM_JOB_QOS}" # Should be set by sbatch/runner if needed -export SLURM_JOB_ACCOUNT="${SLURM_JOB_ACCOUNT}" # Should be set by sbatch/runner export SLURM_NTASKS_PER_NODE=1 -export SLURM_SUBMIT_HOST="${SLURM_SUBMIT_HOST}" -export SLURM_JOB_ID="${SLURM_JOB_ID}" -# SLURM_CONF is auto-set by SLURM, no need to override -export SLURM_JOB_NAME="${SLURM_JOB_NAME:-1p1d_bench-serving}" echo "" -echo "Updated SLURM Environment Variables:" -echo "SLURM_JOB_ID: $SLURM_JOB_ID" -echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "SLURM_NNODES: $SLURM_NNODES" -echo "SLURM_NTASKS: $SLURM_NTASKS" -echo "SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "SLURM_JOB_CPUS_PER_NODE: $SLURM_JOB_CPUS_PER_NODE" -echo "SLURM_JOB_PARTITION: $SLURM_JOB_PARTITION" -echo "SLURM_JOB_NUM_NODES: $SLURM_JOB_NUM_NODES" -echo "SLURM_JOBID: $SLURM_JOBID" -echo "SLURM_JOB_QOS: $SLURM_JOB_QOS" -echo "SLURM_NODELIST: $SLURM_NODELIST" -echo "SLURM_JOB_ACCOUNT: $SLURM_JOB_ACCOUNT" -echo "SLURM_NPROCS: $SLURM_NPROCS" -echo "SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "SLURM_CONF: $SLURM_CONF" -echo "SLURM_JOB_NAME: $SLURM_JOB_NAME" -echo "SLURM_NTASKS_PER_NODE: $SLURM_NTASKS_PER_NODE" -echo "SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "SLURM_CLUSTER_NAME: $SLURM_CLUSTER_NAME" -echo "ulimit: $(ulimit -a)" -echo "" -echo "Selected nodes for execution:" -echo "$SELECTED_NODES" -echo "" +echo "Selected nodes: $SELECTED_NODELIST_STR" + +# ============================================================================= +# IP Resolution +# ============================================================================= -# Node information USER_NAME=$(whoami) MASTER_NODE=$(echo "$SELECTED_NODES" | head -n 1) NODE0_ADDR=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$MASTER_NODE" bash -c 'ip route get 1.1.1.1') NODE0_ADDR=$(echo "$NODE0_ADDR" | awk '/src/ {print $7}') IPS=() - -GW_NIC=$(ip route | awk '/^default/ {print $5; exit}') for NODE in $SELECTED_NODES; do IP=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$NODE" bash -c 'ip route get 1.1.1.1') IP=$(echo "$IP" | awk '/src/ {print $7}') IPS+=("$IP") done -echo "Selected node IPs: ${IPS[*]}" | sed 's/ /,/g' +echo "Node IPs: ${IPS[*]}" DOCKER_MOUNT_PATH="/workspace" -SGLANG_WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils" -timestamp=$(date +"%Y-%m-%d_%H-%M-%S") +WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils" NNODES=$NUM_NODES -echo "MASTER_NODE is ${MASTER_NODE}" -echo "NODE0_ADDR is ${NODE0_ADDR}" -echo "NNODES is ${NNODES}" -echo "REPO Directory is ${DI_REPO_DIR}" -echo "USER_NAME is ${USER_NAME}" - -# Get the RDMA priority and DSCP value from the NIC -if ! command -v nicctl >/dev/null 2>&1; then - echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2 - exit 1 -fi +echo "MASTER_NODE: ${MASTER_NODE}" +echo "NODE0_ADDR: ${NODE0_ADDR}" +echo "NNODES: ${NNODES}" +echo "REPO DIR: ${DI_REPO_DIR}" +echo "USER: ${USER_NAME}" # Reduce log spam export TQDM_MININTERVAL=20 +# Translate the host-resolved MODEL_PATH to the Docker mount namespace +DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}" + export DI_REPO_DIR=$DI_REPO_DIR -export SGLANG_WS_PATH=$SGLANG_WS_PATH +export WS_PATH=$WS_PATH export NNODES=$NNODES export NODE0_ADDR=$NODE0_ADDR export MODEL_PATH=$MODEL_PATH @@ -269,21 +262,17 @@ export yD=$yD export MODEL_NAME=$MODEL_NAME export USER_NAME=$USER_NAME export IPADDRS="$(echo "${IPS[*]}" | sed 's/ /,/g')" -export PREFILL_TP_SIZE=$PREFILL_TP_SIZE -export PREFILL_ENABLE_EP=$PREFILL_ENABLE_EP -export PREFILL_ENABLE_DP=$PREFILL_ENABLE_DP -export DECODE_TP_SIZE=$DECODE_TP_SIZE -export DECODE_ENABLE_EP=$DECODE_ENABLE_EP -export DECODE_ENABLE_DP=$DECODE_ENABLE_DP -export DECODE_MTP_SIZE=$DECODE_MTP_SIZE export GPUS_PER_NODE=$GPUS_PER_NODE export BENCH_INPUT_LEN=$BENCH_INPUT_LEN export BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN export BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO export BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY +export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE export DRY_RUN="${DRY_RUN:-0}" export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +export KEEP_CONTAINERS="${KEEP_CONTAINERS:-0}" +export ENGINE=$ENGINE # Eval-related env vars (threaded from submit.sh) export RUN_EVAL="${RUN_EVAL:-false}" @@ -298,38 +287,106 @@ export SPEC_DECODING="${SPEC_DECODING:-}" export IS_MULTINODE="${IS_MULTINODE:-false}" SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') -export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" -export RUN_FILE_FULL="$SGLANG_WS_PATH/${RUN_FILE}" +export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" +# vLLM external router container +VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260511-e667ebb}" +ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}" +export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" -# Use only the selected nodes for srun execution SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) - cleanup() { - echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning stale logs folder..." - # clean up the logs folder - sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true - + echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..." + rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true echo "[${SLURM_JOB_ID}] cleanup done." } trap cleanup INT TERM HUP - -# Force NFS cache refresh on all nodes before running Docker to avoid stale file handle errors +# Force NFS cache refresh on all nodes echo "Refreshing NFS caches on all nodes..." srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c ' sync - # Force re-stat of the mounted directory to refresh NFS handles ls -la '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils > /dev/null 2>&1 stat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1 cat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1 - # Drop caches if we have permission (optional, requires root) echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true echo "NFS cache refreshed on $(hostname)" ' +# ============================================================================= +# Build engine-specific Docker environment variables +# ============================================================================= + +# Common env vars (always passed) +DOCKER_ENV_COMMON=( + -e SLURM_JOB_ID=\$SLURM_JOB_ID + -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST + -e NNODES=\$NNODES + -e NODE_RANK=\$SLURM_PROCID + -e NODE0_ADDR=\$NODE0_ADDR + -e MODEL_DIR=/models + -e MODEL_NAME=\$MODEL_NAME + -e GPUS_PER_NODE=\$GPUS_PER_NODE + -e xP=\$xP + -e yD=\$yD + -e IPADDRS=\$IPADDRS + -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN + -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN + -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO + -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER + -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY + -e TQDM_MININTERVAL=\$TQDM_MININTERVAL + -e DRY_RUN=\$DRY_RUN + -e BENCHMARK_LOGS_DIR=/benchmark_logs + -e ENGINE=\$ENGINE + -e WS_PATH=${WS_PATH} + -e RUN_EVAL=\$RUN_EVAL + -e EVAL_ONLY=\$EVAL_ONLY + -e EVAL_CONC=\$EVAL_CONC + -e FRAMEWORK=\$FRAMEWORK + -e PRECISION=\$PRECISION + -e MODEL_PREFIX=\$MODEL_PREFIX + -e RUNNER_TYPE=\$RUNNER_TYPE + -e RESULT_FILENAME=\$RESULT_FILENAME + -e SPEC_DECODING=\$SPEC_DECODING + -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE + -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP + -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP + -e DECODE_TP_SIZE=\$DECODE_TP_SIZE + -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP + -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP + -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE + -e IS_MULTINODE=\$IS_MULTINODE +) + +# Engine-specific env vars +if [[ "$ENGINE" == "vllm-disagg" ]]; then + DOCKER_ENV_ENGINE=( + -e VLLM_WS_PATH=${WS_PATH} + -e MODEL_PATH=$DOCKER_MODEL_PATH + -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma + -e UCX_SOCKADDR_TLS_PRIORITY=tcp + -e UCX_MEMTYPE_CACHE=y + -e UCX_RNDV_SCHEME=get_zcopy + -e UCX_RNDV_THRESH=4k + -e UCX_ROCM_IPC_MIN_ZCOPY=0 + -e UCX_LOG_LEVEL=warn + -e HSA_ENABLE_SDMA=1 + -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} + -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} + -e PYTHONPYCACHEPREFIX=/tmp/pycache + ) +else + DOCKER_ENV_ENGINE=( + -e SGLANG_WS_PATH=${WS_PATH} + ) +fi + +# Engine-specific container filter for pre-clean +CONT_FILTER="name=^container_${ENGINE}_" + srun \ --nodelist="$SELECTED_NODELIST_SRUN" \ --kill-on-bad-exit=1 \ @@ -340,11 +397,44 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" +# Per-node docker privilege detection +eval \"\$DOCKER_CMD_DETECT\" +echo \"[docker-detect] rank \$SLURM_PROCID: DOCKER_CMD=\$DOCKER_CMD\" + # Pre-clean (idempotent) -sudo docker ps -aq --filter \"name=^container_sbatch_\" | xargs -r sudo docker rm -f || true -sudo docker ps -aq | xargs -r sudo docker stop || true +\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true +\$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true + +# Start vLLM external router container on node 0 +if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then + \$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true + \$DOCKER_CMD run -d \ + --name \"$ROUTER_CONT_NAME\" \ + --network host \ + -v /tmp:/run_logs \ + \"$VLLM_ROUTER_IMAGE\" \ + bash -lc \"mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && exec vllm-router \ + --vllm-pd-disaggregation \ + --kv-connector moriio \ + --vllm-discovery-address 0.0.0.0:${PROXY_PING_PORT} \ + --port ${ROUTER_PORT} \ + --host 0.0.0.0 \ + --policy consistent_hash \ + --prefill-policy consistent_hash \ + --decode-policy consistent_hash \ + --log-level info 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_\$(hostname).log \" +fi + +# Skip exec on vllm-disagg rank 0 so we can stop the router after the main +# container exits. Without this, decode nodes block forever waiting for the +# router port to close (the router is a separate container). +MAYBE_EXEC=exec +if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then + MAYBE_EXEC= + set +e +fi -exec sudo docker run --rm \ +\$MAYBE_EXEC \$DOCKER_CMD run \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -367,62 +457,38 @@ exec sudo docker run --rm \ --cap-add SYS_PTRACE \ --security-opt seccomp=unconfined \ --privileged \ + -v /sys:/sys \ + $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \ -v ${MODEL_DIR}:/models \ -v \$HOME/.ssh:/root/.ssh \ - -v $(which nicctl):/usr/sbin/nicctl \ --shm-size 128G \ -v /tmp:/run_logs \ -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ - -e SLURM_JOB_ID=\$SLURM_JOB_ID \ - -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST \ - -e NNODES=\$NNODES \ - -e NODE_RANK=\$SLURM_PROCID \ - -e NODE0_ADDR=\$NODE0_ADDR \ - -e MODEL_DIR=/models \ - -e SGLANG_WS_PATH=${SGLANG_WS_PATH} \ - -e GPUS_PER_NODE=\$GPUS_PER_NODE \ - -e xP=\$xP \ - -e yD=\$yD \ - -e MODEL_NAME=\$MODEL_NAME \ - -e IPADDRS=\$IPADDRS \ - -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \ - -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \ - -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP \ - -e DECODE_TP_SIZE=\$DECODE_TP_SIZE \ - -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP \ - -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP \ - -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE \ - -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN \ - -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN \ - -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO \ - -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER \ - -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY \ - -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \ - -e DRY_RUN=\$DRY_RUN \ - -e BENCHMARK_LOGS_DIR=/benchmark_logs \ - -e RUN_EVAL=\$RUN_EVAL \ - -e EVAL_ONLY=\$EVAL_ONLY \ - -e EVAL_CONC=\$EVAL_CONC \ - -e FRAMEWORK=\$FRAMEWORK \ - -e PRECISION=\$PRECISION \ - -e MODEL_PREFIX=\$MODEL_PREFIX \ - -e RUNNER_TYPE=\$RUNNER_TYPE \ - -e RESULT_FILENAME=\$RESULT_FILENAME \ - -e SPEC_DECODING=\$SPEC_DECODING \ - -e IS_MULTINODE=\$IS_MULTINODE \ + ${DOCKER_ENV_COMMON[*]} \ + ${DOCKER_ENV_ENGINE[*]} \ --name \"$DOCKER_CONT_NAME\" \ + --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' set -o pipefail mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log ' +# Only reached when exec was skipped (vllm-disagg rank 0) DOCKER_EXIT_CODE=\$? -if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then - echo \"ERROR: docker exited rc=\$DOCKER_EXIT_CODE on \$(hostname)\" - exit \$DOCKER_EXIT_CODE -fi +echo \"[rank 0] Main container exited (rc=\$DOCKER_EXIT_CODE). Stopping vllm-router...\" +\$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true +exit \$DOCKER_EXIT_CODE " -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'sudo docker rm -f $DOCKER_CONT_NAME 2>/dev/null || true' +if [[ "${KEEP_CONTAINERS}" != "1" ]]; then + srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'eval "$DOCKER_CMD_DETECT"; $DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' + + # Clean up vLLM external router container on node 0 + if [[ "$ENGINE" == "vllm-disagg" && "$ROUTER_TYPE" == "vllm-router" ]]; then + srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' + eval "$DOCKER_CMD_DETECT"; $DOCKER_CMD rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true + ' + fi +fi diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml new file mode 100644 index 000000000..b051de8d9 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/models_vllm.yaml @@ -0,0 +1,44 @@ +# Model-specific vLLM server configurations for disaggregated inference. +# +# Each top-level key is a MODEL_NAME value (must match the model identifier +# used in amd-master.yaml and the directory/HF-cache name under MODEL_DIR). +# +# To add a new model: add a new top-level entry following the same schema. +# No script changes are required. +# +# Schema: +# : +# prefill_flags: str # vLLM CLI flags for prefill workers +# decode_flags: str # vLLM CLI flags for decode workers +# env: str # Space-separated KEY=VALUE pairs exported before vllm serve +# hf_dir: str # (optional) On-disk directory name if it differs from the key +# # e.g. HF cache layout: models--amd--Kimi-K2.5-MXFP4 + +Llama-3.1-405B-Instruct-FP8-KV: + prefill_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" + decode_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" + env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + +amd-Llama-3.3-70B-Instruct-FP8-KV: + prefill_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + decode_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + +Kimi-K2.5-MXFP4: + prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" + hf_dir: "models--amd--Kimi-K2.5-MXFP4" + +MiniMax-M2.5: + # AITER fused-MoE kernel fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384 for gfx950 writes OOB when run with MiniMax's shapes at M=8K(=num batched tokens), crashing vllm during AITER warmup. + # Set token budget to 4k to avoid using that shape, instead of disabling AITER_MOE. + prefill_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + decode_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ENGINE_READY_TIMEOUT_S=3600 VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1" + hf_dir: "models--MiniMaxAI--MiniMax-M2.5" + +gpt-oss-120b: + prefill_flags: "--tensor-parallel-size 8" + decode_flags: "--tensor-parallel-size 8" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0" diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 7eb7414a6..5c441a793 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -1,780 +1,19 @@ #!/bin/bash -# SGLang Disaggregated Server Launcher with Model-Specific Configurations +# Dual-Engine Disaggregated Server Dispatcher # ============================================================================= - -# ============================================================================= -# Environment Configuration -# ============================================================================= - -NODE0_ADDR="${NODE0_ADDR:-localhost}" -NODE_RANK="${NODE_RANK:-0}" -MODEL_DIR="${MODEL_DIR:-}" -MODEL_NAME="${MODEL_NAME:-}" - -xP="${xP:-1}" #-> Number of Prefill Workers -yD="${yD:-1}" #-> Number of Decode Workers - -IPADDRS="${IPADDRS:-localhost}" -HEADNODE_PORT="${HEADNODE_PORT:-20000}" -# Parallelism Configuration -PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" -PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}" -PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}" -DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" -DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}" -DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}" -DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}" - -# Benchmark Configuration -BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" -BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" -BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" -BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" -BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" -BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" - -# Extract the maximum concurrency from the x-delimited list -BENCH_MAX_CONC_VALUE=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) - -# Dry Run for debugging purpose -DRY_RUN="${DRY_RUN:-0}" - -# GPU count (expandable for different hardware) -GPUS_PER_NODE="${GPUS_PER_NODE:-8}" - - -# ============================================================================= -# Dependencies and Environment Setup -# ============================================================================= -source $SGLANG_WS_PATH/env.sh - -host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}') -host_name=$(hostname) - -# MORI_RDMA_TC configuration (optional) -# If set by runner, use it for RDMA traffic class configuration -# If not set, RDMA operations will proceed without QoS/traffic class settings -if [[ -n "${MORI_RDMA_TC}" ]]; then - echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC for RDMA traffic class configuration" - echo "[INFO] Host '$host_name' configured with MORI_RDMA_TC=$MORI_RDMA_TC" -else - echo "[INFO] MORI_RDMA_TC not set. Skipping RDMA traffic class configuration." - echo "[INFO] This is normal for clusters without QoS requirements." -fi - -# ============================================================================= -# Model-Specific Configuration from YAML +# Dispatches to the engine-specific server launcher based on ENGINE env var. +# ENGINE=sglang-disagg (default) -> server_sglang.sh (SGLang + MoRI) +# ENGINE=vllm-disagg -> server_vllm.sh (vLLM + Nixl/MoRI-IO) # ============================================================================= -MODELS_YAML="${SGLANG_WS_PATH}/models.yaml" -if [[ ! -f "$MODELS_YAML" ]]; then - echo "ERROR: models.yaml not found at $MODELS_YAML" - exit 1 -fi - -# Load model config via inline Python (PyYAML is available in SGLang containers) -# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP") -# is done here in Python to avoid bash glob-expanding the * characters. -eval "$(python3 -c " -import yaml, sys, os - -config_path = '${MODELS_YAML}' -model_name = '${MODEL_NAME}' - -with open(config_path) as f: - models = yaml.safe_load(f) - -if model_name not in models: - print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') - sys.exit(0) - -m = models[model_name] - -def eval_formula(val): - \"\"\"Evaluate chunked_prefill_size: if string, resolve variable names from env and compute.\"\"\" - if isinstance(val, (int, float)): - return int(val) - s = str(val) - # Build a namespace from env vars (convert numeric values to int) - ns = {} - for k, v in os.environ.items(): - try: - ns[k] = int(v) - except (ValueError, TypeError): - pass - try: - return int(eval(s, {'__builtins__': {}}, ns)) - except Exception as e: - print(f'echo \"WARNING: Cannot evaluate formula: {s} ({e})\"', file=sys.stderr) - return val - -def parse_range(cuda_range, default_start, default_end): - if '-' in str(cuda_range): - s, e = str(cuda_range).split('-') - return s, e - return str(default_start), str(default_end) - -# Output shell variables -print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"') -print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"') -print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"') - -prefill = m.get('prefill', {}) -decode = m.get('decode', {}) +ENGINE="${ENGINE:-sglang-disagg}" +WS_PATH="${WS_PATH:-${SGLANG_WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}}" +export WS_PATH ENGINE -print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"') -print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"') +echo "[DISPATCHER] ENGINE=$ENGINE WS_PATH=$WS_PATH" -dp = prefill.get('dp', {}) -no_dp = prefill.get('no_dp', {}) -print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"') -print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') -print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"') -print(f'PREFILL_CONTEXT_LENGTH_DP=\"{dp.get(\"context_length\", \"\")}\"') -print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"') -print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"') -print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') -print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) -print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') -print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') - -print(f'DECODE_MEM_FRACTION_STATIC=\"{decode.get(\"mem_fraction_static\", 0.85)}\"') -print(f'DECODE_PREFILL_ROUND_ROBIN_BALANCE=\"{decode.get(\"prefill_round_robin_balance\", True)}\"') - -dp = decode.get('dp', {}) -ep_only = decode.get('ep_only', {}) -no_dp = decode.get('no_dp', {}) - -# Decode DP config -print(f'DECODE_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 4096)}\"') -print(f'DECODE_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(dp.get('cuda_graph_bs_range', '1-160'), 1, 160) -print(f'DECODE_CUDA_GRAPH_BS_DP_START=\"{s}\"') -print(f'DECODE_CUDA_GRAPH_BS_DP_END=\"{e}\"') - -# Decode EP-only config (EP enabled but DP disabled) -print(f'DECODE_MAX_RUNNING_REQUESTS_EP_ONLY=\"{ep_only.get(\"max_running_requests\", 256)}\"') -print(f'DECODE_CHUNKED_PREFILL_SIZE_EP_ONLY=\"{eval_formula(ep_only.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(ep_only.get('cuda_graph_bs_range', '1-256'), 1, 256) -print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_START=\"{s}\"') -print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_END=\"{e}\"') - -# Decode no-DP config -print(f'DECODE_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') -print(f'DECODE_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) -print(f'DECODE_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') -print(f'DECODE_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') -")" - -echo "Loaded model configuration for: $MODEL_NAME" - -# Compute DP-dependent prefill parameters -if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then - prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP) - prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP - prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP - prefill_context_length=$PREFILL_CONTEXT_LENGTH_DP - prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_DP - prefill_enable_two_batch_overlap=$PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP +if [[ "$ENGINE" == "vllm-disagg" ]]; then + source "$WS_PATH/server_vllm.sh" else - prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) - prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP - prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP - prefill_context_length="" - prefill_max_total_tokens="" - prefill_enable_two_batch_overlap="false" + source "$WS_PATH/server_sglang.sh" fi - -# When both DP and EP are enabled, override max-running-requests with max bench concurrency -if [[ "$PREFILL_ENABLE_DP" == "true" ]] && [[ "$PREFILL_ENABLE_EP" == "true" ]]; then - prefill_max_running_requests=$BENCH_MAX_CONC_VALUE - prefill_dp_ranks=$PREFILL_TP_SIZE - # MORI_MAX_DISPATCH_TOKENS_PREFILL stays at 8192 (no change) - MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2)) - echo "[DP+EP override] Prefill: max-running-requests=$prefill_max_running_requests, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" -fi - -# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) -if [[ "$DECODE_ENABLE_DP" == "true" ]]; then - decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END)) - decode_max_running_requests=$((DECODE_CUDA_GRAPH_BS_DP_END * DECODE_TP_SIZE)) -elif [[ "$DECODE_ENABLE_EP" == "true" ]]; then - decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_EP_ONLY_START $DECODE_CUDA_GRAPH_BS_EP_ONLY_END)) - decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_EP_ONLY -else - decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_NO_DP_START $DECODE_CUDA_GRAPH_BS_NO_DP_END)) - decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP -fi - -# When both DP and EP are enabled, override max-running-requests and dispatch tokens -if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; then - decode_max_running_requests=$BENCH_MAX_CONC_VALUE - decode_dp_ranks=$DECODE_TP_SIZE - MORI_MAX_DISPATCH_TOKENS_DECODE=$((BENCH_MAX_CONC_VALUE / decode_dp_ranks)) - MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10)) - # Update derived variable - SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) - export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD - echo "[DP+EP override] Decode: max-running-requests=$decode_max_running_requests, DISPATCH_TOKENS=$MORI_MAX_DISPATCH_TOKENS_DECODE, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_DECODE, INTER_KERNEL_SWITCH=$SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD" -fi - -# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " -if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then - PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" -fi -if [[ -n "$prefill_context_length" ]]; then - PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --context-length ${prefill_context_length}" -fi -if [[ -n "$prefill_max_total_tokens" ]]; then - PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --max-total-tokens ${prefill_max_total_tokens}" -fi -if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then - PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap" - PREFILL_SDMA_ENV="MORI_ENABLE_SDMA=true" -fi - -DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} " - -if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then - DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" -fi - -if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then - MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) - MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) -fi - -# ============================================================================= -# Cluster Topology Configuration -# ============================================================================= -IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" - -# Ceiling division by GPUS_PER_NODE for nodes-per-worker -PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE)) -DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE)) -NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP)) - -# Build prefill arguments dynamically based on xP -PREFILL_HEADNODE_URLS=() -PREFILL_ARGS="" -for i in $(seq 0 $((xP - 1))); do - prefill_idx=$((i * PREFILL_NODES_PER_WORKER)) - PREFILL_HEADNODE_URLS[$i]="${IP_ARRAY[$prefill_idx]}:${HEADNODE_PORT}" - PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$prefill_idx]}:8000" -done - -# Build decode arguments dynamically based on yD -DECODE_HEADNODE_URLS=() -DECODE_ARGS="" -for i in $(seq 0 $((yD - 1))); do - decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET)) - DECODE_HEADNODE_URLS[$i]="${IP_ARRAY[$decode_idx]}:${HEADNODE_PORT}" - DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$decode_idx]}:8000" -done - -echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}" -echo "Decode worker headnode list: ${DECODE_HEADNODE_URLS[@]}" - -# ============================================================================= -# Configuration Builder Functions -# ============================================================================= - -build_server_config() { - local mode="$1" - local model_name="$2" - local tp_size="$3" - local enable_ep="$4" - local enable_dp="$5" - local decode_mtp_size="$6" - - # Calculate EP and DP sizes based on enable flags - local ep_size=1 - local dp_size=1 - - if [[ "$enable_ep" == "true" ]]; then - ep_size=$tp_size - fi - - if [[ "$enable_dp" == "true" ]]; then - dp_size=$tp_size - fi - - # Build parallelism arguments - local parallel_args="--tp-size ${tp_size}" - - if [[ "$enable_ep" == "true" ]]; then - parallel_args="$parallel_args --ep-size ${ep_size}" - fi - - if [[ "$enable_dp" == "true" ]]; then - parallel_args="$parallel_args --dp-size ${dp_size}" - fi - - # Get model-specific configuration from YAML-loaded variables - local base_config="$MODEL_BASE_FLAGS" - local mtp_config="" - local dp_config="" - local specific_config="" - - # MTP config (only if MTP is enabled and mode is decode) - if [ "$decode_mtp_size" -gt 0 ]; then - mtp_config="${MODEL_MTP_FLAGS} --speculative-num-steps ${decode_mtp_size} --speculative-num-draft-tokens $((decode_mtp_size + 1))" - fi - - # DP config (only if DP is enabled) - if [[ "$enable_dp" == "true" ]]; then - dp_config="$MODEL_DP_FLAGS" - fi - - # Mode-specific config - if [[ "$mode" == "prefill" ]]; then - specific_config="$PREFILL_MODE_FLAGS" - elif [[ "$mode" == "decode" ]]; then - specific_config="$DECODE_MODE_FLAGS" - fi - - # Combine: parallel args + base config + mtp config (decode only) + dp config + specific config - local full_config="$parallel_args" - if [[ -n "$base_config" ]]; then - full_config="$full_config $base_config" - fi - if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then - full_config="$full_config $mtp_config" - fi - if [[ -n "$dp_config" ]]; then - full_config="$full_config $dp_config" - fi - if [[ -n "$specific_config" ]]; then - full_config="$full_config $specific_config" - fi - - echo "$full_config" -} - -# Build complete server configurations -PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE") -DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE") - -if [[ -n "$MODEL_NAME" ]]; then - echo "Using model-specific configuration for: $MODEL_NAME" -fi - -if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]; then - PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') - DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') - unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL - unset MORI_MOE_MAX_INPUT_TOKENS_DECODE -fi - -# ============================================================================= -# Container Synchronization -# ============================================================================= - -echo "Waiting at the container creation barrier on $host_name" -python3 $SGLANG_WS_PATH/sync.py barrier \ - --local-ip ${host_ip} \ - --local-port 5000 \ - --enable-port \ - --node-ips ${IPADDRS} \ - --node-ports 5000 \ - --wait-for-all-ports \ - --timeout 300 - - -# ============================================================================= -# Node Role Assignment and Server Launch -# ============================================================================= - -if [ "$NODE_RANK" -eq 0 ]; then - echo "NODE INFO =======================================" - echo "================================================" - echo "Node List : ${SLURM_JOB_NODELIST}" - echo "Node IPs : ${IPADDRS}" - echo "Model Name : ${MODEL_NAME:-'Not specified'}" - echo "================================================" - - echo "CLUSTER INFO ====================================" - echo "================================================" - echo "${host_name}:${host_ip} is Proxy Node and Prefill Node" - echo "Using prefill config: $PREFILL_SERVER_CONFIG" - echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" - echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" - echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}" - echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" - echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}" - echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} " - echo "Decode env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} " - - echo "================================================" - - # start the head prefill server - PREFILL_MORI_MOE_ENV="" - set -x - if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then - PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" - fi - set +x - PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ - --model-path $MODEL_DIR/$MODEL_NAME \ - --disaggregation-mode prefill \ - --disaggregation-ib-device ${IBDEVICES} \ - --host 0.0.0.0 \ - --port 8000 \ - --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} " - - if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then - PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0" - fi - - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PREFILL_CMD" - else - set -x - eval "$PREFILL_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & - set +x - prefill0_pid=$! - fi - - - echo "Waiting for all prefill and decode servers to be up . . ." - - - BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports 8000 \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - echo "Congratulations!!! All prefill and decode servers are up . . ." - - ROUTER_CMD="python -m sglang_router.launch_router \ - --pd-disaggregation \ - --port 30000 \ - --policy random \ - --prefill-policy random \ - --decode-policy random \ - ${PREFILL_ARGS} \ - ${DECODE_ARGS}" - - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $ROUTER_CMD" - else - ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_proxy_${host_name}.log" - set -x - if [[ "${SGLANG_ROUTER_STDOUT_LOGS:-0}" == "1" ]]; then - eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & - else - eval "$ROUTER_CMD" >"$ROUTER_LOG_FILE" 2>&1 & - fi - set +x - proxy_pid=$! - - # Wait for router to be ready via health endpoint - HEALTH_BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports 30000 \ - --wait-for-all-health \ - --health-endpoint /readiness \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $HEALTH_BARRIER_CMD" - else - eval "$HEALTH_BARRIER_CMD" - fi - - echo "Router is ready for benchmarking" - fi - - - echo "Ready for benchmarking on ${host_name}:${host_ip}" - - echo "Benchmarking on ${host_name}:${host_ip}" - cd $SGLANG_WS_PATH - - # Export IS_MTP based on whether MTP is enabled - if [ "$DECODE_MTP_SIZE" -gt 0 ]; then - export IS_MTP=true - else - export IS_MTP=false - fi - - # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier - BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ - $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ - ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \ - ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" - - if [[ "${EVAL_ONLY:-false}" == "true" ]]; then - echo "EVAL_ONLY mode: skipping throughput benchmark" - elif [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BENCH_CMD" - else - set -x - eval "$BENCH_CMD" - set +x - fi - - # Run evaluation if requested (before killing router) - if [[ "${RUN_EVAL:-false}" == "true" ]]; then - echo "Running lm-eval evaluation on Node 0..." - - # Health check: verify the router is still serving before running eval. - # The throughput benchmark may have crashed/exhausted decode workers. - EVAL_HEALTH_OK=false - for _attempt in 1 2 3; do - if curl -sf --max-time 10 "http://0.0.0.0:30000/readiness" >/dev/null 2>&1; then - EVAL_HEALTH_OK=true - break - fi - echo "Eval health check attempt $_attempt failed, retrying in 10s..." - sleep 10 - done - - if [[ "$EVAL_HEALTH_OK" != "true" ]]; then - echo "WARNING: Router health check failed after 3 attempts. Skipping eval." - else - # Must run from repo root so utils/evals/${task}.yaml resolves - pushd /workspace - - # Source eval functions from benchmark_lib.sh - source /workspace/benchmarks/benchmark_lib.sh - - # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list - if [[ -n "${EVAL_CONC:-}" ]]; then - export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}" - else - export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) - fi - - # Override eval context length with model's configured context_length - if [[ -n "$prefill_context_length" ]]; then - export EVAL_MAX_MODEL_LEN="$prefill_context_length" - fi - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})" - else - # Run lm-eval against the router on port 30000 - run_eval --framework lm-eval --port 30000 - eval_rc=$? - - if [[ $eval_rc -ne 0 ]]; then - echo "ERROR: run_eval exited rc=$eval_rc; skipping metadata write and eval artifact staging" >&2 - EVAL_FAILED=1 - else - # Set metadata env vars for append_lm_eval_summary - export TP="${PREFILL_TP_SIZE}" - export CONC="${EVAL_CONCURRENT_REQUESTS}" - export EP_SIZE=1 - [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}" - export PREFILL_TP="${PREFILL_TP_SIZE}" - export PREFILL_EP=1 - [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}" - export PREFILL_NUM_WORKERS="${xP}" - export DECODE_TP="${DECODE_TP_SIZE}" - export DECODE_EP=1 - [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}" - export DECODE_NUM_WORKERS="${yD}" - export DP_ATTENTION="${PREFILL_ENABLE_DP}" - export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}" - export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}" - export ISL="${BENCH_INPUT_LEN}" - export OSL="${BENCH_OUTPUT_LEN}" - # IS_MULTINODE, FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, - # RESULT_FILENAME are already set via Docker -e flags from job.slurm - - append_lm_eval_summary - # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace - - # Copy eval artifacts to run_logs for NFS extraction by runner - EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results" - mkdir -p "$EVAL_COPY_DIR" - for f in meta_env.json; do - [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/" - done - # Use find for glob patterns to avoid "no match" errors - find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \; - find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \; - - echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR" - fi - fi - - popd - fi - fi - - # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) - LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" - mkdir -p "$LOGS_OUTPUT" - - if [[ "$DRY_RUN" -eq 0 ]]; then - cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" - echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" - fi - - echo "Killing the proxy server and prefill server" - - if [[ "$DRY_RUN" -eq 0 ]]; then - kill $proxy_pid - kill $prefill0_pid - fi - - if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then - echo "ERROR: eval failed; exiting node-0 with rc=1" - exit 1 - fi - -elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then - echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})" - echo "Using prefill config: $PREFILL_SERVER_CONFIG" - echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}" - - PREFILL_MORI_MOE_ENV="" - set -x - if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then - PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" - fi - set +x - PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ - --model-path $MODEL_DIR/${MODEL_NAME} \ - --disaggregation-mode prefill \ - --disaggregation-ib-device ${IBDEVICES} \ - --host 0.0.0.0 \ - --port 8000 \ - --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} " - - if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then - rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER)) - prefill_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER)) - PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[$prefill_idx]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank $rank" - fi - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PREFILL_CMD" - else - set -x - eval "$PREFILL_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & - set +x - prefill_pid=$! - fi - - echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports 30000 \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - - echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ - --remote-ip ${NODE0_ADDR} \ - --remote-port 30000" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $WAIT_CMD" - else - eval "$WAIT_CMD" - fi - - echo "Killing the rank $NODE_RANK prefill server" - - if [[ "$DRY_RUN" -eq 0 ]]; then - kill $prefill_pid - fi - -else - RANK=$((NODE_RANK - xP * PREFILL_NODES_PER_WORKER)) - echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})" - echo "Using decode config: $DECODE_SERVER_CONFIG" - echo "Decode node rank: $RANK" - echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" - - DECODE_MORI_MOE_ENV="" - set -x - if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_DECODE" ]]; then - DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}" - fi - set +x - DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ - --model-path ${MODEL_DIR}/${MODEL_NAME} \ - --disaggregation-mode decode \ - --disaggregation-ib-device ${IBDEVICES} \ - --host 0.0.0.0 \ - --port 8000 \ - --trust-remote-code \ - ${DECODE_SERVER_CONFIG} " - - if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then - rank=$((RANK % DECODE_NODES_PER_WORKER)) - decode_idx=$((RANK / DECODE_NODES_PER_WORKER)) - DECODE_CMD="$DECODE_CMD --dist-init-addr ${DECODE_HEADNODE_URLS[$decode_idx]} --nnodes ${DECODE_NODES_PER_WORKER} --node-rank $rank" - fi - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $DECODE_CMD" - else - set -x - eval "$DECODE_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & - - set +x - decode_pid=$! - fi - - - echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports 30000 \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - - - echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ - --remote-ip ${NODE0_ADDR} \ - --remote-port 30000" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $WAIT_CMD" - else - eval "$WAIT_CMD" - fi - - echo "Killing the rank $RANK decode server" - if [[ "$DRY_RUN" -eq 0 ]]; then - kill $decode_pid - fi - -fi - -echo "Script completed successfully" -exit 0 diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh new file mode 100755 index 000000000..7eb7414a6 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -0,0 +1,780 @@ +#!/bin/bash +# SGLang Disaggregated Server Launcher with Model-Specific Configurations +# ============================================================================= + +# ============================================================================= +# Environment Configuration +# ============================================================================= + +NODE0_ADDR="${NODE0_ADDR:-localhost}" +NODE_RANK="${NODE_RANK:-0}" +MODEL_DIR="${MODEL_DIR:-}" +MODEL_NAME="${MODEL_NAME:-}" + +xP="${xP:-1}" #-> Number of Prefill Workers +yD="${yD:-1}" #-> Number of Decode Workers + +IPADDRS="${IPADDRS:-localhost}" +HEADNODE_PORT="${HEADNODE_PORT:-20000}" +# Parallelism Configuration +PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" +PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}" +PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}" +DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" +DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}" +DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}" +DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}" + +# Benchmark Configuration +BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" +BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" +BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" +BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" + +# Extract the maximum concurrency from the x-delimited list +BENCH_MAX_CONC_VALUE=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) + +# Dry Run for debugging purpose +DRY_RUN="${DRY_RUN:-0}" + +# GPU count (expandable for different hardware) +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + + +# ============================================================================= +# Dependencies and Environment Setup +# ============================================================================= +source $SGLANG_WS_PATH/env.sh + +host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}') +host_name=$(hostname) + +# MORI_RDMA_TC configuration (optional) +# If set by runner, use it for RDMA traffic class configuration +# If not set, RDMA operations will proceed without QoS/traffic class settings +if [[ -n "${MORI_RDMA_TC}" ]]; then + echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC for RDMA traffic class configuration" + echo "[INFO] Host '$host_name' configured with MORI_RDMA_TC=$MORI_RDMA_TC" +else + echo "[INFO] MORI_RDMA_TC not set. Skipping RDMA traffic class configuration." + echo "[INFO] This is normal for clusters without QoS requirements." +fi + +# ============================================================================= +# Model-Specific Configuration from YAML +# ============================================================================= +MODELS_YAML="${SGLANG_WS_PATH}/models.yaml" + +if [[ ! -f "$MODELS_YAML" ]]; then + echo "ERROR: models.yaml not found at $MODELS_YAML" + exit 1 +fi + +# Load model config via inline Python (PyYAML is available in SGLang containers) +# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP") +# is done here in Python to avoid bash glob-expanding the * characters. +eval "$(python3 -c " +import yaml, sys, os + +config_path = '${MODELS_YAML}' +model_name = '${MODEL_NAME}' + +with open(config_path) as f: + models = yaml.safe_load(f) + +if model_name not in models: + print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') + sys.exit(0) + +m = models[model_name] + +def eval_formula(val): + \"\"\"Evaluate chunked_prefill_size: if string, resolve variable names from env and compute.\"\"\" + if isinstance(val, (int, float)): + return int(val) + s = str(val) + # Build a namespace from env vars (convert numeric values to int) + ns = {} + for k, v in os.environ.items(): + try: + ns[k] = int(v) + except (ValueError, TypeError): + pass + try: + return int(eval(s, {'__builtins__': {}}, ns)) + except Exception as e: + print(f'echo \"WARNING: Cannot evaluate formula: {s} ({e})\"', file=sys.stderr) + return val + +def parse_range(cuda_range, default_start, default_end): + if '-' in str(cuda_range): + s, e = str(cuda_range).split('-') + return s, e + return str(default_start), str(default_end) + +# Output shell variables +print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"') +print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"') +print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"') + +prefill = m.get('prefill', {}) +decode = m.get('decode', {}) + +print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"') +print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"') + +dp = prefill.get('dp', {}) +no_dp = prefill.get('no_dp', {}) +print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"') +print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') +print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"') +print(f'PREFILL_CONTEXT_LENGTH_DP=\"{dp.get(\"context_length\", \"\")}\"') +print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"') +print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"') +print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') +print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) +print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') +print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') + +print(f'DECODE_MEM_FRACTION_STATIC=\"{decode.get(\"mem_fraction_static\", 0.85)}\"') +print(f'DECODE_PREFILL_ROUND_ROBIN_BALANCE=\"{decode.get(\"prefill_round_robin_balance\", True)}\"') + +dp = decode.get('dp', {}) +ep_only = decode.get('ep_only', {}) +no_dp = decode.get('no_dp', {}) + +# Decode DP config +print(f'DECODE_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 4096)}\"') +print(f'DECODE_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(dp.get('cuda_graph_bs_range', '1-160'), 1, 160) +print(f'DECODE_CUDA_GRAPH_BS_DP_START=\"{s}\"') +print(f'DECODE_CUDA_GRAPH_BS_DP_END=\"{e}\"') + +# Decode EP-only config (EP enabled but DP disabled) +print(f'DECODE_MAX_RUNNING_REQUESTS_EP_ONLY=\"{ep_only.get(\"max_running_requests\", 256)}\"') +print(f'DECODE_CHUNKED_PREFILL_SIZE_EP_ONLY=\"{eval_formula(ep_only.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(ep_only.get('cuda_graph_bs_range', '1-256'), 1, 256) +print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_START=\"{s}\"') +print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_END=\"{e}\"') + +# Decode no-DP config +print(f'DECODE_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') +print(f'DECODE_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) +print(f'DECODE_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') +print(f'DECODE_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') +")" + +echo "Loaded model configuration for: $MODEL_NAME" + +# Compute DP-dependent prefill parameters +if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then + prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP) + prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP + prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP + prefill_context_length=$PREFILL_CONTEXT_LENGTH_DP + prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_DP + prefill_enable_two_batch_overlap=$PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP +else + prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) + prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP + prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP + prefill_context_length="" + prefill_max_total_tokens="" + prefill_enable_two_batch_overlap="false" +fi + +# When both DP and EP are enabled, override max-running-requests with max bench concurrency +if [[ "$PREFILL_ENABLE_DP" == "true" ]] && [[ "$PREFILL_ENABLE_EP" == "true" ]]; then + prefill_max_running_requests=$BENCH_MAX_CONC_VALUE + prefill_dp_ranks=$PREFILL_TP_SIZE + # MORI_MAX_DISPATCH_TOKENS_PREFILL stays at 8192 (no change) + MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2)) + echo "[DP+EP override] Prefill: max-running-requests=$prefill_max_running_requests, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" +fi + +# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) +if [[ "$DECODE_ENABLE_DP" == "true" ]]; then + decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END)) + decode_max_running_requests=$((DECODE_CUDA_GRAPH_BS_DP_END * DECODE_TP_SIZE)) +elif [[ "$DECODE_ENABLE_EP" == "true" ]]; then + decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_EP_ONLY_START $DECODE_CUDA_GRAPH_BS_EP_ONLY_END)) + decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_EP_ONLY +else + decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_NO_DP_START $DECODE_CUDA_GRAPH_BS_NO_DP_END)) + decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP +fi + +# When both DP and EP are enabled, override max-running-requests and dispatch tokens +if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; then + decode_max_running_requests=$BENCH_MAX_CONC_VALUE + decode_dp_ranks=$DECODE_TP_SIZE + MORI_MAX_DISPATCH_TOKENS_DECODE=$((BENCH_MAX_CONC_VALUE / decode_dp_ranks)) + MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10)) + # Update derived variable + SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) + export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD + echo "[DP+EP override] Decode: max-running-requests=$decode_max_running_requests, DISPATCH_TOKENS=$MORI_MAX_DISPATCH_TOKENS_DECODE, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_DECODE, INTER_KERNEL_SWITCH=$SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD" +fi + +# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) +PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " +if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" +fi +if [[ -n "$prefill_context_length" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --context-length ${prefill_context_length}" +fi +if [[ -n "$prefill_max_total_tokens" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --max-total-tokens ${prefill_max_total_tokens}" +fi +if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap" + PREFILL_SDMA_ENV="MORI_ENABLE_SDMA=true" +fi + +DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} " + +if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then + DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" +fi + +if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then + MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) + MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) +fi + +# ============================================================================= +# Cluster Topology Configuration +# ============================================================================= +IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" + +# Ceiling division by GPUS_PER_NODE for nodes-per-worker +PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE)) +DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE)) +NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP)) + +# Build prefill arguments dynamically based on xP +PREFILL_HEADNODE_URLS=() +PREFILL_ARGS="" +for i in $(seq 0 $((xP - 1))); do + prefill_idx=$((i * PREFILL_NODES_PER_WORKER)) + PREFILL_HEADNODE_URLS[$i]="${IP_ARRAY[$prefill_idx]}:${HEADNODE_PORT}" + PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$prefill_idx]}:8000" +done + +# Build decode arguments dynamically based on yD +DECODE_HEADNODE_URLS=() +DECODE_ARGS="" +for i in $(seq 0 $((yD - 1))); do + decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET)) + DECODE_HEADNODE_URLS[$i]="${IP_ARRAY[$decode_idx]}:${HEADNODE_PORT}" + DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$decode_idx]}:8000" +done + +echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}" +echo "Decode worker headnode list: ${DECODE_HEADNODE_URLS[@]}" + +# ============================================================================= +# Configuration Builder Functions +# ============================================================================= + +build_server_config() { + local mode="$1" + local model_name="$2" + local tp_size="$3" + local enable_ep="$4" + local enable_dp="$5" + local decode_mtp_size="$6" + + # Calculate EP and DP sizes based on enable flags + local ep_size=1 + local dp_size=1 + + if [[ "$enable_ep" == "true" ]]; then + ep_size=$tp_size + fi + + if [[ "$enable_dp" == "true" ]]; then + dp_size=$tp_size + fi + + # Build parallelism arguments + local parallel_args="--tp-size ${tp_size}" + + if [[ "$enable_ep" == "true" ]]; then + parallel_args="$parallel_args --ep-size ${ep_size}" + fi + + if [[ "$enable_dp" == "true" ]]; then + parallel_args="$parallel_args --dp-size ${dp_size}" + fi + + # Get model-specific configuration from YAML-loaded variables + local base_config="$MODEL_BASE_FLAGS" + local mtp_config="" + local dp_config="" + local specific_config="" + + # MTP config (only if MTP is enabled and mode is decode) + if [ "$decode_mtp_size" -gt 0 ]; then + mtp_config="${MODEL_MTP_FLAGS} --speculative-num-steps ${decode_mtp_size} --speculative-num-draft-tokens $((decode_mtp_size + 1))" + fi + + # DP config (only if DP is enabled) + if [[ "$enable_dp" == "true" ]]; then + dp_config="$MODEL_DP_FLAGS" + fi + + # Mode-specific config + if [[ "$mode" == "prefill" ]]; then + specific_config="$PREFILL_MODE_FLAGS" + elif [[ "$mode" == "decode" ]]; then + specific_config="$DECODE_MODE_FLAGS" + fi + + # Combine: parallel args + base config + mtp config (decode only) + dp config + specific config + local full_config="$parallel_args" + if [[ -n "$base_config" ]]; then + full_config="$full_config $base_config" + fi + if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then + full_config="$full_config $mtp_config" + fi + if [[ -n "$dp_config" ]]; then + full_config="$full_config $dp_config" + fi + if [[ -n "$specific_config" ]]; then + full_config="$full_config $specific_config" + fi + + echo "$full_config" +} + +# Build complete server configurations +PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE") +DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE") + +if [[ -n "$MODEL_NAME" ]]; then + echo "Using model-specific configuration for: $MODEL_NAME" +fi + +if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]; then + PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') + DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') + unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL + unset MORI_MOE_MAX_INPUT_TOKENS_DECODE +fi + +# ============================================================================= +# Container Synchronization +# ============================================================================= + +echo "Waiting at the container creation barrier on $host_name" +python3 $SGLANG_WS_PATH/sync.py barrier \ + --local-ip ${host_ip} \ + --local-port 5000 \ + --enable-port \ + --node-ips ${IPADDRS} \ + --node-ports 5000 \ + --wait-for-all-ports \ + --timeout 300 + + +# ============================================================================= +# Node Role Assignment and Server Launch +# ============================================================================= + +if [ "$NODE_RANK" -eq 0 ]; then + echo "NODE INFO =======================================" + echo "================================================" + echo "Node List : ${SLURM_JOB_NODELIST}" + echo "Node IPs : ${IPADDRS}" + echo "Model Name : ${MODEL_NAME:-'Not specified'}" + echo "================================================" + + echo "CLUSTER INFO ====================================" + echo "================================================" + echo "${host_name}:${host_ip} is Proxy Node and Prefill Node" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" + echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" + echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}" + echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" + echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}" + echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} " + echo "Decode env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} " + + echo "================================================" + + # start the head prefill server + PREFILL_MORI_MOE_ENV="" + set -x + if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then + PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" + fi + set +x + PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + --model-path $MODEL_DIR/$MODEL_NAME \ + --disaggregation-mode prefill \ + --disaggregation-ib-device ${IBDEVICES} \ + --host 0.0.0.0 \ + --port 8000 \ + --trust-remote-code \ + ${PREFILL_SERVER_CONFIG} " + + if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then + PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0" + fi + + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + set -x + eval "$PREFILL_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill0_pid=$! + fi + + + echo "Waiting for all prefill and decode servers to be up . . ." + + + BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports 8000 \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + echo "Congratulations!!! All prefill and decode servers are up . . ." + + ROUTER_CMD="python -m sglang_router.launch_router \ + --pd-disaggregation \ + --port 30000 \ + --policy random \ + --prefill-policy random \ + --decode-policy random \ + ${PREFILL_ARGS} \ + ${DECODE_ARGS}" + + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $ROUTER_CMD" + else + ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_proxy_${host_name}.log" + set -x + if [[ "${SGLANG_ROUTER_STDOUT_LOGS:-0}" == "1" ]]; then + eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & + else + eval "$ROUTER_CMD" >"$ROUTER_LOG_FILE" 2>&1 & + fi + set +x + proxy_pid=$! + + # Wait for router to be ready via health endpoint + HEALTH_BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports 30000 \ + --wait-for-all-health \ + --health-endpoint /readiness \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $HEALTH_BARRIER_CMD" + else + eval "$HEALTH_BARRIER_CMD" + fi + + echo "Router is ready for benchmarking" + fi + + + echo "Ready for benchmarking on ${host_name}:${host_ip}" + + echo "Benchmarking on ${host_name}:${host_ip}" + cd $SGLANG_WS_PATH + + # Export IS_MTP based on whether MTP is enabled + if [ "$DECODE_MTP_SIZE" -gt 0 ]; then + export IS_MTP=true + else + export IS_MTP=false + fi + + # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier + BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ + $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ + ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \ + ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" + + if [[ "${EVAL_ONLY:-false}" == "true" ]]; then + echo "EVAL_ONLY mode: skipping throughput benchmark" + elif [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BENCH_CMD" + else + set -x + eval "$BENCH_CMD" + set +x + fi + + # Run evaluation if requested (before killing router) + if [[ "${RUN_EVAL:-false}" == "true" ]]; then + echo "Running lm-eval evaluation on Node 0..." + + # Health check: verify the router is still serving before running eval. + # The throughput benchmark may have crashed/exhausted decode workers. + EVAL_HEALTH_OK=false + for _attempt in 1 2 3; do + if curl -sf --max-time 10 "http://0.0.0.0:30000/readiness" >/dev/null 2>&1; then + EVAL_HEALTH_OK=true + break + fi + echo "Eval health check attempt $_attempt failed, retrying in 10s..." + sleep 10 + done + + if [[ "$EVAL_HEALTH_OK" != "true" ]]; then + echo "WARNING: Router health check failed after 3 attempts. Skipping eval." + else + # Must run from repo root so utils/evals/${task}.yaml resolves + pushd /workspace + + # Source eval functions from benchmark_lib.sh + source /workspace/benchmarks/benchmark_lib.sh + + # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list + if [[ -n "${EVAL_CONC:-}" ]]; then + export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}" + else + export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) + fi + + # Override eval context length with model's configured context_length + if [[ -n "$prefill_context_length" ]]; then + export EVAL_MAX_MODEL_LEN="$prefill_context_length" + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})" + else + # Run lm-eval against the router on port 30000 + run_eval --framework lm-eval --port 30000 + eval_rc=$? + + if [[ $eval_rc -ne 0 ]]; then + echo "ERROR: run_eval exited rc=$eval_rc; skipping metadata write and eval artifact staging" >&2 + EVAL_FAILED=1 + else + # Set metadata env vars for append_lm_eval_summary + export TP="${PREFILL_TP_SIZE}" + export CONC="${EVAL_CONCURRENT_REQUESTS}" + export EP_SIZE=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}" + export PREFILL_TP="${PREFILL_TP_SIZE}" + export PREFILL_EP=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}" + export PREFILL_NUM_WORKERS="${xP}" + export DECODE_TP="${DECODE_TP_SIZE}" + export DECODE_EP=1 + [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}" + export DECODE_NUM_WORKERS="${yD}" + export DP_ATTENTION="${PREFILL_ENABLE_DP}" + export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}" + export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}" + export ISL="${BENCH_INPUT_LEN}" + export OSL="${BENCH_OUTPUT_LEN}" + # IS_MULTINODE, FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, + # RESULT_FILENAME are already set via Docker -e flags from job.slurm + + append_lm_eval_summary + # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace + + # Copy eval artifacts to run_logs for NFS extraction by runner + EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results" + mkdir -p "$EVAL_COPY_DIR" + for f in meta_env.json; do + [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/" + done + # Use find for glob patterns to avoid "no match" errors + find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \; + find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \; + + echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR" + fi + fi + + popd + fi + fi + + # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) + LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" + mkdir -p "$LOGS_OUTPUT" + + if [[ "$DRY_RUN" -eq 0 ]]; then + cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" + echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" + fi + + echo "Killing the proxy server and prefill server" + + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $proxy_pid + kill $prefill0_pid + fi + + if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then + echo "ERROR: eval failed; exiting node-0 with rc=1" + exit 1 + fi + +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then + echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}" + + PREFILL_MORI_MOE_ENV="" + set -x + if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then + PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" + fi + set +x + PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + --model-path $MODEL_DIR/${MODEL_NAME} \ + --disaggregation-mode prefill \ + --disaggregation-ib-device ${IBDEVICES} \ + --host 0.0.0.0 \ + --port 8000 \ + --trust-remote-code \ + ${PREFILL_SERVER_CONFIG} " + + if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then + rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER)) + prefill_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER)) + PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[$prefill_idx]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank $rank" + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + set -x + eval "$PREFILL_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports 30000 \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port 30000" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the rank $NODE_RANK prefill server" + + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $prefill_pid + fi + +else + RANK=$((NODE_RANK - xP * PREFILL_NODES_PER_WORKER)) + echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})" + echo "Using decode config: $DECODE_SERVER_CONFIG" + echo "Decode node rank: $RANK" + echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" + + DECODE_MORI_MOE_ENV="" + set -x + if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_DECODE" ]]; then + DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}" + fi + set +x + DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + --model-path ${MODEL_DIR}/${MODEL_NAME} \ + --disaggregation-mode decode \ + --disaggregation-ib-device ${IBDEVICES} \ + --host 0.0.0.0 \ + --port 8000 \ + --trust-remote-code \ + ${DECODE_SERVER_CONFIG} " + + if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then + rank=$((RANK % DECODE_NODES_PER_WORKER)) + decode_idx=$((RANK / DECODE_NODES_PER_WORKER)) + DECODE_CMD="$DECODE_CMD --dist-init-addr ${DECODE_HEADNODE_URLS[$decode_idx]} --nnodes ${DECODE_NODES_PER_WORKER} --node-rank $rank" + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $DECODE_CMD" + else + set -x + eval "$DECODE_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & + + set +x + decode_pid=$! + fi + + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports 30000 \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port 30000" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the rank $RANK decode server" + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $decode_pid + fi + +fi + +echo "Script completed successfully" +exit 0 diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh new file mode 100755 index 000000000..d61fe0359 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -0,0 +1,527 @@ +#!/bin/bash +# vLLM Disaggregated Server Launcher with Model-Specific Configurations +# ============================================================================= +# +# Node role assignment (by NODE_RANK): +# 0 -> Proxy/Router + first Prefill node (kv_producer) +# 1..xP-1 -> Additional Prefill nodes (kv_producer) +# xP..xP+yD-1 -> Decode nodes (kv_consumer) +# +# Total nodes = xP + yD (router co-located with first prefill, like SGLang). + +# ============================================================================= +# Dependency Setup (idempotent; required when using base vLLM image) +# ============================================================================= +source "$(dirname "${BASH_SOURCE[0]}")/setup_deps.sh" + +# ============================================================================= +# Environment Configuration +# ============================================================================= + +NODE0_ADDR="${NODE0_ADDR:-localhost}" +NODE_RANK="${NODE_RANK:-0}" +MODEL_DIR="${MODEL_DIR:-}" +MODEL_NAME="${MODEL_NAME:-}" + +xP="${xP:-1}" +yD="${yD:-1}" + +IPADDRS="${IPADDRS:-localhost}" + +# Benchmark Configuration +BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" +BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" +BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" +BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" + +DRY_RUN="${DRY_RUN:-0}" +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + +ROUTER_PORT="${ROUTER_PORT:-30000}" +SERVER_PORT="${SERVER_PORT:-2584}" +ENGINE_ID="${ENGINE_ID:-${MODEL_NAME}-pd-run}" + +# Prefer MODEL_PATH from job.slurm (handles HF cache snapshot resolution) +MODEL_PATH="${MODEL_PATH:-${MODEL_DIR}/${MODEL_NAME}}" + +# ============================================================================= +# Dependencies and Environment Setup +# ============================================================================= +source $WS_PATH/env.sh + +host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}') +# RDMA IP for Nixl KV transfer (prefer 192.168.x.x subnet if available) +rdma_ip=$(hostname -I | tr ' ' '\n' | grep '^192\.168\.' | head -1) +rdma_ip="${rdma_ip:-$host_ip}" +host_name=$(hostname) + +echo "[INFO] Management IP (barriers/proxy): $host_ip" +echo "[INFO] RDMA IP (Nixl KV transfer): $rdma_ip" + +# ============================================================================= +# RDMA / Nixl Workarounds +# ============================================================================= + +setup_rdma_env() { + # Pensando ionic (RoCEv2) point-to-point /31 route fix. + # Each benic interface has a /31 to the TOR switch. Without explicit routes, + # traffic to other nodes' RDMA IPs falls through to the management network. + if [[ "$rdma_ip" =~ ^192\.168\.([0-9]+)\.([0-9]+)$ ]]; then + local rdma_subnet="${BASH_REMATCH[1]}" + local rdma_host="${BASH_REMATCH[2]}" + local rdma_gw="192.168.${rdma_subnet}.$(( rdma_host | 1 ))" + local rdma_iface + rdma_iface=$(ip -o addr show | awk -v ip="$rdma_ip" '$4 ~ ip {print $2}' | head -1) + if [[ -n "$rdma_iface" ]]; then + ip route replace "192.168.${rdma_subnet}.0/24" via "$rdma_gw" dev "$rdma_iface" 2>/dev/null && \ + echo "[RDMA-ROUTE] Added 192.168.${rdma_subnet}.0/24 via $rdma_gw dev $rdma_iface" || \ + echo "[RDMA-ROUTE] Route add failed for 192.168.${rdma_subnet}.0/24" + fi + fi + + # Patch Nixl UCX backend: set ucx_error_handling_mode=none. + # Required for ALL NIC types under high concurrency (C512+). Without this, + # UCX's default UCP_ERR_HANDLING_MODE_PEER triggers transport-level error + # recovery on ibv_post_send failures, preventing RIXL RDMA READ retries from + # recovering gracefully. This causes the prefill KV cache to fill to 100% + # and deadlock the pipeline. On ionic NICs this was already applied (rdmacm + # incompatibility); on mlx5 NICs it was incorrectly skipped. + local nixl_api + nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) + if [[ -n "$nixl_api" ]]; then + if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then + sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" + echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api (IBDEVICES=${IBDEVICES:-unset})" + else + echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" + fi + fi +} + +setup_rdma_env + +if [[ -z "$UCX_NET_DEVICES" ]]; then + echo "Error: UCX_NET_DEVICES is empty after env.sh detection" >&2 + exit 1 +fi + +# ============================================================================= +# Model-Specific Configuration from YAML +# ============================================================================= +MODELS_YAML="${WS_PATH}/models_vllm.yaml" + +if [[ ! -f "$MODELS_YAML" ]]; then + echo "ERROR: models.yaml not found at $MODELS_YAML" + exit 1 +fi + +if [[ -z "$MODEL_NAME" ]]; then + echo "ERROR: MODEL_NAME is not set"; exit 1 +fi + +eval "$(python3 -c " +import yaml, sys + +with open('${MODELS_YAML}') as f: + models = yaml.safe_load(f) + +model_name = '${MODEL_NAME}' +if model_name not in models: + print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') + sys.exit(0) + +m = models[model_name] + +def bash_escape(s): + \"\"\"Escape a value for safe embedding in a bash double-quoted assignment.\"\"\" + return s.replace('\\\\', '\\\\\\\\').replace('\"', '\\\\\"').replace('\$', '\\\\\$').replace('\`', '\\\\\`') + +pf = bash_escape(m.get('prefill_flags', '--tensor-parallel-size 8')) +df = bash_escape(m.get('decode_flags', '--tensor-parallel-size 8')) +ev = bash_escape(m.get('env', '')) +dev = bash_escape(m.get('decode_env', '')) +print(f'PREFILL_SERVER_CONFIG=\"{pf}\"') +print(f'DECODE_SERVER_CONFIG=\"{df}\"') +print(f'MODEL_ENVS=\"{ev}\"') +print(f'DECODE_MODEL_ENVS=\"{dev}\"') +")" + +echo "Loaded model configuration for: $MODEL_NAME" + +# Apply tensor-parallel size and EP/DP flags from submit pipeline. +if [[ -n "${PREFILL_TP_SIZE:-}" ]]; then + if echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then + PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${PREFILL_TP_SIZE}/g") + else + PREFILL_SERVER_CONFIG+=" --tensor-parallel-size ${PREFILL_TP_SIZE}" + fi +fi +if [[ -n "${DECODE_TP_SIZE:-}" ]]; then + if echo "$DECODE_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then + DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${DECODE_TP_SIZE}/g") + else + DECODE_SERVER_CONFIG+=" --tensor-parallel-size ${DECODE_TP_SIZE}" + fi +fi +if [[ "${PREFILL_ENABLE_EP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then + PREFILL_SERVER_CONFIG+=" --enable-expert-parallel" +fi +if [[ "${PREFILL_ENABLE_DP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then + PREFILL_SERVER_CONFIG+=" --enable-dp-attention" +fi +if [[ "${DECODE_ENABLE_EP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then + DECODE_SERVER_CONFIG+=" --enable-expert-parallel" +fi +if [[ "${DECODE_ENABLE_DP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then + DECODE_SERVER_CONFIG+=" --enable-dp-attention" +fi + +echo "PREFILL_SERVER_CONFIG (after TP/EP/DP): $PREFILL_SERVER_CONFIG" +echo "DECODE_SERVER_CONFIG (after TP/EP/DP): $DECODE_SERVER_CONFIG" + +# ============================================================================= +# Container Synchronization +# ============================================================================= + +echo "Waiting at the container creation barrier on $host_name" +python3 $WS_PATH/sync.py barrier \ + --local-ip ${host_ip} \ + --local-port 5000 \ + --enable-port \ + --node-ips ${IPADDRS} \ + --node-ports 5000 \ + --wait-for-all-ports \ + --timeout 600 + +# ============================================================================= +# Cluster Topology Configuration +# ============================================================================= +IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" + +PREFILL_ARGS="" +DECODE_ARGS="" + +for ((i=0; i "$PREFILL_LOG_FILE" 2>&1 & + set +x + prefill_pid=$! + fi + + echo "Waiting for all prefill and decode servers to be up . . ." + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: skipping barrier (wait-for-all-ports)" + else + python3 $WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports $SERVER_PORT \ + --wait-for-all-ports \ + --timeout 1800 + fi + + echo "Congratulations!!! All prefill and decode servers are up . . ." + + # Wait for proxy /health to confirm it is accepting requests + HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-health \ + --health-endpoint /health \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $HEALTH_BARRIER_CMD" + else + eval "$HEALTH_BARRIER_CMD" + echo "MoRI-IO proxy is ready for benchmarking" + fi + + echo "Ready for benchmarking on ${host_name}:${host_ip}" + echo "Benchmarking on ${host_name}:${host_ip}" + cd $WS_PATH + + export ROUTER_PORT=$ROUTER_PORT + BENCH_CMD="bash $WS_PATH/bench.sh ${xP} ${yD} $((GPUS_PER_NODE*xP)) $((GPUS_PER_NODE*yD)) \ + $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ + ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \ + ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" + + if [[ "${EVAL_ONLY:-false}" == "true" ]]; then + echo "EVAL_ONLY mode: skipping throughput benchmark" + elif [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BENCH_CMD" + else + set -x + eval "$BENCH_CMD" + set +x + fi + + # Run evaluation if requested (before killing router) + if [[ "${RUN_EVAL:-false}" == "true" ]]; then + echo "Running lm-eval evaluation on Node 0..." + + EVAL_HEALTH_OK=false + for _attempt in 1 2 3; do + if curl -sf --max-time 10 "http://0.0.0.0:${ROUTER_PORT}/health" >/dev/null 2>&1; then + EVAL_HEALTH_OK=true + break + fi + echo "Eval health check attempt $_attempt failed, retrying in 10s..." + sleep 10 + done + + if [[ "$EVAL_HEALTH_OK" != "true" ]]; then + echo "WARNING: Router health check failed after 3 attempts. Skipping eval." + else + pushd /workspace + + source /workspace/benchmarks/benchmark_lib.sh + + if [[ -n "${EVAL_CONC:-}" ]]; then + export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}" + else + export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: run_eval --framework lm-eval --port $ROUTER_PORT (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})" + else + run_eval --framework lm-eval --port "$ROUTER_PORT" + eval_rc=$? + + if [[ $eval_rc -ne 0 ]]; then + echo "ERROR: run_eval exited rc=$eval_rc; skipping metadata write and eval artifact staging" >&2 + EVAL_FAILED=1 + else + export TP="${PREFILL_TP_SIZE}" + export CONC="${EVAL_CONCURRENT_REQUESTS}" + export EP_SIZE=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}" + export PREFILL_TP="${PREFILL_TP_SIZE}" + export PREFILL_EP=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}" + export PREFILL_NUM_WORKERS="${xP}" + export DECODE_TP="${DECODE_TP_SIZE}" + export DECODE_EP=1 + [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}" + export DECODE_NUM_WORKERS="${yD}" + export DP_ATTENTION="${PREFILL_ENABLE_DP}" + export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}" + export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}" + export ISL="${BENCH_INPUT_LEN}" + export OSL="${BENCH_OUTPUT_LEN}" + + append_lm_eval_summary + + EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results" + mkdir -p "$EVAL_COPY_DIR" + for f in meta_env.json; do + [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/" + done + find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \; + find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \; + + echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR" + fi + fi + + popd + fi + fi + + # Copy benchmark/eval results to BENCHMARK_LOGS_DIR (mounted from host) + LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" + mkdir -p "$LOGS_OUTPUT" + + if [[ "$DRY_RUN" -eq 0 ]]; then + cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" + echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" + fi + + echo "Killing the prefill server" + if [[ "$DRY_RUN" -eq 0 ]]; then + [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true + sleep 2 + pkill -f "vllm serve" 2>/dev/null || true + fi + + if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then + echo "ERROR: eval failed; exiting node-0 with rc=1" + exit 1 + fi + +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then + echo "${host_name}:${host_ip} is Additional Prefill Node (Model: ${MODEL_NAME})" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + + setup_vllm_env + + SERVED_MODEL="${MODEL_NAME}" + PREFILL_CMD="vllm serve ${MODEL_PATH} \ + --served-model-name ${SERVED_MODEL} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + ${PREFILL_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log" + set -x + eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 & + set +x + prefill_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port ${ROUTER_PORT}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the prefill server" + [[ "$DRY_RUN" -eq 0 ]] && kill $prefill_pid 2>/dev/null || true + +else + echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME})" + echo "Using decode config: $DECODE_SERVER_CONFIG" + + setup_vllm_env + + for env_pair in ${DECODE_MODEL_ENVS}; do + export "$env_pair" + echo "[DECODE_ENV] $env_pair" + done + + SERVED_MODEL="${MODEL_NAME}" + DECODE_CMD="vllm serve ${MODEL_PATH} \ + --served-model-name ${SERVED_MODEL} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + ${DECODE_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $DECODE_CMD" + else + DECODE_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log" + set -x + eval "$DECODE_CMD" > "$DECODE_LOG_FILE" 2>&1 & + set +x + decode_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port ${ROUTER_PORT}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the decode server" + [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid 2>/dev/null || true +fi + +# echo "Killing the etcd server" +# kill $etcd_pid 2>/dev/null || true +# pkill -f etcd 2>/dev/null || true + +echo "Script completed successfully" +exit 0 diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh new file mode 100644 index 000000000..1b5c6f45e --- /dev/null +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -0,0 +1,654 @@ +#!/bin/bash +# ============================================================================= +# setup_deps.sh — Install missing vLLM disagg dependencies at container start. +# +# Base image: vllm/vllm-openai-rocm:v0.18.0 +# Sourced by server.sh so PATH / LD_LIBRARY_PATH exports persist. +# Idempotent: each component is skipped if already present. +# +# Build steps run in subshells to avoid CWD pollution between installers. +# ============================================================================= + +ROCM_PATH="${ROCM_PATH:-/opt/rocm}" +UCX_HOME="${UCX_HOME:-/usr/local/ucx}" +RIXL_HOME="${RIXL_HOME:-/usr/local/rixl}" + +_SETUP_START=$(date +%s) +_SETUP_INSTALLED=() + +git_clone_retry() { + local url="$1" dest="$2" max_tries=3 try=1 + while (( try <= max_tries )); do + if git clone --quiet "$url" "$dest" 2>/dev/null; then return 0; fi + echo "[SETUP] git clone attempt $try/$max_tries failed for $url, retrying in 10s..." + rm -rf "$dest" + sleep 10 + (( try++ )) + done + echo "[SETUP] git clone failed after $max_tries attempts: $url" + return 1 +} + + +# --------------------------------------------------------------------------- +# 5. Container RDMA/net tools +# - ibv_devinfo comes from ibverbs-utils +# - iproute2 provides the `ip` command +# Used for in-container NIC/RDMA validation and routing checks. +# --------------------------------------------------------------------------- +install_recipe_deps() { + if command -v ibv_devinfo >/dev/null 2>&1 && command -v ip >/dev/null 2>&1; then + echo "[SETUP] Container RDMA/net tools already present" + return 0 + fi + + echo "[SETUP] Installing ibv_devinfo + iproute2 in container..." + apt-get update -q -y && apt-get install -q -y \ + ibverbs-utils iproute2 \ + && rm -rf /var/lib/apt/lists/* + + if ! command -v ibv_devinfo >/dev/null 2>&1 || ! command -v ip >/dev/null 2>&1; then + echo "[SETUP] ERROR: Failed to install ibv_devinfo/iproute2"; exit 1 + fi + _SETUP_INSTALLED+=("ibverbs-utils+iproute2") +} + +# --------------------------------------------------------------------------- +# 6b. amd-quark (MXFP4 quantization support for Kimi-K2.5-MXFP4 and similar) +# Required due to ROCm vLLM missing the quark dependency: +# https://github.com/vllm-project/vllm/issues/35633 +# --------------------------------------------------------------------------- +install_amd_quark() { + if python3 -c "import quark" 2>/dev/null; then + echo "[SETUP] amd-quark already present" + return 0 + fi + + echo "[SETUP] Installing amd-quark for MXFP4 quantization support..." + pip install --quiet amd-quark + + if ! python3 -c "import quark" 2>/dev/null; then + echo "[SETUP] WARN: amd-quark install failed (non-fatal for non-MXFP4 models)" + return 0 + fi + _SETUP_INSTALLED+=("amd-quark") +} + +# --------------------------------------------------------------------------- +# 8. Patch vLLM MoRI-IO save_kv_layer busy-spin (C128 tail-batch deadlock) +# In WRITE mode, save_kv_layer spins forever waiting for the handshake +# callback to set write_ready_flags. This blocks the model worker thread, +# preventing it from responding to EngineCore shm_broadcast, causing a +# TimeoutError cascade and crash. +# Patch: add time.sleep(0.001) and a 30s timeout to yield CPU and prevent +# the model worker from deadlocking. +# --------------------------------------------------------------------------- +patch_moriio_save_kv_timeout() { + python3 -c ' +import os, sys + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc + f = mc.__file__ + src = open(f).read() + + # Already patched? + if "[PATCHED] save_kv_layer timeout" in src: + print("[SETUP] save_kv_layer timeout patch already applied") + sys.exit(0) + + old = """ while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.write_ready_flags + ): + continue""" + + if old not in src: + print("[SETUP] WARN: save_kv_layer busy-spin pattern not found, skipping patch") + sys.exit(0) + + new = """ # [PATCHED] save_kv_layer — null guard + timeout + sleep + if remote_engine_id is None: + return + import time as _time, os as _os + _wait_start = _time.monotonic() + _SAVE_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) + while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.write_ready_flags + ): + _elapsed = _time.monotonic() - _wait_start + if _elapsed > _SAVE_KV_TIMEOUT: + import logging as _logging + _logging.getLogger("vllm.moriio").warning( + "[HANGFIX] save_kv_layer: timeout (%.1fs) waiting for " + "write_ready_flags[%s], breaking to unblock model " + "worker", _elapsed, remote_engine_id) + break + _time.sleep(0.001) + continue""" + + new_src = src.replace(old, new) + if new_src == src: + print("[SETUP] WARN: replacement had no effect") + sys.exit(0) + + open(f, "w").write(new_src) + print("[SETUP] Patched save_kv_layer: null guard + timeout + sleep") +except Exception as e: + print(f"[SETUP] WARN patch save_kv_layer: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-save-kv-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 9. Patch MoRIIO waiting_for_transfer_complete with bounded timeout +# The original status.Wait() blocks forever if an RDMA completion never +# arrives (e.g., NIC queue saturation at C256). This replaces the unbounded +# wait with a polling loop using status.Succeeded() + configurable timeout. +# Also adds error handling to the write worker loop so a single failed +# transfer doesn't kill the background thread. +# --------------------------------------------------------------------------- +patch_moriio_transfer_timeout() { + python3 -c ' +import os, sys, textwrap + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine as me + f = me.__file__ + src = open(f).read() + + if "[PATCHED] transfer completion timeout" in src: + print("[SETUP] transfer completion timeout patch already applied") + sys.exit(0) + + # --- Patch 1: Replace waiting_for_transfer_complete with polling + timeout --- + old_wait = """ def waiting_for_transfer_complete(self): + if not self.transfer_status: + return + + transfers_to_wait = [] + with self.lock: + transfers_to_wait = self.transfer_status[:] + self.transfer_status.clear() + + for status in transfers_to_wait: + try: + status.Wait() + if not status.Succeeded(): + logger.error( + "Transfer failed: %s, Code: %s", status.Message(), status.Code() + ) + raise TransferError("MoRIIO transfer failed!") + except Exception as e: + logger.error("Transfer %s failed: %s", status, e) + raise""" + + new_wait = """ def waiting_for_transfer_complete(self): + # [PATCHED] transfer completion timeout — bounded polling loop + import time as _time, os as _os + if not self.transfer_status: + return + + _timeout = float(_os.environ.get("VLLM_MORIIO_TRANSFER_TIMEOUT", "120")) + + transfers_to_wait = [] + with self.lock: + transfers_to_wait = self.transfer_status[:] + self.transfer_status.clear() + + _start = _time.monotonic() + remaining = list(transfers_to_wait) + _polls = 0 + _completed = 0 + + while remaining: + _elapsed = _time.monotonic() - _start + if _elapsed > _timeout: + logger.error( + "[HANGFIX] transfer_timeout elapsed=%.1fs " + "pending=%d/%d completed=%d polls=%d " + "action=raise_transfer_error", + _elapsed, len(remaining), len(transfers_to_wait), + _completed, _polls, + ) + raise TransferError( + f"RDMA transfer timeout after {_elapsed:.1f}s, " + f"{len(remaining)}/{len(transfers_to_wait)} pending" + ) + + still_waiting = [] + for status in remaining: + try: + if status.Succeeded(): + _completed += 1 + continue + still_waiting.append(status) + except Exception as e: + logger.error( + "[HANGFIX] transfer_poll_error error=%s", e) + raise TransferError( + f"Transfer failed during poll: {e}" + ) from e + + remaining = still_waiting + if remaining: + _time.sleep(0.005) + _polls += 1 + if _polls % 2000 == 0: + logger.warning( + "[HANGFIX] transfer_wait pending=%d " + "completed=%d elapsed=%.1fs timeout=%.0fs", + len(remaining), _completed, + _time.monotonic() - _start, _timeout, + )""" + + if old_wait not in src: + print("[SETUP] WARN: waiting_for_transfer_complete pattern not found") + sys.exit(0) + + new_src = src.replace(old_wait, new_wait) + + # --- Patch 2: Add error handling + cleanup to _write_worker_loop --- + old_loop = """ self._execute_write_task(task)""" + + new_loop = """ try: + self._execute_write_task(task) + except Exception as _e: + logger.error( + "[HANGFIX] req=%s write_task_failed error=%s " + "action=cleanup_and_mark_done", + task.request_id, _e, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None + ) + except Exception: + pass""" + + if old_loop in new_src: + new_src = new_src.replace(old_loop, new_loop, 1) + else: + print("[SETUP] WARN: _write_worker_loop pattern not found for error handling") + + # --- Patch 3: Add deferred task timeout to _process_deferred_tasks --- + old_deferred = """ def _process_deferred_tasks(self) -> None: + \"\"\"Process tasks that were previously deferred.\"\"\" + if not self._deferred_tasks: + return + + still_deferred: list[WriteTask] = [] + for task in self._deferred_tasks: + if self._is_remote_ready(task): + self._execute_write_task(task) + else: + still_deferred.append(task) + + self._deferred_tasks = still_deferred""" + + new_deferred = """ def _process_deferred_tasks(self) -> None: + \"\"\"Process tasks that were previously deferred.\"\"\" + # [PATCHED] deferred task timeout — prune stale tasks + import time as _time, os as _os + if not self._deferred_tasks: + return + + _DEFER_TIMEOUT = float( + _os.environ.get("VLLM_MORIIO_DEFER_TIMEOUT", "60")) + + still_deferred: list[WriteTask] = [] + for task in self._deferred_tasks: + _age = _time.monotonic() - getattr(task, "_defer_ts", _time.monotonic()) + if _age > _DEFER_TIMEOUT: + logger.error( + "[HANGFIX] req=%s deferred_task_expired age=%.1fs " + "action=drop_and_mark_done", + task.request_id, _age, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None) + except Exception: + pass + continue + if self._is_remote_ready(task): + try: + self._execute_write_task(task) + except Exception as _e: + logger.error( + "[HANGFIX] req=%s deferred_write_failed error=%s", + task.request_id, _e, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None) + except Exception: + pass + else: + still_deferred.append(task) + + self._deferred_tasks = still_deferred""" + + if old_deferred in new_src: + new_src = new_src.replace(old_deferred, new_deferred, 1) + else: + print("[SETUP] WARN: _process_deferred_tasks pattern not found") + + # --- Patch 4: Stamp defer time when task is deferred --- + old_defer_add = """ self._deferred_tasks.append(task)""" + new_defer_add = """ import time as _time2 + if not hasattr(task, "_defer_ts"): + task._defer_ts = _time2.monotonic() + self._deferred_tasks.append(task)""" + if old_defer_add in new_src: + new_src = new_src.replace(old_defer_add, new_defer_add, 1) + else: + print("[SETUP] WARN: deferred task timestamp patch target not found") + + open(f, "w").write(new_src) + print("[SETUP] Patched: transfer timeout + writer error handling") + +except Exception as e: + print(f"[SETUP] WARN patch transfer_timeout: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-transfer-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 10. Patch MoRIIO start_load_kv busy-spin (same pattern as save_kv_layer) +# The READ-mode spin loop in start_load_kv has the same unbounded-spin +# issue as save_kv_layer. Add timeout + sleep + null guard. +# --------------------------------------------------------------------------- +patch_moriio_load_kv_timeout() { + python3 -c ' +import os, sys + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc + f = mc.__file__ + src = open(f).read() + + if "[PATCHED] start_load_kv timeout" in src: + print("[SETUP] start_load_kv timeout patch already applied") + sys.exit(0) + + old = """ while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.load_ready_flag + and wait_handshake_readd_req + ): + continue""" + + if old not in src: + print("[SETUP] WARN: start_load_kv busy-spin pattern not found, skipping") + sys.exit(0) + + new = """ # [PATCHED] start_load_kv timeout — prevent model worker deadlock + if remote_engine_id is None and not wait_handshake_readd_req: + self._reqs_to_send.update(metadata.reqs_to_send) + return + import time as _time, os as _os + _wait_start = _time.monotonic() + _LOAD_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) + while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.load_ready_flag + and wait_handshake_readd_req + ): + if _time.monotonic() - _wait_start > _LOAD_KV_TIMEOUT: + import logging as _logging + _logging.getLogger("vllm.moriio").warning( + "[HANGFIX] start_load_kv: timeout (%.1fs) waiting for " + "load_ready_flag[%s]", _time.monotonic() - _wait_start, + remote_engine_id) + break + _time.sleep(0.001) + continue""" + + new_src = src.replace(old, new) + if new_src == src: + print("[SETUP] WARN: start_load_kv replacement had no effect") + sys.exit(0) + + open(f, "w").write(new_src) + print("[SETUP] Patched start_load_kv busy-spin with timeout + sleep") +except Exception as e: + print(f"[SETUP] WARN patch start_load_kv: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-load-kv-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 11. Fix READ-mode scheduler assertion in _update_from_kv_xfer_finished +# vLLM asserts that a request in finished_recving must be either +# WAITING_FOR_REMOTE_KVS or finished. In READ mode the request can +# transition to RUNNING before the aggregated recv notification arrives, +# crashing the engine with AssertionError. +# (present in v0.17.1 & v0.18.0) +# --------------------------------------------------------------------------- +patch_scheduler_read_mode_fix() { + python3 -c ' +import os, sys + +try: + import vllm.v1.core.sched.scheduler as smod + f = smod.__file__ + src = open(f).read() + + if "[PATCHED] read-mode recv assertion" in src: + print("[SETUP] scheduler read-mode assertion fix already applied") + sys.exit(0) + + old_recv = """ for req_id in kv_connector_output.finished_recving or (): + logger.debug("Finished recving KV transfer for request %s", req_id) + assert req_id in self.requests + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + else: + assert RequestStatus.is_finished(req.status) + self._free_blocks(self.requests[req_id])""" + + new_recv = """ # [PATCHED] read-mode recv assertion — handle intermediate states + for req_id in kv_connector_output.finished_recving or (): + logger.debug("Finished recving KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping recv", req_id) + continue + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + elif RequestStatus.is_finished(req.status): + self._free_blocks(self.requests[req_id]) + else: + logger.debug( + "Request %s recv finished but status=%s (not " + "WAITING_FOR_REMOTE_KVS or finished), skipping " + "block free — will be freed on request completion", + req_id, req.status.name)""" + + if old_recv not in src: + print("[SETUP] WARN: scheduler finished_recving pattern not found, skipping") + sys.exit(0) + + new_src = src.replace(old_recv, new_recv, 1) + + old_send = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + assert req_id in self.requests + self._free_blocks(self.requests[req_id])""" + + new_send = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping send", req_id) + continue + self._free_blocks(self.requests[req_id])""" + + if old_send in new_src: + new_src = new_src.replace(old_send, new_send, 1) + else: + print("[SETUP] WARN: scheduler finished_sending pattern not found") + + open(f, "w").write(new_src) + print("[SETUP] Patched: scheduler _update_from_kv_xfer_finished read-mode fix") + +except Exception as e: + print(f"[SETUP] WARN patch scheduler read-mode: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("scheduler-read-mode-fix") +} + +# --------------------------------------------------------------------------- +# 12. Idle KV block reaper for disaggregated prefill (READ mode) +# The RIXL notification path can lose `finished_sending` signals under +# high concurrency with ibv_post_send failures. This leaves KV blocks +# permanently allocated on the prefill engine even after the decode has +# finished reading. Over multiple benchmark rounds, leaked blocks +# accumulate and eventually saturate the prefill KV cache. +# +# Fix: instrument the scheduler's `schedule()` method to detect idle +# periods (0 running, 0 waiting for >5s) and force-free blocks for +# any remaining requests whose status is finished. +# --------------------------------------------------------------------------- +patch_prefill_idle_kv_reaper() { + python3 -c ' +import os, sys + +try: + import vllm.v1.core.sched.scheduler as smod + f = smod.__file__ + src = open(f).read() + + if "[PATCHED] idle-kv-reaper" in src: + print("[SETUP] idle KV block reaper already applied") + sys.exit(0) + + # Find the _update_from_kv_xfer_finished method end and add reaper logic + # We inject into the method that processes KV transfer completions. + marker = "[PATCHED] read-mode recv assertion" + if marker not in src: + print("[SETUP] WARN: scheduler read-mode patch not found, skipping reaper") + sys.exit(0) + + # Add reaper state initialization to __init__ + old_init_marker = "self.finished_recving_kv_req_ids" + if old_init_marker not in src: + print("[SETUP] WARN: finished_recving_kv_req_ids not found in scheduler") + sys.exit(0) + + # Find the first occurrence to insert reaper state + init_pos = src.find(old_init_marker) + # Find the line containing it + line_end = src.find("\n", init_pos) + init_line = src[init_pos:line_end] + + # Add reaper state after this line + reaper_init = init_line + """ + # [PATCHED] idle-kv-reaper state + self._idle_kv_reaper_ts = 0.0 + self._idle_kv_reaper_active = False""" + + src = src.replace(init_line, reaper_init, 1) + + # Now add the reaper logic at the end of _update_from_kv_xfer_finished + # Find the finished_sending handler we patched + send_handler = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping send", req_id) + continue + self._free_blocks(self.requests[req_id])""" + + reaper_logic = send_handler + """ + + # [PATCHED] idle-kv-reaper — force-free leaked prefill KV blocks + import time as _time + _REAPER_IDLE_SECS = 5.0 + _num_running = sum(1 for r in self.requests.values() + if r.status == RequestStatus.RUNNING) + _should_reap = (_num_running == 0) + + if _should_reap: + if not self._idle_kv_reaper_active: + self._idle_kv_reaper_active = True + self._idle_kv_reaper_ts = _time.monotonic() + elif _time.monotonic() - self._idle_kv_reaper_ts > _REAPER_IDLE_SECS: + _reaped = 0 + _reap_ids = [] + for _rid, _req in list(self.requests.items()): + if RequestStatus.is_finished(_req.status): + _reap_ids.append(_rid) + for _rid in _reap_ids: + try: + _req = self.requests[_rid] + self._free_blocks(_req) + _reaped += 1 + except Exception as _e: + logger.debug("[KV-REAPER] free_blocks failed for %s: %s", _rid, _e) + if _reaped > 0: + logger.warning( + "[KV-REAPER] Force-freed blocks for %d finished " + "requests after %.1fs idle", + _reaped, _time.monotonic() - self._idle_kv_reaper_ts) + self._idle_kv_reaper_ts = _time.monotonic() + else: + self._idle_kv_reaper_active = False""" + + if send_handler in src: + src = src.replace(send_handler, reaper_logic, 1) + else: + print("[SETUP] WARN: send handler not found for reaper injection") + sys.exit(0) + + open(f, "w").write(src) + print("[SETUP] Patched: idle KV block reaper for prefill") + +except Exception as e: + print(f"[SETUP] WARN patch idle-kv-reaper: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("idle-kv-reaper") +} + +# ============================================================================= +# Run installers +# ============================================================================= + +install_recipe_deps +install_amd_quark +patch_moriio_save_kv_timeout +patch_moriio_transfer_timeout +patch_moriio_load_kv_timeout +patch_scheduler_read_mode_fix +patch_prefill_idle_kv_reaper + +# ============================================================================= +# Export paths (persists for server.sh since this file is sourced) +# ============================================================================= + +export ROCM_PATH="${ROCM_PATH}" +export UCX_HOME="${UCX_HOME}" +export RIXL_HOME="${RIXL_HOME}" +export PATH="${UCX_HOME}/bin:/usr/local/bin/etcd:/root/.cargo/bin:${PATH}" +export LD_LIBRARY_PATH="${UCX_HOME}/lib:${RIXL_HOME}/lib:${RIXL_HOME}/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" + +_SETUP_END=$(date +%s) +if [[ ${#_SETUP_INSTALLED[@]} -eq 0 ]]; then + echo "[SETUP] All dependencies already present (${_SETUP_END}s wallclock)" +else + echo "[SETUP] Installed: ${_SETUP_INSTALLED[*]} in $(( _SETUP_END - _SETUP_START ))s" +fi diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index d2c49bc9e..fa3d65418 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -2,37 +2,51 @@ # # Cluster Configuration Template for Multi-Node Disaggregated Serving # -# This script submits a multi-node SGLang disaggregated benchmark job to SLURM. +# This script submits a multi-node disaggregated benchmark job to SLURM. # It must be configured for your specific cluster before use. +# +# ENGINE=sglang (default): SGLang disaggregated serving +# ENGINE=vllm: vLLM disaggregated serving +# +# Router is co-located with the first prefill node (same for both engines), +# so NUM_NODES = PREFILL_NODES + DECODE_NODES. usage() { cat << 'USAGE' -This script aims to provide a one-liner call to the submit_job_script.py, -so that the deployment process can be further simplified. - -To use this script, fill in the following script and run it under your `slurm_jobs` directory: -======== begin script area ======== -# REQUIRED: Cluster-specific configuration -export SLURM_ACCOUNT= # Your SLURM account name -export SLURM_PARTITION= # SLURM partition to submit to -export TIME_LIMIT= # Job time limit (e.g., "08:00:00") - -# REQUIRED: Model and container paths -export MODEL_PATH= # Path to model directory (e.g., /mnt/models, /nfsdata) -export CONTAINER_IMAGE= # Path to container squash file - -# REQUIRED: Hardware configuration -export GPUS_PER_NODE= # GPUs per node (e.g., 8 for MI355X, 4 for MI325X) - -# OPTIONAL: RDMA/Network configuration (set in runners/launch_mi355x-amds.sh for AMD) -# export IBDEVICES= # RDMA device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) -# export MORI_RDMA_TC= # RDMA traffic class (e.g., 96, 104) - -bash submit.sh \ -$PREFILL_NODES $PREFILL_WORKERS $DECODE_NODES $DECODE_WORKERS \ -$ADDITIONAL_FRONTENDS \ -$ISL $OSL $CONCURRENCIES $REQUEST_RATE -======== end script area ======== +Usage: + bash submit.sh \ + \ + \ + \ + \ + [NODE_LIST] + +Arguments: + PREFILL_NODES Number of prefill nodes + PREFILL_WORKERS Number of prefill workers (usually 1) + DECODE_NODES Number of decode nodes + DECODE_WORKERS Number of decode workers (usually 1) + ISL Input sequence length + OSL Output sequence length + CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") + REQUEST_RATE Request rate ("inf" for max throughput) + PREFILL_ENABLE_EP true/false or 1/0 (expert parallelism on prefill) + PREFILL_ENABLE_DP true/false or 1/0 (data-parallel attention on prefill) + DECODE_ENABLE_EP true/false or 1/0 (expert parallelism on decode) + DECODE_ENABLE_DP true/false or 1/0 (data-parallel attention on decode) + PREFILL_TP Tensor parallel size per prefill node + DECODE_TP Tensor parallel size per decode node + RANDOM_RANGE_RATIO Random range ratio for benchmark client + NODE_LIST Optional: comma-separated hostnames (must match NUM_NODES) + +Required environment variables: + SLURM_ACCOUNT SLURM account name + SLURM_PARTITION SLURM partition + TIME_LIMIT Job time limit (e.g., "08:00:00") + MODEL_PATH Path to model directory (e.g., /nfsdata) + MODEL_NAME Model name directory + CONTAINER_IMAGE Docker image name (e.g., vllm_disagg_pd:latest) + RUNNER_NAME Runner identifier (for job name) USAGE } @@ -53,6 +67,7 @@ check_env MODEL_PATH check_env MODEL_NAME check_env CONTAINER_IMAGE check_env RUNNER_NAME +check_env FRAMEWORK # GPUS_PER_NODE defaults to 8 (MI355X). Set to 4 for MI325X if needed. GPUS_PER_NODE="${GPUS_PER_NODE:-8}" @@ -66,31 +81,32 @@ ISL=$5 OSL=$6 CONCURRENCIES=$7 REQUEST_RATE=$8 -PREFILL_ENABLE_EP=${9:-1} -PREFILL_ENABLE_DP=${10:-1} -DECODE_ENABLE_EP=${11:-1} -DECODE_ENABLE_DP=${12:-1} +PREFILL_ENABLE_EP=${9:-true} +PREFILL_ENABLE_DP=${10:-true} +DECODE_ENABLE_EP=${11:-true} +DECODE_ENABLE_DP=${12:-true} PREFILL_TP=${13:-8} DECODE_TP=${14:-8} -RANDOM_RANGE_RATIO=${15} +RANDOM_RANGE_RATIO=${15:-0.8} NODE_LIST=${16} - NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) profiler_args="${ISL} ${OSL} ${CONCURRENCIES} ${REQUEST_RATE}" # Export variables for the SLURM job +export ENGINE="${FRAMEWORK:-sglang}" export MODEL_DIR=$MODEL_PATH export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE export PROFILER_ARGS=$profiler_args - - +# Engine-specific xP/yD semantics and TP exports +if [[ "$ENGINE" == "vllm-disagg" ]]; then + export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} + export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} +fi +# xP = prefill workers, yD = decode workers (may span multiple nodes) export xP=$PREFILL_WORKERS export yD=$DECODE_WORKERS -export NUM_NODES=$NUM_NODES -export GPUS_PER_NODE=$GPUS_PER_NODE -export MODEL_NAME=$MODEL_NAME export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $PREFILL_TP / $PREFILL_WORKERS )) export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP} export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP} @@ -98,12 +114,16 @@ export DECODE_TP_SIZE=$(( $DECODE_NODES * $DECODE_TP / $DECODE_WORKERS )) export DECODE_ENABLE_EP=${DECODE_ENABLE_EP} export DECODE_ENABLE_DP=${DECODE_ENABLE_DP} export DECODE_MTP_SIZE=${DECODE_MTP_SIZE} + +export NUM_NODES=$NUM_NODES +export GPUS_PER_NODE=$GPUS_PER_NODE +export MODEL_NAME=$MODEL_NAME export BENCH_INPUT_LEN=${ISL} export BENCH_OUTPUT_LEN=${OSL} -export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO} -export BENCH_NUM_PROMPTS_MULTIPLIER=10 +export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} export BENCH_REQUEST_RATE=${REQUEST_RATE} +export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} # Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker) export RUN_EVAL="${RUN_EVAL:-false}" @@ -118,13 +138,10 @@ export SPEC_DECODING="${SPEC_DECODING:-}" export IS_MULTINODE="${IS_MULTINODE:-false}" # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. -# SLURM writes output files on the batch node, so /tmp won't work (node-local). -# Defaults to a sibling directory of the submit working directory. export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" mkdir -p "$BENCHMARK_LOGS_DIR" # Optional: pass an explicit node list to sbatch. -# NODE_LIST is expected to be comma-separated hostnames. NODELIST_OPT=() if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then IFS=',' read -r -a NODE_ARR <<< "$NODE_LIST" @@ -137,6 +154,63 @@ if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then NODELIST_OPT=(--nodelist "$NODELIST_CSV") fi +# Optional: exclude specific nodes (e.g. nodes with broken Docker sockets). +# Set SLURM_EXCLUDE_NODES env var to a comma-separated list of hostnames. +EXCLUDE_OPT=() +SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-mia1-p01-g11,mia1-p01-g12,mia1-p01-g15}" +if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then + EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES") +fi + +# ============================================================================= +# Reuse existing allocation (skip sbatch) +# ============================================================================= +# When SLURM_REUSE_JOBID is set, run job.slurm directly in the current shell, +# attaching to the existing allocation. Inner `srun` calls pick up the +# allocation via SLURM_JOB_ID; SLURM_OVERLAP=1 lets them share task slots with +# the interactive shell already holding the allocation. +if [[ -n "${SLURM_REUSE_JOBID:-}" ]]; then + REUSE_JID="$SLURM_REUSE_JOBID" + echo "Reusing existing Slurm allocation ${REUSE_JID} (skipping sbatch)" >&2 + + # Resolve allocation's nodelist if not already provided. + ALLOC_NODELIST="${SLURM_JOB_NODELIST:-$(squeue -h -j "$REUSE_JID" -o '%N' 2>/dev/null)}" + if [[ -z "$ALLOC_NODELIST" ]]; then + echo "Error: could not resolve nodelist for job ${REUSE_JID}" >&2 + exit 1 + fi + ALLOC_NNODES=$(scontrol show hostnames "$ALLOC_NODELIST" | wc -l) + if [[ "$ALLOC_NNODES" -lt "$NUM_NODES" ]]; then + echo "Error: allocation ${REUSE_JID} has ${ALLOC_NNODES} nodes, need ${NUM_NODES}" >&2 + exit 1 + fi + + export SLURM_JOB_ID="$REUSE_JID" + export SLURM_JOBID="$REUSE_JID" + export SLURM_JOB_NODELIST="$ALLOC_NODELIST" + export SLURM_NODELIST="$ALLOC_NODELIST" + export SLURM_NNODES="$ALLOC_NNODES" + export SLURM_JOB_NUM_NODES="$ALLOC_NNODES" + export SLURM_NTASKS="$ALLOC_NNODES" + export SLURM_NPROCS="$ALLOC_NNODES" + export SLURM_NTASKS_PER_NODE=1 + export SLURM_TASKS_PER_NODE="1(x${ALLOC_NNODES})" + export SLURM_OVERLAP=1 + export SLURM_SUBMIT_DIR="$(pwd)" + + STDOUT_LOG="${BENCHMARK_LOGS_DIR}/slurm_job-${REUSE_JID}.out" + STDERR_LOG="${BENCHMARK_LOGS_DIR}/slurm_job-${REUSE_JID}.err" + rm -f "$STDOUT_LOG" "$STDERR_LOG" + + nohup bash "$(dirname "$0")/job.slurm" >"$STDOUT_LOG" 2>"$STDERR_LOG" & + INLINE_PID=$! + echo "$INLINE_PID" > "${BENCHMARK_LOGS_DIR}/slurm_job-${REUSE_JID}.pid" + echo "Started job.slurm (pid=${INLINE_PID}); logs: ${STDOUT_LOG}" >&2 + + echo "$REUSE_JID" + exit 0 +fi + # Construct the sbatch command sbatch_cmd=( sbatch @@ -145,6 +219,7 @@ sbatch_cmd=( -N "$NUM_NODES" -n "$NUM_NODES" "${NODELIST_OPT[@]}" + "${EXCLUDE_OPT[@]}" --time "$TIME_LIMIT" --partition "$SLURM_PARTITION" --account "$SLURM_ACCOUNT" @@ -154,7 +229,6 @@ sbatch_cmd=( "$(dirname "$0")/job.slurm" ) -# todo: --parsable outputs only the jobid and cluster name, test if jobid;clustername is correct JOB_ID=$("${sbatch_cmd[@]}") if [[ $? -ne 0 ]]; then echo "Error: Failed to submit job with sbatch" >&2 diff --git a/benchmarks/multi_node/amd_utils/sync.py b/benchmarks/multi_node/amd_utils/sync.py index 140951519..3678e7614 100755 --- a/benchmarks/multi_node/amd_utils/sync.py +++ b/benchmarks/multi_node/amd_utils/sync.py @@ -143,7 +143,10 @@ def close_port(): time.sleep(30) if args.enable_port: - time.sleep(30) + # Keep the port open long enough for slow nodes to pass their barrier. + # The previous 30s was too short when setup times vary by minutes. + grace = max(60, args.timeout // 2) if args.timeout > 0 else 300 + time.sleep(grace) close_port() diff --git a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh index 6a7314ab4..d17d1a323 100644 --- a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh @@ -19,7 +19,8 @@ check_env_vars \ DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ - RANDOM_RANGE_RATIO + RANDOM_RANGE_RATIO \ + FRAMEWORK if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh index 0124d4b4d..a8c0d2743 100644 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh @@ -19,7 +19,8 @@ check_env_vars \ DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ - RANDOM_RANGE_RATIO + RANDOM_RANGE_RATIO \ + FRAMEWORK if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" diff --git a/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh new file mode 100755 index 000000000..d7995fb25 --- /dev/null +++ b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +# Same EP/DP booleans as dsr1_fp8_mi355x_sglang-disagg.sh → amd_utils/submit.sh +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi + +# Parameter order matches SGLang disagg submit.sh; arg 16 is optional NODELIST. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh new file mode 100644 index 000000000..a9a28d889 --- /dev/null +++ b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi + +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ad37e0c27..def63fd87 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2974,6 +2974,18 @@ - "Update SGLang ROCm image from v0.5.11/v0.5.10rc0 to v0.5.12-rocm720-mi35x-20260517" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1440 +- config-keys: + - kimik2.5-fp4-mi355x-vllm-disagg + description: + - "Add Kimi-K2.5-MXFP4 FP4 vLLM disagg PD recipe (1P2D, MoRI-EP + MoRI-IO) for MI355X on vllm/vllm-openai-rocm:nightly" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1569 + +- config-keys: + - minimaxm2.5-fp8-mi355x-vllm-disagg + description: + - "Add MiniMax-M2.5 FP8 vLLM disagg PD recipe (1P2D, MoRI-EP + MoRI-IO) for MI355X on vllm/vllm-openai-rocm:nightly" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1569 + - config-keys: - dsv4-fp4-mi355x-vllm description: diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index a8033847e..00fd994f3 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -52,11 +52,27 @@ if [[ "$IS_MULTINODE" == "true" ]]; then sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true # Ensure root-owned files are cleaned up even on early exit to prevent - # EACCES errors when the next GH Actions job checks out on this runner - trap 'sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true' EXIT + # EACCES errors when the next GH Actions job checks out on this runner. + # Always preserve slurm logs as CI artifacts for debugging. + cleanup_and_save_logs() { + if [[ -n "${GITHUB_ACTIONS:-}" && -n "${JOB_ID:-}" ]]; then + local art_dir="$GITHUB_WORKSPACE/benchmark_artifacts" + mkdir -p "$art_dir" + cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$art_dir/" 2>/dev/null || true + fi + # Print .err inline so failures are visible in CI output + local err_file="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID:-unknown}.err" + if [[ -s "$err_file" ]]; then + echo "=== Slurm job stderr ===" + tail -100 "$err_file" + echo "========================" + fi + sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true + } + trap cleanup_and_save_logs EXIT SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh" - if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then + if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]]; then BENCHMARK_SUBDIR="multi_node" else BENCHMARK_SUBDIR="single_node" @@ -108,12 +124,19 @@ if [[ "$IS_MULTINODE" == "true" ]]; then if [[ "${EVAL_ONLY:-false}" != "true" ]]; then cat > collect_latest_results.py <<'PY' import os, sys -sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) -for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: +job_dir, isl, osl, nexp, framework = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]), sys.argv[5] +logs_root = f"{job_dir}/logs/" +candidates = [] +if os.path.isdir(logs_root): + for name in os.listdir(logs_root): + subdir = f"{logs_root}{name}/{framework}_isl_{isl}_osl_{osl}" + if os.path.isdir(subdir): + candidates.append(subdir) +for path in sorted(candidates, key=os.path.getmtime, reverse=True)[:nexp]: print(path) PY - LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1) + LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1 "$FRAMEWORK") if [ -z "$LOGS_DIR" ]; then echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" exit 1 @@ -162,16 +185,7 @@ PY sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true - # Upload logs as artifact if running in GitHub Actions - if [[ -n "${GITHUB_ACTIONS:-}" ]]; then - ARTIFACT_DIR="$GITHUB_WORKSPACE/benchmark_artifacts" - mkdir -p "$ARTIFACT_DIR" - cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$ARTIFACT_DIR/" 2>/dev/null || true - echo "Logs copied to $ARTIFACT_DIR for artifact upload" - fi - - # Clean up root-owned files to prevent EACCES on GH Actions checkout cleanup - sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true + # Log preservation and cleanup handled by EXIT trap (cleanup_and_save_logs) else