diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index a3afb2f6b..fd82d05cb 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1350,6 +1350,115 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=2"
 
+kimik2.5-fp4-mi355x-vllm-disagg:
+  image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x-disagg
+  precision: fp4
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total 
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
+minimaxm2.5-fp8-mi355x-vllm-disagg:
+  image: vllm/vllm-openai-rocm:nightly-a6682d1d259cca69a9ae737ea5608fbbe7520031
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi355x-disagg
+  precision: fp8
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total
+      # Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536,
+      # TP8 shards to 192 which is not divisible by FP8 block_n=128.
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
 
 dsr1-fp4-mi355x-sglang-disagg:
   image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index f5e39b4cf..7dbbaaaa8 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -210,6 +210,7 @@ run_benchmark_serving() {
     local dsv4=false
     local trust_remote_code=false
     local server_pid=""
+    local tokenizer=""
 
     while [[ $# -gt 0 ]]; do
         case $1 in
@@ -278,6 +279,10 @@ run_benchmark_serving() {
                 server_pid="$2"
                 shift 2
                 ;;
+            --tokenizer)
+                tokenizer="$2"
+                shift 2
+                ;;
             *)
                 echo "Unknown parameter: $1"
                 return 1
@@ -385,6 +390,10 @@ run_benchmark_serving() {
         benchmark_cmd+=(--trust-remote-code)
     fi
 
+    if [[ -n "$tokenizer" ]]; then
+        benchmark_cmd+=(--tokenizer "$tokenizer")
+    fi
+
     # Run benchmark with optional server monitoring
     set -x
     if [[ -n "$server_pid" ]]; then
diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh
index ac996c5a9..05384f435 100755
--- a/benchmarks/multi_node/amd_utils/bench.sh
+++ b/benchmarks/multi_node/amd_utils/bench.sh
@@ -1,4 +1,17 @@
 #!/bin/bash
+# Dual-Engine Disaggregated Benchmark Runner
+#
+# ENGINE=sglang (default): SGLang benchmark
+# ENGINE=vllm:             vLLM benchmark
+#
+# Produces JSON result files via benchmark_serving.py so that the CI pipeline
+# can collect and process results.
+#
+# Usage: bash bench.sh <n_prefill> <n_decode> <prefill_gpus> <decode_gpus> \
+#            <model_dir> <model_name> <log_path> <isl> <osl> \
+#            <concurrency_list> <req_rate> <random_range_ratio> <num_prompts_multiplier>
+
+ENGINE="${ENGINE:-sglang-disagg}"
 
 n_prefill=$1
 n_decode=$2
@@ -6,58 +19,90 @@ prefill_gpus=$3
 decode_gpus=$4
 model_path=$5
 model_name=$6
-MODEL_PATH="${model_path}/${model_name}"
+MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}"
+# vllm-disagg uses --served-model-name MODEL_NAME; sglang defaults to MODEL_PATH
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    BENCH_MODEL="${MODEL_NAME:-${MODEL_PATH}}"
+else
+    BENCH_MODEL="${MODEL_PATH}"
+fi
 log_path=$7
 
 chosen_isl=${8:-1024}
 chosen_osl=${9:-1024}
 concurrency_list=${10:-"512x1"}
-chosen_req_rate=${11:-1}
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    chosen_req_rate=${11:-inf}
+else
+    chosen_req_rate=${11:-1}
+fi
 random_range_ratio=${12:-0.8}
 num_prompts_multiplier=${13:-10}
 
 IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list"
 
-echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}"
+ROUTER_PORT="${ROUTER_PORT:-30000}"
 
-head_node="localhost"
-head_port="30000"
+export TRANSFORMERS_VERBOSITY=error
+export TOKENIZERS_PARALLELISM=false
 
+echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}"
 
-profile_folder="${log_path}/sglang_isl_${chosen_isl}_osl_${chosen_osl}"
-mkdir -p $profile_folder
+profile_folder="${log_path}/${ENGINE}_isl_${chosen_isl}_osl_${chosen_osl}"
+mkdir -p "$profile_folder"
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-# Repo root inside the container (3 levels up from this script's directory)
 REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
 
-for max_concurrency in ${chosen_concurrencies[@]}; do
+for max_concurrency in "${chosen_concurrencies[@]}"; do
 
     export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}"
 
+    num_prompts=$(( max_concurrency * num_prompts_multiplier ))
+    if [[ "$num_prompts" -lt 16 ]]; then
+        num_prompts=16
+    fi
+
     echo "profile_folder: $profile_folder"
     echo "max_concurrency: $max_concurrency"
     echo "chosen_req_rate: $chosen_req_rate"
     echo "MODEL_PATH: $MODEL_PATH"
-    echo "head_port: $head_port"
+    echo "ROUTER_PORT: $ROUTER_PORT"
     echo "chosen_isl: $chosen_isl"
     echo "chosen_osl: $chosen_osl"
+    echo "num_prompts: $num_prompts"
     echo "export_file: $export_file"
 
+    # Engine-specific extra flags
+    extra_flags=""
+    if [[ "$ENGINE" == "vllm-disagg" ]]; then
+        extra_flags="--trust-remote-code --tokenizer $MODEL_PATH"
+    else
+        if [ "$IS_MTP" = "true" ]; then
+            extra_flags="--use-chat-template"
+        fi
+    fi
+
     run_benchmark_serving \
         --bench-serving-dir "$REPO_ROOT" \
-        --model  ${MODEL_PATH} \
-        --port ${head_port} \
+        --model "$BENCH_MODEL" \
+        --port "$ROUTER_PORT" \
         --backend openai \
-        --input-len ${chosen_isl} \
-        --output-len ${chosen_osl} \
-        --random-range-ratio ${random_range_ratio} \
-        --num-prompts $(( $max_concurrency * $num_prompts_multiplier )) \
+        --input-len "$chosen_isl" \
+        --output-len "$chosen_osl" \
+        --random-range-ratio "$random_range_ratio" \
+        --num-prompts "$num_prompts" \
         --max-concurrency "$max_concurrency" \
         --result-filename "$export_file" \
         --result-dir /workspace/ \
-        $( [ "$IS_MTP" = "true" ] && echo "--use-chat-template" )
+        $extra_flags
 
     echo "-----------------------------------------"
+
+    # vLLM: cooldown between rounds for idle KV block reaper
+    if [[ "$ENGINE" == "vllm-disagg" ]]; then
+        echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..."
+        sleep 10
+    fi
 done
diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 904576003..58c1f6c83 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -1,110 +1,209 @@
 #!/bin/bash
-# SGLang/MoRI environment setup for multi-node disaggregated serving.
+# Dual-engine environment setup for multi-node disaggregated serving.
+#
+# ENGINE=sglang (default): SGLang/MoRI environment
+# ENGINE=vllm:             vLLM/Nixl environment
 #
 # REQUIRED ENVIRONMENT VARIABLES:
 #   IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...)
-#               This must be set by the runner script (runners/launch_mi355x-amds.sh)
-#
-# OPTIONAL ENVIRONMENT VARIABLES:
-#   MORI_RDMA_TC - RDMA traffic class (e.g., 96, 104). Set by runner if cluster uses QoS.
-
+#               Set by runner or auto-detected from hostname.
 set -x
+
+ENGINE="${ENGINE:-sglang-disagg}"
 export PYTHONDONTWRITEBYTECODE=1
 
-# IBDEVICES configuration
+# =============================================================================
+# Shared: IBDEVICES detection
+# =============================================================================
+
 # Prefer IBDEVICES set by runner (runners/launch_mi355x-amds.sh)
 # Fall back to hostname detection if not set (for direct script execution)
 if [[ -z "$IBDEVICES" ]]; then
-    NODENAME=$(hostname -s)
-    if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
-        export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7
-    elif [[ $NODENAME == mia1* ]]; then
-        export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+    DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',')
+    if [[ -n "$DETECTED" ]]; then
+        export IBDEVICES="$DETECTED"
+        echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES via ibv_devinfo on $(hostname -s)"
     else
-        echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2
+        echo "ERROR: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2
         exit 1
     fi
-    echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $NODENAME"
 else
     echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)"
 fi
 export IBDEVICES
 
-# Auto-detect default network interface (portable across clusters)
-export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
-export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
+# Shared: Auto-detect default network interface (portable across clusters)
+# Only auto-detect if not already set by the runner/environment
+if [[ -z "$GLOO_SOCKET_IFNAME" ]]; then
+    export GLOO_SOCKET_IFNAME=$(ip route 2>/dev/null | grep '^default' | awk '{print $5}' | head -n 1)
+fi
+if [[ -z "$NCCL_SOCKET_IFNAME" ]]; then
+    export NCCL_SOCKET_IFNAME=$(ip route 2>/dev/null | grep '^default' | awk '{print $5}' | head -n 1)
+fi
 
+set +x
 
-export NCCL_IB_HCA=$IBDEVICES
+export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES}
 
-export SGLANG_USE_AITER=1
+# =============================================================================
+# Engine-specific environment
+# =============================================================================
 
-export SGLANG_MORI_DISPATCH_DTYPE=auto
-export MORI_COMBINE_DTYPE_PREFILL=fp8_direct_cast
-export MORI_COMBINE_DTYPE_DECODE=fp8
-export SGLANG_MORI_QP_PER_TRANSFER=4
-export SGLANG_MORI_NUM_WORKERS=4
-export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    # =========================================================================
+    # vLLM/Nixl-specific environment
+    # =========================================================================
+    export VLLM_USE_V1=1
+    export VLLM_SERVER_DEV_MODE=0
+    export VLLM_DISABLE_REQUEST_ID_RANDOMIZATION=1
 
-export MORI_IO_QP_MAX_SEND_WR=16384
-export MORI_IO_QP_MAX_CQE=32768 
-export MORI_IO_QP_MAX_SGE=4
+    set -x
 
-export MORI_IO_TC_DISABLE=0
+    # UCX_NET_DEVICES: Use the first tw-eth interface for UCX TCP transport
+    if [[ -z "$UCX_NET_DEVICES" ]]; then
+        UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth/{print $2}' | head -1)
+        if [[ -n "$UCX_NET_DEV" ]]; then
+            export UCX_NET_DEVICES="$UCX_NET_DEV"
+        else
+            FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1)
+            if [[ -n "$FIRST_IB" ]]; then
+                export UCX_NET_DEVICES="${FIRST_IB}:1"
+            fi
+        fi
+        echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES"
+    else
+        echo "[INFO] Using UCX_NET_DEVICES=$UCX_NET_DEVICES (set by environment)"
+    fi
 
-export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600
-export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600
+    # RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing
+    export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1}
 
-# Disable allocating memory in one pass
-export MORI_SHMEM_MODE=ISOLATION
+    # QoS/DSCP configuration for lossless RoCEv2 fabric.
+    if [[ -n "$UCX_IB_TRAFFIC_CLASS" ]]; then
+        echo "[INFO] Using UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS (set by environment)"
+    elif command -v nicctl &> /dev/null; then
+        ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}')
+        ND_DSCP=$(nicctl show qos 2>/dev/null | awk -v p="$ND_PRIO" '
+$1 == "DSCP" && $2 == ":" && $NF == p {
+    print $3; exit
+}')
+        if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then
+            export UCX_IB_TRAFFIC_CLASS=$(( 4 * ND_DSCP ))
+            export UCX_IB_SL=$ND_PRIO
+            echo "[INFO] Detected QoS from nicctl: UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS, UCX_IB_SL=$UCX_IB_SL"
+        else
+            echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection."
+            NODENAME=$(hostname -s)
+            if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
+                export UCX_IB_TRAFFIC_CLASS=96
+                echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
+            elif [[ $NODENAME == mia1* ]]; then
+                export UCX_IB_TRAFFIC_CLASS=104
+                echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
+            fi
+        fi
+    else
+        NODENAME=$(hostname -s)
+        if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
+            export UCX_IB_TRAFFIC_CLASS=96
+            echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
+        elif [[ $NODENAME == mia1* ]]; then
+            export UCX_IB_TRAFFIC_CLASS=104
+            echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
+        else
+            echo "[INFO] No nicctl and unable to detect from hostname. Skipping QoS configuration."
+        fi
+    fi
+
+    set +x
+    echo "[INFO] IBDEVICES=$IBDEVICES  UCX_NET_DEVICES=$UCX_NET_DEVICES  NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME  UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX  UCX_IB_TRAFFIC_CLASS=${UCX_IB_TRAFFIC_CLASS:-unset}"
+
+else
+    # =========================================================================
+    # SGLang/MoRI-specific environment
+    # =========================================================================
+
+    export SGLANG_USE_AITER=1
+
+    export SGLANG_MORI_DISPATCH_DTYPE=auto
+    export SGLANG_MORI_FP8_COMB=true
+    export SGLANG_MORI_QP_PER_TRANSFER=4
+    export SGLANG_MORI_NUM_WORKERS=4
+    export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000
+
+    export MORI_IO_QP_MAX_SEND_WR=16384
+    export MORI_IO_QP_MAX_CQE=32768
+    export MORI_IO_QP_MAX_SGE=4
+
+    export MORI_IO_TC_DISABLE=0
 
-# Enable spec v2 
-export SGLANG_ENABLE_SPEC_V2=1
-export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=0
+    export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600
+    export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600
 
-export SGLANG_LOG_MS=true
-export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32
+    # Disable allocating memory in one pass
+    export MORI_SHMEM_MODE=ISOLATION
 
-export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192
-export MORI_MAX_DISPATCH_TOKENS_DECODE=512
+    # Enable spec v2
+    export SGLANG_ENABLE_SPEC_V2=1
+    export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=0
 
-export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=32768
-export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2703
+    export SGLANG_LOG_MS=true
+    export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32
 
-# set MTP size=1 when EP16
-export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
+    export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192
+    export MORI_MAX_DISPATCH_TOKENS_DECODE=512
 
-export MORI_EP_LAUNCH_CONFIG_MODE=AUTO
+    export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=32768
+    export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2703
 
+    # set MTP size=1 when EP16
+    export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
 
-export MORI_APP_LOG_LEVEL=INFO
+    export MORI_EP_LAUNCH_CONFIG_MODE=AUTO
 
-# Router logging control:
-# 0 (default) keeps noisy per-request access logs out of stdout while still logging to file.
-# 1 mirrors router logs to stdout via tee (useful for live debugging).
-export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}"
+    export MORI_APP_LOG_LEVEL=INFO
 
-# QoS/DSCP configuration
-# Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname
-if [[ -n "$MORI_RDMA_TC" ]]; then
-    echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)"
-elif command -v nicctl &> /dev/null; then
-    ND_PRIO=$(nicctl show qos  2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}')
-    ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" '
+    # Router logging control:
+    # 0 (default) keeps noisy per-request access logs out of stdout while still logging to file.
+    # 1 mirrors router logs to stdout via tee (useful for live debugging).
+    export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}"
+
+    # QoS/DSCP configuration
+    # Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname
+    if [[ -n "$MORI_RDMA_TC" ]]; then
+        echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)"
+    elif command -v nicctl &> /dev/null; then
+        ND_PRIO=$(nicctl show qos  2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}')
+        ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" '
 $1 == "DSCP" && $2 == ":" && $NF == p {
     print $3; exit
 }')
 
-    if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then
-        TC=$(( 4 * ND_DSCP ))
-        export MORI_RDMA_SL=$ND_PRIO
-        export MORI_IO_SL=$ND_PRIO
-        export MORI_RDMA_TC=$TC
-        export MORI_IO_TC=$TC
-        echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL"
+        if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then
+            TC=$(( 4 * ND_DSCP ))
+            export MORI_RDMA_SL=$ND_PRIO
+            export MORI_IO_SL=$ND_PRIO
+            export MORI_RDMA_TC=$TC
+            export MORI_IO_TC=$TC
+            echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL"
+        else
+            echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection."
+            # Fall back to hostname-based detection
+            NODENAME=$(hostname -s)
+            if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
+                export MORI_RDMA_TC=96
+                export MORI_IO_TC=96
+                echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
+            elif [[ $NODENAME == mia1* ]]; then
+                export MORI_RDMA_TC=104
+                export MORI_IO_TC=104
+                echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
+            else
+                echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
+            fi
+        fi
     else
-        echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection."
-        # Fall back to hostname-based detection
+        # nicctl not available, try hostname-based detection
         NODENAME=$(hostname -s)
         if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
             export MORI_RDMA_TC=96
@@ -115,28 +214,12 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
             export MORI_IO_TC=104
             echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
         else
-            echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
+            echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
+            echo "       This is normal for clusters without QoS or outside Docker containers."
         fi
     fi
-else
-    # nicctl not available, try hostname-based detection
-    NODENAME=$(hostname -s)
-    if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
-        export MORI_RDMA_TC=96
-        export MORI_IO_TC=96
-        echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
-    elif [[ $NODENAME == mia1* ]]; then
-        export MORI_RDMA_TC=104
-        export MORI_IO_TC=104
-        echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
-    else
-        echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
-        echo "       This is normal for clusters without QoS or outside Docker containers."
-    fi
-fi
-
-# FIXME: WA for latest upstream 0305 image
-export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH}
 
+    # FIXME: WA for latest upstream 0305 image
+    export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH}
 
-set +x
+fi
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 824605c46..a0dd81bb9 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -1,265 +1,258 @@
 #!/bin/bash
-#SBATCH --job-name=1p2d_bench-serving    # Specify a custom string for your slurm batch job
-#SBATCH -N 3            # CHECK this to be right in batch jobs
-#SBATCH -n 3          # CHECK this to be right in batch jobs
+#SBATCH --job-name=disagg-bench
+#SBATCH -N 3            # Overridden by submit.sh -N flag
+#SBATCH -n 3            # Overridden by submit.sh -n flag
 #SBATCH --ntasks-per-node=1
 #SBATCH --spread-job
-#SBATCH --gres=gpu:8      # Request 8 GPUs and 8 NICs (use --gres if specific GPU resources are needed)
-#SBATCH --time=24:00:00         # Set a time limit for the job (HH:MM:SS)
+#SBATCH --gres=gpu:8
+#SBATCH --time=24:00:00
 # --output and --error are set by submit.sh via BENCHMARK_LOGS_DIR
 
+ENGINE="${ENGINE:-sglang-disagg}"
 
-# ------------------------
-# Print current time in UTC and PST formats
-# ------------------------
 echo "=== Job Start Time ==="
 echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')"
 echo "PST Time: $(TZ=America/Los_Angeles date '+%Y-%m-%d %H:%M:%S %Z')"
+echo "ENGINE: $ENGINE"
 echo "======================="
 echo ""
 
 # =============================================================================
-# Model validation from models.yaml (replaces hardcoded VALID_MODELS array)
+# Model Validation
 # =============================================================================
-# DI_REPO_DIR is set below from $(pwd); use the submit-time working directory
-# because sbatch copies this script to /var/spool/slurmd/ at runtime.
-MODELS_YAML="$(pwd)/models.yaml"
+
+# Use $(pwd) not BASH_SOURCE — sbatch copies the script to /var/spool/slurmd/
+# at runtime, but the CWD remains the submit-time directory (amd_utils/).
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    MODELS_YAML="$(pwd)/models_vllm.yaml"
+else
+    MODELS_YAML="$(pwd)/models.yaml"
+fi
 
 if [[ ! -f "$MODELS_YAML" ]]; then
-    echo "Error: models.yaml not found at $MODELS_YAML"
+    echo "Error: models YAML not found at $MODELS_YAML"
     exit 1
 fi
 
-# Validate MODEL_NAME exists as a top-level key in models.yaml
+if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then
+    echo "Error: DOCKER_IMAGE_NAME is not set."
+    exit 1
+fi
+
+MODEL_NAME="${MODEL_NAME:-None}"
 if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then
-    echo "Error: Model '$MODEL_NAME' not found in models.yaml"
+    echo "Error: Model '$MODEL_NAME' not found in $MODELS_YAML"
     echo "Available models:"
     grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/  - /'
     exit 1
 fi
 echo "Model found: $MODEL_NAME"
 
-# All models use server.sh as the entrypoint
 RUN_FILE="server.sh"
 echo "Runfile set: $RUN_FILE"
 
-if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then
-    echo "Error: DOCKER_IMAGE_NAME is not set."
-    exit 1
-fi
-
-# DI_REPO_DIR points to the repo root so Docker can access both benchmarks/ and utils/.
+# DI_REPO_DIR points to the repo root.
 # $(pwd) is amd_utils/ (the sbatch submit dir); go up 3 levels to reach the repo root.
 export DI_REPO_DIR=$(cd "$(pwd)/../../.." && pwd)
 
-xP="${xP:-1}" #-> Number of Prefill Workers
-yD="${yD:-1}" #-> Number of Decode Workers
+xP="${xP:-1}"
+yD="${yD:-1}"
 
-# Parallelism Configuration with defaults
-PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}"
-PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}"
-PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}"
-DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}"
-DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}"
-DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}"
-DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} # 0 for disabling MTP
-
-# Benchmark Configuration with defaults
+# Benchmark configuration
 BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}"
 BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}"
 BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}"
 BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}"
 BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}"
+BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}"
 
 GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
 
-MODEL_NAME="${MODEL_NAME:-None}"
+# Engine-specific defaults
+PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-false}"
+PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-false}"
+DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-false}"
+DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-false}"
+PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}"
+DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}"
+DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0}
+
+# Router selection: "vllm-router" (external container) or "moriio" (in-container proxy)
+ROUTER_TYPE="${ROUTER_TYPE:-vllm-router}"
+ROUTER_PORT="${ROUTER_PORT:-30000}"
+PROXY_PING_PORT="${PROXY_PING_PORT:-36367}"
+
+# =============================================================================
+# Model Path Resolution
+# =============================================================================
 
 # MODEL_DIR detection: prefer env var, fall back to hostname detection
 if [[ -z "$MODEL_DIR" ]]; then
     NODENAME=$(hostname -s)
     if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
         MODEL_DIR="/nfsdata"
-        echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $NODENAME"
     elif [[ $NODENAME == mia1* ]]; then
         MODEL_DIR="/it-share/data"
-        echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $NODENAME"
     else
-        MODEL_DIR="/nfsdata"  # Default fallback
-        echo "[INFO] Using default MODEL_DIR=$MODEL_DIR (hostname $NODENAME not recognized)"
+        MODEL_DIR="/nfsdata"
     fi
+    echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $(hostname -s)"
 fi
 export MODEL_DIR
 
-# ------------------------
-# Model path validation and selection across all nodes
-# ------------------------
-echo "Looking for model: $MODEL_NAME"
-echo "Checking model availability across all allocated nodes..."
-
-# Get all allocated nodes
-ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
-TOTAL_NODES=$(echo "$ALL_NODES" | wc -l)
-
-echo "Total allocated nodes: $TOTAL_NODES"
-echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')"
-
-# Function to check model path on all nodes
-check_model_path() {
-    local path=$1
-    local check_name=$2
-
-    echo "Checking $check_name: $path"
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    # vLLM: Extract hf_dir from models.yaml, search multiple paths, resolve HF cache snapshots
+    DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next}
+        found && /^[^ ]/{exit}
+        found && /hf_dir:/{gsub(/[" ]/, "", $2); print $2; exit}' "$MODELS_YAML")
+    DISK_DIR_NAME="${DISK_DIR_NAME:-$MODEL_NAME}"
+    echo "Looking for model: $MODEL_NAME (disk dir: $DISK_DIR_NAME)"
+
+    resolve_hf_cache_path() {
+        local base_path=$1
+        if [[ -d "${base_path}/snapshots" ]]; then
+            local snapshot=$(ls -1 "${base_path}/snapshots" 2>/dev/null | head -1)
+            if [[ -n "$snapshot" ]]; then
+                echo "${base_path}/snapshots/${snapshot}"
+                return 0
+            fi
+        fi
+        echo "$base_path"
+        return 1
+    }
+
+    MODEL_PATH=""
+    SEARCH_PATHS=(
+        "${MODEL_DIR}/${DISK_DIR_NAME}"
+        "${MODEL_DIR}/${MODEL_NAME}"
+        "/nfsdata/hf_hub_cache-0/${DISK_DIR_NAME}"
+        "/nfsdata/hf_hub_cache-0/${MODEL_NAME}"
+    )
+
+    for search_path in "${SEARCH_PATHS[@]}"; do
+        if [[ -d "$search_path" ]]; then
+            RESOLVED=$(resolve_hf_cache_path "$search_path")
+            MODEL_PATH="$RESOLVED"
+            echo "Found MODEL_PATH: $MODEL_PATH"
+            break
+        fi
+    done
 
-    # Run check on all nodes in parallel
-    srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c "
-        if [ -d '$path' ]; then
-            echo \"\$(hostname): ✓ Found $path\"
-            exit 0
+    if [[ -z "$MODEL_PATH" ]]; then
+        echo "FATAL: Model '$MODEL_NAME' not found. Searched:"
+        for p in "${SEARCH_PATHS[@]}"; do echo "  - $p"; done
+        exit 1
+    fi
+    echo "Final MODEL_PATH: $MODEL_PATH"
+else
+    # SGLang: Validate model path across all allocated nodes
+    echo "Looking for model: $MODEL_NAME"
+    echo "Checking model availability across all allocated nodes..."
+
+    ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+    TOTAL_NODES=$(echo "$ALL_NODES" | wc -l)
+    echo "Total allocated nodes: $TOTAL_NODES"
+    echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')"
+
+    check_model_path() {
+        local path=$1
+        local check_name=$2
+        echo "Checking $check_name: $path"
+        srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c "
+            if [ -d '$path' ]; then
+                echo \"\$(hostname): Found $path\"
+                exit 0
+            else
+                echo \"\$(hostname): Missing $path\"
+                exit 1
+            fi
+        "
+        local exit_code=$?
+        if [ $exit_code -eq 0 ]; then
+            echo "$check_name available on ALL nodes"
+            return 0
         else
-            echo \"\$(hostname): ✗ Missing $path\"
-            exit 1
+            echo "$check_name NOT available on all nodes"
+            return 1
         fi
-    "
+    }
 
-    # Check if all nodes succeeded (exit code 0)
-    local exit_code=$?
-    if [ $exit_code -eq 0 ]; then
-        echo "✓ $check_name available on ALL nodes"
-        return 0
+    if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then
+        MODEL_PATH="$MODEL_DIR/$MODEL_NAME"
+        echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)"
     else
-        echo "✗ $check_name NOT available on all nodes"
-        return 1
+        echo "FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in:"
+        echo "  - $MODEL_DIR/$MODEL_NAME"
+        exit 1
     fi
-}
-
-# Check model weights exist on "$MODEL_DIR/$MODEL_NAME"
-if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then
-    MODEL_PATH="$MODEL_DIR/$MODEL_NAME"
-    echo ""
-    echo "✓ Selected MODEL_PATH: $MODEL_PATH (available on all nodes)"
-else
-    echo ""
-    echo "✗ FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in the following:"
-    echo "  - $MODEL_DIR/$MODEL_NAME"
-    echo ""
-    echo "Model must be accessible from all nodes for distributed execution."
-    echo "Please ensure the model is available on all allocated nodes."
-    exit 1
+    echo "Final MODEL_PATH: $MODEL_PATH"
 fi
 
-echo "Final MODEL_PATH: $MODEL_PATH"
-echo ""
-
-NUM_NODES="${NUM_NODES}"
+# =============================================================================
+# Node Selection
+# =============================================================================
 
-# ------------------------
-# Extract first NUM_NODES from SLURM allocation and update SLURM variables
-# ------------------------
-echo "Original SLURM allocation:"
-echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
-echo "SLURM_NNODES: $SLURM_NNODES"
-echo "SLURM_NTASKS: $SLURM_NTASKS"
+NUM_NODES=$((xP + yD))
+echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD)"
 
-# Get the full nodelist and extract first NUM_NODES
 FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
 SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES)
 SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//')
 
-# Create new nodelist in SLURM format
-# This is a simplified approach - for complex ranges, you might need more sophisticated parsing
-NEW_SLURM_NODELIST=$(echo "$SELECTED_NODES" | paste -sd, | sed 's/,/,/g')
+# Docker privilege detection — evaluated per-node since group membership varies.
+# Exported as a snippet so every srun participant resolves it locally.
+export DOCKER_CMD_DETECT='if docker ps &>/dev/null 2>&1; then DOCKER_CMD=docker; else DOCKER_CMD="sudo docker"; fi'
 
 # Update SLURM environment variables
 export SLURM_NNODES=$NUM_NODES
 export SLURM_NTASKS=$NUM_NODES
 export SLURM_JOB_NUM_NODES=$NUM_NODES
 export SLURM_NPROCS=$NUM_NODES
-export SLURM_JOB_NODELIST="$NEW_SLURM_NODELIST"
-export SLURM_NODELIST="$NEW_SLURM_NODELIST"
-
-# Keep other SLURM variables as they were or set defaults
+export SLURM_JOB_NODELIST="$SELECTED_NODELIST_STR"
+export SLURM_NODELIST="$SELECTED_NODELIST_STR"
 export SLURM_TASKS_PER_NODE="1(x$NUM_NODES)"
-export SLURM_SUBMIT_DIR="${SLURM_SUBMIT_DIR:-$HOME}"
-export SLURM_CLUSTER_NAME="${SLURM_CLUSTER_NAME}"  # Let SLURM set this automatically
-export SLURM_JOB_CPUS_PER_NODE="${SLURM_JOB_CPUS_PER_NODE}"
-export SLURM_JOB_PARTITION="${SLURM_JOB_PARTITION}"  # Should be set by sbatch/runner
-export SLURM_JOBID="${SLURM_JOBID:-$SLURM_JOB_ID}"
-export SLURM_JOB_QOS="${SLURM_JOB_QOS}"  # Should be set by sbatch/runner if needed
-export SLURM_JOB_ACCOUNT="${SLURM_JOB_ACCOUNT}"  # Should be set by sbatch/runner
 export SLURM_NTASKS_PER_NODE=1
-export SLURM_SUBMIT_HOST="${SLURM_SUBMIT_HOST}"
-export SLURM_JOB_ID="${SLURM_JOB_ID}"
-# SLURM_CONF is auto-set by SLURM, no need to override
-export SLURM_JOB_NAME="${SLURM_JOB_NAME:-1p1d_bench-serving}"
 
 echo ""
-echo "Updated SLURM Environment Variables:"
-echo "SLURM_JOB_ID: $SLURM_JOB_ID"
-echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
-echo "SLURM_NNODES: $SLURM_NNODES"
-echo "SLURM_NTASKS: $SLURM_NTASKS"
-echo "SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
-echo "SLURM_JOB_CPUS_PER_NODE: $SLURM_JOB_CPUS_PER_NODE"
-echo "SLURM_JOB_PARTITION: $SLURM_JOB_PARTITION"
-echo "SLURM_JOB_NUM_NODES: $SLURM_JOB_NUM_NODES"
-echo "SLURM_JOBID: $SLURM_JOBID"
-echo "SLURM_JOB_QOS: $SLURM_JOB_QOS"
-echo "SLURM_NODELIST: $SLURM_NODELIST"
-echo "SLURM_JOB_ACCOUNT: $SLURM_JOB_ACCOUNT"
-echo "SLURM_NPROCS: $SLURM_NPROCS"
-echo "SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
-echo "SLURM_CONF: $SLURM_CONF"
-echo "SLURM_JOB_NAME: $SLURM_JOB_NAME"
-echo "SLURM_NTASKS_PER_NODE: $SLURM_NTASKS_PER_NODE"
-echo "SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
-echo "SLURM_CLUSTER_NAME: $SLURM_CLUSTER_NAME"
-echo "ulimit: $(ulimit -a)"
-echo ""
-echo "Selected nodes for execution:"
-echo "$SELECTED_NODES"
-echo ""
+echo "Selected nodes: $SELECTED_NODELIST_STR"
+
+# =============================================================================
+# IP Resolution
+# =============================================================================
 
-# Node information
 USER_NAME=$(whoami)
 MASTER_NODE=$(echo "$SELECTED_NODES" | head -n 1)
 NODE0_ADDR=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$MASTER_NODE" bash -c 'ip route get 1.1.1.1')
 NODE0_ADDR=$(echo "$NODE0_ADDR" | awk '/src/ {print $7}')
 
 IPS=()
-
-GW_NIC=$(ip route | awk '/^default/ {print $5; exit}')
 for NODE in $SELECTED_NODES; do
     IP=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$NODE" bash -c 'ip route get 1.1.1.1')
     IP=$(echo "$IP" | awk '/src/ {print $7}')
     IPS+=("$IP")
 done
 
-echo "Selected node IPs: ${IPS[*]}" | sed 's/ /,/g'
+echo "Node IPs: ${IPS[*]}"
 
 DOCKER_MOUNT_PATH="/workspace"
-SGLANG_WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils"
-timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
+WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils"
 
 NNODES=$NUM_NODES
 
-echo "MASTER_NODE is ${MASTER_NODE}"
-echo "NODE0_ADDR is ${NODE0_ADDR}"
-echo "NNODES is ${NNODES}"
-echo "REPO Directory is ${DI_REPO_DIR}"
-echo "USER_NAME is ${USER_NAME}"
-
-# Get the RDMA priority and DSCP value from the NIC
-if ! command -v nicctl >/dev/null 2>&1; then
-    echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2
-    exit 1
-fi
+echo "MASTER_NODE: ${MASTER_NODE}"
+echo "NODE0_ADDR:  ${NODE0_ADDR}"
+echo "NNODES:      ${NNODES}"
+echo "REPO DIR:    ${DI_REPO_DIR}"
+echo "USER:        ${USER_NAME}"
 
 # Reduce log spam
 export TQDM_MININTERVAL=20
 
+# Translate the host-resolved MODEL_PATH to the Docker mount namespace
+DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}"
+
 export DI_REPO_DIR=$DI_REPO_DIR
-export SGLANG_WS_PATH=$SGLANG_WS_PATH
+export WS_PATH=$WS_PATH
 export NNODES=$NNODES
 export NODE0_ADDR=$NODE0_ADDR
 export MODEL_PATH=$MODEL_PATH
@@ -269,21 +262,17 @@ export yD=$yD
 export MODEL_NAME=$MODEL_NAME
 export USER_NAME=$USER_NAME
 export IPADDRS="$(echo "${IPS[*]}" | sed 's/ /,/g')"
-export PREFILL_TP_SIZE=$PREFILL_TP_SIZE
-export PREFILL_ENABLE_EP=$PREFILL_ENABLE_EP
-export PREFILL_ENABLE_DP=$PREFILL_ENABLE_DP
-export DECODE_TP_SIZE=$DECODE_TP_SIZE
-export DECODE_ENABLE_EP=$DECODE_ENABLE_EP
-export DECODE_ENABLE_DP=$DECODE_ENABLE_DP
-export DECODE_MTP_SIZE=$DECODE_MTP_SIZE
 export GPUS_PER_NODE=$GPUS_PER_NODE
 export BENCH_INPUT_LEN=$BENCH_INPUT_LEN
 export BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN
 export BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO
 export BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER
 export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY
+export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE
 export DRY_RUN="${DRY_RUN:-0}"
 export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"
+export KEEP_CONTAINERS="${KEEP_CONTAINERS:-0}"
+export ENGINE=$ENGINE
 
 # Eval-related env vars (threaded from submit.sh)
 export RUN_EVAL="${RUN_EVAL:-false}"
@@ -298,38 +287,106 @@ export SPEC_DECODING="${SPEC_DECODING:-}"
 export IS_MULTINODE="${IS_MULTINODE:-false}"
 
 SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
-export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"
-export RUN_FILE_FULL="$SGLANG_WS_PATH/${RUN_FILE}"
+export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"
 
+# vLLM external router container
+VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260511-e667ebb}"
+ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}"
+export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}"
 
-# Use only the selected nodes for srun execution
 SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,)
 
-
 cleanup() {
-  echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning stale logs folder..."
-  # clean up the logs folder
-  sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true
-
+  echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..."
+  rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true
   echo "[${SLURM_JOB_ID}] cleanup done."
 }
 
 trap cleanup INT TERM HUP
 
-
-# Force NFS cache refresh on all nodes before running Docker to avoid stale file handle errors
+# Force NFS cache refresh on all nodes
 echo "Refreshing NFS caches on all nodes..."
 srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '
     sync
-    # Force re-stat of the mounted directory to refresh NFS handles
     ls -la '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils > /dev/null 2>&1
     stat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1
     cat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1
-    # Drop caches if we have permission (optional, requires root)
     echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true
     echo "NFS cache refreshed on $(hostname)"
 '
 
+# =============================================================================
+# Build engine-specific Docker environment variables
+# =============================================================================
+
+# Common env vars (always passed)
+DOCKER_ENV_COMMON=(
+    -e SLURM_JOB_ID=\$SLURM_JOB_ID
+    -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST
+    -e NNODES=\$NNODES
+    -e NODE_RANK=\$SLURM_PROCID
+    -e NODE0_ADDR=\$NODE0_ADDR
+    -e MODEL_DIR=/models
+    -e MODEL_NAME=\$MODEL_NAME
+    -e GPUS_PER_NODE=\$GPUS_PER_NODE
+    -e xP=\$xP
+    -e yD=\$yD
+    -e IPADDRS=\$IPADDRS
+    -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN
+    -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN
+    -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO
+    -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER
+    -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY
+    -e TQDM_MININTERVAL=\$TQDM_MININTERVAL
+    -e DRY_RUN=\$DRY_RUN
+    -e BENCHMARK_LOGS_DIR=/benchmark_logs
+    -e ENGINE=\$ENGINE
+    -e WS_PATH=${WS_PATH}
+    -e RUN_EVAL=\$RUN_EVAL
+    -e EVAL_ONLY=\$EVAL_ONLY
+    -e EVAL_CONC=\$EVAL_CONC
+    -e FRAMEWORK=\$FRAMEWORK
+    -e PRECISION=\$PRECISION
+    -e MODEL_PREFIX=\$MODEL_PREFIX
+    -e RUNNER_TYPE=\$RUNNER_TYPE
+    -e RESULT_FILENAME=\$RESULT_FILENAME
+    -e SPEC_DECODING=\$SPEC_DECODING
+    -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE
+    -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP
+    -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP
+    -e DECODE_TP_SIZE=\$DECODE_TP_SIZE
+    -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP
+    -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP
+    -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE
+    -e IS_MULTINODE=\$IS_MULTINODE
+)
+
+# Engine-specific env vars
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    DOCKER_ENV_ENGINE=(
+        -e VLLM_WS_PATH=${WS_PATH}
+        -e MODEL_PATH=$DOCKER_MODEL_PATH
+        -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma
+        -e UCX_SOCKADDR_TLS_PRIORITY=tcp
+        -e UCX_MEMTYPE_CACHE=y
+        -e UCX_RNDV_SCHEME=get_zcopy
+        -e UCX_RNDV_THRESH=4k
+        -e UCX_ROCM_IPC_MIN_ZCOPY=0
+        -e UCX_LOG_LEVEL=warn
+        -e HSA_ENABLE_SDMA=1
+        -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300}
+        -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1}
+        -e PYTHONPYCACHEPREFIX=/tmp/pycache
+    )
+else
+    DOCKER_ENV_ENGINE=(
+        -e SGLANG_WS_PATH=${WS_PATH}
+    )
+fi
+
+# Engine-specific container filter for pre-clean
+CONT_FILTER="name=^container_${ENGINE}_"
+
 srun \
   --nodelist="$SELECTED_NODELIST_SRUN" \
   --kill-on-bad-exit=1 \
@@ -340,11 +397,44 @@ set -euo pipefail
 
 echo \"Rank \$SLURM_PROCID on \$(hostname)\"
 
+# Per-node docker privilege detection
+eval \"\$DOCKER_CMD_DETECT\"
+echo \"[docker-detect] rank \$SLURM_PROCID: DOCKER_CMD=\$DOCKER_CMD\"
+
 # Pre-clean (idempotent)
-sudo docker ps -aq --filter \"name=^container_sbatch_\" | xargs -r sudo docker rm -f || true
-sudo docker ps -aq | xargs -r sudo docker stop || true
+\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true
+\$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true
+
+# Start vLLM external router container on node 0
+if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then
+    \$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true
+    \$DOCKER_CMD run -d \
+        --name \"$ROUTER_CONT_NAME\" \
+        --network host \
+        -v /tmp:/run_logs \
+        \"$VLLM_ROUTER_IMAGE\" \
+        bash -lc \"mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && exec vllm-router \
+            --vllm-pd-disaggregation \
+            --kv-connector moriio \
+            --vllm-discovery-address 0.0.0.0:${PROXY_PING_PORT} \
+            --port ${ROUTER_PORT} \
+            --host 0.0.0.0 \
+            --policy consistent_hash \
+            --prefill-policy consistent_hash \
+            --decode-policy consistent_hash \
+            --log-level info 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_\$(hostname).log \"
+fi
+
+# Skip exec on vllm-disagg rank 0 so we can stop the router after the main
+# container exits.  Without this, decode nodes block forever waiting for the
+# router port to close (the router is a separate container).
+MAYBE_EXEC=exec
+if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then
+    MAYBE_EXEC=
+    set +e
+fi
 
-exec sudo docker run --rm \
+\$MAYBE_EXEC \$DOCKER_CMD run \
     --init \
     --stop-timeout 10 \
     --device /dev/dri \
@@ -367,62 +457,38 @@ exec sudo docker run --rm \
     --cap-add SYS_PTRACE \
     --security-opt seccomp=unconfined \
     --privileged \
+    -v /sys:/sys \
+    $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \
     -v ${MODEL_DIR}:/models \
     -v \$HOME/.ssh:/root/.ssh \
-    -v $(which nicctl):/usr/sbin/nicctl \
     --shm-size 128G \
     -v /tmp:/run_logs \
     -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \
     -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \
-    -e SLURM_JOB_ID=\$SLURM_JOB_ID \
-    -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST \
-    -e NNODES=\$NNODES \
-    -e NODE_RANK=\$SLURM_PROCID \
-    -e NODE0_ADDR=\$NODE0_ADDR \
-    -e MODEL_DIR=/models \
-    -e SGLANG_WS_PATH=${SGLANG_WS_PATH} \
-    -e GPUS_PER_NODE=\$GPUS_PER_NODE \
-    -e xP=\$xP \
-    -e yD=\$yD \
-    -e MODEL_NAME=\$MODEL_NAME \
-    -e IPADDRS=\$IPADDRS \
-    -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \
-    -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \
-    -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP \
-    -e DECODE_TP_SIZE=\$DECODE_TP_SIZE \
-    -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP \
-    -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP \
-    -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE \
-    -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN \
-    -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN \
-    -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO \
-    -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER \
-    -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY \
-    -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \
-    -e DRY_RUN=\$DRY_RUN \
-    -e BENCHMARK_LOGS_DIR=/benchmark_logs \
-    -e RUN_EVAL=\$RUN_EVAL \
-    -e EVAL_ONLY=\$EVAL_ONLY \
-    -e EVAL_CONC=\$EVAL_CONC \
-    -e FRAMEWORK=\$FRAMEWORK \
-    -e PRECISION=\$PRECISION \
-    -e MODEL_PREFIX=\$MODEL_PREFIX \
-    -e RUNNER_TYPE=\$RUNNER_TYPE \
-    -e RESULT_FILENAME=\$RESULT_FILENAME \
-    -e SPEC_DECODING=\$SPEC_DECODING \
-    -e IS_MULTINODE=\$IS_MULTINODE \
+    ${DOCKER_ENV_COMMON[*]} \
+    ${DOCKER_ENV_ENGINE[*]} \
     --name \"$DOCKER_CONT_NAME\" \
+    --entrypoint \"\" \
     \"$DOCKER_IMAGE_NAME\" bash -lc '
         set -o pipefail
         mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'
         '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log
     '
 
+# Only reached when exec was skipped (vllm-disagg rank 0)
 DOCKER_EXIT_CODE=\$?
-if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then
-  echo \"ERROR: docker exited rc=\$DOCKER_EXIT_CODE on \$(hostname)\"
-  exit \$DOCKER_EXIT_CODE
-fi
+echo \"[rank 0] Main container exited (rc=\$DOCKER_EXIT_CODE). Stopping vllm-router...\"
+\$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true
+exit \$DOCKER_EXIT_CODE
 "
 
-srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'sudo docker rm -f $DOCKER_CONT_NAME 2>/dev/null || true'
+if [[ "${KEEP_CONTAINERS}" != "1" ]]; then
+    srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'eval "$DOCKER_CMD_DETECT"; $DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true'
+
+    # Clean up vLLM external router container on node 0
+    if [[ "$ENGINE" == "vllm-disagg" && "$ROUTER_TYPE" == "vllm-router" ]]; then
+        srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c '
+            eval "$DOCKER_CMD_DETECT"; $DOCKER_CMD rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true
+        '
+    fi
+fi
diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml
new file mode 100644
index 000000000..b051de8d9
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/models_vllm.yaml
@@ -0,0 +1,44 @@
+# Model-specific vLLM server configurations for disaggregated inference.
+#
+# Each top-level key is a MODEL_NAME value (must match the model identifier
+# used in amd-master.yaml and the directory/HF-cache name under MODEL_DIR).
+#
+# To add a new model: add a new top-level entry following the same schema.
+# No script changes are required.
+#
+# Schema:
+#   <model-name>:
+#     prefill_flags: str       # vLLM CLI flags for prefill workers
+#     decode_flags: str        # vLLM CLI flags for decode workers
+#     env: str                 # Space-separated KEY=VALUE pairs exported before vllm serve
+#     hf_dir: str              # (optional) On-disk directory name if it differs from the key
+#                              #   e.g. HF cache layout: models--amd--Kimi-K2.5-MXFP4
+
+Llama-3.1-405B-Instruct-FP8-KV:
+  prefill_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8"
+  decode_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8"
+  env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1"
+
+amd-Llama-3.3-70B-Instruct-FP8-KV:
+  prefill_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8"
+  decode_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8"
+  env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1"
+
+Kimi-K2.5-MXFP4:
+  prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
+  decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
+  env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600"
+  hf_dir: "models--amd--Kimi-K2.5-MXFP4"
+
+MiniMax-M2.5:
+  # AITER fused-MoE kernel fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384 for gfx950 writes OOB when run with MiniMax's shapes at M=8K(=num batched tokens), crashing vllm during AITER warmup.
+  # Set token budget to 4k to avoid using that shape, instead of disabling AITER_MOE.
+  prefill_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
+  decode_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
+  env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ENGINE_READY_TIMEOUT_S=3600 VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1"
+  hf_dir: "models--MiniMaxAI--MiniMax-M2.5"
+
+gpt-oss-120b:
+  prefill_flags: "--tensor-parallel-size 8"
+  decode_flags: "--tensor-parallel-size 8"
+  env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0"
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 7eb7414a6..5c441a793 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -1,780 +1,19 @@
 #!/bin/bash
-# SGLang Disaggregated Server Launcher with Model-Specific Configurations
+# Dual-Engine Disaggregated Server Dispatcher
 # =============================================================================
-
-# =============================================================================
-# Environment Configuration
-# =============================================================================
-
-NODE0_ADDR="${NODE0_ADDR:-localhost}"
-NODE_RANK="${NODE_RANK:-0}"
-MODEL_DIR="${MODEL_DIR:-}"
-MODEL_NAME="${MODEL_NAME:-}"
-
-xP="${xP:-1}" #-> Number of Prefill Workers
-yD="${yD:-1}" #-> Number of Decode Workers
-
-IPADDRS="${IPADDRS:-localhost}"
-HEADNODE_PORT="${HEADNODE_PORT:-20000}"
-# Parallelism Configuration
-PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}"
-PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}"
-PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}"
-DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}"
-DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}"
-DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}"
-DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}"
-
-# Benchmark Configuration
-BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}"
-BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}"
-BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}"
-BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}"
-BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}"
-BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}"
-
-# Extract the maximum concurrency from the x-delimited list
-BENCH_MAX_CONC_VALUE=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
-
-# Dry Run for debugging purpose
-DRY_RUN="${DRY_RUN:-0}"
-
-# GPU count (expandable for different hardware)
-GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
-
-
-# =============================================================================
-# Dependencies and Environment Setup
-# =============================================================================
-source $SGLANG_WS_PATH/env.sh
-
-host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}')
-host_name=$(hostname)
-
-# MORI_RDMA_TC configuration (optional)
-# If set by runner, use it for RDMA traffic class configuration
-# If not set, RDMA operations will proceed without QoS/traffic class settings
-if [[ -n "${MORI_RDMA_TC}" ]]; then
-    echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC for RDMA traffic class configuration"
-    echo "[INFO] Host '$host_name' configured with MORI_RDMA_TC=$MORI_RDMA_TC"
-else
-    echo "[INFO] MORI_RDMA_TC not set. Skipping RDMA traffic class configuration."
-    echo "[INFO] This is normal for clusters without QoS requirements."
-fi
-
-# =============================================================================
-# Model-Specific Configuration from YAML
+# Dispatches to the engine-specific server launcher based on ENGINE env var.
+#   ENGINE=sglang-disagg (default) -> server_sglang.sh (SGLang + MoRI)
+#   ENGINE=vllm-disagg             -> server_vllm.sh  (vLLM + Nixl/MoRI-IO)
 # =============================================================================
-MODELS_YAML="${SGLANG_WS_PATH}/models.yaml"
 
-if [[ ! -f "$MODELS_YAML" ]]; then
-    echo "ERROR: models.yaml not found at $MODELS_YAML"
-    exit 1
-fi
-
-# Load model config via inline Python (PyYAML is available in SGLang containers)
-# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP")
-# is done here in Python to avoid bash glob-expanding the * characters.
-eval "$(python3 -c "
-import yaml, sys, os
-
-config_path = '${MODELS_YAML}'
-model_name = '${MODEL_NAME}'
-
-with open(config_path) as f:
-    models = yaml.safe_load(f)
-
-if model_name not in models:
-    print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1')
-    sys.exit(0)
-
-m = models[model_name]
-
-def eval_formula(val):
-    \"\"\"Evaluate chunked_prefill_size: if string, resolve variable names from env and compute.\"\"\"
-    if isinstance(val, (int, float)):
-        return int(val)
-    s = str(val)
-    # Build a namespace from env vars (convert numeric values to int)
-    ns = {}
-    for k, v in os.environ.items():
-        try:
-            ns[k] = int(v)
-        except (ValueError, TypeError):
-            pass
-    try:
-        return int(eval(s, {'__builtins__': {}}, ns))
-    except Exception as e:
-        print(f'echo \"WARNING: Cannot evaluate formula: {s} ({e})\"', file=sys.stderr)
-        return val
-
-def parse_range(cuda_range, default_start, default_end):
-    if '-' in str(cuda_range):
-        s, e = str(cuda_range).split('-')
-        return s, e
-    return str(default_start), str(default_end)
-
-# Output shell variables
-print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"')
-print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"')
-print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"')
-
-prefill = m.get('prefill', {})
-decode = m.get('decode', {})
+ENGINE="${ENGINE:-sglang-disagg}"
+WS_PATH="${WS_PATH:-${SGLANG_WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}}"
+export WS_PATH ENGINE
 
-print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"')
-print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"')
+echo "[DISPATCHER] ENGINE=$ENGINE  WS_PATH=$WS_PATH"
 
-dp = prefill.get('dp', {})
-no_dp = prefill.get('no_dp', {})
-print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"')
-print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"')
-print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"')
-print(f'PREFILL_CONTEXT_LENGTH_DP=\"{dp.get(\"context_length\", \"\")}\"')
-print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"')
-print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"')
-print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"')
-print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"')
-s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128)
-print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"')
-print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"')
-
-print(f'DECODE_MEM_FRACTION_STATIC=\"{decode.get(\"mem_fraction_static\", 0.85)}\"')
-print(f'DECODE_PREFILL_ROUND_ROBIN_BALANCE=\"{decode.get(\"prefill_round_robin_balance\", True)}\"')
-
-dp = decode.get('dp', {})
-ep_only = decode.get('ep_only', {})
-no_dp = decode.get('no_dp', {})
-
-# Decode DP config
-print(f'DECODE_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 4096)}\"')
-print(f'DECODE_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"')
-s, e = parse_range(dp.get('cuda_graph_bs_range', '1-160'), 1, 160)
-print(f'DECODE_CUDA_GRAPH_BS_DP_START=\"{s}\"')
-print(f'DECODE_CUDA_GRAPH_BS_DP_END=\"{e}\"')
-
-# Decode EP-only config (EP enabled but DP disabled)
-print(f'DECODE_MAX_RUNNING_REQUESTS_EP_ONLY=\"{ep_only.get(\"max_running_requests\", 256)}\"')
-print(f'DECODE_CHUNKED_PREFILL_SIZE_EP_ONLY=\"{eval_formula(ep_only.get(\"chunked_prefill_size\", 262144))}\"')
-s, e = parse_range(ep_only.get('cuda_graph_bs_range', '1-256'), 1, 256)
-print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_START=\"{s}\"')
-print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_END=\"{e}\"')
-
-# Decode no-DP config
-print(f'DECODE_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"')
-print(f'DECODE_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"')
-s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128)
-print(f'DECODE_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"')
-print(f'DECODE_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"')
-")"
-
-echo "Loaded model configuration for: $MODEL_NAME"
-
-# Compute DP-dependent prefill parameters
-if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then
-    prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP)
-    prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP
-    prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP
-    prefill_context_length=$PREFILL_CONTEXT_LENGTH_DP
-    prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_DP
-    prefill_enable_two_batch_overlap=$PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    source "$WS_PATH/server_vllm.sh"
 else
-    prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END))
-    prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP
-    prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP
-    prefill_context_length=""
-    prefill_max_total_tokens=""
-    prefill_enable_two_batch_overlap="false"
+    source "$WS_PATH/server_sglang.sh"
 fi
-
-# When both DP and EP are enabled, override max-running-requests with max bench concurrency
-if [[ "$PREFILL_ENABLE_DP" == "true" ]] && [[ "$PREFILL_ENABLE_EP" == "true" ]]; then
-    prefill_max_running_requests=$BENCH_MAX_CONC_VALUE
-    prefill_dp_ranks=$PREFILL_TP_SIZE
-    # MORI_MAX_DISPATCH_TOKENS_PREFILL stays at 8192 (no change)
-    MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2))
-    echo "[DP+EP override] Prefill: max-running-requests=$prefill_max_running_requests, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_PREFILL"
-fi
-
-# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp)
-if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
-    decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END))
-    decode_max_running_requests=$((DECODE_CUDA_GRAPH_BS_DP_END * DECODE_TP_SIZE))
-elif [[ "$DECODE_ENABLE_EP" == "true" ]]; then
-    decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_EP_ONLY_START $DECODE_CUDA_GRAPH_BS_EP_ONLY_END))
-    decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_EP_ONLY
-else
-    decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_NO_DP_START $DECODE_CUDA_GRAPH_BS_NO_DP_END))
-    decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP
-fi
-
-# When both DP and EP are enabled, override max-running-requests and dispatch tokens
-if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; then
-    decode_max_running_requests=$BENCH_MAX_CONC_VALUE
-    decode_dp_ranks=$DECODE_TP_SIZE
-    MORI_MAX_DISPATCH_TOKENS_DECODE=$((BENCH_MAX_CONC_VALUE / decode_dp_ranks))
-    MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10))
-    # Update derived variable
-    SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
-    export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD
-    echo "[DP+EP override] Decode: max-running-requests=$decode_max_running_requests, DISPATCH_TOKENS=$MORI_MAX_DISPATCH_TOKENS_DECODE, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_DECODE, INTER_KERNEL_SWITCH=$SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD"
-fi
-
-# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
-PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} "
-if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
-    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache"
-fi
-if [[ -n "$prefill_context_length" ]]; then
-    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --context-length ${prefill_context_length}"
-fi
-if [[ -n "$prefill_max_total_tokens" ]]; then
-    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --max-total-tokens ${prefill_max_total_tokens}"
-fi
-if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then
-    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap"
-    PREFILL_SDMA_ENV="MORI_ENABLE_SDMA=true"
-fi
-
-DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} "
-
-if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then
-    DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance"
-fi
-
-if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then
-    MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
-    MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
-fi
-
-# =============================================================================
-# Cluster Topology Configuration
-# =============================================================================
-IFS=',' read -ra IP_ARRAY <<< "$IPADDRS"
-
-# Ceiling division by GPUS_PER_NODE for nodes-per-worker
-PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE))
-DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE))
-NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP))
-
-# Build prefill arguments dynamically based on xP
-PREFILL_HEADNODE_URLS=()
-PREFILL_ARGS=""
-for i in $(seq 0 $((xP - 1))); do
-    prefill_idx=$((i * PREFILL_NODES_PER_WORKER))
-    PREFILL_HEADNODE_URLS[$i]="${IP_ARRAY[$prefill_idx]}:${HEADNODE_PORT}"
-    PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$prefill_idx]}:8000"
-done
-
-# Build decode arguments dynamically based on yD
-DECODE_HEADNODE_URLS=()
-DECODE_ARGS=""
-for i in $(seq 0 $((yD - 1))); do
-    decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET))
-    DECODE_HEADNODE_URLS[$i]="${IP_ARRAY[$decode_idx]}:${HEADNODE_PORT}"
-    DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$decode_idx]}:8000"
-done
-
-echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}"
-echo "Decode  worker headnode list: ${DECODE_HEADNODE_URLS[@]}"
-
-# =============================================================================
-# Configuration Builder Functions
-# =============================================================================
-
-build_server_config() {
-    local mode="$1"
-    local model_name="$2"
-    local tp_size="$3"
-    local enable_ep="$4"
-    local enable_dp="$5"
-    local decode_mtp_size="$6"
-
-    # Calculate EP and DP sizes based on enable flags
-    local ep_size=1
-    local dp_size=1
-
-    if [[ "$enable_ep" == "true" ]]; then
-        ep_size=$tp_size
-    fi
-
-    if [[ "$enable_dp" == "true" ]]; then
-        dp_size=$tp_size
-    fi
-
-    # Build parallelism arguments
-    local parallel_args="--tp-size ${tp_size}"
-
-    if [[ "$enable_ep" == "true" ]]; then
-        parallel_args="$parallel_args --ep-size ${ep_size}"
-    fi
-
-    if [[ "$enable_dp" == "true" ]]; then
-        parallel_args="$parallel_args --dp-size ${dp_size}"
-    fi
-
-    # Get model-specific configuration from YAML-loaded variables
-    local base_config="$MODEL_BASE_FLAGS"
-    local mtp_config=""
-    local dp_config=""
-    local specific_config=""
-
-    # MTP config (only if MTP is enabled and mode is decode)
-    if [ "$decode_mtp_size" -gt 0 ]; then
-        mtp_config="${MODEL_MTP_FLAGS} --speculative-num-steps ${decode_mtp_size} --speculative-num-draft-tokens $((decode_mtp_size + 1))"
-    fi
-
-    # DP config (only if DP is enabled)
-    if [[ "$enable_dp" == "true" ]]; then
-        dp_config="$MODEL_DP_FLAGS"
-    fi
-
-    # Mode-specific config
-    if [[ "$mode" == "prefill" ]]; then
-        specific_config="$PREFILL_MODE_FLAGS"
-    elif [[ "$mode" == "decode" ]]; then
-        specific_config="$DECODE_MODE_FLAGS"
-    fi
-
-    # Combine: parallel args + base config + mtp config (decode only) + dp config + specific config
-    local full_config="$parallel_args"
-    if [[ -n "$base_config" ]]; then
-        full_config="$full_config $base_config"
-    fi
-    if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then
-        full_config="$full_config $mtp_config"
-    fi
-    if [[ -n "$dp_config" ]]; then
-        full_config="$full_config $dp_config"
-    fi
-    if [[ -n "$specific_config" ]]; then
-        full_config="$full_config $specific_config"
-    fi
-
-    echo "$full_config"
-}
-
-# Build complete server configurations
-PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE")
-DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE")
-
-if [[ -n "$MODEL_NAME" ]]; then
-    echo "Using model-specific configuration for: $MODEL_NAME"
-fi
-
-if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]; then
-    PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g')
-    DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g')
-    unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL
-    unset MORI_MOE_MAX_INPUT_TOKENS_DECODE
-fi
-
-# =============================================================================
-# Container Synchronization
-# =============================================================================
-
-echo "Waiting at the container creation barrier on $host_name"
-python3 $SGLANG_WS_PATH/sync.py barrier \
-    --local-ip ${host_ip} \
-    --local-port 5000 \
-    --enable-port \
-    --node-ips ${IPADDRS} \
-    --node-ports 5000 \
-    --wait-for-all-ports \
-    --timeout 300
-
-
-# =============================================================================
-# Node Role Assignment and Server Launch
-# =============================================================================
-
-if [ "$NODE_RANK" -eq 0 ]; then
-    echo "NODE INFO ======================================="
-    echo "================================================"
-    echo "Node List : ${SLURM_JOB_NODELIST}"
-    echo "Node IPs : ${IPADDRS}"
-    echo "Model Name : ${MODEL_NAME:-'Not specified'}"
-    echo "================================================"
-
-    echo "CLUSTER INFO ===================================="
-    echo "================================================"
-    echo "${host_name}:${host_ip} is Proxy Node and Prefill Node"
-    echo "Using prefill config: $PREFILL_SERVER_CONFIG"
-    echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}"
-    echo "Decode  parallelism: TP=${DECODE_TP_SIZE},  EP enabled: ${DECODE_ENABLE_EP},  DP enabled: ${DECODE_ENABLE_DP},  MTP size=${DECODE_MTP_SIZE}"
-    echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}"
-    echo "Decode servers  ($((DECODE_TP_SIZE/GPUS_PER_NODE))  nodes): ${DECODE_ARGS}"
-    echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}"
-    echo "Decode  env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} "
-    echo "Decode  env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} "
-
-    echo "================================================"
-
-    # start the head prefill server
-    PREFILL_MORI_MOE_ENV=""
-    set -x
-    if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then
-        PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}"
-    fi
-    set +x
-    PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
-        --model-path $MODEL_DIR/$MODEL_NAME \
-        --disaggregation-mode prefill \
-        --disaggregation-ib-device ${IBDEVICES} \
-        --host 0.0.0.0 \
-        --port 8000 \
-        --trust-remote-code \
-        ${PREFILL_SERVER_CONFIG} "
-
-    if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then
-        PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0"
-    fi
-
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $PREFILL_CMD"
-    else
-        set -x
-        eval "$PREFILL_CMD" \
-            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log &
-        set +x
-        prefill0_pid=$!
-    fi
-
-
-    echo "Waiting for all prefill and decode servers to be up . . ."
-
-
-    BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
-        --node-ips ${IPADDRS} \
-        --node-ports 8000 \
-        --wait-for-all-ports \
-        --timeout 1800"
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $BARRIER_CMD"
-    else
-        eval "$BARRIER_CMD"
-    fi
-    echo "Congratulations!!! All prefill and decode servers are up . . ."
-
-    ROUTER_CMD="python -m sglang_router.launch_router \
-        --pd-disaggregation \
-        --port 30000 \
-        --policy random \
-        --prefill-policy random \
-        --decode-policy random \
-        ${PREFILL_ARGS} \
-        ${DECODE_ARGS}"
-
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $ROUTER_CMD"
-    else
-        ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_proxy_${host_name}.log"
-        set -x
-        if [[ "${SGLANG_ROUTER_STDOUT_LOGS:-0}" == "1" ]]; then
-            eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" &
-        else
-            eval "$ROUTER_CMD" >"$ROUTER_LOG_FILE" 2>&1 &
-        fi
-        set +x
-        proxy_pid=$!
-
-        # Wait for router to be ready via health endpoint
-        HEALTH_BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
-            --node-ips ${NODE0_ADDR} \
-            --node-ports 30000 \
-            --wait-for-all-health \
-            --health-endpoint /readiness \
-            --timeout 1800"
-
-        if [[ "$DRY_RUN" -eq 1 ]]; then
-            echo "DRY RUN: $HEALTH_BARRIER_CMD"
-        else
-            eval "$HEALTH_BARRIER_CMD"
-        fi
-
-        echo "Router is ready for benchmarking"
-    fi
-
-
-    echo "Ready for benchmarking on ${host_name}:${host_ip}"
-
-    echo "Benchmarking on ${host_name}:${host_ip}"
-    cd $SGLANG_WS_PATH
-
-    # Export IS_MTP based on whether MTP is enabled
-    if [ "$DECODE_MTP_SIZE" -gt 0 ]; then
-        export IS_MTP=true
-    else
-        export IS_MTP=false
-    fi
-
-    # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier
-    BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \
-        $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \
-        ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \
-        ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}"
-
-    if [[ "${EVAL_ONLY:-false}" == "true" ]]; then
-        echo "EVAL_ONLY mode: skipping throughput benchmark"
-    elif [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $BENCH_CMD"
-    else
-        set -x
-        eval "$BENCH_CMD"
-        set +x
-    fi
-
-    # Run evaluation if requested (before killing router)
-    if [[ "${RUN_EVAL:-false}" == "true" ]]; then
-        echo "Running lm-eval evaluation on Node 0..."
-
-        # Health check: verify the router is still serving before running eval.
-        # The throughput benchmark may have crashed/exhausted decode workers.
-        EVAL_HEALTH_OK=false
-        for _attempt in 1 2 3; do
-            if curl -sf --max-time 10 "http://0.0.0.0:30000/readiness" >/dev/null 2>&1; then
-                EVAL_HEALTH_OK=true
-                break
-            fi
-            echo "Eval health check attempt $_attempt failed, retrying in 10s..."
-            sleep 10
-        done
-
-        if [[ "$EVAL_HEALTH_OK" != "true" ]]; then
-            echo "WARNING: Router health check failed after 3 attempts. Skipping eval."
-        else
-            # Must run from repo root so utils/evals/${task}.yaml resolves
-            pushd /workspace
-
-            # Source eval functions from benchmark_lib.sh
-            source /workspace/benchmarks/benchmark_lib.sh
-
-            # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list
-            if [[ -n "${EVAL_CONC:-}" ]]; then
-                export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}"
-            else
-                export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
-            fi
-
-            # Override eval context length with model's configured context_length
-            if [[ -n "$prefill_context_length" ]]; then
-                export EVAL_MAX_MODEL_LEN="$prefill_context_length"
-            fi
-
-            if [[ "$DRY_RUN" -eq 1 ]]; then
-                echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})"
-            else
-                # Run lm-eval against the router on port 30000
-                run_eval --framework lm-eval --port 30000
-                eval_rc=$?
-
-                if [[ $eval_rc -ne 0 ]]; then
-                    echo "ERROR: run_eval exited rc=$eval_rc; skipping metadata write and eval artifact staging" >&2
-                    EVAL_FAILED=1
-                else
-                    # Set metadata env vars for append_lm_eval_summary
-                    export TP="${PREFILL_TP_SIZE}"
-                    export CONC="${EVAL_CONCURRENT_REQUESTS}"
-                    export EP_SIZE=1
-                    [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}"
-                    export PREFILL_TP="${PREFILL_TP_SIZE}"
-                    export PREFILL_EP=1
-                    [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}"
-                    export PREFILL_NUM_WORKERS="${xP}"
-                    export DECODE_TP="${DECODE_TP_SIZE}"
-                    export DECODE_EP=1
-                    [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}"
-                    export DECODE_NUM_WORKERS="${yD}"
-                    export DP_ATTENTION="${PREFILL_ENABLE_DP}"
-                    export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}"
-                    export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}"
-                    export ISL="${BENCH_INPUT_LEN}"
-                    export OSL="${BENCH_OUTPUT_LEN}"
-                    # IS_MULTINODE, FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE,
-                    # RESULT_FILENAME are already set via Docker -e flags from job.slurm
-
-                    append_lm_eval_summary
-                    # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace
-
-                    # Copy eval artifacts to run_logs for NFS extraction by runner
-                    EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results"
-                    mkdir -p "$EVAL_COPY_DIR"
-                    for f in meta_env.json; do
-                        [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/"
-                    done
-                    # Use find for glob patterns to avoid "no match" errors
-                    find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \;
-                    find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \;
-
-                    echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR"
-                fi
-            fi
-
-            popd
-        fi
-    fi
-
-    # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host)
-    LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs"
-    mkdir -p "$LOGS_OUTPUT"
-
-    if [[ "$DRY_RUN" -eq 0 ]]; then
-        cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/"
-        echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}"
-    fi
-
-    echo "Killing the proxy server and prefill server"
-
-    if [[ "$DRY_RUN" -eq 0 ]]; then
-        kill $proxy_pid
-        kill $prefill0_pid
-    fi
-
-    if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then
-        echo "ERROR: eval failed; exiting node-0 with rc=1"
-        exit 1
-    fi
-
-elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
-    echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})"
-    echo "Using prefill config: $PREFILL_SERVER_CONFIG"
-    echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}"
-
-    PREFILL_MORI_MOE_ENV=""
-    set -x
-    if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then
-        PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}"
-    fi
-    set +x
-    PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
-        --model-path $MODEL_DIR/${MODEL_NAME} \
-        --disaggregation-mode prefill \
-        --disaggregation-ib-device ${IBDEVICES} \
-        --host 0.0.0.0 \
-        --port 8000 \
-        --trust-remote-code \
-        ${PREFILL_SERVER_CONFIG} "
-
-    if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then
-        rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER))
-        prefill_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER))
-        PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[$prefill_idx]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank $rank"
-    fi
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $PREFILL_CMD"
-    else
-        set -x
-        eval "$PREFILL_CMD" \
-            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log &
-        set +x
-        prefill_pid=$!
-    fi
-
-    echo "Waiting for proxy server to be up..."
-    BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
-        --node-ips ${NODE0_ADDR} \
-        --node-ports 30000 \
-        --wait-for-all-ports \
-        --timeout 1800"
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $BARRIER_CMD"
-    else
-        eval "$BARRIER_CMD"
-    fi
-
-    echo "Waiting until proxy server closes..."
-    WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \
-        --remote-ip ${NODE0_ADDR} \
-        --remote-port 30000"
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $WAIT_CMD"
-    else
-        eval "$WAIT_CMD"
-    fi
-
-    echo "Killing the rank $NODE_RANK prefill server"
-
-    if [[ "$DRY_RUN" -eq 0 ]]; then
-        kill $prefill_pid
-    fi
-
-else
-    RANK=$((NODE_RANK - xP * PREFILL_NODES_PER_WORKER))
-    echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})"
-    echo "Using decode config: $DECODE_SERVER_CONFIG"
-    echo "Decode node rank: $RANK"
-    echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}"
-
-    DECODE_MORI_MOE_ENV=""
-    set -x
-    if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_DECODE" ]]; then
-        DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}"
-    fi
-    set +x
-    DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
-        --model-path ${MODEL_DIR}/${MODEL_NAME} \
-        --disaggregation-mode decode \
-        --disaggregation-ib-device ${IBDEVICES} \
-        --host 0.0.0.0 \
-        --port 8000 \
-        --trust-remote-code \
-        ${DECODE_SERVER_CONFIG} "
-
-    if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then
-        rank=$((RANK % DECODE_NODES_PER_WORKER))
-        decode_idx=$((RANK / DECODE_NODES_PER_WORKER))
-        DECODE_CMD="$DECODE_CMD --dist-init-addr ${DECODE_HEADNODE_URLS[$decode_idx]} --nnodes ${DECODE_NODES_PER_WORKER} --node-rank $rank"
-    fi
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $DECODE_CMD"
-    else
-        set -x
-        eval "$DECODE_CMD" \
-            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log &
-
-        set +x
-        decode_pid=$!
-    fi
-
-
-    echo "Waiting for proxy server to be up..."
-    BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
-        --node-ips ${NODE0_ADDR} \
-        --node-ports 30000 \
-        --wait-for-all-ports \
-        --timeout 1800"
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $BARRIER_CMD"
-    else
-        eval "$BARRIER_CMD"
-    fi
-
-
-    echo "Waiting until proxy server closes..."
-    WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \
-        --remote-ip ${NODE0_ADDR} \
-        --remote-port 30000"
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $WAIT_CMD"
-    else
-        eval "$WAIT_CMD"
-    fi
-
-    echo "Killing the rank $RANK decode server"
-    if [[ "$DRY_RUN" -eq 0 ]]; then
-        kill $decode_pid
-    fi
-
-fi
-
-echo "Script completed successfully"
-exit 0
diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh
new file mode 100755
index 000000000..7eb7414a6
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/server_sglang.sh
@@ -0,0 +1,780 @@
+#!/bin/bash
+# SGLang Disaggregated Server Launcher with Model-Specific Configurations
+# =============================================================================
+
+# =============================================================================
+# Environment Configuration
+# =============================================================================
+
+NODE0_ADDR="${NODE0_ADDR:-localhost}"
+NODE_RANK="${NODE_RANK:-0}"
+MODEL_DIR="${MODEL_DIR:-}"
+MODEL_NAME="${MODEL_NAME:-}"
+
+xP="${xP:-1}" #-> Number of Prefill Workers
+yD="${yD:-1}" #-> Number of Decode Workers
+
+IPADDRS="${IPADDRS:-localhost}"
+HEADNODE_PORT="${HEADNODE_PORT:-20000}"
+# Parallelism Configuration
+PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}"
+PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}"
+PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}"
+DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}"
+DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}"
+DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}"
+DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}"
+
+# Benchmark Configuration
+BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}"
+BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}"
+BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}"
+BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}"
+BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}"
+BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}"
+
+# Extract the maximum concurrency from the x-delimited list
+BENCH_MAX_CONC_VALUE=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
+
+# Dry Run for debugging purpose
+DRY_RUN="${DRY_RUN:-0}"
+
+# GPU count (expandable for different hardware)
+GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
+
+
+# =============================================================================
+# Dependencies and Environment Setup
+# =============================================================================
+source $SGLANG_WS_PATH/env.sh
+
+host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}')
+host_name=$(hostname)
+
+# MORI_RDMA_TC configuration (optional)
+# If set by runner, use it for RDMA traffic class configuration
+# If not set, RDMA operations will proceed without QoS/traffic class settings
+if [[ -n "${MORI_RDMA_TC}" ]]; then
+    echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC for RDMA traffic class configuration"
+    echo "[INFO] Host '$host_name' configured with MORI_RDMA_TC=$MORI_RDMA_TC"
+else
+    echo "[INFO] MORI_RDMA_TC not set. Skipping RDMA traffic class configuration."
+    echo "[INFO] This is normal for clusters without QoS requirements."
+fi
+
+# =============================================================================
+# Model-Specific Configuration from YAML
+# =============================================================================
+MODELS_YAML="${SGLANG_WS_PATH}/models.yaml"
+
+if [[ ! -f "$MODELS_YAML" ]]; then
+    echo "ERROR: models.yaml not found at $MODELS_YAML"
+    exit 1
+fi
+
+# Load model config via inline Python (PyYAML is available in SGLang containers)
+# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP")
+# is done here in Python to avoid bash glob-expanding the * characters.
+eval "$(python3 -c "
+import yaml, sys, os
+
+config_path = '${MODELS_YAML}'
+model_name = '${MODEL_NAME}'
+
+with open(config_path) as f:
+    models = yaml.safe_load(f)
+
+if model_name not in models:
+    print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1')
+    sys.exit(0)
+
+m = models[model_name]
+
+def eval_formula(val):
+    \"\"\"Evaluate chunked_prefill_size: if string, resolve variable names from env and compute.\"\"\"
+    if isinstance(val, (int, float)):
+        return int(val)
+    s = str(val)
+    # Build a namespace from env vars (convert numeric values to int)
+    ns = {}
+    for k, v in os.environ.items():
+        try:
+            ns[k] = int(v)
+        except (ValueError, TypeError):
+            pass
+    try:
+        return int(eval(s, {'__builtins__': {}}, ns))
+    except Exception as e:
+        print(f'echo \"WARNING: Cannot evaluate formula: {s} ({e})\"', file=sys.stderr)
+        return val
+
+def parse_range(cuda_range, default_start, default_end):
+    if '-' in str(cuda_range):
+        s, e = str(cuda_range).split('-')
+        return s, e
+    return str(default_start), str(default_end)
+
+# Output shell variables
+print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"')
+print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"')
+print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"')
+
+prefill = m.get('prefill', {})
+decode = m.get('decode', {})
+
+print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"')
+print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"')
+
+dp = prefill.get('dp', {})
+no_dp = prefill.get('no_dp', {})
+print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"')
+print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"')
+print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"')
+print(f'PREFILL_CONTEXT_LENGTH_DP=\"{dp.get(\"context_length\", \"\")}\"')
+print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"')
+print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"')
+print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"')
+print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"')
+s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128)
+print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"')
+print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"')
+
+print(f'DECODE_MEM_FRACTION_STATIC=\"{decode.get(\"mem_fraction_static\", 0.85)}\"')
+print(f'DECODE_PREFILL_ROUND_ROBIN_BALANCE=\"{decode.get(\"prefill_round_robin_balance\", True)}\"')
+
+dp = decode.get('dp', {})
+ep_only = decode.get('ep_only', {})
+no_dp = decode.get('no_dp', {})
+
+# Decode DP config
+print(f'DECODE_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 4096)}\"')
+print(f'DECODE_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"')
+s, e = parse_range(dp.get('cuda_graph_bs_range', '1-160'), 1, 160)
+print(f'DECODE_CUDA_GRAPH_BS_DP_START=\"{s}\"')
+print(f'DECODE_CUDA_GRAPH_BS_DP_END=\"{e}\"')
+
+# Decode EP-only config (EP enabled but DP disabled)
+print(f'DECODE_MAX_RUNNING_REQUESTS_EP_ONLY=\"{ep_only.get(\"max_running_requests\", 256)}\"')
+print(f'DECODE_CHUNKED_PREFILL_SIZE_EP_ONLY=\"{eval_formula(ep_only.get(\"chunked_prefill_size\", 262144))}\"')
+s, e = parse_range(ep_only.get('cuda_graph_bs_range', '1-256'), 1, 256)
+print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_START=\"{s}\"')
+print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_END=\"{e}\"')
+
+# Decode no-DP config
+print(f'DECODE_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"')
+print(f'DECODE_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"')
+s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128)
+print(f'DECODE_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"')
+print(f'DECODE_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"')
+")"
+
+echo "Loaded model configuration for: $MODEL_NAME"
+
+# Compute DP-dependent prefill parameters
+if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then
+    prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP)
+    prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP
+    prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP
+    prefill_context_length=$PREFILL_CONTEXT_LENGTH_DP
+    prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_DP
+    prefill_enable_two_batch_overlap=$PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP
+else
+    prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END))
+    prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP
+    prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP
+    prefill_context_length=""
+    prefill_max_total_tokens=""
+    prefill_enable_two_batch_overlap="false"
+fi
+
+# When both DP and EP are enabled, override max-running-requests with max bench concurrency
+if [[ "$PREFILL_ENABLE_DP" == "true" ]] && [[ "$PREFILL_ENABLE_EP" == "true" ]]; then
+    prefill_max_running_requests=$BENCH_MAX_CONC_VALUE
+    prefill_dp_ranks=$PREFILL_TP_SIZE
+    # MORI_MAX_DISPATCH_TOKENS_PREFILL stays at 8192 (no change)
+    MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2))
+    echo "[DP+EP override] Prefill: max-running-requests=$prefill_max_running_requests, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_PREFILL"
+fi
+
+# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp)
+if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
+    decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END))
+    decode_max_running_requests=$((DECODE_CUDA_GRAPH_BS_DP_END * DECODE_TP_SIZE))
+elif [[ "$DECODE_ENABLE_EP" == "true" ]]; then
+    decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_EP_ONLY_START $DECODE_CUDA_GRAPH_BS_EP_ONLY_END))
+    decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_EP_ONLY
+else
+    decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_NO_DP_START $DECODE_CUDA_GRAPH_BS_NO_DP_END))
+    decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP
+fi
+
+# When both DP and EP are enabled, override max-running-requests and dispatch tokens
+if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; then
+    decode_max_running_requests=$BENCH_MAX_CONC_VALUE
+    decode_dp_ranks=$DECODE_TP_SIZE
+    MORI_MAX_DISPATCH_TOKENS_DECODE=$((BENCH_MAX_CONC_VALUE / decode_dp_ranks))
+    MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10))
+    # Update derived variable
+    SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
+    export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD
+    echo "[DP+EP override] Decode: max-running-requests=$decode_max_running_requests, DISPATCH_TOKENS=$MORI_MAX_DISPATCH_TOKENS_DECODE, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_DECODE, INTER_KERNEL_SWITCH=$SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD"
+fi
+
+# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
+PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} "
+if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
+    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache"
+fi
+if [[ -n "$prefill_context_length" ]]; then
+    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --context-length ${prefill_context_length}"
+fi
+if [[ -n "$prefill_max_total_tokens" ]]; then
+    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --max-total-tokens ${prefill_max_total_tokens}"
+fi
+if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then
+    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap"
+    PREFILL_SDMA_ENV="MORI_ENABLE_SDMA=true"
+fi
+
+DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} "
+
+if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then
+    DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance"
+fi
+
+if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then
+    MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
+    MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
+fi
+
+# =============================================================================
+# Cluster Topology Configuration
+# =============================================================================
+IFS=',' read -ra IP_ARRAY <<< "$IPADDRS"
+
+# Ceiling division by GPUS_PER_NODE for nodes-per-worker
+PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE))
+DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE))
+NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP))
+
+# Build prefill arguments dynamically based on xP
+PREFILL_HEADNODE_URLS=()
+PREFILL_ARGS=""
+for i in $(seq 0 $((xP - 1))); do
+    prefill_idx=$((i * PREFILL_NODES_PER_WORKER))
+    PREFILL_HEADNODE_URLS[$i]="${IP_ARRAY[$prefill_idx]}:${HEADNODE_PORT}"
+    PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$prefill_idx]}:8000"
+done
+
+# Build decode arguments dynamically based on yD
+DECODE_HEADNODE_URLS=()
+DECODE_ARGS=""
+for i in $(seq 0 $((yD - 1))); do
+    decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET))
+    DECODE_HEADNODE_URLS[$i]="${IP_ARRAY[$decode_idx]}:${HEADNODE_PORT}"
+    DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$decode_idx]}:8000"
+done
+
+echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}"
+echo "Decode  worker headnode list: ${DECODE_HEADNODE_URLS[@]}"
+
+# =============================================================================
+# Configuration Builder Functions
+# =============================================================================
+
+build_server_config() {
+    local mode="$1"
+    local model_name="$2"
+    local tp_size="$3"
+    local enable_ep="$4"
+    local enable_dp="$5"
+    local decode_mtp_size="$6"
+
+    # Calculate EP and DP sizes based on enable flags
+    local ep_size=1
+    local dp_size=1
+
+    if [[ "$enable_ep" == "true" ]]; then
+        ep_size=$tp_size
+    fi
+
+    if [[ "$enable_dp" == "true" ]]; then
+        dp_size=$tp_size
+    fi
+
+    # Build parallelism arguments
+    local parallel_args="--tp-size ${tp_size}"
+
+    if [[ "$enable_ep" == "true" ]]; then
+        parallel_args="$parallel_args --ep-size ${ep_size}"
+    fi
+
+    if [[ "$enable_dp" == "true" ]]; then
+        parallel_args="$parallel_args --dp-size ${dp_size}"
+    fi
+
+    # Get model-specific configuration from YAML-loaded variables
+    local base_config="$MODEL_BASE_FLAGS"
+    local mtp_config=""
+    local dp_config=""
+    local specific_config=""
+
+    # MTP config (only if MTP is enabled and mode is decode)
+    if [ "$decode_mtp_size" -gt 0 ]; then
+        mtp_config="${MODEL_MTP_FLAGS} --speculative-num-steps ${decode_mtp_size} --speculative-num-draft-tokens $((decode_mtp_size + 1))"
+    fi
+
+    # DP config (only if DP is enabled)
+    if [[ "$enable_dp" == "true" ]]; then
+        dp_config="$MODEL_DP_FLAGS"
+    fi
+
+    # Mode-specific config
+    if [[ "$mode" == "prefill" ]]; then
+        specific_config="$PREFILL_MODE_FLAGS"
+    elif [[ "$mode" == "decode" ]]; then
+        specific_config="$DECODE_MODE_FLAGS"
+    fi
+
+    # Combine: parallel args + base config + mtp config (decode only) + dp config + specific config
+    local full_config="$parallel_args"
+    if [[ -n "$base_config" ]]; then
+        full_config="$full_config $base_config"
+    fi
+    if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then
+        full_config="$full_config $mtp_config"
+    fi
+    if [[ -n "$dp_config" ]]; then
+        full_config="$full_config $dp_config"
+    fi
+    if [[ -n "$specific_config" ]]; then
+        full_config="$full_config $specific_config"
+    fi
+
+    echo "$full_config"
+}
+
+# Build complete server configurations
+PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE")
+DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE")
+
+if [[ -n "$MODEL_NAME" ]]; then
+    echo "Using model-specific configuration for: $MODEL_NAME"
+fi
+
+if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]; then
+    PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g')
+    DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g')
+    unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL
+    unset MORI_MOE_MAX_INPUT_TOKENS_DECODE
+fi
+
+# =============================================================================
+# Container Synchronization
+# =============================================================================
+
+echo "Waiting at the container creation barrier on $host_name"
+python3 $SGLANG_WS_PATH/sync.py barrier \
+    --local-ip ${host_ip} \
+    --local-port 5000 \
+    --enable-port \
+    --node-ips ${IPADDRS} \
+    --node-ports 5000 \
+    --wait-for-all-ports \
+    --timeout 300
+
+
+# =============================================================================
+# Node Role Assignment and Server Launch
+# =============================================================================
+
+if [ "$NODE_RANK" -eq 0 ]; then
+    echo "NODE INFO ======================================="
+    echo "================================================"
+    echo "Node List : ${SLURM_JOB_NODELIST}"
+    echo "Node IPs : ${IPADDRS}"
+    echo "Model Name : ${MODEL_NAME:-'Not specified'}"
+    echo "================================================"
+
+    echo "CLUSTER INFO ===================================="
+    echo "================================================"
+    echo "${host_name}:${host_ip} is Proxy Node and Prefill Node"
+    echo "Using prefill config: $PREFILL_SERVER_CONFIG"
+    echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}"
+    echo "Decode  parallelism: TP=${DECODE_TP_SIZE},  EP enabled: ${DECODE_ENABLE_EP},  DP enabled: ${DECODE_ENABLE_DP},  MTP size=${DECODE_MTP_SIZE}"
+    echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}"
+    echo "Decode servers  ($((DECODE_TP_SIZE/GPUS_PER_NODE))  nodes): ${DECODE_ARGS}"
+    echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}"
+    echo "Decode  env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} "
+    echo "Decode  env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} "
+
+    echo "================================================"
+
+    # start the head prefill server
+    PREFILL_MORI_MOE_ENV=""
+    set -x
+    if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then
+        PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}"
+    fi
+    set +x
+    PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
+        --model-path $MODEL_DIR/$MODEL_NAME \
+        --disaggregation-mode prefill \
+        --disaggregation-ib-device ${IBDEVICES} \
+        --host 0.0.0.0 \
+        --port 8000 \
+        --trust-remote-code \
+        ${PREFILL_SERVER_CONFIG} "
+
+    if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then
+        PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0"
+    fi
+
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $PREFILL_CMD"
+    else
+        set -x
+        eval "$PREFILL_CMD" \
+            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log &
+        set +x
+        prefill0_pid=$!
+    fi
+
+
+    echo "Waiting for all prefill and decode servers to be up . . ."
+
+
+    BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
+        --node-ips ${IPADDRS} \
+        --node-ports 8000 \
+        --wait-for-all-ports \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BARRIER_CMD"
+    else
+        eval "$BARRIER_CMD"
+    fi
+    echo "Congratulations!!! All prefill and decode servers are up . . ."
+
+    ROUTER_CMD="python -m sglang_router.launch_router \
+        --pd-disaggregation \
+        --port 30000 \
+        --policy random \
+        --prefill-policy random \
+        --decode-policy random \
+        ${PREFILL_ARGS} \
+        ${DECODE_ARGS}"
+
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $ROUTER_CMD"
+    else
+        ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_proxy_${host_name}.log"
+        set -x
+        if [[ "${SGLANG_ROUTER_STDOUT_LOGS:-0}" == "1" ]]; then
+            eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" &
+        else
+            eval "$ROUTER_CMD" >"$ROUTER_LOG_FILE" 2>&1 &
+        fi
+        set +x
+        proxy_pid=$!
+
+        # Wait for router to be ready via health endpoint
+        HEALTH_BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
+            --node-ips ${NODE0_ADDR} \
+            --node-ports 30000 \
+            --wait-for-all-health \
+            --health-endpoint /readiness \
+            --timeout 1800"
+
+        if [[ "$DRY_RUN" -eq 1 ]]; then
+            echo "DRY RUN: $HEALTH_BARRIER_CMD"
+        else
+            eval "$HEALTH_BARRIER_CMD"
+        fi
+
+        echo "Router is ready for benchmarking"
+    fi
+
+
+    echo "Ready for benchmarking on ${host_name}:${host_ip}"
+
+    echo "Benchmarking on ${host_name}:${host_ip}"
+    cd $SGLANG_WS_PATH
+
+    # Export IS_MTP based on whether MTP is enabled
+    if [ "$DECODE_MTP_SIZE" -gt 0 ]; then
+        export IS_MTP=true
+    else
+        export IS_MTP=false
+    fi
+
+    # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier
+    BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \
+        $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \
+        ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \
+        ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}"
+
+    if [[ "${EVAL_ONLY:-false}" == "true" ]]; then
+        echo "EVAL_ONLY mode: skipping throughput benchmark"
+    elif [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BENCH_CMD"
+    else
+        set -x
+        eval "$BENCH_CMD"
+        set +x
+    fi
+
+    # Run evaluation if requested (before killing router)
+    if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+        echo "Running lm-eval evaluation on Node 0..."
+
+        # Health check: verify the router is still serving before running eval.
+        # The throughput benchmark may have crashed/exhausted decode workers.
+        EVAL_HEALTH_OK=false
+        for _attempt in 1 2 3; do
+            if curl -sf --max-time 10 "http://0.0.0.0:30000/readiness" >/dev/null 2>&1; then
+                EVAL_HEALTH_OK=true
+                break
+            fi
+            echo "Eval health check attempt $_attempt failed, retrying in 10s..."
+            sleep 10
+        done
+
+        if [[ "$EVAL_HEALTH_OK" != "true" ]]; then
+            echo "WARNING: Router health check failed after 3 attempts. Skipping eval."
+        else
+            # Must run from repo root so utils/evals/${task}.yaml resolves
+            pushd /workspace
+
+            # Source eval functions from benchmark_lib.sh
+            source /workspace/benchmarks/benchmark_lib.sh
+
+            # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list
+            if [[ -n "${EVAL_CONC:-}" ]]; then
+                export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}"
+            else
+                export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
+            fi
+
+            # Override eval context length with model's configured context_length
+            if [[ -n "$prefill_context_length" ]]; then
+                export EVAL_MAX_MODEL_LEN="$prefill_context_length"
+            fi
+
+            if [[ "$DRY_RUN" -eq 1 ]]; then
+                echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})"
+            else
+                # Run lm-eval against the router on port 30000
+                run_eval --framework lm-eval --port 30000
+                eval_rc=$?
+
+                if [[ $eval_rc -ne 0 ]]; then
+                    echo "ERROR: run_eval exited rc=$eval_rc; skipping metadata write and eval artifact staging" >&2
+                    EVAL_FAILED=1
+                else
+                    # Set metadata env vars for append_lm_eval_summary
+                    export TP="${PREFILL_TP_SIZE}"
+                    export CONC="${EVAL_CONCURRENT_REQUESTS}"
+                    export EP_SIZE=1
+                    [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}"
+                    export PREFILL_TP="${PREFILL_TP_SIZE}"
+                    export PREFILL_EP=1
+                    [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}"
+                    export PREFILL_NUM_WORKERS="${xP}"
+                    export DECODE_TP="${DECODE_TP_SIZE}"
+                    export DECODE_EP=1
+                    [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}"
+                    export DECODE_NUM_WORKERS="${yD}"
+                    export DP_ATTENTION="${PREFILL_ENABLE_DP}"
+                    export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}"
+                    export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}"
+                    export ISL="${BENCH_INPUT_LEN}"
+                    export OSL="${BENCH_OUTPUT_LEN}"
+                    # IS_MULTINODE, FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE,
+                    # RESULT_FILENAME are already set via Docker -e flags from job.slurm
+
+                    append_lm_eval_summary
+                    # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace
+
+                    # Copy eval artifacts to run_logs for NFS extraction by runner
+                    EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results"
+                    mkdir -p "$EVAL_COPY_DIR"
+                    for f in meta_env.json; do
+                        [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/"
+                    done
+                    # Use find for glob patterns to avoid "no match" errors
+                    find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \;
+                    find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \;
+
+                    echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR"
+                fi
+            fi
+
+            popd
+        fi
+    fi
+
+    # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host)
+    LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs"
+    mkdir -p "$LOGS_OUTPUT"
+
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/"
+        echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}"
+    fi
+
+    echo "Killing the proxy server and prefill server"
+
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        kill $proxy_pid
+        kill $prefill0_pid
+    fi
+
+    if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then
+        echo "ERROR: eval failed; exiting node-0 with rc=1"
+        exit 1
+    fi
+
+elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
+    echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})"
+    echo "Using prefill config: $PREFILL_SERVER_CONFIG"
+    echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}"
+
+    PREFILL_MORI_MOE_ENV=""
+    set -x
+    if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then
+        PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}"
+    fi
+    set +x
+    PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
+        --model-path $MODEL_DIR/${MODEL_NAME} \
+        --disaggregation-mode prefill \
+        --disaggregation-ib-device ${IBDEVICES} \
+        --host 0.0.0.0 \
+        --port 8000 \
+        --trust-remote-code \
+        ${PREFILL_SERVER_CONFIG} "
+
+    if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then
+        rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER))
+        prefill_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER))
+        PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[$prefill_idx]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank $rank"
+    fi
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $PREFILL_CMD"
+    else
+        set -x
+        eval "$PREFILL_CMD" \
+            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log &
+        set +x
+        prefill_pid=$!
+    fi
+
+    echo "Waiting for proxy server to be up..."
+    BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
+        --node-ips ${NODE0_ADDR} \
+        --node-ports 30000 \
+        --wait-for-all-ports \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BARRIER_CMD"
+    else
+        eval "$BARRIER_CMD"
+    fi
+
+    echo "Waiting until proxy server closes..."
+    WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \
+        --remote-ip ${NODE0_ADDR} \
+        --remote-port 30000"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $WAIT_CMD"
+    else
+        eval "$WAIT_CMD"
+    fi
+
+    echo "Killing the rank $NODE_RANK prefill server"
+
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        kill $prefill_pid
+    fi
+
+else
+    RANK=$((NODE_RANK - xP * PREFILL_NODES_PER_WORKER))
+    echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})"
+    echo "Using decode config: $DECODE_SERVER_CONFIG"
+    echo "Decode node rank: $RANK"
+    echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}"
+
+    DECODE_MORI_MOE_ENV=""
+    set -x
+    if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_DECODE" ]]; then
+        DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}"
+    fi
+    set +x
+    DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
+        --model-path ${MODEL_DIR}/${MODEL_NAME} \
+        --disaggregation-mode decode \
+        --disaggregation-ib-device ${IBDEVICES} \
+        --host 0.0.0.0 \
+        --port 8000 \
+        --trust-remote-code \
+        ${DECODE_SERVER_CONFIG} "
+
+    if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then
+        rank=$((RANK % DECODE_NODES_PER_WORKER))
+        decode_idx=$((RANK / DECODE_NODES_PER_WORKER))
+        DECODE_CMD="$DECODE_CMD --dist-init-addr ${DECODE_HEADNODE_URLS[$decode_idx]} --nnodes ${DECODE_NODES_PER_WORKER} --node-rank $rank"
+    fi
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $DECODE_CMD"
+    else
+        set -x
+        eval "$DECODE_CMD" \
+            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log &
+
+        set +x
+        decode_pid=$!
+    fi
+
+
+    echo "Waiting for proxy server to be up..."
+    BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
+        --node-ips ${NODE0_ADDR} \
+        --node-ports 30000 \
+        --wait-for-all-ports \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BARRIER_CMD"
+    else
+        eval "$BARRIER_CMD"
+    fi
+
+
+    echo "Waiting until proxy server closes..."
+    WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \
+        --remote-ip ${NODE0_ADDR} \
+        --remote-port 30000"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $WAIT_CMD"
+    else
+        eval "$WAIT_CMD"
+    fi
+
+    echo "Killing the rank $RANK decode server"
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        kill $decode_pid
+    fi
+
+fi
+
+echo "Script completed successfully"
+exit 0
diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh
new file mode 100755
index 000000000..d61fe0359
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/server_vllm.sh
@@ -0,0 +1,527 @@
+#!/bin/bash
+# vLLM Disaggregated Server Launcher with Model-Specific Configurations
+# =============================================================================
+#
+# Node role assignment (by NODE_RANK):
+#   0           -> Proxy/Router + first Prefill node  (kv_producer)
+#   1..xP-1     -> Additional Prefill nodes            (kv_producer)
+#   xP..xP+yD-1 -> Decode nodes                        (kv_consumer)
+#
+# Total nodes = xP + yD (router co-located with first prefill, like SGLang).
+
+# =============================================================================
+# Dependency Setup (idempotent; required when using base vLLM image)
+# =============================================================================
+source "$(dirname "${BASH_SOURCE[0]}")/setup_deps.sh"
+
+# =============================================================================
+# Environment Configuration
+# =============================================================================
+
+NODE0_ADDR="${NODE0_ADDR:-localhost}"
+NODE_RANK="${NODE_RANK:-0}"
+MODEL_DIR="${MODEL_DIR:-}"
+MODEL_NAME="${MODEL_NAME:-}"
+
+xP="${xP:-1}"
+yD="${yD:-1}"
+
+IPADDRS="${IPADDRS:-localhost}"
+
+# Benchmark Configuration
+BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}"
+BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}"
+BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}"
+BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}"
+BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}"
+BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}"
+
+DRY_RUN="${DRY_RUN:-0}"
+GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
+
+ROUTER_PORT="${ROUTER_PORT:-30000}"
+SERVER_PORT="${SERVER_PORT:-2584}"
+ENGINE_ID="${ENGINE_ID:-${MODEL_NAME}-pd-run}"
+
+# Prefer MODEL_PATH from job.slurm (handles HF cache snapshot resolution)
+MODEL_PATH="${MODEL_PATH:-${MODEL_DIR}/${MODEL_NAME}}"
+
+# =============================================================================
+# Dependencies and Environment Setup
+# =============================================================================
+source $WS_PATH/env.sh
+
+host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}')
+# RDMA IP for Nixl KV transfer (prefer 192.168.x.x subnet if available)
+rdma_ip=$(hostname -I | tr ' ' '\n' | grep '^192\.168\.' | head -1)
+rdma_ip="${rdma_ip:-$host_ip}"
+host_name=$(hostname)
+
+echo "[INFO] Management IP (barriers/proxy): $host_ip"
+echo "[INFO] RDMA IP (Nixl KV transfer): $rdma_ip"
+
+# =============================================================================
+# RDMA / Nixl Workarounds
+# =============================================================================
+
+setup_rdma_env() {
+    # Pensando ionic (RoCEv2) point-to-point /31 route fix.
+    # Each benic interface has a /31 to the TOR switch. Without explicit routes,
+    # traffic to other nodes' RDMA IPs falls through to the management network.
+    if [[ "$rdma_ip" =~ ^192\.168\.([0-9]+)\.([0-9]+)$ ]]; then
+        local rdma_subnet="${BASH_REMATCH[1]}"
+        local rdma_host="${BASH_REMATCH[2]}"
+        local rdma_gw="192.168.${rdma_subnet}.$(( rdma_host | 1 ))"
+        local rdma_iface
+        rdma_iface=$(ip -o addr show | awk -v ip="$rdma_ip" '$4 ~ ip {print $2}' | head -1)
+        if [[ -n "$rdma_iface" ]]; then
+            ip route replace "192.168.${rdma_subnet}.0/24" via "$rdma_gw" dev "$rdma_iface" 2>/dev/null && \
+                echo "[RDMA-ROUTE] Added 192.168.${rdma_subnet}.0/24 via $rdma_gw dev $rdma_iface" || \
+                echo "[RDMA-ROUTE] Route add failed for 192.168.${rdma_subnet}.0/24"
+        fi
+    fi
+
+    # Patch Nixl UCX backend: set ucx_error_handling_mode=none.
+    # Required for ALL NIC types under high concurrency (C512+). Without this,
+    # UCX's default UCP_ERR_HANDLING_MODE_PEER triggers transport-level error
+    # recovery on ibv_post_send failures, preventing RIXL RDMA READ retries from
+    # recovering gracefully. This causes the prefill KV cache to fill to 100%
+    # and deadlock the pipeline. On ionic NICs this was already applied (rdmacm
+    # incompatibility); on mlx5 NICs it was incorrectly skipped.
+    local nixl_api
+    nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null)
+    if [[ -n "$nixl_api" ]]; then
+        if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then
+            sed -i '/self\.create_backend(bknd, init)/i\                init["ucx_error_handling_mode"] = "none"' "$nixl_api"
+            echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api (IBDEVICES=${IBDEVICES:-unset})"
+        else
+            echo "[PATCH] ucx_error_handling_mode already set in $nixl_api"
+        fi
+    fi
+}
+
+setup_rdma_env
+
+if [[ -z "$UCX_NET_DEVICES" ]]; then
+    echo "Error: UCX_NET_DEVICES is empty after env.sh detection" >&2
+    exit 1
+fi
+
+# =============================================================================
+# Model-Specific Configuration from YAML
+# =============================================================================
+MODELS_YAML="${WS_PATH}/models_vllm.yaml"
+
+if [[ ! -f "$MODELS_YAML" ]]; then
+    echo "ERROR: models.yaml not found at $MODELS_YAML"
+    exit 1
+fi
+
+if [[ -z "$MODEL_NAME" ]]; then
+    echo "ERROR: MODEL_NAME is not set"; exit 1
+fi
+
+eval "$(python3 -c "
+import yaml, sys
+
+with open('${MODELS_YAML}') as f:
+    models = yaml.safe_load(f)
+
+model_name = '${MODEL_NAME}'
+if model_name not in models:
+    print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1')
+    sys.exit(0)
+
+m = models[model_name]
+
+def bash_escape(s):
+    \"\"\"Escape a value for safe embedding in a bash double-quoted assignment.\"\"\"
+    return s.replace('\\\\', '\\\\\\\\').replace('\"', '\\\\\"').replace('\$', '\\\\\$').replace('\`', '\\\\\`')
+
+pf = bash_escape(m.get('prefill_flags', '--tensor-parallel-size 8'))
+df = bash_escape(m.get('decode_flags', '--tensor-parallel-size 8'))
+ev = bash_escape(m.get('env', ''))
+dev = bash_escape(m.get('decode_env', ''))
+print(f'PREFILL_SERVER_CONFIG=\"{pf}\"')
+print(f'DECODE_SERVER_CONFIG=\"{df}\"')
+print(f'MODEL_ENVS=\"{ev}\"')
+print(f'DECODE_MODEL_ENVS=\"{dev}\"')
+")"
+
+echo "Loaded model configuration for: $MODEL_NAME"
+
+# Apply tensor-parallel size and EP/DP flags from submit pipeline.
+if [[ -n "${PREFILL_TP_SIZE:-}" ]]; then
+    if echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then
+        PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${PREFILL_TP_SIZE}/g")
+    else
+        PREFILL_SERVER_CONFIG+=" --tensor-parallel-size ${PREFILL_TP_SIZE}"
+    fi
+fi
+if [[ -n "${DECODE_TP_SIZE:-}" ]]; then
+    if echo "$DECODE_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then
+        DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${DECODE_TP_SIZE}/g")
+    else
+        DECODE_SERVER_CONFIG+=" --tensor-parallel-size ${DECODE_TP_SIZE}"
+    fi
+fi
+if [[ "${PREFILL_ENABLE_EP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then
+    PREFILL_SERVER_CONFIG+=" --enable-expert-parallel"
+fi
+if [[ "${PREFILL_ENABLE_DP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then
+    PREFILL_SERVER_CONFIG+=" --enable-dp-attention"
+fi
+if [[ "${DECODE_ENABLE_EP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then
+    DECODE_SERVER_CONFIG+=" --enable-expert-parallel"
+fi
+if [[ "${DECODE_ENABLE_DP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then
+    DECODE_SERVER_CONFIG+=" --enable-dp-attention"
+fi
+
+echo "PREFILL_SERVER_CONFIG (after TP/EP/DP): $PREFILL_SERVER_CONFIG"
+echo "DECODE_SERVER_CONFIG (after TP/EP/DP): $DECODE_SERVER_CONFIG"
+
+# =============================================================================
+# Container Synchronization
+# =============================================================================
+
+echo "Waiting at the container creation barrier on $host_name"
+python3 $WS_PATH/sync.py barrier \
+    --local-ip ${host_ip} \
+    --local-port 5000 \
+    --enable-port \
+    --node-ips ${IPADDRS} \
+    --node-ports 5000 \
+    --wait-for-all-ports \
+    --timeout 600
+
+# =============================================================================
+# Cluster Topology Configuration
+# =============================================================================
+IFS=',' read -ra IP_ARRAY <<< "$IPADDRS"
+
+PREFILL_ARGS=""
+DECODE_ARGS=""
+
+for ((i=0; i<xP && i<${#IP_ARRAY[@]}; i++)); do
+    PREFILL_ARGS+="${IP_ARRAY[$i]} "
+done
+
+for ((i=xP; i<${#IP_ARRAY[@]}; i++)); do
+    DECODE_ARGS+="${IP_ARRAY[$i]} "
+done
+
+echo "Prefill node IPs: ${PREFILL_ARGS}"
+echo "Decode  node IPs: ${DECODE_ARGS}"
+
+# MoRI-IO proxy ZMQ registration port (must match vllm-router --vllm-discovery-address)
+PROXY_PING_PORT="${PROXY_PING_PORT:-36367}"
+
+# vLLM runtime environment (static vars moved to env.sh; these depend on per-node state)
+setup_vllm_env() {
+    export VLLM_NIXL_SIDE_CHANNEL_HOST=${rdma_ip}
+    export VLLM_NIXL_SIDE_CHANNEL_PORT=5600
+    for env_pair in ${MODEL_ENVS}; do
+        export "$env_pair"
+    done
+}
+
+# =============================================================================
+# Node Role Assignment and Server Launch
+# =============================================================================
+
+if [ "$NODE_RANK" -eq 0 ]; then
+    echo "NODE INFO ======================================="
+    echo "================================================"
+    echo "Node List : ${SLURM_JOB_NODELIST}"
+    echo "Node IPs  : ${IPADDRS}"
+    echo "Model     : ${MODEL_NAME:-'Not specified'}"
+    echo "================================================"
+
+    echo "CLUSTER INFO ===================================="
+    echo "================================================"
+    echo "${host_name}:${host_ip} is Proxy Node and Prefill Node"
+    echo "Using prefill config: $PREFILL_SERVER_CONFIG"
+    echo "Prefill servers: ${PREFILL_ARGS}"
+    echo "Decode  servers: ${DECODE_ARGS}"
+    echo "================================================"
+
+    setup_vllm_env
+
+    # Router is started as an external container by job.slurm (VLLM_ROUTER_IMAGE)
+    echo "Using external vllm-router container (started by job.slurm on this node)"
+
+    SERVED_MODEL="${MODEL_NAME}"
+    PREFILL_CMD="vllm serve ${MODEL_PATH} \
+        --served-model-name ${SERVED_MODEL} \
+        --port $SERVER_PORT \
+        --trust-remote-code \
+        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
+        ${PREFILL_SERVER_CONFIG}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $PREFILL_CMD"
+    else
+        PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log"
+        set -x
+        eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 &
+        set +x
+        prefill_pid=$!
+    fi
+
+    echo "Waiting for all prefill and decode servers to be up . . ."
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: skipping barrier (wait-for-all-ports)"
+    else
+        python3 $WS_PATH/sync.py barrier \
+            --node-ips ${IPADDRS} \
+            --node-ports $SERVER_PORT \
+            --wait-for-all-ports \
+            --timeout 1800
+    fi
+
+    echo "Congratulations!!! All prefill and decode servers are up . . ."
+
+    # Wait for proxy /health to confirm it is accepting requests
+    HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \
+        --node-ips ${NODE0_ADDR} \
+        --node-ports ${ROUTER_PORT} \
+        --wait-for-all-health \
+        --health-endpoint /health \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $HEALTH_BARRIER_CMD"
+    else
+        eval "$HEALTH_BARRIER_CMD"
+        echo "MoRI-IO proxy is ready for benchmarking"
+    fi
+
+    echo "Ready for benchmarking on ${host_name}:${host_ip}"
+    echo "Benchmarking on ${host_name}:${host_ip}"
+    cd $WS_PATH
+
+    export ROUTER_PORT=$ROUTER_PORT
+    BENCH_CMD="bash $WS_PATH/bench.sh ${xP} ${yD} $((GPUS_PER_NODE*xP)) $((GPUS_PER_NODE*yD)) \
+        $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \
+        ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \
+        ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}"
+
+    if [[ "${EVAL_ONLY:-false}" == "true" ]]; then
+        echo "EVAL_ONLY mode: skipping throughput benchmark"
+    elif [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BENCH_CMD"
+    else
+        set -x
+        eval "$BENCH_CMD"
+        set +x
+    fi
+
+    # Run evaluation if requested (before killing router)
+    if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+        echo "Running lm-eval evaluation on Node 0..."
+
+        EVAL_HEALTH_OK=false
+        for _attempt in 1 2 3; do
+            if curl -sf --max-time 10 "http://0.0.0.0:${ROUTER_PORT}/health" >/dev/null 2>&1; then
+                EVAL_HEALTH_OK=true
+                break
+            fi
+            echo "Eval health check attempt $_attempt failed, retrying in 10s..."
+            sleep 10
+        done
+
+        if [[ "$EVAL_HEALTH_OK" != "true" ]]; then
+            echo "WARNING: Router health check failed after 3 attempts. Skipping eval."
+        else
+            pushd /workspace
+
+            source /workspace/benchmarks/benchmark_lib.sh
+
+            if [[ -n "${EVAL_CONC:-}" ]]; then
+                export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}"
+            else
+                export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
+            fi
+
+            if [[ "$DRY_RUN" -eq 1 ]]; then
+                echo "DRY RUN: run_eval --framework lm-eval --port $ROUTER_PORT (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})"
+            else
+                run_eval --framework lm-eval --port "$ROUTER_PORT"
+                eval_rc=$?
+
+                if [[ $eval_rc -ne 0 ]]; then
+                    echo "ERROR: run_eval exited rc=$eval_rc; skipping metadata write and eval artifact staging" >&2
+                    EVAL_FAILED=1
+                else
+                    export TP="${PREFILL_TP_SIZE}"
+                    export CONC="${EVAL_CONCURRENT_REQUESTS}"
+                    export EP_SIZE=1
+                    [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}"
+                    export PREFILL_TP="${PREFILL_TP_SIZE}"
+                    export PREFILL_EP=1
+                    [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}"
+                    export PREFILL_NUM_WORKERS="${xP}"
+                    export DECODE_TP="${DECODE_TP_SIZE}"
+                    export DECODE_EP=1
+                    [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}"
+                    export DECODE_NUM_WORKERS="${yD}"
+                    export DP_ATTENTION="${PREFILL_ENABLE_DP}"
+                    export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}"
+                    export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}"
+                    export ISL="${BENCH_INPUT_LEN}"
+                    export OSL="${BENCH_OUTPUT_LEN}"
+
+                    append_lm_eval_summary
+
+                    EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results"
+                    mkdir -p "$EVAL_COPY_DIR"
+                    for f in meta_env.json; do
+                        [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/"
+                    done
+                    find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \;
+                    find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \;
+
+                    echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR"
+                fi
+            fi
+
+            popd
+        fi
+    fi
+
+    # Copy benchmark/eval results to BENCHMARK_LOGS_DIR (mounted from host)
+    LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs"
+    mkdir -p "$LOGS_OUTPUT"
+
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/"
+        echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}"
+    fi
+
+    echo "Killing the prefill server"
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true
+        sleep 2
+        pkill -f "vllm serve" 2>/dev/null || true
+    fi
+
+    if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then
+        echo "ERROR: eval failed; exiting node-0 with rc=1"
+        exit 1
+    fi
+
+elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then
+    echo "${host_name}:${host_ip} is Additional Prefill Node (Model: ${MODEL_NAME})"
+    echo "Using prefill config: $PREFILL_SERVER_CONFIG"
+
+    setup_vllm_env
+
+    SERVED_MODEL="${MODEL_NAME}"
+    PREFILL_CMD="vllm serve ${MODEL_PATH} \
+        --served-model-name ${SERVED_MODEL} \
+        --port $SERVER_PORT \
+        --trust-remote-code \
+        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
+        ${PREFILL_SERVER_CONFIG}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $PREFILL_CMD"
+    else
+        PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log"
+        set -x
+        eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 &
+        set +x
+        prefill_pid=$!
+    fi
+
+    echo "Waiting for proxy server to be up..."
+    BARRIER_CMD="python3 $WS_PATH/sync.py barrier \
+        --node-ips ${NODE0_ADDR} \
+        --node-ports ${ROUTER_PORT} \
+        --wait-for-all-ports \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BARRIER_CMD"
+    else
+        eval "$BARRIER_CMD"
+    fi
+
+    echo "Waiting until proxy server closes..."
+    WAIT_CMD="python3 $WS_PATH/sync.py wait \
+        --remote-ip ${NODE0_ADDR} \
+        --remote-port ${ROUTER_PORT}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $WAIT_CMD"
+    else
+        eval "$WAIT_CMD"
+    fi
+
+    echo "Killing the prefill server"
+    [[ "$DRY_RUN" -eq 0 ]] && kill $prefill_pid 2>/dev/null || true
+
+else
+    echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME})"
+    echo "Using decode config: $DECODE_SERVER_CONFIG"
+
+    setup_vllm_env
+
+    for env_pair in ${DECODE_MODEL_ENVS}; do
+        export "$env_pair"
+        echo "[DECODE_ENV] $env_pair"
+    done
+
+    SERVED_MODEL="${MODEL_NAME}"
+    DECODE_CMD="vllm serve ${MODEL_PATH} \
+        --served-model-name ${SERVED_MODEL} \
+        --port $SERVER_PORT \
+        --trust-remote-code \
+        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
+        ${DECODE_SERVER_CONFIG}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $DECODE_CMD"
+    else
+        DECODE_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log"
+        set -x
+        eval "$DECODE_CMD" > "$DECODE_LOG_FILE" 2>&1 &
+        set +x
+        decode_pid=$!
+    fi
+
+    echo "Waiting for proxy server to be up..."
+    BARRIER_CMD="python3 $WS_PATH/sync.py barrier \
+        --node-ips ${NODE0_ADDR} \
+        --node-ports ${ROUTER_PORT} \
+        --wait-for-all-ports \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BARRIER_CMD"
+    else
+        eval "$BARRIER_CMD"
+    fi
+
+    echo "Waiting until proxy server closes..."
+    WAIT_CMD="python3 $WS_PATH/sync.py wait \
+        --remote-ip ${NODE0_ADDR} \
+        --remote-port ${ROUTER_PORT}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $WAIT_CMD"
+    else
+        eval "$WAIT_CMD"
+    fi
+
+    echo "Killing the decode server"
+    [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid 2>/dev/null || true
+fi
+
+# echo "Killing the etcd server"
+# kill $etcd_pid 2>/dev/null || true
+# pkill -f etcd 2>/dev/null || true
+
+echo "Script completed successfully"
+exit 0
diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh
new file mode 100644
index 000000000..1b5c6f45e
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/setup_deps.sh
@@ -0,0 +1,654 @@
+#!/bin/bash
+# =============================================================================
+# setup_deps.sh — Install missing vLLM disagg dependencies at container start.
+#
+# Base image: vllm/vllm-openai-rocm:v0.18.0
+# Sourced by server.sh so PATH / LD_LIBRARY_PATH exports persist.
+# Idempotent: each component is skipped if already present.
+#
+# Build steps run in subshells to avoid CWD pollution between installers.
+# =============================================================================
+
+ROCM_PATH="${ROCM_PATH:-/opt/rocm}"
+UCX_HOME="${UCX_HOME:-/usr/local/ucx}"
+RIXL_HOME="${RIXL_HOME:-/usr/local/rixl}"
+
+_SETUP_START=$(date +%s)
+_SETUP_INSTALLED=()
+
+git_clone_retry() {
+    local url="$1" dest="$2" max_tries=3 try=1
+    while (( try <= max_tries )); do
+        if git clone --quiet "$url" "$dest" 2>/dev/null; then return 0; fi
+        echo "[SETUP] git clone attempt $try/$max_tries failed for $url, retrying in 10s..."
+        rm -rf "$dest"
+        sleep 10
+        (( try++ ))
+    done
+    echo "[SETUP] git clone failed after $max_tries attempts: $url"
+    return 1
+}
+
+
+# ---------------------------------------------------------------------------
+# 5. Container RDMA/net tools
+#    - ibv_devinfo comes from ibverbs-utils
+#    - iproute2 provides the `ip` command
+#    Used for in-container NIC/RDMA validation and routing checks.
+# ---------------------------------------------------------------------------
+install_recipe_deps() {
+    if command -v ibv_devinfo >/dev/null 2>&1 && command -v ip >/dev/null 2>&1; then
+        echo "[SETUP] Container RDMA/net tools already present"
+        return 0
+    fi
+
+    echo "[SETUP] Installing ibv_devinfo + iproute2 in container..."
+    apt-get update -q -y && apt-get install -q -y \
+        ibverbs-utils iproute2 \
+        && rm -rf /var/lib/apt/lists/*
+
+    if ! command -v ibv_devinfo >/dev/null 2>&1 || ! command -v ip >/dev/null 2>&1; then
+        echo "[SETUP] ERROR: Failed to install ibv_devinfo/iproute2"; exit 1
+    fi
+    _SETUP_INSTALLED+=("ibverbs-utils+iproute2")
+}
+
+# ---------------------------------------------------------------------------
+# 6b. amd-quark (MXFP4 quantization support for Kimi-K2.5-MXFP4 and similar)
+#     Required due to ROCm vLLM missing the quark dependency:
+#     https://github.com/vllm-project/vllm/issues/35633
+# ---------------------------------------------------------------------------
+install_amd_quark() {
+    if python3 -c "import quark" 2>/dev/null; then
+        echo "[SETUP] amd-quark already present"
+        return 0
+    fi
+
+    echo "[SETUP] Installing amd-quark for MXFP4 quantization support..."
+    pip install --quiet amd-quark
+
+    if ! python3 -c "import quark" 2>/dev/null; then
+        echo "[SETUP] WARN: amd-quark install failed (non-fatal for non-MXFP4 models)"
+        return 0
+    fi
+    _SETUP_INSTALLED+=("amd-quark")
+}
+
+# ---------------------------------------------------------------------------
+# 8. Patch vLLM MoRI-IO save_kv_layer busy-spin (C128 tail-batch deadlock)
+#    In WRITE mode, save_kv_layer spins forever waiting for the handshake
+#    callback to set write_ready_flags. This blocks the model worker thread,
+#    preventing it from responding to EngineCore shm_broadcast, causing a
+#    TimeoutError cascade and crash.
+#    Patch: add time.sleep(0.001) and a 30s timeout to yield CPU and prevent
+#    the model worker from deadlocking.
+# ---------------------------------------------------------------------------
+patch_moriio_save_kv_timeout() {
+    python3 -c '
+import os, sys
+
+try:
+    import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc
+    f = mc.__file__
+    src = open(f).read()
+
+    # Already patched?
+    if "[PATCHED] save_kv_layer timeout" in src:
+        print("[SETUP] save_kv_layer timeout patch already applied")
+        sys.exit(0)
+
+    old = """        while True:
+            if (
+                self._ready_requests.empty()
+                and remote_engine_id not in self.write_ready_flags
+            ):
+                continue"""
+
+    if old not in src:
+        print("[SETUP] WARN: save_kv_layer busy-spin pattern not found, skipping patch")
+        sys.exit(0)
+
+    new = """        # [PATCHED] save_kv_layer — null guard + timeout + sleep
+        if remote_engine_id is None:
+            return
+        import time as _time, os as _os
+        _wait_start = _time.monotonic()
+        _SAVE_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30"))
+        while True:
+            if (
+                self._ready_requests.empty()
+                and remote_engine_id not in self.write_ready_flags
+            ):
+                _elapsed = _time.monotonic() - _wait_start
+                if _elapsed > _SAVE_KV_TIMEOUT:
+                    import logging as _logging
+                    _logging.getLogger("vllm.moriio").warning(
+                        "[HANGFIX] save_kv_layer: timeout (%.1fs) waiting for "
+                        "write_ready_flags[%s], breaking to unblock model "
+                        "worker", _elapsed, remote_engine_id)
+                    break
+                _time.sleep(0.001)
+                continue"""
+
+    new_src = src.replace(old, new)
+    if new_src == src:
+        print("[SETUP] WARN: replacement had no effect")
+        sys.exit(0)
+
+    open(f, "w").write(new_src)
+    print("[SETUP] Patched save_kv_layer: null guard + timeout + sleep")
+except Exception as e:
+    print(f"[SETUP] WARN patch save_kv_layer: {e}", file=sys.stderr)
+'
+    _SETUP_INSTALLED+=("MoRIIO-save-kv-timeout-patch")
+}
+
+# ---------------------------------------------------------------------------
+# 9. Patch MoRIIO waiting_for_transfer_complete with bounded timeout
+#    The original status.Wait() blocks forever if an RDMA completion never
+#    arrives (e.g., NIC queue saturation at C256). This replaces the unbounded
+#    wait with a polling loop using status.Succeeded() + configurable timeout.
+#    Also adds error handling to the write worker loop so a single failed
+#    transfer doesn't kill the background thread.
+# ---------------------------------------------------------------------------
+patch_moriio_transfer_timeout() {
+    python3 -c '
+import os, sys, textwrap
+
+try:
+    import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine as me
+    f = me.__file__
+    src = open(f).read()
+
+    if "[PATCHED] transfer completion timeout" in src:
+        print("[SETUP] transfer completion timeout patch already applied")
+        sys.exit(0)
+
+    # --- Patch 1: Replace waiting_for_transfer_complete with polling + timeout ---
+    old_wait = """    def waiting_for_transfer_complete(self):
+        if not self.transfer_status:
+            return
+
+        transfers_to_wait = []
+        with self.lock:
+            transfers_to_wait = self.transfer_status[:]
+            self.transfer_status.clear()
+
+        for status in transfers_to_wait:
+            try:
+                status.Wait()
+                if not status.Succeeded():
+                    logger.error(
+                        "Transfer failed: %s, Code: %s", status.Message(), status.Code()
+                    )
+                    raise TransferError("MoRIIO transfer failed!")
+            except Exception as e:
+                logger.error("Transfer %s failed: %s", status, e)
+                raise"""
+
+    new_wait = """    def waiting_for_transfer_complete(self):
+        # [PATCHED] transfer completion timeout — bounded polling loop
+        import time as _time, os as _os
+        if not self.transfer_status:
+            return
+
+        _timeout = float(_os.environ.get("VLLM_MORIIO_TRANSFER_TIMEOUT", "120"))
+
+        transfers_to_wait = []
+        with self.lock:
+            transfers_to_wait = self.transfer_status[:]
+            self.transfer_status.clear()
+
+        _start = _time.monotonic()
+        remaining = list(transfers_to_wait)
+        _polls = 0
+        _completed = 0
+
+        while remaining:
+            _elapsed = _time.monotonic() - _start
+            if _elapsed > _timeout:
+                logger.error(
+                    "[HANGFIX] transfer_timeout elapsed=%.1fs "
+                    "pending=%d/%d completed=%d polls=%d "
+                    "action=raise_transfer_error",
+                    _elapsed, len(remaining), len(transfers_to_wait),
+                    _completed, _polls,
+                )
+                raise TransferError(
+                    f"RDMA transfer timeout after {_elapsed:.1f}s, "
+                    f"{len(remaining)}/{len(transfers_to_wait)} pending"
+                )
+
+            still_waiting = []
+            for status in remaining:
+                try:
+                    if status.Succeeded():
+                        _completed += 1
+                        continue
+                    still_waiting.append(status)
+                except Exception as e:
+                    logger.error(
+                        "[HANGFIX] transfer_poll_error error=%s", e)
+                    raise TransferError(
+                        f"Transfer failed during poll: {e}"
+                    ) from e
+
+            remaining = still_waiting
+            if remaining:
+                _time.sleep(0.005)
+                _polls += 1
+                if _polls % 2000 == 0:
+                    logger.warning(
+                        "[HANGFIX] transfer_wait pending=%d "
+                        "completed=%d elapsed=%.1fs timeout=%.0fs",
+                        len(remaining), _completed,
+                        _time.monotonic() - _start, _timeout,
+                    )"""
+
+    if old_wait not in src:
+        print("[SETUP] WARN: waiting_for_transfer_complete pattern not found")
+        sys.exit(0)
+
+    new_src = src.replace(old_wait, new_wait)
+
+    # --- Patch 2: Add error handling + cleanup to _write_worker_loop ---
+    old_loop = """            self._execute_write_task(task)"""
+
+    new_loop = """            try:
+                self._execute_write_task(task)
+            except Exception as _e:
+                logger.error(
+                    "[HANGFIX] req=%s write_task_failed error=%s "
+                    "action=cleanup_and_mark_done",
+                    task.request_id, _e,
+                )
+                try:
+                    _wr = self.worker.moriio_wrapper
+                    with _wr.lock:
+                        _wr.done_req_ids.append(task.request_id)
+                    _wr.done_remote_allocate_req_dict.pop(
+                        task.request_id, None
+                    )
+                except Exception:
+                    pass"""
+
+    if old_loop in new_src:
+        new_src = new_src.replace(old_loop, new_loop, 1)
+    else:
+        print("[SETUP] WARN: _write_worker_loop pattern not found for error handling")
+
+    # --- Patch 3: Add deferred task timeout to _process_deferred_tasks ---
+    old_deferred = """    def _process_deferred_tasks(self) -> None:
+        \"\"\"Process tasks that were previously deferred.\"\"\"
+        if not self._deferred_tasks:
+            return
+
+        still_deferred: list[WriteTask] = []
+        for task in self._deferred_tasks:
+            if self._is_remote_ready(task):
+                self._execute_write_task(task)
+            else:
+                still_deferred.append(task)
+
+        self._deferred_tasks = still_deferred"""
+
+    new_deferred = """    def _process_deferred_tasks(self) -> None:
+        \"\"\"Process tasks that were previously deferred.\"\"\"
+        # [PATCHED] deferred task timeout — prune stale tasks
+        import time as _time, os as _os
+        if not self._deferred_tasks:
+            return
+
+        _DEFER_TIMEOUT = float(
+            _os.environ.get("VLLM_MORIIO_DEFER_TIMEOUT", "60"))
+
+        still_deferred: list[WriteTask] = []
+        for task in self._deferred_tasks:
+            _age = _time.monotonic() - getattr(task, "_defer_ts", _time.monotonic())
+            if _age > _DEFER_TIMEOUT:
+                logger.error(
+                    "[HANGFIX] req=%s deferred_task_expired age=%.1fs "
+                    "action=drop_and_mark_done",
+                    task.request_id, _age,
+                )
+                try:
+                    _wr = self.worker.moriio_wrapper
+                    with _wr.lock:
+                        _wr.done_req_ids.append(task.request_id)
+                    _wr.done_remote_allocate_req_dict.pop(
+                        task.request_id, None)
+                except Exception:
+                    pass
+                continue
+            if self._is_remote_ready(task):
+                try:
+                    self._execute_write_task(task)
+                except Exception as _e:
+                    logger.error(
+                        "[HANGFIX] req=%s deferred_write_failed error=%s",
+                        task.request_id, _e,
+                    )
+                    try:
+                        _wr = self.worker.moriio_wrapper
+                        with _wr.lock:
+                            _wr.done_req_ids.append(task.request_id)
+                        _wr.done_remote_allocate_req_dict.pop(
+                            task.request_id, None)
+                    except Exception:
+                        pass
+            else:
+                still_deferred.append(task)
+
+        self._deferred_tasks = still_deferred"""
+
+    if old_deferred in new_src:
+        new_src = new_src.replace(old_deferred, new_deferred, 1)
+    else:
+        print("[SETUP] WARN: _process_deferred_tasks pattern not found")
+
+    # --- Patch 4: Stamp defer time when task is deferred ---
+    old_defer_add = """                self._deferred_tasks.append(task)"""
+    new_defer_add = """                import time as _time2
+                if not hasattr(task, "_defer_ts"):
+                    task._defer_ts = _time2.monotonic()
+                self._deferred_tasks.append(task)"""
+    if old_defer_add in new_src:
+        new_src = new_src.replace(old_defer_add, new_defer_add, 1)
+    else:
+        print("[SETUP] WARN: deferred task timestamp patch target not found")
+
+    open(f, "w").write(new_src)
+    print("[SETUP] Patched: transfer timeout + writer error handling")
+
+except Exception as e:
+    print(f"[SETUP] WARN patch transfer_timeout: {e}", file=sys.stderr)
+'
+    _SETUP_INSTALLED+=("MoRIIO-transfer-timeout-patch")
+}
+
+# ---------------------------------------------------------------------------
+# 10. Patch MoRIIO start_load_kv busy-spin (same pattern as save_kv_layer)
+#     The READ-mode spin loop in start_load_kv has the same unbounded-spin
+#     issue as save_kv_layer. Add timeout + sleep + null guard.
+# ---------------------------------------------------------------------------
+patch_moriio_load_kv_timeout() {
+    python3 -c '
+import os, sys
+
+try:
+    import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc
+    f = mc.__file__
+    src = open(f).read()
+
+    if "[PATCHED] start_load_kv timeout" in src:
+        print("[SETUP] start_load_kv timeout patch already applied")
+        sys.exit(0)
+
+    old = """        while True:
+            if (
+                self._ready_requests.empty()
+                and remote_engine_id not in self.load_ready_flag
+                and wait_handshake_readd_req
+            ):
+                continue"""
+
+    if old not in src:
+        print("[SETUP] WARN: start_load_kv busy-spin pattern not found, skipping")
+        sys.exit(0)
+
+    new = """        # [PATCHED] start_load_kv timeout — prevent model worker deadlock
+        if remote_engine_id is None and not wait_handshake_readd_req:
+            self._reqs_to_send.update(metadata.reqs_to_send)
+            return
+        import time as _time, os as _os
+        _wait_start = _time.monotonic()
+        _LOAD_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30"))
+        while True:
+            if (
+                self._ready_requests.empty()
+                and remote_engine_id not in self.load_ready_flag
+                and wait_handshake_readd_req
+            ):
+                if _time.monotonic() - _wait_start > _LOAD_KV_TIMEOUT:
+                    import logging as _logging
+                    _logging.getLogger("vllm.moriio").warning(
+                        "[HANGFIX] start_load_kv: timeout (%.1fs) waiting for "
+                        "load_ready_flag[%s]", _time.monotonic() - _wait_start,
+                        remote_engine_id)
+                    break
+                _time.sleep(0.001)
+                continue"""
+
+    new_src = src.replace(old, new)
+    if new_src == src:
+        print("[SETUP] WARN: start_load_kv replacement had no effect")
+        sys.exit(0)
+
+    open(f, "w").write(new_src)
+    print("[SETUP] Patched start_load_kv busy-spin with timeout + sleep")
+except Exception as e:
+    print(f"[SETUP] WARN patch start_load_kv: {e}", file=sys.stderr)
+'
+    _SETUP_INSTALLED+=("MoRIIO-load-kv-timeout-patch")
+}
+
+# ---------------------------------------------------------------------------
+# 11. Fix READ-mode scheduler assertion in _update_from_kv_xfer_finished
+#     vLLM asserts that a request in finished_recving must be either
+#     WAITING_FOR_REMOTE_KVS or finished.  In READ mode the request can
+#     transition to RUNNING before the aggregated recv notification arrives,
+#     crashing the engine with AssertionError.
+#     (present in v0.17.1 & v0.18.0)
+# ---------------------------------------------------------------------------
+patch_scheduler_read_mode_fix() {
+    python3 -c '
+import os, sys
+
+try:
+    import vllm.v1.core.sched.scheduler as smod
+    f = smod.__file__
+    src = open(f).read()
+
+    if "[PATCHED] read-mode recv assertion" in src:
+        print("[SETUP] scheduler read-mode assertion fix already applied")
+        sys.exit(0)
+
+    old_recv = """        for req_id in kv_connector_output.finished_recving or ():
+            logger.debug("Finished recving KV transfer for request %s", req_id)
+            assert req_id in self.requests
+            req = self.requests[req_id]
+            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
+                self.finished_recving_kv_req_ids.add(req_id)
+            else:
+                assert RequestStatus.is_finished(req.status)
+                self._free_blocks(self.requests[req_id])"""
+
+    new_recv = """        # [PATCHED] read-mode recv assertion — handle intermediate states
+        for req_id in kv_connector_output.finished_recving or ():
+            logger.debug("Finished recving KV transfer for request %s", req_id)
+            if req_id not in self.requests:
+                logger.debug("Request %s already removed, skipping recv", req_id)
+                continue
+            req = self.requests[req_id]
+            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
+                self.finished_recving_kv_req_ids.add(req_id)
+            elif RequestStatus.is_finished(req.status):
+                self._free_blocks(self.requests[req_id])
+            else:
+                logger.debug(
+                    "Request %s recv finished but status=%s (not "
+                    "WAITING_FOR_REMOTE_KVS or finished), skipping "
+                    "block free — will be freed on request completion",
+                    req_id, req.status.name)"""
+
+    if old_recv not in src:
+        print("[SETUP] WARN: scheduler finished_recving pattern not found, skipping")
+        sys.exit(0)
+
+    new_src = src.replace(old_recv, new_recv, 1)
+
+    old_send = """        for req_id in kv_connector_output.finished_sending or ():
+            logger.debug("Finished sending KV transfer for request %s", req_id)
+            assert req_id in self.requests
+            self._free_blocks(self.requests[req_id])"""
+
+    new_send = """        for req_id in kv_connector_output.finished_sending or ():
+            logger.debug("Finished sending KV transfer for request %s", req_id)
+            if req_id not in self.requests:
+                logger.debug("Request %s already removed, skipping send", req_id)
+                continue
+            self._free_blocks(self.requests[req_id])"""
+
+    if old_send in new_src:
+        new_src = new_src.replace(old_send, new_send, 1)
+    else:
+        print("[SETUP] WARN: scheduler finished_sending pattern not found")
+
+    open(f, "w").write(new_src)
+    print("[SETUP] Patched: scheduler _update_from_kv_xfer_finished read-mode fix")
+
+except Exception as e:
+    print(f"[SETUP] WARN patch scheduler read-mode: {e}", file=sys.stderr)
+'
+    _SETUP_INSTALLED+=("scheduler-read-mode-fix")
+}
+
+# ---------------------------------------------------------------------------
+# 12. Idle KV block reaper for disaggregated prefill (READ mode)
+#     The RIXL notification path can lose `finished_sending` signals under
+#     high concurrency with ibv_post_send failures. This leaves KV blocks
+#     permanently allocated on the prefill engine even after the decode has
+#     finished reading. Over multiple benchmark rounds, leaked blocks
+#     accumulate and eventually saturate the prefill KV cache.
+#
+#     Fix: instrument the scheduler's `schedule()` method to detect idle
+#     periods (0 running, 0 waiting for >5s) and force-free blocks for
+#     any remaining requests whose status is finished.
+# ---------------------------------------------------------------------------
+patch_prefill_idle_kv_reaper() {
+    python3 -c '
+import os, sys
+
+try:
+    import vllm.v1.core.sched.scheduler as smod
+    f = smod.__file__
+    src = open(f).read()
+
+    if "[PATCHED] idle-kv-reaper" in src:
+        print("[SETUP] idle KV block reaper already applied")
+        sys.exit(0)
+
+    # Find the _update_from_kv_xfer_finished method end and add reaper logic
+    # We inject into the method that processes KV transfer completions.
+    marker = "[PATCHED] read-mode recv assertion"
+    if marker not in src:
+        print("[SETUP] WARN: scheduler read-mode patch not found, skipping reaper")
+        sys.exit(0)
+
+    # Add reaper state initialization to __init__
+    old_init_marker = "self.finished_recving_kv_req_ids"
+    if old_init_marker not in src:
+        print("[SETUP] WARN: finished_recving_kv_req_ids not found in scheduler")
+        sys.exit(0)
+
+    # Find the first occurrence to insert reaper state
+    init_pos = src.find(old_init_marker)
+    # Find the line containing it
+    line_end = src.find("\n", init_pos)
+    init_line = src[init_pos:line_end]
+
+    # Add reaper state after this line
+    reaper_init = init_line + """
+        # [PATCHED] idle-kv-reaper state
+        self._idle_kv_reaper_ts = 0.0
+        self._idle_kv_reaper_active = False"""
+
+    src = src.replace(init_line, reaper_init, 1)
+
+    # Now add the reaper logic at the end of _update_from_kv_xfer_finished
+    # Find the finished_sending handler we patched
+    send_handler = """        for req_id in kv_connector_output.finished_sending or ():
+            logger.debug("Finished sending KV transfer for request %s", req_id)
+            if req_id not in self.requests:
+                logger.debug("Request %s already removed, skipping send", req_id)
+                continue
+            self._free_blocks(self.requests[req_id])"""
+
+    reaper_logic = send_handler + """
+
+        # [PATCHED] idle-kv-reaper — force-free leaked prefill KV blocks
+        import time as _time
+        _REAPER_IDLE_SECS = 5.0
+        _num_running = sum(1 for r in self.requests.values()
+                          if r.status == RequestStatus.RUNNING)
+        _should_reap = (_num_running == 0)
+
+        if _should_reap:
+            if not self._idle_kv_reaper_active:
+                self._idle_kv_reaper_active = True
+                self._idle_kv_reaper_ts = _time.monotonic()
+            elif _time.monotonic() - self._idle_kv_reaper_ts > _REAPER_IDLE_SECS:
+                _reaped = 0
+                _reap_ids = []
+                for _rid, _req in list(self.requests.items()):
+                    if RequestStatus.is_finished(_req.status):
+                        _reap_ids.append(_rid)
+                for _rid in _reap_ids:
+                    try:
+                        _req = self.requests[_rid]
+                        self._free_blocks(_req)
+                        _reaped += 1
+                    except Exception as _e:
+                        logger.debug("[KV-REAPER] free_blocks failed for %s: %s", _rid, _e)
+                if _reaped > 0:
+                    logger.warning(
+                        "[KV-REAPER] Force-freed blocks for %d finished "
+                        "requests after %.1fs idle",
+                        _reaped, _time.monotonic() - self._idle_kv_reaper_ts)
+                self._idle_kv_reaper_ts = _time.monotonic()
+        else:
+            self._idle_kv_reaper_active = False"""
+
+    if send_handler in src:
+        src = src.replace(send_handler, reaper_logic, 1)
+    else:
+        print("[SETUP] WARN: send handler not found for reaper injection")
+        sys.exit(0)
+
+    open(f, "w").write(src)
+    print("[SETUP] Patched: idle KV block reaper for prefill")
+
+except Exception as e:
+    print(f"[SETUP] WARN patch idle-kv-reaper: {e}", file=sys.stderr)
+'
+    _SETUP_INSTALLED+=("idle-kv-reaper")
+}
+
+# =============================================================================
+# Run installers
+# =============================================================================
+
+install_recipe_deps
+install_amd_quark
+patch_moriio_save_kv_timeout
+patch_moriio_transfer_timeout
+patch_moriio_load_kv_timeout
+patch_scheduler_read_mode_fix
+patch_prefill_idle_kv_reaper
+
+# =============================================================================
+# Export paths (persists for server.sh since this file is sourced)
+# =============================================================================
+
+export ROCM_PATH="${ROCM_PATH}"
+export UCX_HOME="${UCX_HOME}"
+export RIXL_HOME="${RIXL_HOME}"
+export PATH="${UCX_HOME}/bin:/usr/local/bin/etcd:/root/.cargo/bin:${PATH}"
+export LD_LIBRARY_PATH="${UCX_HOME}/lib:${RIXL_HOME}/lib:${RIXL_HOME}/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}"
+
+_SETUP_END=$(date +%s)
+if [[ ${#_SETUP_INSTALLED[@]} -eq 0 ]]; then
+    echo "[SETUP] All dependencies already present (${_SETUP_END}s wallclock)"
+else
+    echo "[SETUP] Installed: ${_SETUP_INSTALLED[*]} in $(( _SETUP_END - _SETUP_START ))s"
+fi
diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh
index d2c49bc9e..fa3d65418 100755
--- a/benchmarks/multi_node/amd_utils/submit.sh
+++ b/benchmarks/multi_node/amd_utils/submit.sh
@@ -2,37 +2,51 @@
 #
 # Cluster Configuration Template for Multi-Node Disaggregated Serving
 #
-# This script submits a multi-node SGLang disaggregated benchmark job to SLURM.
+# This script submits a multi-node disaggregated benchmark job to SLURM.
 # It must be configured for your specific cluster before use.
+#
+# ENGINE=sglang (default): SGLang disaggregated serving
+# ENGINE=vllm:             vLLM disaggregated serving
+#
+# Router is co-located with the first prefill node (same for both engines),
+# so NUM_NODES = PREFILL_NODES + DECODE_NODES.
 
 usage() {
     cat << 'USAGE'
-This script aims to provide a one-liner call to the submit_job_script.py,
-so that the deployment process can be further simplified.
-
-To use this script, fill in the following script and run it under your `slurm_jobs` directory:
-======== begin script area ========
-# REQUIRED: Cluster-specific configuration
-export SLURM_ACCOUNT=              # Your SLURM account name
-export SLURM_PARTITION=            # SLURM partition to submit to
-export TIME_LIMIT=                 # Job time limit (e.g., "08:00:00")
-
-# REQUIRED: Model and container paths
-export MODEL_PATH=                 # Path to model directory (e.g., /mnt/models, /nfsdata)
-export CONTAINER_IMAGE=            # Path to container squash file
-
-# REQUIRED: Hardware configuration
-export GPUS_PER_NODE=              # GPUs per node (e.g., 8 for MI355X, 4 for MI325X)
-
-# OPTIONAL: RDMA/Network configuration (set in runners/launch_mi355x-amds.sh for AMD)
-# export IBDEVICES=                # RDMA device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...)
-# export MORI_RDMA_TC=             # RDMA traffic class (e.g., 96, 104)
-
-bash submit.sh \
-$PREFILL_NODES $PREFILL_WORKERS $DECODE_NODES $DECODE_WORKERS \
-$ADDITIONAL_FRONTENDS \
-$ISL $OSL $CONCURRENCIES $REQUEST_RATE
-======== end script area ========
+Usage:
+  bash submit.sh <PREFILL_NODES> <PREFILL_WORKERS> <DECODE_NODES> <DECODE_WORKERS> \
+                 <ISL> <OSL> <CONCURRENCIES> <REQUEST_RATE> \
+                 <PREFILL_ENABLE_EP> <PREFILL_ENABLE_DP> \
+                 <DECODE_ENABLE_EP> <DECODE_ENABLE_DP> \
+                 <PREFILL_TP> <DECODE_TP> \
+                 <RANDOM_RANGE_RATIO> [NODE_LIST]
+
+Arguments:
+  PREFILL_NODES        Number of prefill nodes
+  PREFILL_WORKERS      Number of prefill workers (usually 1)
+  DECODE_NODES         Number of decode nodes
+  DECODE_WORKERS       Number of decode workers (usually 1)
+  ISL                  Input sequence length
+  OSL                  Output sequence length
+  CONCURRENCIES        Concurrency levels, delimited by 'x' (e.g., "8x16x32")
+  REQUEST_RATE         Request rate ("inf" for max throughput)
+  PREFILL_ENABLE_EP    true/false or 1/0 (expert parallelism on prefill)
+  PREFILL_ENABLE_DP    true/false or 1/0 (data-parallel attention on prefill)
+  DECODE_ENABLE_EP     true/false or 1/0 (expert parallelism on decode)
+  DECODE_ENABLE_DP     true/false or 1/0 (data-parallel attention on decode)
+  PREFILL_TP           Tensor parallel size per prefill node
+  DECODE_TP            Tensor parallel size per decode node
+  RANDOM_RANGE_RATIO   Random range ratio for benchmark client
+  NODE_LIST            Optional: comma-separated hostnames (must match NUM_NODES)
+
+Required environment variables:
+  SLURM_ACCOUNT    SLURM account name
+  SLURM_PARTITION  SLURM partition
+  TIME_LIMIT       Job time limit (e.g., "08:00:00")
+  MODEL_PATH       Path to model directory (e.g., /nfsdata)
+  MODEL_NAME       Model name directory
+  CONTAINER_IMAGE  Docker image name (e.g., vllm_disagg_pd:latest)
+  RUNNER_NAME      Runner identifier (for job name)
 USAGE
 }
 
@@ -53,6 +67,7 @@ check_env MODEL_PATH
 check_env MODEL_NAME
 check_env CONTAINER_IMAGE
 check_env RUNNER_NAME
+check_env FRAMEWORK
 
 # GPUS_PER_NODE defaults to 8 (MI355X). Set to 4 for MI325X if needed.
 GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
@@ -66,31 +81,32 @@ ISL=$5
 OSL=$6
 CONCURRENCIES=$7
 REQUEST_RATE=$8
-PREFILL_ENABLE_EP=${9:-1}
-PREFILL_ENABLE_DP=${10:-1}
-DECODE_ENABLE_EP=${11:-1}
-DECODE_ENABLE_DP=${12:-1}
+PREFILL_ENABLE_EP=${9:-true}
+PREFILL_ENABLE_DP=${10:-true}
+DECODE_ENABLE_EP=${11:-true}
+DECODE_ENABLE_DP=${12:-true}
 PREFILL_TP=${13:-8}
 DECODE_TP=${14:-8}
-RANDOM_RANGE_RATIO=${15}
+RANDOM_RANGE_RATIO=${15:-0.8}
 NODE_LIST=${16}
 
-
 NUM_NODES=$((PREFILL_NODES + DECODE_NODES))
 profiler_args="${ISL} ${OSL} ${CONCURRENCIES} ${REQUEST_RATE}"
 
 # Export variables for the SLURM job
+export ENGINE="${FRAMEWORK:-sglang}"
 export MODEL_DIR=$MODEL_PATH
 export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE
 export PROFILER_ARGS=$profiler_args
 
-
-
+# Engine-specific xP/yD semantics and TP exports
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300}
+    export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1}
+fi
+# xP = prefill workers, yD = decode workers (may span multiple nodes)
 export xP=$PREFILL_WORKERS
 export yD=$DECODE_WORKERS
-export NUM_NODES=$NUM_NODES
-export GPUS_PER_NODE=$GPUS_PER_NODE
-export MODEL_NAME=$MODEL_NAME
 export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $PREFILL_TP / $PREFILL_WORKERS ))
 export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP}
 export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP}
@@ -98,12 +114,16 @@ export DECODE_TP_SIZE=$(( $DECODE_NODES * $DECODE_TP / $DECODE_WORKERS ))
 export DECODE_ENABLE_EP=${DECODE_ENABLE_EP}
 export DECODE_ENABLE_DP=${DECODE_ENABLE_DP}
 export DECODE_MTP_SIZE=${DECODE_MTP_SIZE}
+
+export NUM_NODES=$NUM_NODES
+export GPUS_PER_NODE=$GPUS_PER_NODE
+export MODEL_NAME=$MODEL_NAME
 export BENCH_INPUT_LEN=${ISL}
 export BENCH_OUTPUT_LEN=${OSL}
-export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO}
-export BENCH_NUM_PROMPTS_MULTIPLIER=10
+export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10}
 export BENCH_MAX_CONCURRENCY=${CONCURRENCIES}
 export BENCH_REQUEST_RATE=${REQUEST_RATE}
+export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8}
 
 # Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker)
 export RUN_EVAL="${RUN_EVAL:-false}"
@@ -118,13 +138,10 @@ export SPEC_DECODING="${SPEC_DECODING:-}"
 export IS_MULTINODE="${IS_MULTINODE:-false}"
 
 # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output.
-# SLURM writes output files on the batch node, so /tmp won't work (node-local).
-# Defaults to a sibling directory of the submit working directory.
 export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"
 mkdir -p "$BENCHMARK_LOGS_DIR"
 
 # Optional: pass an explicit node list to sbatch.
-# NODE_LIST is expected to be comma-separated hostnames.
 NODELIST_OPT=()
 if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then
     IFS=',' read -r -a NODE_ARR <<< "$NODE_LIST"
@@ -137,6 +154,63 @@ if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then
     NODELIST_OPT=(--nodelist "$NODELIST_CSV")
 fi
 
+# Optional: exclude specific nodes (e.g. nodes with broken Docker sockets).
+# Set SLURM_EXCLUDE_NODES env var to a comma-separated list of hostnames.
+EXCLUDE_OPT=()
+SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-mia1-p01-g11,mia1-p01-g12,mia1-p01-g15}"
+if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then
+    EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES")
+fi
+
+# =============================================================================
+# Reuse existing allocation (skip sbatch)
+# =============================================================================
+# When SLURM_REUSE_JOBID is set, run job.slurm directly in the current shell,
+# attaching to the existing allocation. Inner `srun` calls pick up the
+# allocation via SLURM_JOB_ID; SLURM_OVERLAP=1 lets them share task slots with
+# the interactive shell already holding the allocation.
+if [[ -n "${SLURM_REUSE_JOBID:-}" ]]; then
+    REUSE_JID="$SLURM_REUSE_JOBID"
+    echo "Reusing existing Slurm allocation ${REUSE_JID} (skipping sbatch)" >&2
+
+    # Resolve allocation's nodelist if not already provided.
+    ALLOC_NODELIST="${SLURM_JOB_NODELIST:-$(squeue -h -j "$REUSE_JID" -o '%N' 2>/dev/null)}"
+    if [[ -z "$ALLOC_NODELIST" ]]; then
+        echo "Error: could not resolve nodelist for job ${REUSE_JID}" >&2
+        exit 1
+    fi
+    ALLOC_NNODES=$(scontrol show hostnames "$ALLOC_NODELIST" | wc -l)
+    if [[ "$ALLOC_NNODES" -lt "$NUM_NODES" ]]; then
+        echo "Error: allocation ${REUSE_JID} has ${ALLOC_NNODES} nodes, need ${NUM_NODES}" >&2
+        exit 1
+    fi
+
+    export SLURM_JOB_ID="$REUSE_JID"
+    export SLURM_JOBID="$REUSE_JID"
+    export SLURM_JOB_NODELIST="$ALLOC_NODELIST"
+    export SLURM_NODELIST="$ALLOC_NODELIST"
+    export SLURM_NNODES="$ALLOC_NNODES"
+    export SLURM_JOB_NUM_NODES="$ALLOC_NNODES"
+    export SLURM_NTASKS="$ALLOC_NNODES"
+    export SLURM_NPROCS="$ALLOC_NNODES"
+    export SLURM_NTASKS_PER_NODE=1
+    export SLURM_TASKS_PER_NODE="1(x${ALLOC_NNODES})"
+    export SLURM_OVERLAP=1
+    export SLURM_SUBMIT_DIR="$(pwd)"
+
+    STDOUT_LOG="${BENCHMARK_LOGS_DIR}/slurm_job-${REUSE_JID}.out"
+    STDERR_LOG="${BENCHMARK_LOGS_DIR}/slurm_job-${REUSE_JID}.err"
+    rm -f "$STDOUT_LOG" "$STDERR_LOG"
+
+    nohup bash "$(dirname "$0")/job.slurm" >"$STDOUT_LOG" 2>"$STDERR_LOG" &
+    INLINE_PID=$!
+    echo "$INLINE_PID" > "${BENCHMARK_LOGS_DIR}/slurm_job-${REUSE_JID}.pid"
+    echo "Started job.slurm (pid=${INLINE_PID}); logs: ${STDOUT_LOG}" >&2
+
+    echo "$REUSE_JID"
+    exit 0
+fi
+
 # Construct the sbatch command
 sbatch_cmd=(
     sbatch
@@ -145,6 +219,7 @@ sbatch_cmd=(
     -N "$NUM_NODES"
     -n "$NUM_NODES"
     "${NODELIST_OPT[@]}"
+    "${EXCLUDE_OPT[@]}"
     --time "$TIME_LIMIT"
     --partition "$SLURM_PARTITION"
     --account "$SLURM_ACCOUNT"
@@ -154,7 +229,6 @@ sbatch_cmd=(
     "$(dirname "$0")/job.slurm"
 )
 
-# todo: --parsable outputs only the jobid and cluster name, test if jobid;clustername is correct
 JOB_ID=$("${sbatch_cmd[@]}")
 if [[ $? -ne 0 ]]; then
     echo "Error: Failed to submit job with sbatch" >&2
diff --git a/benchmarks/multi_node/amd_utils/sync.py b/benchmarks/multi_node/amd_utils/sync.py
index 140951519..3678e7614 100755
--- a/benchmarks/multi_node/amd_utils/sync.py
+++ b/benchmarks/multi_node/amd_utils/sync.py
@@ -143,7 +143,10 @@ def close_port():
             time.sleep(30)
 
     if args.enable_port:
-        time.sleep(30)
+        # Keep the port open long enough for slow nodes to pass their barrier.
+        # The previous 30s was too short when setup times vary by minutes.
+        grace = max(60, args.timeout // 2) if args.timeout > 0 else 300
+        time.sleep(grace)
         close_port()
 
 
diff --git a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
index 6a7314ab4..d17d1a323 100644
--- a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
+++ b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
@@ -19,7 +19,8 @@ check_env_vars \
     DECODE_DP_ATTN \
     PREFILL_NODES \
     DECODE_NODES \
-    RANDOM_RANGE_RATIO
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
 
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
index 0124d4b4d..a8c0d2743 100644
--- a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
+++ b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
@@ -19,7 +19,8 @@ check_env_vars \
     DECODE_DP_ATTN \
     PREFILL_NODES \
     DECODE_NODES \
-    RANDOM_RANGE_RATIO
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
 
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
diff --git a/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh
new file mode 100755
index 000000000..d7995fb25
--- /dev/null
+++ b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+# Same EP/DP booleans as dsr1_fp8_mi355x_sglang-disagg.sh → amd_utils/submit.sh
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+    export PREFILL_ENABLE_EP=false
+else
+    export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+    export PREFILL_ENABLE_DP=true
+else
+    export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+    export DECODE_ENABLE_EP=false
+else
+    export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+    export DECODE_ENABLE_DP=true
+else
+    export DECODE_ENABLE_DP=false
+fi
+
+# Parameter order matches SGLang disagg submit.sh; arg 16 is optional NODELIST.
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO} \
+    "${NODELIST:-}")
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh
new file mode 100644
index 000000000..a9a28d889
--- /dev/null
+++ b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+    export PREFILL_ENABLE_EP=false
+else
+    export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+    export PREFILL_ENABLE_DP=true
+else
+    export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+    export DECODE_ENABLE_EP=false
+else
+    export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+    export DECODE_ENABLE_DP=true
+else
+    export DECODE_ENABLE_DP=false
+fi
+
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO} \
+    "${NODELIST:-}")
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index ad37e0c27..def63fd87 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2974,6 +2974,18 @@
     - "Update SGLang ROCm image from v0.5.11/v0.5.10rc0 to v0.5.12-rocm720-mi35x-20260517"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1440
 
+- config-keys:
+    - kimik2.5-fp4-mi355x-vllm-disagg
+  description:
+    - "Add Kimi-K2.5-MXFP4 FP4 vLLM disagg PD recipe (1P2D, MoRI-EP + MoRI-IO) for MI355X on vllm/vllm-openai-rocm:nightly"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1569
+
+- config-keys:
+    - minimaxm2.5-fp8-mi355x-vllm-disagg
+  description:
+    - "Add MiniMax-M2.5 FP8 vLLM disagg PD recipe (1P2D, MoRI-EP + MoRI-IO) for MI355X on vllm/vllm-openai-rocm:nightly"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1569
+
 - config-keys:
     - dsv4-fp4-mi355x-vllm
   description:
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index a8033847e..00fd994f3 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -52,11 +52,27 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
 
     # Ensure root-owned files are cleaned up even on early exit to prevent
-    # EACCES errors when the next GH Actions job checks out on this runner
-    trap 'sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true' EXIT
+    # EACCES errors when the next GH Actions job checks out on this runner.
+    # Always preserve slurm logs as CI artifacts for debugging.
+    cleanup_and_save_logs() {
+        if [[ -n "${GITHUB_ACTIONS:-}" && -n "${JOB_ID:-}" ]]; then
+            local art_dir="$GITHUB_WORKSPACE/benchmark_artifacts"
+            mkdir -p "$art_dir"
+            cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$art_dir/" 2>/dev/null || true
+        fi
+        # Print .err inline so failures are visible in CI output
+        local err_file="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID:-unknown}.err"
+        if [[ -s "$err_file" ]]; then
+            echo "=== Slurm job stderr ==="
+            tail -100 "$err_file"
+            echo "========================"
+        fi
+        sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true
+    }
+    trap cleanup_and_save_logs EXIT
 
     SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh"
-    if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then
+    if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]]; then
         BENCHMARK_SUBDIR="multi_node"
     else
         BENCHMARK_SUBDIR="single_node"
@@ -108,12 +124,19 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
         cat > collect_latest_results.py <<'PY'
 import os, sys
-sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4])
-for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]:
+job_dir, isl, osl, nexp, framework = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]), sys.argv[5]
+logs_root = f"{job_dir}/logs/"
+candidates = []
+if os.path.isdir(logs_root):
+    for name in os.listdir(logs_root):
+        subdir = f"{logs_root}{name}/{framework}_isl_{isl}_osl_{osl}"
+        if os.path.isdir(subdir):
+            candidates.append(subdir)
+for path in sorted(candidates, key=os.path.getmtime, reverse=True)[:nexp]:
     print(path)
 PY
 
-        LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1)
+        LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1 "$FRAMEWORK")
         if [ -z "$LOGS_DIR" ]; then
             echo "No logs directory found for ISL=${ISL}, OSL=${OSL}"
             exit 1
@@ -162,16 +185,7 @@ PY
 
     sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
 
-    # Upload logs as artifact if running in GitHub Actions
-    if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
-        ARTIFACT_DIR="$GITHUB_WORKSPACE/benchmark_artifacts"
-        mkdir -p "$ARTIFACT_DIR"
-        cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$ARTIFACT_DIR/" 2>/dev/null || true
-        echo "Logs copied to $ARTIFACT_DIR for artifact upload"
-    fi
-
-    # Clean up root-owned files to prevent EACCES on GH Actions checkout cleanup
-    sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true
+    # Log preservation and cleanup handled by EXIT trap (cleanup_and_save_logs)
 
 else