diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ec9cbc11e..a758c7df0 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2363,6 +2363,24 @@ dsr1-fp8-h200-sglang: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } +dsv4-fp8-h200-sglang: + image: lmsysorg/sglang:deepseek-v4-hopper + model: sgl-project/DeepSeek-V4-Flash-FP8 + model-prefix: dsv4 + runner: h200 + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 32 } + qwen3.5-fp8-h200-sglang: image: lmsysorg/sglang:v0.5.9-cu129-amd64 model: Qwen/Qwen3.5-397B-A17B-FP8 diff --git a/benchmarks/single_node/dsv4_fp8_h200.sh b/benchmarks/single_node/dsv4_fp8_h200.sh new file mode 100755 index 000000000..45220dc3d --- /dev/null +++ b/benchmarks/single_node/dsv4_fp8_h200.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +hf download "$MODEL" + +nvidia-smi + +export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +export SGLANG_DSV4_FP4_EXPERTS=0 + +# TODO(Cam): the lmsysorg/sglang:deepseek-v4-hopper image installs sglang editable +# at /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so +# the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install. The +# runner mounts at /ix for this image; paths here are $PWD-relative to be agnostic. +# Drop once lmsys moves sglang back out of /workspace. + +SERVER_LOG="$PWD/server.log" +PORT=${PORT:-8888} + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" + +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi + +start_gpu_monitor --output "$PWD/gpu_metrics.csv" + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ +--tp $TP \ +--moe-runner-backend flashinfer_mxfp4 \ +--chunked-prefill-size 4096 \ +--disable-flashinfer-autotune \ +--disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $((CONC * 10)) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir "$PWD/" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ddc6409c2..6ff2c8c35 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,13 @@ +- config-keys: + - dsv4-fp8-h200-sglang + description: + - "Add DeepSeek-V4-Flash-FP8 single-node H200 SGLang benchmark (TP4)" + - "Container: lmsysorg/sglang:deepseek-v4-hopper" + - "Model: sgl-project/DeepSeek-V4-Flash-FP8" + - "Recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + - "Prefix caching and speculative decoding disabled for baseline numbers" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD + - config-keys: - dsr1-fp8-h100-dynamo-trt - dsr1-fp8-h100-dynamo-sglang diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 657f84792..5e971e659 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -11,6 +11,17 @@ PARTITION="h200" SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" LOCK_FILE="${SQUASH_FILE}.lock" +# TODO(Cam): lmsysorg/sglang:deepseek-v4-hopper installs sglang editable at +# /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so +# the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and +# breaks `import sglang`. Mount this one image at /ix instead; drop the +# conditional once the image stops installing editable under /workspace. +if [[ "$IMAGE" == *deepseek-v4-hopper* ]]; then + CONTAINER_MOUNT_DIR=/ix +else + CONTAINER_MOUNT_DIR=/workspace +fi + set -x JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') @@ -40,9 +51,9 @@ fi srun --jobid=$JOB_ID \ --container-image=$CONTAINER_IMAGE \ ---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --container-mount-home \ ---container-workdir=/workspace/ \ +--container-workdir=$CONTAINER_MOUNT_DIR \ --no-container-entrypoint --export=ALL \ bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index 9d157a858..fa8a9f7bb 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -9,14 +9,25 @@ SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="main" +# TODO(Cam): lmsysorg/sglang:deepseek-v4-hopper installs sglang editable at +# /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so +# the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and +# breaks `import sglang`. Mount this one image at /ix instead; drop the +# conditional once the image stops installing editable under /workspace. +if [[ "$IMAGE" == *deepseek-v4-hopper* ]]; then + CONTAINER_MOUNT_DIR=/ix +else + CONTAINER_MOUNT_DIR=/workspace +fi + set -x srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \ --container-image=$IMAGE \ --container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \ ---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --container-remap-root \ --container-writable \ --container-mount-home \ ---container-workdir=/workspace/ \ +--container-workdir=$CONTAINER_MOUNT_DIR \ --no-container-entrypoint --export=ALL \ bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh