diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 917136739..85025aba7 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1,5 +1,5 @@ dsr1-fp4-b200-sglang: - image: lmsysorg/sglang:v0.5.3rc1-cu129-b200 + image: lmsysorg/sglang:v0.5.5-cu129-amd64 model: nvidia/DeepSeek-R1-0528-FP4-V2 model-prefix: dsr1 runner: b200 @@ -9,18 +9,18 @@ dsr1-fp4-b200-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } - isl: 1024 osl: 8192 search-space: - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 16 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } dsr1-fp4-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 @@ -73,7 +73,7 @@ dsr1-fp4-b200-trt: - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } dsr1-fp8-b200-sglang: - image: lmsysorg/sglang:v0.5.3rc1-cu129-b200 + image: lmsysorg/sglang:v0.5.5-cu129-amd64 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: b200 @@ -83,15 +83,15 @@ dsr1-fp8-b200-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } dsr1-fp8-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 @@ -120,7 +120,7 @@ dsr1-fp8-b200-trt: - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } dsr1-fp8-h200-sglang: - image: lmsysorg/sglang:v0.5.2rc2-cu126 + image: lmsysorg/sglang:v0.5.5-cu129-amd64 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: h200 diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index ac3e0e889..3c8232072 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -21,6 +21,6 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0. --tensor-parallel-size=$TP --data-parallel-size=1 \ --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \ --chunked-prefill-size 16384 \ ---enable-ep-moe --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ ---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --enable-flashinfer-trtllm-moe --stream-interval 10 +--ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ +--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index 8cc26a72f..361b6f1f6 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -34,4 +34,4 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --cuda-graph-max-bs 128 --max-running-requests 128 \ --mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 --chunked-prefill-size 32768 --max-prefill-tokens 32768 \ --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \ ---attention-backend trtllm_mla --stream-interval 30 --enable-flashinfer-trtllm-moe --quantization fp8 +--attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh index cda48abb3..21a10d48f 100644 --- a/runners/launch_b200-nvd.sh +++ b/runners/launch_b200-nvd.sh @@ -30,7 +30,7 @@ docker run --rm -d --init --network host --name $server_name \ --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ diff --git a/runners/launch_b200-tg.sh b/runners/launch_b200-tg.sh index b37154925..9f313396c 100644 --- a/runners/launch_b200-tg.sh +++ b/runners/launch_b200-tg.sh @@ -12,7 +12,7 @@ docker run --rm -d --network host --name $server_name \ --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \