From fdba04197191b62787d063f87ffad12de5e63aa3 Mon Sep 17 00:00:00 2001 From: gongshaotian Date: Tue, 21 Oct 2025 14:55:31 +0800 Subject: [PATCH 1/2] Turn on the CUDAGraph + RL switch --- fastdeploy/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 738f563efd6..943525e4dcf 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1513,7 +1513,6 @@ def postprocess(self): if ( (self.speculative_config is not None and self.speculative_config.method is not None) or (self.model_config is not None and self.model_config.enable_mm is True) - or (self.load_config is not None and self.load_config.dynamic_load_weight is True) or (self.scheduler_config.splitwise_role != "mixed") ): self.graph_opt_config.use_cudagraph = False @@ -1642,11 +1641,12 @@ def check(self): assert ( self.model_config.enable_mm is not True ), "CUDAGraph cannot be applied to multimodal model temporarily" - if self.graph_opt_config.graph_opt_level > 0 or self.graph_opt_config.use_cudagraph: + if self.graph_opt_config.graph_opt_level > 0: if self.load_config is not None: assert ( self.load_config.dynamic_load_weight is False ), "Static graph cannot be used in RL scene temporarily" + if int(envs.ENABLE_V1_KVCACHE_SCHEDULER) == 1: assert ( int(envs.FD_DISABLED_RECOVER) == 0 From e945d4ec3d7961af426f038b82e5fdea254cc731 Mon Sep 17 00:00:00 2001 From: gongshaotian Date: Wed, 22 Oct 2025 20:47:15 +0800 Subject: [PATCH 2/2] reduce max_num_seqs and number of request --- tests/ce/stable_cases/launch_model.sh | 2 +- tests/ce/stable_cases/run.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ce/stable_cases/launch_model.sh b/tests/ce/stable_cases/launch_model.sh index 1850dc944dc..3b758a15a2a 100644 --- a/tests/ce/stable_cases/launch_model.sh +++ b/tests/ce/stable_cases/launch_model.sh @@ -38,7 +38,7 @@ python -m fastdeploy.entrypoints.openai.api_server \ --cache-queue-port ${FD_CACHE_QUEUE_PORT} \ --quantization wint8 \ --max-model-len 32768 \ - --max-num-seqs 256 \ + --max-num-seqs 1 \ --gpu-memory-utilization 0.9 \ --model "$MODEL_PATH" \ --load-strategy ipc_snapshot \ diff --git a/tests/ce/stable_cases/run.sh b/tests/ce/stable_cases/run.sh index 6b7f939bb6e..81197253ba5 100644 --- a/tests/ce/stable_cases/run.sh +++ b/tests/ce/stable_cases/run.sh @@ -12,7 +12,7 @@ PORT="${FD_API_PORT}" # 这里需要配合启动脚本那个URL PORT BASE_URL="http://$HOST:$PORT" TOTAL_ROUNDS=30 -CHAT_REQUESTS_PER_ROUND=5 +CHAT_REQUESTS_PER_ROUND=1 export CUDA_VISIBLE_DEVICES=0,1 MAX_MEMORY_MB=10240 # 10GB