From 782bb1dfcd867d5ee8f01b2ab09141f64f07be73 Mon Sep 17 00:00:00 2001 From: iosmers Date: Thu, 4 Sep 2025 12:07:02 +0000 Subject: [PATCH 1/5] fix bug --- fastdeploy/config.py | 5 ++++- fastdeploy/engine/args_utils.py | 7 ++++++- fastdeploy/engine/sched/resource_manager_v1.py | 4 +++- fastdeploy/worker/xpu_model_runner.py | 9 ++++++++- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 2992312742a..ca4f1e2d5c5 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1241,7 +1241,10 @@ def postprocess(self): if self.cache_config.enable_chunked_prefill: self.max_num_batched_tokens = 2048 else: - self.max_num_batched_tokens = self.max_model_len + if paddle.is_compiled_with_xpu(): + self.max_num_batched_tokens = self.max_model_len + else: + self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM if self.long_prefill_token_threshold == 0: self.long_prefill_token_threshold = int(self.max_model_len * 0.04) diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 664b2b36dc6..4497ca779f4 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -19,6 +19,8 @@ from dataclasses import fields as dataclass_fields from typing import Any, Dict, List, Optional +import paddle + from fastdeploy import envs from fastdeploy.config import ( CacheConfig, @@ -1010,7 +1012,10 @@ def create_engine_config(self) -> FDConfig: if self.enable_chunked_prefill: self.max_num_batched_tokens = 2048 else: - self.max_num_batched_tokens = self.max_model_len + if paddle.is_compiled_with_xpu(): + self.max_num_batched_tokens = self.max_model_len + else: + self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM all_dict = asdict(self) all_dict["model_cfg"] = model_cfg diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 5ea7f094a16..339f18f32b6 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -363,7 +363,9 @@ def schedule(self): while self.waiting and token_budget > 0: if len(self.running) == self.max_num_seqs: break - if self.config.model_config.enable_mm and self.exist_prefill(scheduled_reqs): + if (self.config.model_config.enable_mm or paddle.is_compiled_with_xpu()) and self.exist_prefill( + scheduled_reqs + ): break request = self.waiting[0] if request.status == RequestStatus.WAITING: diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index cee71415b0b..b80aeccf966 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -377,12 +377,14 @@ def insert_tasks_v1(self, req_dicts: List[Request]): """ Process scheduler output tasks, used when ENABLE_V1_KVCACHE_SCHEDULER=1 """ + print("走到了insert_tasks_v1") # NOTE(luotingdan): Lazy initialize kv cache if "caches" not in self.share_inputs: self.initialize_kv_cache() req_len = len(req_dicts) has_prefill_task = False + has_decode_task = False for i in range(req_len): request = req_dicts[i] idx = request.idx @@ -392,6 +394,9 @@ def insert_tasks_v1(self, req_dicts: List[Request]): prefill_end_index = request.prefill_end_index length = prefill_end_index - prefill_start_index input_ids = request.prompt_token_ids + request.output_token_ids + logger.debug( + f"Handle prefill request {request} at idx {idx} prefill_start_index {prefill_start_index} prefill_end_index {prefill_end_index} need_prefilled_token_num {len(input_ids)}" + ) self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array( input_ids[prefill_start_index:prefill_end_index] ) @@ -401,6 +406,8 @@ def insert_tasks_v1(self, req_dicts: List[Request]): self.share_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array( request.block_tables, dtype="int32" ) + if self.share_inputs["is_block_step"][idx]: # has tasks to continue to decode + has_decode_task = True self.share_inputs["stop_flags"][idx : idx + 1] = False self.share_inputs["seq_lens_decoder"][idx : idx + 1] = prefill_start_index self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length @@ -474,7 +481,7 @@ def insert_tasks_v1(self, req_dicts: List[Request]): self.share_inputs["stop_seqs"][:stop_seqs_num, : len(request.get("stop_token_ids")[0])] = np.array( request.get("stop_token_ids"), dtype="int64" ) - if has_prefill_task: + if has_prefill_task or has_decode_task: self.share_inputs["not_need_stop"][0] = True def process_prefill_inputs(self, req_dicts: List[Request]): From 202e3a4ad7bb28b19ac83b1ce444e96cc072fcdb Mon Sep 17 00:00:00 2001 From: iosmers Date: Thu, 4 Sep 2025 12:08:39 +0000 Subject: [PATCH 2/5] fix bug --- fastdeploy/worker/xpu_model_runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index b80aeccf966..09ec0ee1a34 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -377,7 +377,6 @@ def insert_tasks_v1(self, req_dicts: List[Request]): """ Process scheduler output tasks, used when ENABLE_V1_KVCACHE_SCHEDULER=1 """ - print("走到了insert_tasks_v1") # NOTE(luotingdan): Lazy initialize kv cache if "caches" not in self.share_inputs: self.initialize_kv_cache() From 13a69e0e0b993116cbfa79336202bb2f1194a476 Mon Sep 17 00:00:00 2001 From: iosmers Date: Fri, 5 Sep 2025 02:12:11 +0000 Subject: [PATCH 3/5] update --- fastdeploy/config.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index ca4f1e2d5c5..d640eae1cd6 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1236,7 +1236,10 @@ def postprocess(self): if self.max_num_batched_tokens is None: if int(envs.ENABLE_V1_KVCACHE_SCHEDULER): - self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM + if paddle.is_compiled_with_xpu(): + self.max_num_batched_tokens = self.max_model_len + else: + self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM else: if self.cache_config.enable_chunked_prefill: self.max_num_batched_tokens = 2048 From c1e7dac3a347fb27405134627b8bd53488373aa9 Mon Sep 17 00:00:00 2001 From: iosmers Date: Fri, 5 Sep 2025 05:10:55 +0000 Subject: [PATCH 4/5] update --- fastdeploy/engine/args_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 4497ca779f4..f2488f94938 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -1007,7 +1007,10 @@ def create_engine_config(self) -> FDConfig: if self.max_num_batched_tokens is None: if int(envs.ENABLE_V1_KVCACHE_SCHEDULER): - self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM + if paddle.is_compiled_with_xpu(): + self.max_num_batched_tokens = self.max_model_len + else: + self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM else: if self.enable_chunked_prefill: self.max_num_batched_tokens = 2048 From 11a1053e10ba0ac9d517c30a2b9a44c6f21dc1bd Mon Sep 17 00:00:00 2001 From: iosmers Date: Fri, 5 Sep 2025 06:16:24 +0000 Subject: [PATCH 5/5] update --- fastdeploy/config.py | 5 +---- fastdeploy/engine/args_utils.py | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index d640eae1cd6..88f1870ce94 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1244,10 +1244,7 @@ def postprocess(self): if self.cache_config.enable_chunked_prefill: self.max_num_batched_tokens = 2048 else: - if paddle.is_compiled_with_xpu(): - self.max_num_batched_tokens = self.max_model_len - else: - self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM + self.max_num_batched_tokens = self.max_model_len if self.long_prefill_token_threshold == 0: self.long_prefill_token_threshold = int(self.max_model_len * 0.04) diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index f2488f94938..f553ad2d24a 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -1015,10 +1015,7 @@ def create_engine_config(self) -> FDConfig: if self.enable_chunked_prefill: self.max_num_batched_tokens = 2048 else: - if paddle.is_compiled_with_xpu(): - self.max_num_batched_tokens = self.max_model_len - else: - self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM + self.max_num_batched_tokens = self.max_model_len all_dict = asdict(self) all_dict["model_cfg"] = model_cfg