From 782bb1dfcd867d5ee8f01b2ab09141f64f07be73 Mon Sep 17 00:00:00 2001
From: iosmers <yinwei_hust@163.com>
Date: Thu, 4 Sep 2025 12:07:02 +0000
Subject: [PATCH 1/5] fix bug

---
 fastdeploy/config.py                           | 5 ++++-
 fastdeploy/engine/args_utils.py                | 7 ++++++-
 fastdeploy/engine/sched/resource_manager_v1.py | 4 +++-
 fastdeploy/worker/xpu_model_runner.py          | 9 ++++++++-
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index 2992312742a..ca4f1e2d5c5 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -1241,7 +1241,10 @@ def postprocess(self):
                 if self.cache_config.enable_chunked_prefill:
                     self.max_num_batched_tokens = 2048
                 else:
-                    self.max_num_batched_tokens = self.max_model_len
+                    if paddle.is_compiled_with_xpu():
+                        self.max_num_batched_tokens = self.max_model_len
+                    else:
+                        self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
 
         if self.long_prefill_token_threshold == 0:
             self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
index 664b2b36dc6..4497ca779f4 100644
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -19,6 +19,8 @@
 from dataclasses import fields as dataclass_fields
 from typing import Any, Dict, List, Optional
 
+import paddle
+
 from fastdeploy import envs
 from fastdeploy.config import (
     CacheConfig,
@@ -1010,7 +1012,10 @@ def create_engine_config(self) -> FDConfig:
                 if self.enable_chunked_prefill:
                     self.max_num_batched_tokens = 2048
                 else:
-                    self.max_num_batched_tokens = self.max_model_len
+                    if paddle.is_compiled_with_xpu():
+                        self.max_num_batched_tokens = self.max_model_len
+                    else:
+                        self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
 
         all_dict = asdict(self)
         all_dict["model_cfg"] = model_cfg
diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py
index 5ea7f094a16..339f18f32b6 100644
--- a/fastdeploy/engine/sched/resource_manager_v1.py
+++ b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -363,7 +363,9 @@ def schedule(self):
                 while self.waiting and token_budget > 0:
                     if len(self.running) == self.max_num_seqs:
                         break
-                    if self.config.model_config.enable_mm and self.exist_prefill(scheduled_reqs):
+                    if (self.config.model_config.enable_mm or paddle.is_compiled_with_xpu()) and self.exist_prefill(
+                        scheduled_reqs
+                    ):
                         break
                     request = self.waiting[0]
                     if request.status == RequestStatus.WAITING:
diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py
index cee71415b0b..b80aeccf966 100644
--- a/fastdeploy/worker/xpu_model_runner.py
+++ b/fastdeploy/worker/xpu_model_runner.py
@@ -377,12 +377,14 @@ def insert_tasks_v1(self, req_dicts: List[Request]):
         """
         Process scheduler output tasks, used when ENABLE_V1_KVCACHE_SCHEDULER=1
         """
+        print("走到了insert_tasks_v1")
         # NOTE(luotingdan): Lazy initialize kv cache
         if "caches" not in self.share_inputs:
             self.initialize_kv_cache()
 
         req_len = len(req_dicts)
         has_prefill_task = False
+        has_decode_task = False
         for i in range(req_len):
             request = req_dicts[i]
             idx = request.idx
@@ -392,6 +394,9 @@ def insert_tasks_v1(self, req_dicts: List[Request]):
                 prefill_end_index = request.prefill_end_index
                 length = prefill_end_index - prefill_start_index
                 input_ids = request.prompt_token_ids + request.output_token_ids
+                logger.debug(
+                    f"Handle prefill request {request} at idx {idx} prefill_start_index {prefill_start_index} prefill_end_index {prefill_end_index} need_prefilled_token_num {len(input_ids)}"
+                )
                 self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array(
                     input_ids[prefill_start_index:prefill_end_index]
                 )
@@ -401,6 +406,8 @@ def insert_tasks_v1(self, req_dicts: List[Request]):
                 self.share_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array(
                     request.block_tables, dtype="int32"
                 )
+                if self.share_inputs["is_block_step"][idx]:  # has tasks to continue to decode
+                    has_decode_task = True
                 self.share_inputs["stop_flags"][idx : idx + 1] = False
                 self.share_inputs["seq_lens_decoder"][idx : idx + 1] = prefill_start_index
                 self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length
@@ -474,7 +481,7 @@ def insert_tasks_v1(self, req_dicts: List[Request]):
                 self.share_inputs["stop_seqs"][:stop_seqs_num, : len(request.get("stop_token_ids")[0])] = np.array(
                     request.get("stop_token_ids"), dtype="int64"
                 )
-        if has_prefill_task:
+        if has_prefill_task or has_decode_task:
             self.share_inputs["not_need_stop"][0] = True
 
     def process_prefill_inputs(self, req_dicts: List[Request]):

From 202e3a4ad7bb28b19ac83b1ce444e96cc072fcdb Mon Sep 17 00:00:00 2001
From: iosmers <yinwei_hust@163.com>
Date: Thu, 4 Sep 2025 12:08:39 +0000
Subject: [PATCH 2/5] fix bug

---
 fastdeploy/worker/xpu_model_runner.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py
index b80aeccf966..09ec0ee1a34 100644
--- a/fastdeploy/worker/xpu_model_runner.py
+++ b/fastdeploy/worker/xpu_model_runner.py
@@ -377,7 +377,6 @@ def insert_tasks_v1(self, req_dicts: List[Request]):
         """
         Process scheduler output tasks, used when ENABLE_V1_KVCACHE_SCHEDULER=1
         """
-        print("走到了insert_tasks_v1")
         # NOTE(luotingdan): Lazy initialize kv cache
         if "caches" not in self.share_inputs:
             self.initialize_kv_cache()

From 13a69e0e0b993116cbfa79336202bb2f1194a476 Mon Sep 17 00:00:00 2001
From: iosmers <yinwei_hust@163.com>
Date: Fri, 5 Sep 2025 02:12:11 +0000
Subject: [PATCH 3/5] update

---
 fastdeploy/config.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index ca4f1e2d5c5..d640eae1cd6 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -1236,7 +1236,10 @@ def postprocess(self):
 
         if self.max_num_batched_tokens is None:
             if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
-                self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
+                if paddle.is_compiled_with_xpu():
+                    self.max_num_batched_tokens = self.max_model_len
+                else:
+                    self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
             else:
                 if self.cache_config.enable_chunked_prefill:
                     self.max_num_batched_tokens = 2048

From c1e7dac3a347fb27405134627b8bd53488373aa9 Mon Sep 17 00:00:00 2001
From: iosmers <yinwei_hust@163.com>
Date: Fri, 5 Sep 2025 05:10:55 +0000
Subject: [PATCH 4/5] update

---
 fastdeploy/engine/args_utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
index 4497ca779f4..f2488f94938 100644
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -1007,7 +1007,10 @@ def create_engine_config(self) -> FDConfig:
 
         if self.max_num_batched_tokens is None:
             if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
-                self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
+                if paddle.is_compiled_with_xpu():
+                    self.max_num_batched_tokens = self.max_model_len
+                else:
+                    self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
             else:
                 if self.enable_chunked_prefill:
                     self.max_num_batched_tokens = 2048

From 11a1053e10ba0ac9d517c30a2b9a44c6f21dc1bd Mon Sep 17 00:00:00 2001
From: iosmers <yinwei_hust@163.com>
Date: Fri, 5 Sep 2025 06:16:24 +0000
Subject: [PATCH 5/5] update

---
 fastdeploy/config.py            | 5 +----
 fastdeploy/engine/args_utils.py | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index d640eae1cd6..88f1870ce94 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -1244,10 +1244,7 @@ def postprocess(self):
                 if self.cache_config.enable_chunked_prefill:
                     self.max_num_batched_tokens = 2048
                 else:
-                    if paddle.is_compiled_with_xpu():
-                        self.max_num_batched_tokens = self.max_model_len
-                    else:
-                        self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
+                    self.max_num_batched_tokens = self.max_model_len
 
         if self.long_prefill_token_threshold == 0:
             self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
index f2488f94938..f553ad2d24a 100644
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -1015,10 +1015,7 @@ def create_engine_config(self) -> FDConfig:
                 if self.enable_chunked_prefill:
                     self.max_num_batched_tokens = 2048
                 else:
-                    if paddle.is_compiled_with_xpu():
-                        self.max_num_batched_tokens = self.max_model_len
-                    else:
-                        self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
+                    self.max_num_batched_tokens = self.max_model_len
 
         all_dict = asdict(self)
         all_dict["model_cfg"] = model_cfg