Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
234ef92
add model status in vl
luukunn Sep 23, 2025
671a4dc
add x1 parser
luukunn Sep 24, 2025
8bbe39d
add model_status
luukunn Sep 24, 2025
d087afb
fix parser
luukunn Sep 25, 2025
2f6f063
fix parser
luukunn Sep 25, 2025
41f1418
fix parser
luukunn Sep 25, 2025
300f446
fix parser
luukunn Sep 25, 2025
3b93672
Revert "fix parser"
luukunn Sep 25, 2025
dae8419
fix parser
luukunn Sep 25, 2025
e49676c
fix
luukunn Sep 26, 2025
2c92f6f
fix
luukunn Sep 26, 2025
c433e05
fix
luukunn Sep 26, 2025
bfdec9f
fix
luukunn Sep 26, 2025
bd192b2
fix parser
luukunn Sep 28, 2025
ae24353
fix parser
luukunn Sep 28, 2025
dd30110
fix unit test
luukunn Sep 28, 2025
31d639d
fix unit test
luukunn Sep 29, 2025
13ab4d4
Merge branch 'develop' of https://github.com/PaddlePaddle/FastDeploy …
luukunn Sep 29, 2025
46e3c13
add unit test
luukunn Sep 29, 2025
bc4a71e
Merge branch 'develop' of https://github.com/PaddlePaddle/FastDeploy …
luukunn Oct 14, 2025
ede57ff
pull
luukunn Oct 16, 2025
d159f27
fix
luukunn Oct 20, 2025
cff5840
fix
luukunn Oct 20, 2025
21a8d59
fix
luukunn Oct 20, 2025
4a2908b
add unit test
luukunn Oct 20, 2025
fd2a343
Merge branch 'develop' of https://github.com/PaddlePaddle/FastDeploy …
luukunn Oct 20, 2025
59aaa2c
fix unit test
luukunn Oct 21, 2025
0e2019d
add unit test
luukunn Oct 21, 2025
f0def03
add unit test
luukunn Oct 21, 2025
ea2d987
fix unit test
luukunn Oct 22, 2025
b8794cb
fix unit test
luukunn Oct 22, 2025
e32c352
Merge branch 'develop' of https://github.com/PaddlePaddle/FastDeploy …
luukunn Oct 22, 2025
37b320e
fix bug
luukunn Oct 22, 2025
34ac21a
fix unit test
luukunn Oct 22, 2025
1cb6205
x1 tool parser
luukunn Oct 22, 2025
f19c727
update
luukunn Nov 3, 2025
4ef4df1
fix unit test
luukunn Nov 3, 2025
13ba41f
Resolved merge conflicts
luukunn Nov 20, 2025
7c17812
fix unit test
luukunn Nov 20, 2025
d3171a2
fix unit test
luukunn Nov 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions fastdeploy/entrypoints/openai/response_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,12 @@ def accumulate_token_ids(self, request_output):
else:
self._multipart_buffer.append({"decode_type": decode_type, "request_output": request_output})

async def process_response_chat(self, request_outputs, stream, enable_thinking, include_stop_str_in_output):
async def process_response_chat(self, request_outputs, stream, include_stop_str_in_output):
"""
Process a list of responses into a generator that yields each processed response as it's generated.
Args:
request_outputs: The list of outputs to be processed.
stream: Whether or not to stream the output.
enable_thinking: Whether or not to show thinking messages.
include_stop_str_in_output: Whether or not to include stop strings in the output.
"""
for request_output in request_outputs:
Expand All @@ -83,7 +82,6 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
yield self.data_processor.process_response_dict(
response_dict=request_output,
stream=stream,
enable_thinking=enable_thinking,
include_stop_str_in_output=include_stop_str_in_output,
)
elif stream:
Expand Down Expand Up @@ -111,7 +109,6 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
self.data_processor.process_response_dict(
response_dict=request_output,
stream=stream,
enable_thinking=enable_thinking,
include_stop_str_in_output=include_stop_str_in_output,
)
text = {"type": "text", "text": request_output["outputs"]["text"]}
Expand All @@ -132,7 +129,6 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
self.data_processor.process_response_dict(
response_dict=part["request_output"],
stream=False,
enable_thinking=enable_thinking,
include_stop_str_in_output=include_stop_str_in_output,
)
text = {"type": "text", "text": part["request_output"]["outputs"]["text"]}
Expand Down
22 changes: 0 additions & 22 deletions fastdeploy/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,6 @@ async def chat_completion_stream_generator(

max_streaming_response_tokens = max(1, max_streaming_response_tokens)

enable_thinking = self._get_thinking_status(request)

include_stop_str_in_output = request.include_stop_str_in_output

stream_options = request.stream_options
Expand Down Expand Up @@ -267,7 +265,6 @@ async def chat_completion_stream_generator(
generator = response_processor.process_response_chat(
response,
stream=True,
enable_thinking=enable_thinking,
include_stop_str_in_output=include_stop_str_in_output,
)

Expand Down Expand Up @@ -469,7 +466,6 @@ async def chat_completion_full_generator(
"""
created_time = int(time.time())
num_choices = 1 if request.n is None else request.n
enable_thinking = self._get_thinking_status(request)

include_stop_str_in_output = request.include_stop_str_in_output
try:
Expand Down Expand Up @@ -523,7 +519,6 @@ async def chat_completion_full_generator(
generator = response_processor.process_response_chat(
response,
stream=False,
enable_thinking=enable_thinking,
include_stop_str_in_output=include_stop_str_in_output,
)
async for data in generator:
Expand Down Expand Up @@ -757,20 +752,3 @@ def _build_logprobs_response(
error_msg = f"Error in _build_logprobs_response: {e}, {str(traceback.format_exc())}"
api_server_logger.error(error_msg)
return None

def _get_thinking_status(self, request: ChatCompletionRequest) -> bool:
"""
Get the thinking status from the request.
"""
enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None
if enable_thinking is None:
enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None
options = request.chat_template_kwargs.get("options") if request.chat_template_kwargs else None
if options:
thinking_mode = options.get("thinking_mode")
if thinking_mode:
if thinking_mode == "close" or thinking_mode == "false":
enable_thinking = False
else:
enable_thinking = True
return enable_thinking
52 changes: 35 additions & 17 deletions fastdeploy/input/ernie4_5_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_ob
self.decode_status = dict()
self.tool_parser_dict = dict()
self.thinking_parser_dict = dict()
self.model_status_dict = dict()
self._load_tokenizer()
data_processor_logger.info(
f"tokenizer information: bos_token is {self.tokenizer.bos_token} \
Expand Down Expand Up @@ -151,8 +152,13 @@ def process_request(self, request, max_model_len=None, **kwargs):
request.set("temperature", 1)
if request.get("top_p") < _SAMPLING_EPS:
request.set("top_p", _SAMPLING_EPS)
if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser":
request.enable_thinking = True
if self.reasoning_parser:
real_req_id = request.request_id.split("_")[0]
n = request.get("n", 1)
model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
for idx in range(n):
self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
request.enable_thinking = model_status == "think_start"

data_processor_logger.info(f"Processed request: {request}")
return request
Expand Down Expand Up @@ -229,9 +235,13 @@ def process_request_dict(self, request, max_model_len=None):
request["temperature"] = 1
if request.get("top_p") < _SAMPLING_EPS:
request["top_p"] = _SAMPLING_EPS
if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser":
request["enable_thinking"] = True

if self.reasoning_parser:
real_req_id = request["request_id"].split("_")[0]
model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
n = request.get("n", 1)
for idx in range(n):
self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
request["enable_thinking"] = model_status == "think_start"
data_processor_logger.info(f"Processed request dict: {request}")
return request

Expand All @@ -253,7 +263,11 @@ def process_response(self, response_dict, **kwargs):
token_ids = token_ids[:-1]
full_text = self.tokenizer.decode(token_ids)
if self.reasoning_parser:
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
full_text,
response_dict,
self.model_status_dict[req_id],
)
response_dict.outputs.text = text
response_dict.outputs.reasoning_content = reasoning_content
else:
Expand All @@ -264,6 +278,8 @@ def process_response(self, response_dict, **kwargs):
if tool_call_info.tools_called:
response_dict.outputs.tool_calls = tool_call_info.tool_calls
response_dict.outputs.text = tool_call_info.content
if req_id in self.model_status_dict:
del self.model_status_dict[req_id]
data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}")
if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "":
return None
Expand Down Expand Up @@ -294,7 +310,6 @@ def process_response_dict_normal(self, response_dict, **kwargs):
Returns:
Dict: response contain text fields
"""
enable_thinking = kwargs.get("enable_thinking")
token_ids = response_dict["outputs"]["token_ids"]
is_end = response_dict["finished"]
req_id = response_dict["request_id"]
Expand All @@ -304,16 +319,17 @@ def process_response_dict_normal(self, response_dict, **kwargs):
delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id)
if is_end:
full_text = previous_texts + delta_text
if self.reasoning_parser and (
enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser"
):
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
response_dict["outputs"]["text"] = full_text
if self.reasoning_parser:
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
full_text,
response_dict,
self.model_status_dict[req_id],
)
response_dict["outputs"]["text"] = text
response_dict["outputs"]["reasoning_content"] = reasoning_content
reasoning_tokens = self.tokenizer.tokenize(reasoning_content)
response_dict["outputs"]["reasoning_token_num"] = len(reasoning_tokens)
else:
response_dict["outputs"]["text"] = full_text
if self.tool_parser_obj:
tool_parser = self.tool_parser_obj(self.tokenizer)
tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
Expand All @@ -323,6 +339,8 @@ def process_response_dict_normal(self, response_dict, **kwargs):
response_dict["outputs"]["completion_tokens"] = full_text
data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
del self.decode_status[req_id]
if req_id in self.model_status_dict:
del self.model_status_dict[req_id]
return response_dict

def process_response_dict_streaming(self, response_dict, **kwargs):
Expand All @@ -335,7 +353,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
Returns:
Dict: response contain text fields
"""
enable_thinking = kwargs.get("enable_thinking")
is_end = response_dict["finished"]
req_id = response_dict["request_id"]
token_ids = response_dict["outputs"]["token_ids"]
Expand All @@ -345,16 +362,15 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
token_ids = token_ids[:-1]
delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
response_dict["outputs"]["completion_tokens"] = delta_text
if self.reasoning_parser and (
enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser"
):
if self.reasoning_parser:
reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming(
previous_texts,
previous_texts + delta_text,
delta_text,
previous_token_ids,
previous_token_ids + token_ids,
token_ids,
self.model_status_dict[req_id],
)
response_dict["outputs"]["delta_message"] = reasoning_delta_message
reasoning_content = reasoning_delta_message.reasoning_content if reasoning_delta_message else None
Expand All @@ -381,6 +397,8 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
del self.decode_status[req_id]
if req_id in self.tool_parser_dict:
del self.tool_parser_dict[req_id]
if req_id in self.model_status_dict:
del self.model_status_dict[req_id]
return response_dict

def messages2ids(self, request_or_messages, **kwargs):
Expand Down
26 changes: 8 additions & 18 deletions fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def __init__(

self.tool_parser_dict = dict()
self.decode_status = dict()
self.model_status_dict = dict()
self._load_tokenizer()

# Generation config
Expand Down Expand Up @@ -272,6 +273,13 @@ def process_request_dict(self, request, max_model_len=None):
request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
data_processor_logger.info(f"Processed request {request}")

if self.reasoning_parser:
real_req_id = request["request_id"].split("_")[0]
model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
n = request.get("n", 1)
for idx in range(n):
self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
request["enable_thinking"] = model_status == "think_start"
if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
request["top_p"] = _SAMPLING_EPS

Expand Down Expand Up @@ -307,21 +315,3 @@ def pack_outputs(self, outs):
outs["position_ids"] = np.array(outs["position_ids"], dtype=np.int64)

return outs

def process_response_dict(self, response_dict, stream, **kwargs):
"""
Preprocess the response

Args:
response_dict (Dict): response for engine, contain ids fields

Returns:
Dict: response contain text fields
"""
enable_thinking = kwargs.pop("enable_thinking", True)
if enable_thinking is None:
enable_thinking = True
if stream:
return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs)
else:
return self.process_response_dict_normal(response_dict, enable_thinking=enable_thinking, **kwargs)
7 changes: 7 additions & 0 deletions fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,13 @@ def process_request_dict(self, request, max_model_len=None):
# Set default max_tokens if not specified
if request.get("max_tokens") is None:
request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"])) # Ensure at least 1 token
if self.reasoning_parser:
real_req_id = request["request_id"].split("_")[0]
model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
n = request.get("n", 1)
for idx in range(n):
self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
request["enable_thinking"] = model_status == "think_start"
data_processor_logger.info(f"Processed request {request}")

return request
Expand Down
Loading
Loading