From 490633bb10e74bbbe480bbb872c10e1bfb8359a7 Mon Sep 17 00:00:00 2001 From: zhuzixuan Date: Tue, 9 Sep 2025 16:15:01 +0800 Subject: [PATCH 1/9] wenxin-tools-700 When the prompt type is list[int] or list[list[int]], it needs to support echoing after decoding. --- .../entrypoints/openai/serving_completion.py | 8 ++- .../openai/test_completion_echo.py | 8 +-- .../openai/test_create_completion.py | 58 +++++++++++++++++++ 3 files changed, 69 insertions(+), 5 deletions(-) create mode 100644 tests/entrypoints/openai/test_create_completion.py diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index 969f4de90e0..4d00c0ee8b2 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -115,7 +115,13 @@ async def create_completion(self, request: CompletionRequest): return ErrorResponse(message=error_msg, code=400) if request_prompt_ids is not None: - request_prompts = request_prompt_ids + if isinstance(request.prompt, list) and all(isinstance(item, int) for item in request.prompt): + request_prompts = [self.engine_client.data_processor.tokenizer.decode(request.prompt)] + elif isinstance(request.prompt, list) and all(isinstance(item, list) and all(isinstance(x, int) for x in item) for item in request.prompt): + request_prompts = [self.engine_client.data_processor.tokenizer.decode(item) for item in request.prompt] + else: + request_prompts = request_prompt_ids + request.prompt = request_prompts num_choices = len(request_prompts) api_server_logger.info(f"Start preprocessing request: req_id={request_id}), num_choices={num_choices}") diff --git a/tests/entrypoints/openai/test_completion_echo.py b/tests/entrypoints/openai/test_completion_echo.py index 52a2230702d..1555112bb8d 100644 --- a/tests/entrypoints/openai/test_completion_echo.py +++ b/tests/entrypoints/openai/test_completion_echo.py @@ -39,7 +39,7 @@ def setUp(self): self.completion_handler = None def test_single_prompt_non_streaming(self): - """测试单prompt非流式响应""" + """Test echo back prompt in non-streaming of the single prompt""" self.completion_handler = OpenAIServingCompletion( self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30 ) @@ -71,7 +71,7 @@ def test_single_prompt_non_streaming(self): self.assertEqual(response.choices[0].text, "test prompt generated text") async def test_echo_back_prompt_and_streaming(self): - """测试_echo_back_prompt方法和流式响应的prompt拼接逻辑""" + """Test echo back prompt and streaming""" self.completion_handler = OpenAIServingCompletion( self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30 ) @@ -95,7 +95,7 @@ def mock_echo_side_effect(req, res, idx): self.assertEqual(request.prompt, "test prompt") def test_multi_prompt_non_streaming(self): - """测试多prompt非流式响应""" + """Test echo back prompt in non-streaming of the multi prompts""" self.completion_handler = OpenAIServingCompletion( self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30 ) @@ -187,7 +187,7 @@ async def test_1_send_idx_is_not_0(self): self.assertEqual(res["outputs"]["text"], "!") async def test_1_echo_is_false(self): - """测试echo为False时,_echo_back_prompt不拼接prompt""" + """Test echo back prompt when echo is False""" request = CompletionRequest(echo=False, prompt="Hello") res = {"outputs": {"send_idx": 0, "text": "!"}} idx = 0 diff --git a/tests/entrypoints/openai/test_create_completion.py b/tests/entrypoints/openai/test_create_completion.py new file mode 100644 index 00000000000..ebec85efb89 --- /dev/null +++ b/tests/entrypoints/openai/test_create_completion.py @@ -0,0 +1,58 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import unittest +import asyncio +from typing import List +from unittest.mock import Mock + +from fastdeploy.entrypoints.openai.serving_completion import ( + CompletionRequest, + OpenAIServingCompletion, + RequestOutput, +) + + +class TestCreateCompletion(unittest.TestCase): + def test_create_(self): + # 创建一个模拟的engine_client,并设置reasoning_parser为"ernie_x1" + engine_client = Mock() + # 创建一个OpenAIServingCompletion实例 + serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360) + + def test_request_prompt_handling(self): + engine_client = Mock() + engine_client.data_processor.tokenizer.decode = lambda x: f"decoded_{x}" + + serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360) + + # 测试整数列表输入 + request1 = CompletionRequest(prompt=[1, 2, 3], request_prompt_ids="ids1") + asyncio.run(serving_completion.create_completion(request1)) + self.assertEqual(request1.prompt, ["decoded_[1, 2, 3]"]) + + # 测试整数列表的列表输入 + request2 = CompletionRequest(prompt=[[1, 2], [3, 4]], request_prompt_ids="ids2") + asyncio.run(serving_completion.create_completion(request2)) + self.assertEqual(request2.prompt, ["decoded_[1, 2]", "decoded_[3, 4]"]) + + # 测试其他类型输入 + request3 = CompletionRequest(prompt="text prompt", request_prompt_ids="ids3") + asyncio.run(serving_completion.create_completion(request3)) + self.assertEqual(request3.prompt, "text prompt") + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From 49f54a31230fbfee89a501e00003836edcff63d2 Mon Sep 17 00:00:00 2001 From: zhuzixuan Date: Tue, 9 Sep 2025 16:23:33 +0800 Subject: [PATCH 2/9] wenxin-tools-700 When the prompt type is list[int] or list[list[int]], it needs to support echoing after decoding. --- tests/entrypoints/openai/test_completion_echo.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/entrypoints/openai/test_completion_echo.py b/tests/entrypoints/openai/test_completion_echo.py index 1555112bb8d..762a1f59261 100644 --- a/tests/entrypoints/openai/test_completion_echo.py +++ b/tests/entrypoints/openai/test_completion_echo.py @@ -130,6 +130,7 @@ def test_multi_prompt_non_streaming(self): self.assertEqual(response.choices[1].text, "prompt2 response2") async def test_multi_prompt_streaming(self): + """Test echo back prompt in streaming of the multi prompts""" self.completion_handler = OpenAIServingCompletion( self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30 ) @@ -160,6 +161,7 @@ def mock_echo_side_effect(req, res, idx): self.assertEqual(request.prompt, ["prompt1", "prompt2"]) async def test_echo_back_prompt_and_streaming1(self): + """Test echo back prompt when prompt is list and send_idx is 0""" request = CompletionRequest(echo=True, prompt=["Hello", "World"]) res = {"outputs": {"send_idx": 0, "text": "!"}} idx = 0 @@ -169,6 +171,7 @@ async def test_echo_back_prompt_and_streaming1(self): self.assertEqual(res["outputs"]["text"], "Hello!") async def test_1_prompt_is_string_and_send_idx_is_0(self): + """Test echo back prompt when prompt is string and send_idx is 0""" request = CompletionRequest(echo=True, prompt="Hello") res = {"outputs": {"send_idx": 0, "text": "!"}} idx = 0 @@ -178,6 +181,7 @@ async def test_1_prompt_is_string_and_send_idx_is_0(self): self.assertEqual(res["outputs"]["text"], "Hello!") async def test_1_send_idx_is_not_0(self): + """Test echo back prompt when prompt is string and send_idx is not 0""" request = CompletionRequest(echo=True, prompt="Hello") res = {"outputs": {"send_idx": 1, "text": "!"}} idx = 0 From 50afc0d84cb16db55d74dde281eba23e6cd43368 Mon Sep 17 00:00:00 2001 From: zhuzixuan Date: Tue, 9 Sep 2025 16:27:38 +0800 Subject: [PATCH 3/9] wenxin-tools-700 When the prompt type is list[int] or list[list[int]], it needs to support echoing after decoding. --- .../openai/test_create_completion.py | 27 +++++++------------ 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/tests/entrypoints/openai/test_create_completion.py b/tests/entrypoints/openai/test_create_completion.py index ebec85efb89..8e8a0fb2f3f 100644 --- a/tests/entrypoints/openai/test_create_completion.py +++ b/tests/entrypoints/openai/test_create_completion.py @@ -14,45 +14,38 @@ # limitations under the License. """ -import unittest import asyncio -from typing import List +import unittest from unittest.mock import Mock from fastdeploy.entrypoints.openai.serving_completion import ( CompletionRequest, OpenAIServingCompletion, - RequestOutput, ) class TestCreateCompletion(unittest.TestCase): - def test_create_(self): - # 创建一个模拟的engine_client,并设置reasoning_parser为"ernie_x1" - engine_client = Mock() - # 创建一个OpenAIServingCompletion实例 - serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360) - def test_request_prompt_handling(self): engine_client = Mock() engine_client.data_processor.tokenizer.decode = lambda x: f"decoded_{x}" - + serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360) - - # 测试整数列表输入 + + # Prompt=List[int] request1 = CompletionRequest(prompt=[1, 2, 3], request_prompt_ids="ids1") asyncio.run(serving_completion.create_completion(request1)) self.assertEqual(request1.prompt, ["decoded_[1, 2, 3]"]) - - # 测试整数列表的列表输入 + + # Prompt=List[List[int]] request2 = CompletionRequest(prompt=[[1, 2], [3, 4]], request_prompt_ids="ids2") asyncio.run(serving_completion.create_completion(request2)) self.assertEqual(request2.prompt, ["decoded_[1, 2]", "decoded_[3, 4]"]) - - # 测试其他类型输入 + + # others request3 = CompletionRequest(prompt="text prompt", request_prompt_ids="ids3") asyncio.run(serving_completion.create_completion(request3)) self.assertEqual(request3.prompt, "text prompt") + if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() From db1368fd3566a19a16406e98475c6923777fdb65 Mon Sep 17 00:00:00 2001 From: zhuzixuan Date: Tue, 9 Sep 2025 16:29:29 +0800 Subject: [PATCH 4/9] wenxin-tools-700 When the prompt type is list[int] or list[list[int]], it needs to support echoing after decoding. --- fastdeploy/entrypoints/openai/serving_completion.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index 4d00c0ee8b2..d56b23305da 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -117,7 +117,9 @@ async def create_completion(self, request: CompletionRequest): if request_prompt_ids is not None: if isinstance(request.prompt, list) and all(isinstance(item, int) for item in request.prompt): request_prompts = [self.engine_client.data_processor.tokenizer.decode(request.prompt)] - elif isinstance(request.prompt, list) and all(isinstance(item, list) and all(isinstance(x, int) for x in item) for item in request.prompt): + elif isinstance(request.prompt, list) and all( + isinstance(item, list) and all(isinstance(x, int) for x in item) for item in request.prompt + ): request_prompts = [self.engine_client.data_processor.tokenizer.decode(item) for item in request.prompt] else: request_prompts = request_prompt_ids From 6246913827e51387e2e85e011058128f5200ded9 Mon Sep 17 00:00:00 2001 From: zhuzixuan Date: Tue, 9 Sep 2025 22:00:13 +0800 Subject: [PATCH 5/9] wenxin-tools-700 When the prompt type is list[int] or list[list[int]], it needs to support echoing after decoding. --- .../entrypoints/openai/serving_completion.py | 22 +-- .../openai/test_completion_echo.py | 140 +++++++++++------- .../openai/test_create_completion.py | 51 ------- 3 files changed, 100 insertions(+), 113 deletions(-) delete mode 100644 tests/entrypoints/openai/test_create_completion.py diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index d56b23305da..44135d642b8 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -115,15 +115,7 @@ async def create_completion(self, request: CompletionRequest): return ErrorResponse(message=error_msg, code=400) if request_prompt_ids is not None: - if isinstance(request.prompt, list) and all(isinstance(item, int) for item in request.prompt): - request_prompts = [self.engine_client.data_processor.tokenizer.decode(request.prompt)] - elif isinstance(request.prompt, list) and all( - isinstance(item, list) and all(isinstance(x, int) for x in item) for item in request.prompt - ): - request_prompts = [self.engine_client.data_processor.tokenizer.decode(item) for item in request.prompt] - else: - request_prompts = request_prompt_ids - request.prompt = request_prompts + request_prompts = request_prompt_ids num_choices = len(request_prompts) api_server_logger.info(f"Start preprocessing request: req_id={request_id}), num_choices={num_choices}") @@ -286,6 +278,12 @@ async def completion_full_generator( async def _echo_back_prompt(self, request, res, idx): if res["outputs"].get("send_idx", -1) == 0 and request.echo: + if isinstance(request.prompt, list) and all(isinstance(item, int) for item in request.prompt): + request.prompt = [self.engine_client.data_processor.tokenizer.decode(request.prompt)] + elif isinstance(request.prompt, list) and all( + isinstance(item, list) and all(isinstance(x, int) for x in item) for item in request.prompt + ): + request.prompt = [self.engine_client.data_processor.tokenizer.decode(item) for item in request.prompt] if isinstance(request.prompt, list): prompt_text = request.prompt[idx] else: @@ -506,6 +504,12 @@ def request_output_to_completion_response( if request.echo: assert prompt_text is not None + if isinstance(prompt_text, list) and all(isinstance(item, int) for item in prompt_text): + prompt_text = [self.engine_client.data_processor.tokenizer.decode(prompt_text)] + elif isinstance(prompt_text, list) and all( + isinstance(item, list) and all(isinstance(x, int) for x in item) for item in prompt_text + ): + prompt_text = [self.engine_client.data_processor.tokenizer.decode(item) for item in prompt_text] token_ids = [*prompt_token_ids, *output["token_ids"]] if isinstance(prompt_text, list): output_text = prompt_text[idx] + output["text"] diff --git a/tests/entrypoints/openai/test_completion_echo.py b/tests/entrypoints/openai/test_completion_echo.py index 762a1f59261..1112a9f1d3b 100644 --- a/tests/entrypoints/openai/test_completion_echo.py +++ b/tests/entrypoints/openai/test_completion_echo.py @@ -15,7 +15,7 @@ """ import unittest -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock from fastdeploy.entrypoints.openai.serving_completion import ( CompletionRequest, @@ -37,9 +37,10 @@ class TestCompletionEcho(unittest.IsolatedAsyncioTestCase): def setUp(self): self.mock_engine = MagicMock() self.completion_handler = None + self.mock_engine.data_processor.tokenizer.decode = lambda x: f"decoded_{x}" - def test_single_prompt_non_streaming(self): - """Test echo back prompt in non-streaming of the single prompt""" + def test_single_str_prompt_non_streaming(self): + """Testing echo prompt in non-streaming of a single str prompt""" self.completion_handler = OpenAIServingCompletion( self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30 ) @@ -70,32 +71,59 @@ def test_single_prompt_non_streaming(self): self.assertEqual(response.choices[0].text, "test prompt generated text") - async def test_echo_back_prompt_and_streaming(self): - """Test echo back prompt and streaming""" + def test_single_int_prompt_non_streaming(self): + """Testing echo prompt in non-streaming of a single int prompt""" self.completion_handler = OpenAIServingCompletion( self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30 ) - request = CompletionRequest(prompt="test prompt", max_tokens=10, stream=True, echo=True) + request = CompletionRequest(prompt=[1, 2, 3], max_tokens=10, echo=True, logprobs=1) - mock_response = {"outputs": {"text": "test output", "token_ids": [1, 2, 3], "finished": True}} - - with patch.object(self.completion_handler, "_echo_back_prompt") as mock_echo: + mock_output = { + "outputs": { + "text": " generated text", + "token_ids": [1, 2, 3], + "top_logprobs": {"token1": -0.1, "token2": -0.2}, + "finished": True, + }, + "output_token_ids": 3, + } + self.mock_engine.generate.return_value = [mock_output] - def mock_echo_side_effect(req, res, idx): - res["outputs"]["text"] = req.prompt + res["outputs"]["text"] + response = self.completion_handler.request_output_to_completion_response( + final_res_batch=[mock_output], + request=request, + request_id="test_id", + created_time=12345, + model_name="test_model", + prompt_batched_token_ids=[[1, 2]], + completion_batched_token_ids=[[3, 4, 5]], + text_after_process_list=["test prompt"], + ) + self.assertEqual(response.choices[0].text, "decoded_[1, 2, 3] generated text") - mock_echo.side_effect = mock_echo_side_effect + async def test_single_str_prompt_streaming(self): + """Testing echo prompts in streaming of a single str prompt""" + request = CompletionRequest(echo=True, prompt=["Hello"]) + res = {"outputs": {"send_idx": 0, "text": "!"}} + idx = 0 - await self.completion_handler._echo_back_prompt(request, mock_response, 0) + instance = OpenAIServingCompletion(self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30) + await instance._echo_back_prompt(request, res, idx) + self.assertEqual(res["outputs"]["text"], "Hello!") - mock_echo.assert_called_once_with(request, mock_response, 0) + async def test_single_int_prompt_streaming(self): + """Testing echoing prompts in streaming of a single int prompt""" + request = CompletionRequest(prompt=[1, 2, 3], max_tokens=10, stream=True, echo=True) + res = {"outputs": {"send_idx": 0, "text": "!"}} + idx = 0 - self.assertEqual(mock_response["outputs"]["text"], "test prompttest output") - self.assertEqual(request.prompt, "test prompt") + instance = OpenAIServingCompletion(self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30) + await instance._echo_back_prompt(request, res, idx) + self.assertEqual(res["outputs"]["text"], "decoded_[1, 2, 3]!") - def test_multi_prompt_non_streaming(self): - """Test echo back prompt in non-streaming of the multi prompts""" + def test_multi_str_prompt_non_streaming(self): + """Testing echo prompts in non-streaming of multiple str prompts""" self.completion_handler = OpenAIServingCompletion( self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30 ) @@ -129,59 +157,65 @@ def test_multi_prompt_non_streaming(self): self.assertEqual(response.choices[0].text, "prompt1 response1") self.assertEqual(response.choices[1].text, "prompt2 response2") - async def test_multi_prompt_streaming(self): - """Test echo back prompt in streaming of the multi prompts""" + def test_multi_int_prompt_non_streaming(self): + """Testing echo prompts in non-streaming of multiple int prompts""" self.completion_handler = OpenAIServingCompletion( self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30 ) - request = CompletionRequest(prompt=["prompt1", "prompt2"], max_tokens=10, stream=True, echo=True) + request = CompletionRequest(prompt=[[1, 2, 3], [4, 5, 6]], max_tokens=10, echo=True) - mock_responses = [ - {"outputs": {"text": " response1", "token_ids": [1, 2], "finished": True}}, - {"outputs": {"text": " response2", "token_ids": [3, 4], "finished": True}}, + mock_outputs = [ + { + "outputs": {"text": " response1", "token_ids": [1, 2], "top_logprobs": None, "finished": True}, + "output_token_ids": 2, + }, + { + "outputs": {"text": " response2", "token_ids": [3, 4], "top_logprobs": None, "finished": True}, + "output_token_ids": 2, + }, ] + self.mock_engine.generate.return_value = mock_outputs - with patch.object(self.completion_handler, "_echo_back_prompt") as mock_echo: - - def mock_echo_side_effect(req, res, idx): - res["outputs"]["text"] = req.prompt[idx] + res["outputs"]["text"] - - mock_echo.side_effect = mock_echo_side_effect - - await self.completion_handler._echo_back_prompt(request, mock_responses[0], 0) - await self.completion_handler._echo_back_prompt(request, mock_responses[1], 1) - - self.assertEqual(mock_echo.call_count, 2) - mock_echo.assert_any_call(request, mock_responses[0], 0) - mock_echo.assert_any_call(request, mock_responses[1], 1) + response = self.completion_handler.request_output_to_completion_response( + final_res_batch=mock_outputs, + request=request, + request_id="test_id", + created_time=12345, + model_name="test_model", + prompt_batched_token_ids=[[1], [2]], + completion_batched_token_ids=[[1, 2], [3, 4]], + text_after_process_list=["prompt1", "prompt2"], + ) - self.assertEqual(mock_responses[0]["outputs"]["text"], "prompt1 response1") - self.assertEqual(mock_responses[1]["outputs"]["text"], "prompt2 response2") - self.assertEqual(request.prompt, ["prompt1", "prompt2"]) + self.assertEqual(len(response.choices), 2) + print("response.choices[0].text", response.choices[0].text) + print("response.choices[1].text", response.choices[1].text) + self.assertEqual(response.choices[0].text, "decoded_[1, 2, 3] response1") + self.assertEqual(response.choices[1].text, "decoded_[4, 5, 6] response2") - async def test_echo_back_prompt_and_streaming1(self): - """Test echo back prompt when prompt is list and send_idx is 0""" + async def test_multi_str_prompt_streaming(self): + """Testing echo prompts in streaming of multiple str prompts""" request = CompletionRequest(echo=True, prompt=["Hello", "World"]) res = {"outputs": {"send_idx": 0, "text": "!"}} - idx = 0 + idx = 1 instance = OpenAIServingCompletion(self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30) await instance._echo_back_prompt(request, res, idx) - self.assertEqual(res["outputs"]["text"], "Hello!") + self.assertEqual(res["outputs"]["text"], "World!") - async def test_1_prompt_is_string_and_send_idx_is_0(self): - """Test echo back prompt when prompt is string and send_idx is 0""" - request = CompletionRequest(echo=True, prompt="Hello") + async def test_multi_int_prompt_streaming(self): + """Testing echo prompts in streaming of multiple int prompts""" + request = CompletionRequest(echo=True, prompt=[[1, 2, 3], [4, 5, 6]]) res = {"outputs": {"send_idx": 0, "text": "!"}} - idx = 0 + idx = 1 instance = OpenAIServingCompletion(self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30) await instance._echo_back_prompt(request, res, idx) - self.assertEqual(res["outputs"]["text"], "Hello!") + self.assertEqual(res["outputs"]["text"], "decoded_[4, 5, 6]!") - async def test_1_send_idx_is_not_0(self): - """Test echo back prompt when prompt is string and send_idx is not 0""" + async def test_send_idx_is_not_0(self): + """Testing send_idx is not 0""" request = CompletionRequest(echo=True, prompt="Hello") res = {"outputs": {"send_idx": 1, "text": "!"}} idx = 0 @@ -190,8 +224,8 @@ async def test_1_send_idx_is_not_0(self): await instance._echo_back_prompt(request, res, idx) self.assertEqual(res["outputs"]["text"], "!") - async def test_1_echo_is_false(self): - """Test echo back prompt when echo is False""" + async def test_echo_is_false(self): + """Testing echo prompts when echo is False""" request = CompletionRequest(echo=False, prompt="Hello") res = {"outputs": {"send_idx": 0, "text": "!"}} idx = 0 diff --git a/tests/entrypoints/openai/test_create_completion.py b/tests/entrypoints/openai/test_create_completion.py deleted file mode 100644 index 8e8a0fb2f3f..00000000000 --- a/tests/entrypoints/openai/test_create_completion.py +++ /dev/null @@ -1,51 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import asyncio -import unittest -from unittest.mock import Mock - -from fastdeploy.entrypoints.openai.serving_completion import ( - CompletionRequest, - OpenAIServingCompletion, -) - - -class TestCreateCompletion(unittest.TestCase): - def test_request_prompt_handling(self): - engine_client = Mock() - engine_client.data_processor.tokenizer.decode = lambda x: f"decoded_{x}" - - serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360) - - # Prompt=List[int] - request1 = CompletionRequest(prompt=[1, 2, 3], request_prompt_ids="ids1") - asyncio.run(serving_completion.create_completion(request1)) - self.assertEqual(request1.prompt, ["decoded_[1, 2, 3]"]) - - # Prompt=List[List[int]] - request2 = CompletionRequest(prompt=[[1, 2], [3, 4]], request_prompt_ids="ids2") - asyncio.run(serving_completion.create_completion(request2)) - self.assertEqual(request2.prompt, ["decoded_[1, 2]", "decoded_[3, 4]"]) - - # others - request3 = CompletionRequest(prompt="text prompt", request_prompt_ids="ids3") - asyncio.run(serving_completion.create_completion(request3)) - self.assertEqual(request3.prompt, "text prompt") - - -if __name__ == "__main__": - unittest.main() From e546d674487592b0a3c84708a3b5dc3b89c2d372 Mon Sep 17 00:00:00 2001 From: zhuzixuan Date: Wed, 10 Sep 2025 20:40:12 +0800 Subject: [PATCH 6/9] wenxin-tools-700 When the prompt type is list[int] or list[list[int]], it needs to support echoing after decoding. --- .../entrypoints/openai/serving_completion.py | 50 ++++++----- .../openai/test_completion_echo.py | 90 ++++++------------- 2 files changed, 58 insertions(+), 82 deletions(-) diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index 44135d642b8..df32b1c0973 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -276,19 +276,32 @@ async def completion_full_generator( if dealer is not None: await self.engine_client.connection_manager.cleanup_request(request_id) - async def _echo_back_prompt(self, request, res, idx): - if res["outputs"].get("send_idx", -1) == 0 and request.echo: - if isinstance(request.prompt, list) and all(isinstance(item, int) for item in request.prompt): - request.prompt = [self.engine_client.data_processor.tokenizer.decode(request.prompt)] - elif isinstance(request.prompt, list) and all( - isinstance(item, list) and all(isinstance(x, int) for x in item) for item in request.prompt - ): - request.prompt = [self.engine_client.data_processor.tokenizer.decode(item) for item in request.prompt] - if isinstance(request.prompt, list): + def _echo_back_prompt(self, request, idx): + """ + The echo pre-process of the smallest unit + """ + if isinstance(request.prompt, str): + prompt_text = request.prompt + elif isinstance(request.prompt, list): + if all(isinstance(item, str) for item in request.prompt): prompt_text = request.prompt[idx] + elif all(isinstance(item, int) for item in request.prompt): + prompt_text = [self.engine_client.data_processor.tokenizer.decode(request.prompt)] + else: + prompt_text = [self.engine_client.data_processor.tokenizer.decode(request.prompt[idx])] + return prompt_text + + async def _process_echo_logic(self, request, idx, res_outputs, prompt_text): + """ + Process the echo logic and return the modified text. + """ + if request.echo and res_outputs.get("send_idx", -1) == 0: + prompt_text = self._echo_back_prompt(request, idx) + if isinstance(prompt_text, list): + res_outputs["text"] = prompt_text[0] + (res_outputs["text"] or "") else: - prompt_text = request.prompt - res["outputs"]["text"] = prompt_text + (res["outputs"]["text"] or "") + res_outputs["text"] = prompt_text + (res_outputs["text"] or "") + return res_outputs def calc_finish_reason(self, max_tokens, token_num, output, tool_called): if max_tokens is None or token_num != max_tokens: @@ -390,7 +403,8 @@ async def completion_stream_generator( else: arrival_time = res["metrics"]["arrival_time"] - inference_start_time[idx] - await self._echo_back_prompt(request, res, idx) + prompt_text = request.prompt + await self._process_echo_logic(request, idx, res["outputs"], prompt_text) output = res["outputs"] output_top_logprobs = output["top_logprobs"] logprobs_res: Optional[CompletionLogprobs] = None @@ -503,18 +517,12 @@ def request_output_to_completion_response( aggregated_logprobs = self._create_completion_logprobs(output_top_logprobs, request.logprobs, 0) if request.echo: - assert prompt_text is not None - if isinstance(prompt_text, list) and all(isinstance(item, int) for item in prompt_text): - prompt_text = [self.engine_client.data_processor.tokenizer.decode(prompt_text)] - elif isinstance(prompt_text, list) and all( - isinstance(item, list) and all(isinstance(x, int) for x in item) for item in prompt_text - ): - prompt_text = [self.engine_client.data_processor.tokenizer.decode(item) for item in prompt_text] + prompt_text = self._echo_back_prompt(request, idx) token_ids = [*prompt_token_ids, *output["token_ids"]] if isinstance(prompt_text, list): - output_text = prompt_text[idx] + output["text"] + output_text = prompt_text[0] + output["text"] else: - output_text = str(prompt_text) + output["text"] + output_text = prompt_text + output["text"] else: token_ids = output["token_ids"] output_text = output["text"] diff --git a/tests/entrypoints/openai/test_completion_echo.py b/tests/entrypoints/openai/test_completion_echo.py index 1112a9f1d3b..c96ef5624d5 100644 --- a/tests/entrypoints/openai/test_completion_echo.py +++ b/tests/entrypoints/openai/test_completion_echo.py @@ -23,24 +23,15 @@ ) -class YourClass: - async def _1(self, a, b, c): - if b["outputs"].get("send_idx", -1) == 0 and a.echo: - if isinstance(a.prompt, list): - text = a.prompt[c] - else: - text = a.prompt - b["outputs"]["text"] = text + (b["outputs"]["text"] or "") - - class TestCompletionEcho(unittest.IsolatedAsyncioTestCase): def setUp(self): self.mock_engine = MagicMock() self.completion_handler = None self.mock_engine.data_processor.tokenizer.decode = lambda x: f"decoded_{x}" + """Testing echo prompt in non-streaming of a single str prompt""" + def test_single_str_prompt_non_streaming(self): - """Testing echo prompt in non-streaming of a single str prompt""" self.completion_handler = OpenAIServingCompletion( self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30 ) @@ -71,8 +62,9 @@ def test_single_str_prompt_non_streaming(self): self.assertEqual(response.choices[0].text, "test prompt generated text") + """Testing echo prompt in non-streaming of a single int prompt""" + def test_single_int_prompt_non_streaming(self): - """Testing echo prompt in non-streaming of a single int prompt""" self.completion_handler = OpenAIServingCompletion( self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30 ) @@ -102,28 +94,9 @@ def test_single_int_prompt_non_streaming(self): ) self.assertEqual(response.choices[0].text, "decoded_[1, 2, 3] generated text") - async def test_single_str_prompt_streaming(self): - """Testing echo prompts in streaming of a single str prompt""" - request = CompletionRequest(echo=True, prompt=["Hello"]) - res = {"outputs": {"send_idx": 0, "text": "!"}} - idx = 0 - - instance = OpenAIServingCompletion(self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30) - await instance._echo_back_prompt(request, res, idx) - self.assertEqual(res["outputs"]["text"], "Hello!") - - async def test_single_int_prompt_streaming(self): - """Testing echoing prompts in streaming of a single int prompt""" - request = CompletionRequest(prompt=[1, 2, 3], max_tokens=10, stream=True, echo=True) - res = {"outputs": {"send_idx": 0, "text": "!"}} - idx = 0 - - instance = OpenAIServingCompletion(self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30) - await instance._echo_back_prompt(request, res, idx) - self.assertEqual(res["outputs"]["text"], "decoded_[1, 2, 3]!") + """Testing echo prompts in non-streaming of multiple str prompts""" def test_multi_str_prompt_non_streaming(self): - """Testing echo prompts in non-streaming of multiple str prompts""" self.completion_handler = OpenAIServingCompletion( self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30 ) @@ -157,8 +130,9 @@ def test_multi_str_prompt_non_streaming(self): self.assertEqual(response.choices[0].text, "prompt1 response1") self.assertEqual(response.choices[1].text, "prompt2 response2") + """Testing echo prompts in non-streaming of multiple int prompts""" + def test_multi_int_prompt_non_streaming(self): - """Testing echo prompts in non-streaming of multiple int prompts""" self.completion_handler = OpenAIServingCompletion( self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30 ) @@ -189,50 +163,44 @@ def test_multi_int_prompt_non_streaming(self): ) self.assertEqual(len(response.choices), 2) - print("response.choices[0].text", response.choices[0].text) - print("response.choices[1].text", response.choices[1].text) self.assertEqual(response.choices[0].text, "decoded_[1, 2, 3] response1") self.assertEqual(response.choices[1].text, "decoded_[4, 5, 6] response2") - async def test_multi_str_prompt_streaming(self): - """Testing echo prompts in streaming of multiple str prompts""" - request = CompletionRequest(echo=True, prompt=["Hello", "World"]) - res = {"outputs": {"send_idx": 0, "text": "!"}} - idx = 1 + """Testing echo prompts in streaming of a single str prompt""" - instance = OpenAIServingCompletion(self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30) - await instance._echo_back_prompt(request, res, idx) - self.assertEqual(res["outputs"]["text"], "World!") - - async def test_multi_int_prompt_streaming(self): - """Testing echo prompts in streaming of multiple int prompts""" - request = CompletionRequest(echo=True, prompt=[[1, 2, 3], [4, 5, 6]]) + async def test_single_str_prompt_streaming(self): + request = CompletionRequest(prompt="test prompt", max_tokens=10, stream=True, echo=True) res = {"outputs": {"send_idx": 0, "text": "!"}} - idx = 1 + idx = 0 instance = OpenAIServingCompletion(self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30) - await instance._echo_back_prompt(request, res, idx) - self.assertEqual(res["outputs"]["text"], "decoded_[4, 5, 6]!") + prompt_text = "" + res = await instance._process_echo_logic(request, idx, res["outputs"], prompt_text) + self.assertEqual(res["text"], "test prompt!") + + """Testing echo prompts in streaming of a single int prompt""" - async def test_send_idx_is_not_0(self): - """Testing send_idx is not 0""" - request = CompletionRequest(echo=True, prompt="Hello") - res = {"outputs": {"send_idx": 1, "text": "!"}} + async def test_single_int_prompt_streaming(self): + request = CompletionRequest(prompt=[1, 2, 3], max_tokens=10, stream=True, echo=True) + res = {"outputs": {"send_idx": 0, "text": "!"}} idx = 0 instance = OpenAIServingCompletion(self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30) - await instance._echo_back_prompt(request, res, idx) - self.assertEqual(res["outputs"]["text"], "!") + prompt_text = "" + res = await instance._process_echo_logic(request, idx, res["outputs"], prompt_text) + self.assertEqual(res["text"], "decoded_[1, 2, 3]!") + + """Testing echo prompts in streaming of multi str prompt""" - async def test_echo_is_false(self): - """Testing echo prompts when echo is False""" - request = CompletionRequest(echo=False, prompt="Hello") + async def test_multi_str_prompt_streaming(self): + request = CompletionRequest(prompt=["test prompt1", "test prompt2"], max_tokens=10, stream=True, echo=True) res = {"outputs": {"send_idx": 0, "text": "!"}} idx = 0 instance = OpenAIServingCompletion(self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30) - await instance._echo_back_prompt(request, res, idx) - self.assertEqual(res["outputs"]["text"], "!") + prompt_text = "" + res = await instance._process_echo_logic(request, idx, res["outputs"], prompt_text) + self.assertEqual(res["text"], "test prompt1!") if __name__ == "__main__": From 3600650abe5c4423491be2416365c65ff921989e Mon Sep 17 00:00:00 2001 From: zhuzixuan Date: Wed, 10 Sep 2025 22:45:31 +0800 Subject: [PATCH 7/9] wenxin-tools-700 When the prompt type is list[int] or list[list[int]], it needs to support echoing after decoding. --- .../entrypoints/openai/serving_completion.py | 14 ++++---------- tests/entrypoints/openai/test_completion_echo.py | 2 ++ 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index df32b1c0973..653e1e8d68e 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -286,9 +286,9 @@ def _echo_back_prompt(self, request, idx): if all(isinstance(item, str) for item in request.prompt): prompt_text = request.prompt[idx] elif all(isinstance(item, int) for item in request.prompt): - prompt_text = [self.engine_client.data_processor.tokenizer.decode(request.prompt)] + prompt_text = self.engine_client.data_processor.tokenizer.decode(request.prompt) else: - prompt_text = [self.engine_client.data_processor.tokenizer.decode(request.prompt[idx])] + prompt_text = self.engine_client.data_processor.tokenizer.decode(request.prompt[idx]) return prompt_text async def _process_echo_logic(self, request, idx, res_outputs, prompt_text): @@ -297,10 +297,7 @@ async def _process_echo_logic(self, request, idx, res_outputs, prompt_text): """ if request.echo and res_outputs.get("send_idx", -1) == 0: prompt_text = self._echo_back_prompt(request, idx) - if isinstance(prompt_text, list): - res_outputs["text"] = prompt_text[0] + (res_outputs["text"] or "") - else: - res_outputs["text"] = prompt_text + (res_outputs["text"] or "") + res_outputs["text"] = prompt_text + (res_outputs["text"] or "") return res_outputs def calc_finish_reason(self, max_tokens, token_num, output, tool_called): @@ -519,10 +516,7 @@ def request_output_to_completion_response( if request.echo: prompt_text = self._echo_back_prompt(request, idx) token_ids = [*prompt_token_ids, *output["token_ids"]] - if isinstance(prompt_text, list): - output_text = prompt_text[0] + output["text"] - else: - output_text = prompt_text + output["text"] + output_text = prompt_text + output["text"] else: token_ids = output["token_ids"] output_text = output["text"] diff --git a/tests/entrypoints/openai/test_completion_echo.py b/tests/entrypoints/openai/test_completion_echo.py index c96ef5624d5..9f3a96ae572 100644 --- a/tests/entrypoints/openai/test_completion_echo.py +++ b/tests/entrypoints/openai/test_completion_echo.py @@ -163,6 +163,8 @@ def test_multi_int_prompt_non_streaming(self): ) self.assertEqual(len(response.choices), 2) + print("response.choices[0].text", response.choices[0].text) + print("response.choices[1].text", response.choices[1].text) self.assertEqual(response.choices[0].text, "decoded_[1, 2, 3] response1") self.assertEqual(response.choices[1].text, "decoded_[4, 5, 6] response2") From 54a109fe08ecd2df493c12ac7a64b447225d3617 Mon Sep 17 00:00:00 2001 From: zhuzixuan Date: Wed, 10 Sep 2025 23:03:28 +0800 Subject: [PATCH 8/9] wenxin-tools-700 When the prompt type is list[int] or list[list[int]], it needs to support echoing after decoding. --- fastdeploy/entrypoints/openai/serving_completion.py | 2 +- tests/entrypoints/openai/test_completion_echo.py | 11 +++-------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index 653e1e8d68e..d59de52446a 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -291,7 +291,7 @@ def _echo_back_prompt(self, request, idx): prompt_text = self.engine_client.data_processor.tokenizer.decode(request.prompt[idx]) return prompt_text - async def _process_echo_logic(self, request, idx, res_outputs, prompt_text): + async def _process_echo_logic(self, request, idx, res_outputs): """ Process the echo logic and return the modified text. """ diff --git a/tests/entrypoints/openai/test_completion_echo.py b/tests/entrypoints/openai/test_completion_echo.py index 9f3a96ae572..3e8d8ac79dc 100644 --- a/tests/entrypoints/openai/test_completion_echo.py +++ b/tests/entrypoints/openai/test_completion_echo.py @@ -163,8 +163,6 @@ def test_multi_int_prompt_non_streaming(self): ) self.assertEqual(len(response.choices), 2) - print("response.choices[0].text", response.choices[0].text) - print("response.choices[1].text", response.choices[1].text) self.assertEqual(response.choices[0].text, "decoded_[1, 2, 3] response1") self.assertEqual(response.choices[1].text, "decoded_[4, 5, 6] response2") @@ -176,8 +174,7 @@ async def test_single_str_prompt_streaming(self): idx = 0 instance = OpenAIServingCompletion(self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30) - prompt_text = "" - res = await instance._process_echo_logic(request, idx, res["outputs"], prompt_text) + res = await instance._process_echo_logic(request, idx, res["outputs"]) self.assertEqual(res["text"], "test prompt!") """Testing echo prompts in streaming of a single int prompt""" @@ -188,8 +185,7 @@ async def test_single_int_prompt_streaming(self): idx = 0 instance = OpenAIServingCompletion(self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30) - prompt_text = "" - res = await instance._process_echo_logic(request, idx, res["outputs"], prompt_text) + res = await instance._process_echo_logic(request, idx, res["outputs"]) self.assertEqual(res["text"], "decoded_[1, 2, 3]!") """Testing echo prompts in streaming of multi str prompt""" @@ -200,8 +196,7 @@ async def test_multi_str_prompt_streaming(self): idx = 0 instance = OpenAIServingCompletion(self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30) - prompt_text = "" - res = await instance._process_echo_logic(request, idx, res["outputs"], prompt_text) + res = await instance._process_echo_logic(request, idx, res["outputs"]) self.assertEqual(res["text"], "test prompt1!") From ec87dc08b9aa02f23248a0fdc2e7ae7d3507e311 Mon Sep 17 00:00:00 2001 From: zhuzixuan Date: Thu, 11 Sep 2025 15:04:34 +0800 Subject: [PATCH 9/9] wenxin-tools-700 When the prompt type is list[int] or list[list[int]], it needs to support echoing after decoding. --- fastdeploy/entrypoints/openai/serving_completion.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index d59de52446a..b3a97c42605 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -400,8 +400,7 @@ async def completion_stream_generator( else: arrival_time = res["metrics"]["arrival_time"] - inference_start_time[idx] - prompt_text = request.prompt - await self._process_echo_logic(request, idx, res["outputs"], prompt_text) + await self._process_echo_logic(request, idx, res["outputs"]) output = res["outputs"] output_top_logprobs = output["top_logprobs"] logprobs_res: Optional[CompletionLogprobs] = None @@ -503,7 +502,6 @@ def request_output_to_completion_response( final_res = final_res_batch[idx] prompt_token_ids = prompt_batched_token_ids[idx] assert prompt_token_ids is not None - prompt_text = request.prompt completion_token_ids = completion_batched_token_ids[idx] output = final_res["outputs"]