From b6c0464eee70510207abffa022a3690d60492832 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Sat, 8 Nov 2025 11:42:29 +0800 Subject: [PATCH 1/7] add default temperature value --- fastdeploy/entrypoints/llm.py | 14 +++++++++++++- fastdeploy/entrypoints/openai/serving_chat.py | 2 ++ .../entrypoints/openai/serving_completion.py | 2 ++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py index aebaf349ff9..5c722a807db 100644 --- a/fastdeploy/entrypoints/llm.py +++ b/fastdeploy/entrypoints/llm.py @@ -95,7 +95,9 @@ def __init__( # Create the Engine self.llm_engine = LLMEngine.from_engine_args(engine_args=engine_args) - self.default_sampling_params = SamplingParams(max_tokens=self.llm_engine.cfg.model_config.max_model_len) + self.default_sampling_params = SamplingParams( + max_tokens=self.llm_engine.cfg.model_config.max_model_len, temperature=1e-06 + ) self.llm_engine.start() @@ -168,8 +170,13 @@ def generate( if isinstance(sampling_params, SamplingParams): sampling_params_len = 1 + if sampling_params.temperature is not None and sampling_params.temperature == 0: + sampling_params.temperature = 1e-06 else: sampling_params_len = len(sampling_params) + for param in sampling_params: + if param.temperature is not None and param.temperature == 0: + param.temperature = 1e-06 if isinstance(prompts, str): prompts = [prompts] @@ -234,8 +241,13 @@ def chat( if isinstance(sampling_params, SamplingParams): sampling_params_len = 1 + if sampling_params.temperature is not None and sampling_params.temperature == 0: + sampling_params.temperature = 1e-06 else: sampling_params_len = len(sampling_params) + for param in sampling_params: + if param.temperature is not None and param.temperature == 0: + param.temperature = 1e-06 if isinstance(messages, list) and isinstance(messages[0], dict): messages = [messages] diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index b0b407e05ad..4e5d1e9bd0e 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -106,6 +106,8 @@ async def create_chat_completion(self, request: ChatCompletionRequest): return ErrorResponse( error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR, code=ErrorCode.MODEL_NOT_SUPPORT) ) + if request.temperature is not None and request.temperature == 0: + request.temperature = 1e-06 try: if self.max_waiting_time < 0: diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index c27375305ed..0192854ff5d 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -84,6 +84,8 @@ async def create_completion(self, request: CompletionRequest): return ErrorResponse( error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR, code=ErrorCode.MODEL_NOT_SUPPORT) ) + if request.temperature is not None and request.temperature == 0: + request.temperature = 1e-06 created_time = int(time.time()) if request.user is not None: request_id = f"cmpl-{request.user}-{uuid.uuid4()}" From 0b53c417898ca6e4d059587026c9a374ff766cd4 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Sat, 8 Nov 2025 19:41:15 +0800 Subject: [PATCH 2/7] add unit test --- tests/utils/test_custom_chat_template.py | 32 ++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/utils/test_custom_chat_template.py b/tests/utils/test_custom_chat_template.py index 4ca32945971..78e2198a144 100644 --- a/tests/utils/test_custom_chat_template.py +++ b/tests/utils/test_custom_chat_template.py @@ -124,6 +124,38 @@ def mock_add_request(**kwargs): result = llm.chat(["hello"], sampling_params=SamplingParams(1), chat_template="hello") self.assertEqual("hello", result) + @patch("fastdeploy.entrypoints.llm.LLM.__init__") + def test_temperature(self, mock_class): + mock_class.return_value = None + llm = LLM() + llm.llm_engine = MagicMock() + llm.default_sampling_params = MagicMock() + + def mock_run_engine(req_ids, **kwargs): + return req_ids + + def mock_add_request(**kwargs): + return kwargs.get("sampling_params") + + llm._run_engine = mock_run_engine + llm._add_request = mock_add_request + result = llm.chat(["hello"], sampling_params=SamplingParams(temperature=0), chat_template="hello") + self.assertEqual(1e-06, result.temperature) + + result = llm.chat( + ["hello", "hi"], + sampling_params=[SamplingParams(temperature=0), SamplingParams(temperature=0)], + chat_template="hello", + ) + for params in result: + self.assertEqual(1e-06, params.temperature) + + result = llm.generate( + ["hello", "hi"], sampling_params=[SamplingParams(temperature=0), SamplingParams(temperature=0)] + ) + for params in result: + self.assertEqual(1e-06, params.temperature) + if __name__ == "__main__": unittest.main() From 1bbdb6ea3179a860ab2a6b03f5c8c9974bd99933 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Sat, 8 Nov 2025 20:20:32 +0800 Subject: [PATCH 3/7] update --- fastdeploy/engine/engine.py | 2 ++ fastdeploy/entrypoints/engine_client.py | 3 +- fastdeploy/entrypoints/llm.py | 10 ------ fastdeploy/entrypoints/openai/serving_chat.py | 2 -- .../entrypoints/openai/serving_completion.py | 2 -- tests/utils/test_custom_chat_template.py | 32 ------------------- 6 files changed, 4 insertions(+), 47 deletions(-) diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index e4c0b717ad8..77f5227a2a5 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -251,6 +251,8 @@ def add_requests(self, task, sampling_params=None, **kwargs): request = Request.from_dict(task) llm_logger.info(f"Receive request {request}") if sampling_params is not None: + if sampling_params.temperature is not None and sampling_params.temperature == 0: + sampling_params.temperature = 1e-06 request.sampling_params = sampling_params request.preprocess_start_time = time.time() chat_template_kwargs = kwargs.get("chat_template_kwargs") or {} diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py index e9664ddb6b9..24b16ac8219 100644 --- a/fastdeploy/entrypoints/engine_client.py +++ b/fastdeploy/entrypoints/engine_client.py @@ -304,7 +304,8 @@ def valid_parameters(self, data): api_server_logger.warning( f"req_id: {data['request_id']}, reasoning_max_tokens exceeds max_tokens, the value of reasoning_max_tokens will be adjusted to match that of max_tokens" ) - + if data.get("temperature") is not None and data["temperature"] == 0: + data["temperature"] = 1e-6 # logprobs logprobs = data.get("logprobs") top_logprobs = None diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py index 5c722a807db..6b334a490e2 100644 --- a/fastdeploy/entrypoints/llm.py +++ b/fastdeploy/entrypoints/llm.py @@ -170,13 +170,8 @@ def generate( if isinstance(sampling_params, SamplingParams): sampling_params_len = 1 - if sampling_params.temperature is not None and sampling_params.temperature == 0: - sampling_params.temperature = 1e-06 else: sampling_params_len = len(sampling_params) - for param in sampling_params: - if param.temperature is not None and param.temperature == 0: - param.temperature = 1e-06 if isinstance(prompts, str): prompts = [prompts] @@ -241,13 +236,8 @@ def chat( if isinstance(sampling_params, SamplingParams): sampling_params_len = 1 - if sampling_params.temperature is not None and sampling_params.temperature == 0: - sampling_params.temperature = 1e-06 else: sampling_params_len = len(sampling_params) - for param in sampling_params: - if param.temperature is not None and param.temperature == 0: - param.temperature = 1e-06 if isinstance(messages, list) and isinstance(messages[0], dict): messages = [messages] diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 4e5d1e9bd0e..b0b407e05ad 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -106,8 +106,6 @@ async def create_chat_completion(self, request: ChatCompletionRequest): return ErrorResponse( error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR, code=ErrorCode.MODEL_NOT_SUPPORT) ) - if request.temperature is not None and request.temperature == 0: - request.temperature = 1e-06 try: if self.max_waiting_time < 0: diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index 0192854ff5d..c27375305ed 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -84,8 +84,6 @@ async def create_completion(self, request: CompletionRequest): return ErrorResponse( error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR, code=ErrorCode.MODEL_NOT_SUPPORT) ) - if request.temperature is not None and request.temperature == 0: - request.temperature = 1e-06 created_time = int(time.time()) if request.user is not None: request_id = f"cmpl-{request.user}-{uuid.uuid4()}" diff --git a/tests/utils/test_custom_chat_template.py b/tests/utils/test_custom_chat_template.py index 78e2198a144..4ca32945971 100644 --- a/tests/utils/test_custom_chat_template.py +++ b/tests/utils/test_custom_chat_template.py @@ -124,38 +124,6 @@ def mock_add_request(**kwargs): result = llm.chat(["hello"], sampling_params=SamplingParams(1), chat_template="hello") self.assertEqual("hello", result) - @patch("fastdeploy.entrypoints.llm.LLM.__init__") - def test_temperature(self, mock_class): - mock_class.return_value = None - llm = LLM() - llm.llm_engine = MagicMock() - llm.default_sampling_params = MagicMock() - - def mock_run_engine(req_ids, **kwargs): - return req_ids - - def mock_add_request(**kwargs): - return kwargs.get("sampling_params") - - llm._run_engine = mock_run_engine - llm._add_request = mock_add_request - result = llm.chat(["hello"], sampling_params=SamplingParams(temperature=0), chat_template="hello") - self.assertEqual(1e-06, result.temperature) - - result = llm.chat( - ["hello", "hi"], - sampling_params=[SamplingParams(temperature=0), SamplingParams(temperature=0)], - chat_template="hello", - ) - for params in result: - self.assertEqual(1e-06, params.temperature) - - result = llm.generate( - ["hello", "hi"], sampling_params=[SamplingParams(temperature=0), SamplingParams(temperature=0)] - ) - for params in result: - self.assertEqual(1e-06, params.temperature) - if __name__ == "__main__": unittest.main() From 008e3b1a1b9f0a2686e755645320ba1becc59daf Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Sat, 8 Nov 2025 20:21:54 +0800 Subject: [PATCH 4/7] update --- fastdeploy/entrypoints/llm.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py index 6b334a490e2..aebaf349ff9 100644 --- a/fastdeploy/entrypoints/llm.py +++ b/fastdeploy/entrypoints/llm.py @@ -95,9 +95,7 @@ def __init__( # Create the Engine self.llm_engine = LLMEngine.from_engine_args(engine_args=engine_args) - self.default_sampling_params = SamplingParams( - max_tokens=self.llm_engine.cfg.model_config.max_model_len, temperature=1e-06 - ) + self.default_sampling_params = SamplingParams(max_tokens=self.llm_engine.cfg.model_config.max_model_len) self.llm_engine.start() From c68e836c3af6037226a7e02397587db38bf1821e Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Sat, 8 Nov 2025 20:47:09 +0800 Subject: [PATCH 5/7] add unit test --- tests/entrypoints/test_engine_client.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/entrypoints/test_engine_client.py b/tests/entrypoints/test_engine_client.py index 5aa720bbfcc..a8340b03501 100644 --- a/tests/entrypoints/test_engine_client.py +++ b/tests/entrypoints/test_engine_client.py @@ -33,6 +33,19 @@ async def test_add_request(self): assert request["tools"] == [1] # assert request["chat_template_kwargs"]["tools"] == [1] + def test_valid_parameters(self): + request = { + "request_id": "test-request-id", + "chat_template_kwargs": {"enable_thinking": True}, + "prompt_token_ids": [1], + "chat_template": "Hello", + "max_tokens": 20, + "tools": [1], + "temperature": 0, + } + self.valid_parameters(request) + assert request["temperature"] == 1e-6 + if __name__ == "__main__": unittest.main() From d6301055ffb5604a634b28977252fa513d148aaf Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Sun, 9 Nov 2025 12:24:31 +0800 Subject: [PATCH 6/7] update --- fastdeploy/engine/engine.py | 2 +- fastdeploy/entrypoints/engine_client.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 77f5227a2a5..21fec92ab50 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -251,7 +251,7 @@ def add_requests(self, task, sampling_params=None, **kwargs): request = Request.from_dict(task) llm_logger.info(f"Receive request {request}") if sampling_params is not None: - if sampling_params.temperature is not None and sampling_params.temperature == 0: + if sampling_params.temperature is not None and abs(sampling_params.temperature) < 1e-06: sampling_params.temperature = 1e-06 request.sampling_params = sampling_params request.preprocess_start_time = time.time() diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py index 24b16ac8219..38334a90e6c 100644 --- a/fastdeploy/entrypoints/engine_client.py +++ b/fastdeploy/entrypoints/engine_client.py @@ -304,7 +304,7 @@ def valid_parameters(self, data): api_server_logger.warning( f"req_id: {data['request_id']}, reasoning_max_tokens exceeds max_tokens, the value of reasoning_max_tokens will be adjusted to match that of max_tokens" ) - if data.get("temperature") is not None and data["temperature"] == 0: + if data.get("temperature") is not None and abs(data["temperature"]) < 1e-6: data["temperature"] = 1e-6 # logprobs logprobs = data.get("logprobs") From f274719df2c6d74027157ea85b57cf578e17c84a Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Sun, 9 Nov 2025 21:57:01 +0800 Subject: [PATCH 7/7] fix unit test --- tests/entrypoints/test_engine_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/test_engine_client.py b/tests/entrypoints/test_engine_client.py index a8340b03501..2493bb44d91 100644 --- a/tests/entrypoints/test_engine_client.py +++ b/tests/entrypoints/test_engine_client.py @@ -43,7 +43,7 @@ def test_valid_parameters(self): "tools": [1], "temperature": 0, } - self.valid_parameters(request) + self.engine_client.valid_parameters(request) assert request["temperature"] == 1e-6