From b6c0464eee70510207abffa022a3690d60492832 Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Sat, 8 Nov 2025 11:42:29 +0800
Subject: [PATCH 1/7] add default temperature value

---
 fastdeploy/entrypoints/llm.py                      | 14 +++++++++++++-
 fastdeploy/entrypoints/openai/serving_chat.py      |  2 ++
 .../entrypoints/openai/serving_completion.py       |  2 ++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py
index aebaf349ff9..5c722a807db 100644
--- a/fastdeploy/entrypoints/llm.py
+++ b/fastdeploy/entrypoints/llm.py
@@ -95,7 +95,9 @@ def __init__(
         # Create the Engine
         self.llm_engine = LLMEngine.from_engine_args(engine_args=engine_args)
 
-        self.default_sampling_params = SamplingParams(max_tokens=self.llm_engine.cfg.model_config.max_model_len)
+        self.default_sampling_params = SamplingParams(
+            max_tokens=self.llm_engine.cfg.model_config.max_model_len, temperature=1e-06
+        )
 
         self.llm_engine.start()
 
@@ -168,8 +170,13 @@ def generate(
 
         if isinstance(sampling_params, SamplingParams):
             sampling_params_len = 1
+            if sampling_params.temperature is not None and sampling_params.temperature == 0:
+                sampling_params.temperature = 1e-06
         else:
             sampling_params_len = len(sampling_params)
+            for param in sampling_params:
+                if param.temperature is not None and param.temperature == 0:
+                    param.temperature = 1e-06
 
         if isinstance(prompts, str):
             prompts = [prompts]
@@ -234,8 +241,13 @@ def chat(
 
         if isinstance(sampling_params, SamplingParams):
             sampling_params_len = 1
+            if sampling_params.temperature is not None and sampling_params.temperature == 0:
+                sampling_params.temperature = 1e-06
         else:
             sampling_params_len = len(sampling_params)
+            for param in sampling_params:
+                if param.temperature is not None and param.temperature == 0:
+                    param.temperature = 1e-06
 
         if isinstance(messages, list) and isinstance(messages[0], dict):
             messages = [messages]
diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
index b0b407e05ad..4e5d1e9bd0e 100644
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -106,6 +106,8 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
                 return ErrorResponse(
                     error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR, code=ErrorCode.MODEL_NOT_SUPPORT)
                 )
+        if request.temperature is not None and request.temperature == 0:
+            request.temperature = 1e-06
 
         try:
             if self.max_waiting_time < 0:
diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py
index c27375305ed..0192854ff5d 100644
--- a/fastdeploy/entrypoints/openai/serving_completion.py
+++ b/fastdeploy/entrypoints/openai/serving_completion.py
@@ -84,6 +84,8 @@ async def create_completion(self, request: CompletionRequest):
                 return ErrorResponse(
                     error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR, code=ErrorCode.MODEL_NOT_SUPPORT)
                 )
+        if request.temperature is not None and request.temperature == 0:
+            request.temperature = 1e-06
         created_time = int(time.time())
         if request.user is not None:
             request_id = f"cmpl-{request.user}-{uuid.uuid4()}"

From 0b53c417898ca6e4d059587026c9a374ff766cd4 Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Sat, 8 Nov 2025 19:41:15 +0800
Subject: [PATCH 2/7] add unit test

---
 tests/utils/test_custom_chat_template.py | 32 ++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/tests/utils/test_custom_chat_template.py b/tests/utils/test_custom_chat_template.py
index 4ca32945971..78e2198a144 100644
--- a/tests/utils/test_custom_chat_template.py
+++ b/tests/utils/test_custom_chat_template.py
@@ -124,6 +124,38 @@ def mock_add_request(**kwargs):
         result = llm.chat(["hello"], sampling_params=SamplingParams(1), chat_template="hello")
         self.assertEqual("hello", result)
 
+    @patch("fastdeploy.entrypoints.llm.LLM.__init__")
+    def test_temperature(self, mock_class):
+        mock_class.return_value = None
+        llm = LLM()
+        llm.llm_engine = MagicMock()
+        llm.default_sampling_params = MagicMock()
+
+        def mock_run_engine(req_ids, **kwargs):
+            return req_ids
+
+        def mock_add_request(**kwargs):
+            return kwargs.get("sampling_params")
+
+        llm._run_engine = mock_run_engine
+        llm._add_request = mock_add_request
+        result = llm.chat(["hello"], sampling_params=SamplingParams(temperature=0), chat_template="hello")
+        self.assertEqual(1e-06, result.temperature)
+
+        result = llm.chat(
+            ["hello", "hi"],
+            sampling_params=[SamplingParams(temperature=0), SamplingParams(temperature=0)],
+            chat_template="hello",
+        )
+        for params in result:
+            self.assertEqual(1e-06, params.temperature)
+
+        result = llm.generate(
+            ["hello", "hi"], sampling_params=[SamplingParams(temperature=0), SamplingParams(temperature=0)]
+        )
+        for params in result:
+            self.assertEqual(1e-06, params.temperature)
+
 
 if __name__ == "__main__":
     unittest.main()

From 1bbdb6ea3179a860ab2a6b03f5c8c9974bd99933 Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Sat, 8 Nov 2025 20:20:32 +0800
Subject: [PATCH 3/7] update

---
 fastdeploy/engine/engine.py                   |  2 ++
 fastdeploy/entrypoints/engine_client.py       |  3 +-
 fastdeploy/entrypoints/llm.py                 | 10 ------
 fastdeploy/entrypoints/openai/serving_chat.py |  2 --
 .../entrypoints/openai/serving_completion.py  |  2 --
 tests/utils/test_custom_chat_template.py      | 32 -------------------
 6 files changed, 4 insertions(+), 47 deletions(-)

diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py
index e4c0b717ad8..77f5227a2a5 100644
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -251,6 +251,8 @@ def add_requests(self, task, sampling_params=None, **kwargs):
         request = Request.from_dict(task)
         llm_logger.info(f"Receive request {request}")
         if sampling_params is not None:
+            if sampling_params.temperature is not None and sampling_params.temperature == 0:
+                sampling_params.temperature = 1e-06
             request.sampling_params = sampling_params
         request.preprocess_start_time = time.time()
         chat_template_kwargs = kwargs.get("chat_template_kwargs") or {}
diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py
index e9664ddb6b9..24b16ac8219 100644
--- a/fastdeploy/entrypoints/engine_client.py
+++ b/fastdeploy/entrypoints/engine_client.py
@@ -304,7 +304,8 @@ def valid_parameters(self, data):
                 api_server_logger.warning(
                     f"req_id: {data['request_id']}, reasoning_max_tokens exceeds max_tokens, the value of reasoning_max_tokens will be adjusted to match that of max_tokens"
                 )
-
+        if data.get("temperature") is not None and data["temperature"] == 0:
+            data["temperature"] = 1e-6
         # logprobs
         logprobs = data.get("logprobs")
         top_logprobs = None
diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py
index 5c722a807db..6b334a490e2 100644
--- a/fastdeploy/entrypoints/llm.py
+++ b/fastdeploy/entrypoints/llm.py
@@ -170,13 +170,8 @@ def generate(
 
         if isinstance(sampling_params, SamplingParams):
             sampling_params_len = 1
-            if sampling_params.temperature is not None and sampling_params.temperature == 0:
-                sampling_params.temperature = 1e-06
         else:
             sampling_params_len = len(sampling_params)
-            for param in sampling_params:
-                if param.temperature is not None and param.temperature == 0:
-                    param.temperature = 1e-06
 
         if isinstance(prompts, str):
             prompts = [prompts]
@@ -241,13 +236,8 @@ def chat(
 
         if isinstance(sampling_params, SamplingParams):
             sampling_params_len = 1
-            if sampling_params.temperature is not None and sampling_params.temperature == 0:
-                sampling_params.temperature = 1e-06
         else:
             sampling_params_len = len(sampling_params)
-            for param in sampling_params:
-                if param.temperature is not None and param.temperature == 0:
-                    param.temperature = 1e-06
 
         if isinstance(messages, list) and isinstance(messages[0], dict):
             messages = [messages]
diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
index 4e5d1e9bd0e..b0b407e05ad 100644
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -106,8 +106,6 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
                 return ErrorResponse(
                     error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR, code=ErrorCode.MODEL_NOT_SUPPORT)
                 )
-        if request.temperature is not None and request.temperature == 0:
-            request.temperature = 1e-06
 
         try:
             if self.max_waiting_time < 0:
diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py
index 0192854ff5d..c27375305ed 100644
--- a/fastdeploy/entrypoints/openai/serving_completion.py
+++ b/fastdeploy/entrypoints/openai/serving_completion.py
@@ -84,8 +84,6 @@ async def create_completion(self, request: CompletionRequest):
                 return ErrorResponse(
                     error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR, code=ErrorCode.MODEL_NOT_SUPPORT)
                 )
-        if request.temperature is not None and request.temperature == 0:
-            request.temperature = 1e-06
         created_time = int(time.time())
         if request.user is not None:
             request_id = f"cmpl-{request.user}-{uuid.uuid4()}"
diff --git a/tests/utils/test_custom_chat_template.py b/tests/utils/test_custom_chat_template.py
index 78e2198a144..4ca32945971 100644
--- a/tests/utils/test_custom_chat_template.py
+++ b/tests/utils/test_custom_chat_template.py
@@ -124,38 +124,6 @@ def mock_add_request(**kwargs):
         result = llm.chat(["hello"], sampling_params=SamplingParams(1), chat_template="hello")
         self.assertEqual("hello", result)
 
-    @patch("fastdeploy.entrypoints.llm.LLM.__init__")
-    def test_temperature(self, mock_class):
-        mock_class.return_value = None
-        llm = LLM()
-        llm.llm_engine = MagicMock()
-        llm.default_sampling_params = MagicMock()
-
-        def mock_run_engine(req_ids, **kwargs):
-            return req_ids
-
-        def mock_add_request(**kwargs):
-            return kwargs.get("sampling_params")
-
-        llm._run_engine = mock_run_engine
-        llm._add_request = mock_add_request
-        result = llm.chat(["hello"], sampling_params=SamplingParams(temperature=0), chat_template="hello")
-        self.assertEqual(1e-06, result.temperature)
-
-        result = llm.chat(
-            ["hello", "hi"],
-            sampling_params=[SamplingParams(temperature=0), SamplingParams(temperature=0)],
-            chat_template="hello",
-        )
-        for params in result:
-            self.assertEqual(1e-06, params.temperature)
-
-        result = llm.generate(
-            ["hello", "hi"], sampling_params=[SamplingParams(temperature=0), SamplingParams(temperature=0)]
-        )
-        for params in result:
-            self.assertEqual(1e-06, params.temperature)
-
 
 if __name__ == "__main__":
     unittest.main()

From 008e3b1a1b9f0a2686e755645320ba1becc59daf Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Sat, 8 Nov 2025 20:21:54 +0800
Subject: [PATCH 4/7] update

---
 fastdeploy/entrypoints/llm.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py
index 6b334a490e2..aebaf349ff9 100644
--- a/fastdeploy/entrypoints/llm.py
+++ b/fastdeploy/entrypoints/llm.py
@@ -95,9 +95,7 @@ def __init__(
         # Create the Engine
         self.llm_engine = LLMEngine.from_engine_args(engine_args=engine_args)
 
-        self.default_sampling_params = SamplingParams(
-            max_tokens=self.llm_engine.cfg.model_config.max_model_len, temperature=1e-06
-        )
+        self.default_sampling_params = SamplingParams(max_tokens=self.llm_engine.cfg.model_config.max_model_len)
 
         self.llm_engine.start()
 

From c68e836c3af6037226a7e02397587db38bf1821e Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Sat, 8 Nov 2025 20:47:09 +0800
Subject: [PATCH 5/7] add unit test

---
 tests/entrypoints/test_engine_client.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tests/entrypoints/test_engine_client.py b/tests/entrypoints/test_engine_client.py
index 5aa720bbfcc..a8340b03501 100644
--- a/tests/entrypoints/test_engine_client.py
+++ b/tests/entrypoints/test_engine_client.py
@@ -33,6 +33,19 @@ async def test_add_request(self):
         assert request["tools"] == [1]
         # assert request["chat_template_kwargs"]["tools"] == [1]
 
+    def test_valid_parameters(self):
+        request = {
+            "request_id": "test-request-id",
+            "chat_template_kwargs": {"enable_thinking": True},
+            "prompt_token_ids": [1],
+            "chat_template": "Hello",
+            "max_tokens": 20,
+            "tools": [1],
+            "temperature": 0,
+        }
+        self.valid_parameters(request)
+        assert request["temperature"] == 1e-6
+
 
 if __name__ == "__main__":
     unittest.main()

From d6301055ffb5604a634b28977252fa513d148aaf Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Sun, 9 Nov 2025 12:24:31 +0800
Subject: [PATCH 6/7] update

---
 fastdeploy/engine/engine.py             | 2 +-
 fastdeploy/entrypoints/engine_client.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py
index 77f5227a2a5..21fec92ab50 100644
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -251,7 +251,7 @@ def add_requests(self, task, sampling_params=None, **kwargs):
         request = Request.from_dict(task)
         llm_logger.info(f"Receive request {request}")
         if sampling_params is not None:
-            if sampling_params.temperature is not None and sampling_params.temperature == 0:
+            if sampling_params.temperature is not None and abs(sampling_params.temperature) < 1e-06:
                 sampling_params.temperature = 1e-06
             request.sampling_params = sampling_params
         request.preprocess_start_time = time.time()
diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py
index 24b16ac8219..38334a90e6c 100644
--- a/fastdeploy/entrypoints/engine_client.py
+++ b/fastdeploy/entrypoints/engine_client.py
@@ -304,7 +304,7 @@ def valid_parameters(self, data):
                 api_server_logger.warning(
                     f"req_id: {data['request_id']}, reasoning_max_tokens exceeds max_tokens, the value of reasoning_max_tokens will be adjusted to match that of max_tokens"
                 )
-        if data.get("temperature") is not None and data["temperature"] == 0:
+        if data.get("temperature") is not None and abs(data["temperature"]) < 1e-6:
             data["temperature"] = 1e-6
         # logprobs
         logprobs = data.get("logprobs")

From f274719df2c6d74027157ea85b57cf578e17c84a Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Sun, 9 Nov 2025 21:57:01 +0800
Subject: [PATCH 7/7] fix unit test

---
 tests/entrypoints/test_engine_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/entrypoints/test_engine_client.py b/tests/entrypoints/test_engine_client.py
index a8340b03501..2493bb44d91 100644
--- a/tests/entrypoints/test_engine_client.py
+++ b/tests/entrypoints/test_engine_client.py
@@ -43,7 +43,7 @@ def test_valid_parameters(self):
             "tools": [1],
             "temperature": 0,
         }
-        self.valid_parameters(request)
+        self.engine_client.valid_parameters(request)
         assert request["temperature"] == 1e-6