From f288d98afb809da2da2342e72937c5adb5d741da Mon Sep 17 00:00:00 2001
From: zhaolei36 <zley373@gmail.com>
Date: Wed, 13 Aug 2025 16:30:30 +0800
Subject: [PATCH 1/2] feat(log):add_request_and_response_log

---
 fastdeploy/entrypoints/openai/api_server.py   |  2 ++
 fastdeploy/entrypoints/openai/serving_chat.py |  9 ++++-
 .../entrypoints/openai/serving_completion.py  | 18 ++++++++--
 fastdeploy/utils.py                           | 36 +++++++++++++++++++
 test/utils/test_truncate.py                   | 26 ++++++++++++++
 5 files changed, 88 insertions(+), 3 deletions(-)
 create mode 100644 test/utils/test_truncate.py

diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py
index 37fc0c15871..2a4c0e7abab 100644
--- a/fastdeploy/entrypoints/openai/api_server.py
+++ b/fastdeploy/entrypoints/openai/api_server.py
@@ -251,6 +251,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
     """
     Create a chat completion for the provided prompt and parameters.
     """
+    api_server_logger.info(f"Chat Received request: {request.model_dump_json()}")
     if app.state.dynamic_load_weight:
         status, msg = app.state.engine_client.is_workers_alive()
         if not status:
@@ -279,6 +280,7 @@ async def create_completion(request: CompletionRequest):
     """
     Create a completion for the provided prompt and parameters.
     """
+    api_server_logger.info(f"Completion Received request: {request.model_dump_json()}")
     if app.state.dynamic_load_weight:
         status, msg = app.state.engine_client.is_workers_alive()
         if not status:
diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
index 6632fde3a8b..b14f28e627c 100644
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -239,6 +239,7 @@ async def chat_completion_stream_generator(
                                     prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens),
                                 )
                             yield f"data: {chunk.model_dump_json(exclude_unset=True)} \n\n"
+                            api_server_logger.info(f"Chat Streaming response send_idx 0: {chunk.model_dump_json()}")
                         first_iteration = False
 
                     output = res["outputs"]
@@ -273,6 +274,7 @@ async def chat_completion_stream_generator(
                         logprobs=logprobs_res,
                         arrival_time=arrival_time,
                     )
+
                     if res["finished"]:
                         num_choices -= 1
                         work_process_metrics.e2e_request_latency.observe(
@@ -304,6 +306,9 @@ async def chat_completion_stream_generator(
                     if len(choices) == max_streaming_response_tokens or res["finished"]:
                         chunk.choices = choices
                         yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
+                        # 打印尾包
+                        if res["finished"]:
+                            api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}")
                         choices = []
 
                 if choices:
@@ -456,13 +461,15 @@ async def chat_completion_full_generator(
             prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=final_res.get("num_cached_tokens", 0)),
         )
         work_process_metrics.e2e_request_latency.observe(time.time() - final_res["metrics"]["request_start_time"])
-        return ChatCompletionResponse(
+        res = ChatCompletionResponse(
             id=request_id,
             created=created_time,
             model=model_name,
             choices=choices,
             usage=usage,
         )
+        api_server_logger.info(f"Chat response: {res.model_dump_json()}")
+        return res
 
     def _create_chat_logprobs(
         self,
diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py
index 71a38ec0dc5..a6aadcf060f 100644
--- a/fastdeploy/entrypoints/openai/serving_completion.py
+++ b/fastdeploy/entrypoints/openai/serving_completion.py
@@ -221,8 +221,7 @@ async def completion_full_generator(
                         valid_results[rid] = data
                         num_choices -= 1
                         break
-
-            return self.request_output_to_completion_response(
+            res = self.request_output_to_completion_response(
                 final_res_batch=valid_results,
                 request=request,
                 request_id=request_id,
@@ -232,6 +231,8 @@ async def completion_full_generator(
                 completion_batched_token_ids=completion_batched_token_ids,
                 text_after_process_list=text_after_process_list,
             )
+            api_server_logger.info(f"Completion response: {res.model_dump_json()}")
+            return res
         except Exception as e:
             api_server_logger.error(f"Error in completion_full_generator: {e}", exc_info=True)
             raise
@@ -323,6 +324,9 @@ async def completion_stream_generator(
                                 ],
                             )
                             yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
+                            api_server_logger.info(
+                                f"Completion Streaming response send_idx 0: {chunk.model_dump_json()}"
+                            )
                         first_iteration[idx] = False
 
                     self.engine_client.data_processor.process_response_dict(
@@ -376,6 +380,15 @@ async def completion_stream_generator(
                         choices[-1].finish_reason = self.calc_finish_reason(
                             request.max_tokens, output_tokens[idx], output, tool_called
                         )
+                    send_idx = output.get("send_idx")
+                    # 只有当 send_idx 明确为 0 时才记录日志
+                    if send_idx == 0 and not request.return_token_ids:
+                        chunk_temp = chunk
+                        chunk_temp.choices = choices
+                        api_server_logger.info(
+                            f"Completion Streaming response send_idx 0: {chunk_temp.model_dump_json()}"
+                        )
+                        del chunk_temp
 
                     if len(choices) == max_streaming_response_tokens or res["finished"]:
                         chunk = CompletionStreamResponse(
@@ -402,6 +415,7 @@ async def completion_stream_generator(
                                 ),
                             )
                             yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n"
+                        api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}")
                 if choices:
                     chunk.choices = choices
                     yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
diff --git a/fastdeploy/utils.py b/fastdeploy/utils.py
index 4afcc72214b..bc9d6050a5e 100644
--- a/fastdeploy/utils.py
+++ b/fastdeploy/utils.py
@@ -18,6 +18,7 @@
 import asyncio
 import codecs
 import importlib
+import json
 import logging
 import os
 import random
@@ -736,6 +737,41 @@ def status(self) -> dict:
         }
 
 
+def truncate_text(data, n=10240):
+    """
+    截断数据中的文本内容，支持两种格式：
+    1. 包含 'prompt' 字段的数据
+    2. 包含 'messages' 列表的数据
+    """
+    if isinstance(data, str):
+        data = json.loads(data)
+
+    import copy
+
+    data = copy.deepcopy(data)
+
+    # 处理 prompt 字段
+    if "prompt" in data:
+        text = data["prompt"]
+        if len(text.encode("utf-8")) > n:
+            # 确保截断后的字节数不超过限制
+            start_bytes = text.encode("utf-8")[:n].decode("utf-8", errors="ignore")
+            end_bytes = text.encode("utf-8")[-n:].decode("utf-8", errors="ignore")
+            data["prompt"] = f"{start_bytes}...{end_bytes}"
+
+    # 处理 messages 字段
+    if "messages" in data:
+        for message in data["messages"]:
+            content = message.get("content", "")
+            if len(content.encode("utf-8")) > n:
+                # 确保截断后的字节数不超过限制
+                start_bytes = content.encode("utf-8")[:n].decode("utf-8", errors="ignore")
+                end_bytes = content.encode("utf-8")[-n:].decode("utf-8", errors="ignore")
+                message["content"] = f"{start_bytes}...{end_bytes}"
+
+    return data
+
+
 llm_logger = get_logger("fastdeploy", "fastdeploy.log")
 data_processor_logger = get_logger("data_processor", "data_processor.log")
 scheduler_logger = get_logger("scheduler", "scheduler.log")
diff --git a/test/utils/test_truncate.py b/test/utils/test_truncate.py
new file mode 100644
index 00000000000..c5a0f2c6d17
--- /dev/null
+++ b/test/utils/test_truncate.py
@@ -0,0 +1,26 @@
+import unittest
+
+from fastdeploy.utils import truncate_text
+
+
+class TestTruncateText(unittest.TestCase):
+    def test_truncate_prompt(self):
+        data = {"prompt": "a" * 20000}
+        result = truncate_text(data)
+        self.assertTrue("..." in result["prompt"])
+
+    def test_truncate_messages(self):
+        data = {"messages": [{"content": "short"}, {"content": "long" * 10000}]}
+        result = truncate_text(data)
+        self.assertEqual(len(result["messages"][0]["content"]), 5)
+        self.assertTrue(len(result["messages"][1]["content"]) < 40000)
+        self.assertTrue("..." in result["messages"][1]["content"])
+
+    def test_no_truncate_needed(self):
+        data = {"prompt": "short"}
+        result = truncate_text(data)
+        self.assertEqual(result["prompt"], "short")
+
+
+if __name__ == "__main__":
+    unittest.main()

From f6d3a8a5efbf3eae502cb9df0993cd9202a5a3b4 Mon Sep 17 00:00:00 2001
From: zhaolei36 <zley373@gmail.com>
Date: Thu, 4 Sep 2025 20:04:02 +0800
Subject: [PATCH 2/2] modify markdown graceful shutdown

---
 .../graceful_shutdown_service.md                    |   0
 .../images/graceful_shutdown.png                    | Bin
 .../graceful_shutdown_service.md                    |   0
 .../images/graceful_shutdown.png                    | Bin
 mkdocs.yml                                          |   2 ++
 5 files changed, 2 insertions(+)
 rename docs/{best_practices => online_serving}/graceful_shutdown_service.md (100%)
 rename docs/{best_practices => online_serving}/images/graceful_shutdown.png (100%)
 rename docs/zh/{best_practices => online_serving}/graceful_shutdown_service.md (100%)
 rename docs/zh/{best_practices => online_serving}/images/graceful_shutdown.png (100%)

diff --git a/docs/best_practices/graceful_shutdown_service.md b/docs/online_serving/graceful_shutdown_service.md
similarity index 100%
rename from docs/best_practices/graceful_shutdown_service.md
rename to docs/online_serving/graceful_shutdown_service.md
diff --git a/docs/best_practices/images/graceful_shutdown.png b/docs/online_serving/images/graceful_shutdown.png
similarity index 100%
rename from docs/best_practices/images/graceful_shutdown.png
rename to docs/online_serving/images/graceful_shutdown.png
diff --git a/docs/zh/best_practices/graceful_shutdown_service.md b/docs/zh/online_serving/graceful_shutdown_service.md
similarity index 100%
rename from docs/zh/best_practices/graceful_shutdown_service.md
rename to docs/zh/online_serving/graceful_shutdown_service.md
diff --git a/docs/zh/best_practices/images/graceful_shutdown.png b/docs/zh/online_serving/images/graceful_shutdown.png
similarity index 100%
rename from docs/zh/best_practices/images/graceful_shutdown.png
rename to docs/zh/online_serving/images/graceful_shutdown.png
diff --git a/mkdocs.yml b/mkdocs.yml
index 5f4a354a1e6..f42e231aa1a 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -58,6 +58,7 @@ plugins:
             OpenAI-Compitable API Server: 兼容 OpenAI 协议的服务化部署
             Monitor Metrics: 监控Metrics
             Scheduler: 调度器
+            Graceful Shutdown: 服务优雅关闭
             Offline Inference: 离线推理
             Best Practices: 最佳实践
             ERNIE-4.5-0.3B: ERNIE-4.5-0.3B
@@ -107,6 +108,7 @@ nav:
       - 'OpenAI-Compitable API Server': online_serving/README.md
       - 'Monitor Metrics': online_serving/metrics.md
       - 'Scheduler': online_serving/scheduler.md
+      - 'Graceful Shutdown': online_serving/graceful_shutdown_service.md
   - 'Offline Inference': offline_inference.md
   - Best Practices:
       - ERNIE-4.5-0.3B: best_practices/ERNIE-4.5-0.3B-Paddle.md