From f288d98afb809da2da2342e72937c5adb5d741da Mon Sep 17 00:00:00 2001 From: zhaolei36 Date: Wed, 13 Aug 2025 16:30:30 +0800 Subject: [PATCH 1/2] feat(log):add_request_and_response_log --- fastdeploy/entrypoints/openai/api_server.py | 2 ++ fastdeploy/entrypoints/openai/serving_chat.py | 9 ++++- .../entrypoints/openai/serving_completion.py | 18 ++++++++-- fastdeploy/utils.py | 36 +++++++++++++++++++ test/utils/test_truncate.py | 26 ++++++++++++++ 5 files changed, 88 insertions(+), 3 deletions(-) create mode 100644 test/utils/test_truncate.py diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index 37fc0c15871..2a4c0e7abab 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -251,6 +251,7 @@ async def create_chat_completion(request: ChatCompletionRequest): """ Create a chat completion for the provided prompt and parameters. """ + api_server_logger.info(f"Chat Received request: {request.model_dump_json()}") if app.state.dynamic_load_weight: status, msg = app.state.engine_client.is_workers_alive() if not status: @@ -279,6 +280,7 @@ async def create_completion(request: CompletionRequest): """ Create a completion for the provided prompt and parameters. """ + api_server_logger.info(f"Completion Received request: {request.model_dump_json()}") if app.state.dynamic_load_weight: status, msg = app.state.engine_client.is_workers_alive() if not status: diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 6632fde3a8b..b14f28e627c 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -239,6 +239,7 @@ async def chat_completion_stream_generator( prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens), ) yield f"data: {chunk.model_dump_json(exclude_unset=True)} \n\n" + api_server_logger.info(f"Chat Streaming response send_idx 0: {chunk.model_dump_json()}") first_iteration = False output = res["outputs"] @@ -273,6 +274,7 @@ async def chat_completion_stream_generator( logprobs=logprobs_res, arrival_time=arrival_time, ) + if res["finished"]: num_choices -= 1 work_process_metrics.e2e_request_latency.observe( @@ -304,6 +306,9 @@ async def chat_completion_stream_generator( if len(choices) == max_streaming_response_tokens or res["finished"]: chunk.choices = choices yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" + # 打印尾包 + if res["finished"]: + api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}") choices = [] if choices: @@ -456,13 +461,15 @@ async def chat_completion_full_generator( prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=final_res.get("num_cached_tokens", 0)), ) work_process_metrics.e2e_request_latency.observe(time.time() - final_res["metrics"]["request_start_time"]) - return ChatCompletionResponse( + res = ChatCompletionResponse( id=request_id, created=created_time, model=model_name, choices=choices, usage=usage, ) + api_server_logger.info(f"Chat response: {res.model_dump_json()}") + return res def _create_chat_logprobs( self, diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index 71a38ec0dc5..a6aadcf060f 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -221,8 +221,7 @@ async def completion_full_generator( valid_results[rid] = data num_choices -= 1 break - - return self.request_output_to_completion_response( + res = self.request_output_to_completion_response( final_res_batch=valid_results, request=request, request_id=request_id, @@ -232,6 +231,8 @@ async def completion_full_generator( completion_batched_token_ids=completion_batched_token_ids, text_after_process_list=text_after_process_list, ) + api_server_logger.info(f"Completion response: {res.model_dump_json()}") + return res except Exception as e: api_server_logger.error(f"Error in completion_full_generator: {e}", exc_info=True) raise @@ -323,6 +324,9 @@ async def completion_stream_generator( ], ) yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" + api_server_logger.info( + f"Completion Streaming response send_idx 0: {chunk.model_dump_json()}" + ) first_iteration[idx] = False self.engine_client.data_processor.process_response_dict( @@ -376,6 +380,15 @@ async def completion_stream_generator( choices[-1].finish_reason = self.calc_finish_reason( request.max_tokens, output_tokens[idx], output, tool_called ) + send_idx = output.get("send_idx") + # 只有当 send_idx 明确为 0 时才记录日志 + if send_idx == 0 and not request.return_token_ids: + chunk_temp = chunk + chunk_temp.choices = choices + api_server_logger.info( + f"Completion Streaming response send_idx 0: {chunk_temp.model_dump_json()}" + ) + del chunk_temp if len(choices) == max_streaming_response_tokens or res["finished"]: chunk = CompletionStreamResponse( @@ -402,6 +415,7 @@ async def completion_stream_generator( ), ) yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n" + api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}") if choices: chunk.choices = choices yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" diff --git a/fastdeploy/utils.py b/fastdeploy/utils.py index 4afcc72214b..bc9d6050a5e 100644 --- a/fastdeploy/utils.py +++ b/fastdeploy/utils.py @@ -18,6 +18,7 @@ import asyncio import codecs import importlib +import json import logging import os import random @@ -736,6 +737,41 @@ def status(self) -> dict: } +def truncate_text(data, n=10240): + """ + 截断数据中的文本内容,支持两种格式: + 1. 包含 'prompt' 字段的数据 + 2. 包含 'messages' 列表的数据 + """ + if isinstance(data, str): + data = json.loads(data) + + import copy + + data = copy.deepcopy(data) + + # 处理 prompt 字段 + if "prompt" in data: + text = data["prompt"] + if len(text.encode("utf-8")) > n: + # 确保截断后的字节数不超过限制 + start_bytes = text.encode("utf-8")[:n].decode("utf-8", errors="ignore") + end_bytes = text.encode("utf-8")[-n:].decode("utf-8", errors="ignore") + data["prompt"] = f"{start_bytes}...{end_bytes}" + + # 处理 messages 字段 + if "messages" in data: + for message in data["messages"]: + content = message.get("content", "") + if len(content.encode("utf-8")) > n: + # 确保截断后的字节数不超过限制 + start_bytes = content.encode("utf-8")[:n].decode("utf-8", errors="ignore") + end_bytes = content.encode("utf-8")[-n:].decode("utf-8", errors="ignore") + message["content"] = f"{start_bytes}...{end_bytes}" + + return data + + llm_logger = get_logger("fastdeploy", "fastdeploy.log") data_processor_logger = get_logger("data_processor", "data_processor.log") scheduler_logger = get_logger("scheduler", "scheduler.log") diff --git a/test/utils/test_truncate.py b/test/utils/test_truncate.py new file mode 100644 index 00000000000..c5a0f2c6d17 --- /dev/null +++ b/test/utils/test_truncate.py @@ -0,0 +1,26 @@ +import unittest + +from fastdeploy.utils import truncate_text + + +class TestTruncateText(unittest.TestCase): + def test_truncate_prompt(self): + data = {"prompt": "a" * 20000} + result = truncate_text(data) + self.assertTrue("..." in result["prompt"]) + + def test_truncate_messages(self): + data = {"messages": [{"content": "short"}, {"content": "long" * 10000}]} + result = truncate_text(data) + self.assertEqual(len(result["messages"][0]["content"]), 5) + self.assertTrue(len(result["messages"][1]["content"]) < 40000) + self.assertTrue("..." in result["messages"][1]["content"]) + + def test_no_truncate_needed(self): + data = {"prompt": "short"} + result = truncate_text(data) + self.assertEqual(result["prompt"], "short") + + +if __name__ == "__main__": + unittest.main() From f6d3a8a5efbf3eae502cb9df0993cd9202a5a3b4 Mon Sep 17 00:00:00 2001 From: zhaolei36 Date: Thu, 4 Sep 2025 20:04:02 +0800 Subject: [PATCH 2/2] modify markdown graceful shutdown --- .../graceful_shutdown_service.md | 0 .../images/graceful_shutdown.png | Bin .../graceful_shutdown_service.md | 0 .../images/graceful_shutdown.png | Bin mkdocs.yml | 2 ++ 5 files changed, 2 insertions(+) rename docs/{best_practices => online_serving}/graceful_shutdown_service.md (100%) rename docs/{best_practices => online_serving}/images/graceful_shutdown.png (100%) rename docs/zh/{best_practices => online_serving}/graceful_shutdown_service.md (100%) rename docs/zh/{best_practices => online_serving}/images/graceful_shutdown.png (100%) diff --git a/docs/best_practices/graceful_shutdown_service.md b/docs/online_serving/graceful_shutdown_service.md similarity index 100% rename from docs/best_practices/graceful_shutdown_service.md rename to docs/online_serving/graceful_shutdown_service.md diff --git a/docs/best_practices/images/graceful_shutdown.png b/docs/online_serving/images/graceful_shutdown.png similarity index 100% rename from docs/best_practices/images/graceful_shutdown.png rename to docs/online_serving/images/graceful_shutdown.png diff --git a/docs/zh/best_practices/graceful_shutdown_service.md b/docs/zh/online_serving/graceful_shutdown_service.md similarity index 100% rename from docs/zh/best_practices/graceful_shutdown_service.md rename to docs/zh/online_serving/graceful_shutdown_service.md diff --git a/docs/zh/best_practices/images/graceful_shutdown.png b/docs/zh/online_serving/images/graceful_shutdown.png similarity index 100% rename from docs/zh/best_practices/images/graceful_shutdown.png rename to docs/zh/online_serving/images/graceful_shutdown.png diff --git a/mkdocs.yml b/mkdocs.yml index 5f4a354a1e6..f42e231aa1a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -58,6 +58,7 @@ plugins: OpenAI-Compitable API Server: 兼容 OpenAI 协议的服务化部署 Monitor Metrics: 监控Metrics Scheduler: 调度器 + Graceful Shutdown: 服务优雅关闭 Offline Inference: 离线推理 Best Practices: 最佳实践 ERNIE-4.5-0.3B: ERNIE-4.5-0.3B @@ -107,6 +108,7 @@ nav: - 'OpenAI-Compitable API Server': online_serving/README.md - 'Monitor Metrics': online_serving/metrics.md - 'Scheduler': online_serving/scheduler.md + - 'Graceful Shutdown': online_serving/graceful_shutdown_service.md - 'Offline Inference': offline_inference.md - Best Practices: - ERNIE-4.5-0.3B: best_practices/ERNIE-4.5-0.3B-Paddle.md