Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/usage/environment_variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,5 +88,8 @@ environment_variables: dict[str, Callable[[], Any]] = {

# Count for cache_transfer_manager process error
"FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")),

# Worker process health check timeout when waiting for responses in seconds (default: 30)
"FD_WORKER_ALIVE_TIMEOUT": lambda: int(os.getenv("FD_WORKER_ALIVE_TIMEOUT", "30")),
}
```
6 changes: 5 additions & 1 deletion docs/zh/usage/environment_variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,5 +87,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
"FD_CACHE_PROC_EXIT_TIMEOUT": lambda: int(os.getenv("FD_CACHE_PROC_EXIT_TIMEOUT", "600")),

# cache_transfer_manager 进程残留时连续错误阈值
"FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")),}
"FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")),

# Worker 进程响应等待时的健康检查超时时间(秒),默认 30 秒
"FD_WORKER_ALIVE_TIMEOUT": lambda: int(os.getenv("FD_WORKER_ALIVE_TIMEOUT", "30")),
}
```
5 changes: 3 additions & 2 deletions fastdeploy/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import numpy as np

import fastdeploy.envs as envs
from fastdeploy.entrypoints.openai.protocol import (
ChatCompletionRequest,
ChatCompletionResponse,
Expand Down Expand Up @@ -264,7 +265,7 @@ async def chat_completion_stream_generator(
except asyncio.TimeoutError:
current_waiting_time += 10
if current_waiting_time == 300:
status, msg = self.engine_client.check_health()
status, msg = self.engine_client.check_health(time_interval_threashold=envs.FD_WORKER_ALIVE_TIMEOUT)
if not status:
if choices:
chunk.choices = choices
Expand Down Expand Up @@ -557,7 +558,7 @@ async def chat_completion_full_generator(
except asyncio.TimeoutError:
current_waiting_time += 10
if current_waiting_time == 300:
status, msg = self.engine_client.check_health()
status, msg = self.engine_client.check_health(time_interval_threashold=envs.FD_WORKER_ALIVE_TIMEOUT)
if not status:
raise ValueError(f"Engine is not healthy: {msg}")
else:
Expand Down
5 changes: 3 additions & 2 deletions fastdeploy/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

import numpy as np

import fastdeploy.envs as envs
from fastdeploy.engine.request import RequestOutput
from fastdeploy.entrypoints.openai.protocol import (
CompletionLogprobs,
Expand Down Expand Up @@ -280,7 +281,7 @@ async def completion_full_generator(
except asyncio.TimeoutError:
current_waiting_time += 10
if current_waiting_time == 300:
status, msg = self.engine_client.check_health()
status, msg = self.engine_client.check_health(time_interval_threashold=envs.FD_WORKER_ALIVE_TIMEOUT)
if not status:
raise ValueError(f"Engine is not healthy: {msg}")
else:
Expand Down Expand Up @@ -436,7 +437,7 @@ async def completion_stream_generator(
except asyncio.TimeoutError:
current_waiting_time += 10
if current_waiting_time == 300:
status, msg = self.engine_client.check_health()
status, msg = self.engine_client.check_health(time_interval_threashold=envs.FD_WORKER_ALIVE_TIMEOUT)
if not status:
raise ValueError(f"Engine is not healthy: {msg}")
else:
Expand Down
2 changes: 2 additions & 0 deletions fastdeploy/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@
# "Number of tokens in the group for Mixture of Experts (MoE) computation processing on HPU"
"FD_HPU_CHUNK_SIZE": lambda: int(os.getenv("FD_HPU_CHUNK_SIZE", "64")),
"FD_PREFILL_WAIT_DECODE_RESOURCE_SECONDS": lambda: int(os.getenv("FD_PREFILL_WAIT_DECODE_RESOURCE_SECONDS", "30")),
# Timeout for worker process health check in seconds
"FD_WORKER_ALIVE_TIMEOUT": lambda: int(os.getenv("FD_WORKER_ALIVE_TIMEOUT", "30")),
}


Expand Down
Loading