From 93eebeb703ce8a9e0f4a834645bdb2c31bb4123c Mon Sep 17 00:00:00 2001 From: Gilad Freidkin Date: Tue, 2 Sep 2025 23:21:20 +0300 Subject: [PATCH 1/2] LocalBackend.__monitor_openai_server(): - do not load the server while performing health-check with additional inference requests; rather check if the actual model is loaded on the server - added configurable timeout for the health check --- src/art/local/backend.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/art/local/backend.py b/src/art/local/backend.py index c9f79e3e..22508fd5 100644 --- a/src/art/local/backend.py +++ b/src/art/local/backend.py @@ -315,11 +315,11 @@ async def _monitor_openai_server( if running_requests == 0 and pending_requests == 0: try: # Send a health check with a 5 second timeout - await openai_client.completions.create( + timeout = float(os.environ.get("ART_SERVER_MONITOR_TIMEOUT", 5.0)) + # Send a health check with a 5 second timeout + await openai_client.models.retrieve( model=model_name, - prompt="Hi", - max_tokens=1, - timeout=5, + timeout=timeout, ) except Exception as e: # If the server is sleeping, a failed health check is okay From 43400c0fcb466057a6c6444136ca9443a4136893 Mon Sep 17 00:00:00 2001 From: Gilad Freidkin Date: Tue, 2 Sep 2025 23:26:36 +0300 Subject: [PATCH 2/2] linting --- src/art/local/backend.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/art/local/backend.py b/src/art/local/backend.py index 22508fd5..48465ff6 100644 --- a/src/art/local/backend.py +++ b/src/art/local/backend.py @@ -315,7 +315,9 @@ async def _monitor_openai_server( if running_requests == 0 and pending_requests == 0: try: # Send a health check with a 5 second timeout - timeout = float(os.environ.get("ART_SERVER_MONITOR_TIMEOUT", 5.0)) + timeout = float( + os.environ.get("ART_SERVER_MONITOR_TIMEOUT", 5.0) + ) # Send a health check with a 5 second timeout await openai_client.models.retrieve( model=model_name,