From 93eebeb703ce8a9e0f4a834645bdb2c31bb4123c Mon Sep 17 00:00:00 2001
From: Gilad Freidkin <freidkin.gilad@gmail.com>
Date: Tue, 2 Sep 2025 23:21:20 +0300
Subject: [PATCH 1/2] LocalBackend.__monitor_openai_server(): - do not load the
 server while performing health-check with additional inference requests;
 rather check if the actual model is loaded on the server -  added
 configurable timeout for the health check

---
 src/art/local/backend.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/art/local/backend.py b/src/art/local/backend.py
index c9f79e3e..22508fd5 100644
--- a/src/art/local/backend.py
+++ b/src/art/local/backend.py
@@ -315,11 +315,11 @@ async def _monitor_openai_server(
                 if running_requests == 0 and pending_requests == 0:
                     try:
                         # Send a health check with a 5 second timeout
-                        await openai_client.completions.create(
+                        timeout = float(os.environ.get("ART_SERVER_MONITOR_TIMEOUT", 5.0))
+                        # Send a health check with a 5 second timeout
+                        await openai_client.models.retrieve(
                             model=model_name,
-                            prompt="Hi",
-                            max_tokens=1,
-                            timeout=5,
+                            timeout=timeout,
                         )
                     except Exception as e:
                         # If the server is sleeping, a failed health check is okay

From 43400c0fcb466057a6c6444136ca9443a4136893 Mon Sep 17 00:00:00 2001
From: Gilad Freidkin <freidkin.gilad@gmail.com>
Date: Tue, 2 Sep 2025 23:26:36 +0300
Subject: [PATCH 2/2] linting

---
 src/art/local/backend.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/art/local/backend.py b/src/art/local/backend.py
index 22508fd5..48465ff6 100644
--- a/src/art/local/backend.py
+++ b/src/art/local/backend.py
@@ -315,7 +315,9 @@ async def _monitor_openai_server(
                 if running_requests == 0 and pending_requests == 0:
                     try:
                         # Send a health check with a 5 second timeout
-                        timeout = float(os.environ.get("ART_SERVER_MONITOR_TIMEOUT", 5.0))
+                        timeout = float(
+                            os.environ.get("ART_SERVER_MONITOR_TIMEOUT", 5.0)
+                        )
                         # Send a health check with a 5 second timeout
                         await openai_client.models.retrieve(
                             model=model_name,