From 1717c6d787526fb4240a34effe50aaa043d5a7dd Mon Sep 17 00:00:00 2001 From: Lennox <105095085+LennoxLiu@users.noreply.github.com> Date: Tue, 16 Apr 2024 11:13:58 +0200 Subject: [PATCH] Apply fix for issue 48 Added a timeout when waiting for server to respond. If a timeout happens, rerun job.sh to restart the server. --- hpc/Makefile | 2 ++ hpc/hq_scripts/job.sh | 21 +++++++++++++++------ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/hpc/Makefile b/hpc/Makefile index e7ec401..af41e05 100644 --- a/hpc/Makefile +++ b/hpc/Makefile @@ -6,6 +6,8 @@ build-load-balancer: - g++ -O3 -Wno-unused-result -std=c++17 $(load-balancer-files) -o load-balancer -pthread run-load-balancer: + rm -f retry-respond-job_id.txt + if ! printenv PORT > /dev/null; then \ echo "PORT environment variable not set. Using default value 4242."; \ export PORT=4242; \ diff --git a/hpc/hq_scripts/job.sh b/hpc/hq_scripts/job.sh index 94e7b0b..fbaa6b9 100755 --- a/hpc/hq_scripts/job.sh +++ b/hpc/hq_scripts/job.sh @@ -6,12 +6,14 @@ #HQ --stdout none #HQ --stderr none +# Remove "#HQ --stdout none" and "#HQ --stderr none" if you want to see the output of the job. + # Launch model server, send back server URL # and wait to ensure that HQ won't schedule any more jobs to this allocation. function get_avaliable_port { # Define the range of ports to select from - MIN_PORT=1024 + MIN_PORT=49152 MAX_PORT=65535 # Generate a random port number @@ -34,14 +36,21 @@ export PORT=$port load_balancer_dir="/load/balancer/directory" # CHANGE ME! - host=$(hostname -I | awk '{print $1}') +timeout=60 # timeout in seconds, might need to be increased if the model server takes longer to start echo "Waiting for model server to respond at $host:$port..." -while ! curl -s "http://$host:$port/Info" > /dev/null; do - sleep 1 -done -echo "Model server responded" +if timeout $timeout sh -c 'while ! curl -s "http://'"$host"':'"$port"'/Info" > /dev/null ; do :; done'; then + echo "Model server responded within $timeout seconds" +else + echo "Timeout: Model server did not respond within $timeout seconds" + echo "$HQ_JOB_ID" > "$load_balancer_dir/retry-respond-job_id.txt" + + # clear the server here if needed + + # restart the job + $load_balancer_dir/hq_scripts/job.sh +fi # Write server URL to file identified by HQ job ID. mkdir -p "$load_balancer_dir/urls"