Skip to content

Commit

Permalink
Apply fix for issue 48
Browse files Browse the repository at this point in the history
Added a timeout when waiting for server to respond. If a timeout happens, rerun job.sh to restart the server.
  • Loading branch information
LennoxLiu committed Apr 16, 2024
1 parent 54b6bab commit 1717c6d
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 6 deletions.
2 changes: 2 additions & 0 deletions hpc/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ build-load-balancer:
- g++ -O3 -Wno-unused-result -std=c++17 $(load-balancer-files) -o load-balancer -pthread

run-load-balancer:
rm -f retry-respond-job_id.txt

if ! printenv PORT > /dev/null; then \
echo "PORT environment variable not set. Using default value 4242."; \
export PORT=4242; \
Expand Down
21 changes: 15 additions & 6 deletions hpc/hq_scripts/job.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
#HQ --stdout none
#HQ --stderr none

# Remove "#HQ --stdout none" and "#HQ --stderr none" if you want to see the output of the job.

# Launch model server, send back server URL
# and wait to ensure that HQ won't schedule any more jobs to this allocation.

function get_avaliable_port {
# Define the range of ports to select from
MIN_PORT=1024
MIN_PORT=49152
MAX_PORT=65535

# Generate a random port number
Expand All @@ -34,14 +36,21 @@ export PORT=$port

load_balancer_dir="/load/balancer/directory" # CHANGE ME!


host=$(hostname -I | awk '{print $1}')

timeout=60 # timeout in seconds, might need to be increased if the model server takes longer to start
echo "Waiting for model server to respond at $host:$port..."
while ! curl -s "http://$host:$port/Info" > /dev/null; do
sleep 1
done
echo "Model server responded"
if timeout $timeout sh -c 'while ! curl -s "http://'"$host"':'"$port"'/Info" > /dev/null ; do :; done'; then
echo "Model server responded within $timeout seconds"
else
echo "Timeout: Model server did not respond within $timeout seconds"
echo "$HQ_JOB_ID" > "$load_balancer_dir/retry-respond-job_id.txt"

# clear the server here if needed

# restart the job
$load_balancer_dir/hq_scripts/job.sh
fi

# Write server URL to file identified by HQ job ID.
mkdir -p "$load_balancer_dir/urls"
Expand Down

0 comments on commit 1717c6d

Please sign in to comment.