From 3749c022465ad33df3175fcf17e8860130b154b6 Mon Sep 17 00:00:00 2001 From: Linus Seelinger Date: Tue, 28 May 2024 23:33:09 +0200 Subject: [PATCH 1/2] Simplify job.sh (relative paths, working default), build needed test model by default --- hpc/Makefile | 9 +++++---- hpc/hq_scripts/job.sh | 6 +++--- models/testmodel/minimal-server.cpp | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/hpc/Makefile b/hpc/Makefile index 3617429..5757bea 100644 --- a/hpc/Makefile +++ b/hpc/Makefile @@ -1,6 +1,7 @@ -all: build-load-balancer - -load-balancer-files = LoadBalancer.cpp LoadBalancer.hpp ../lib/httplib.h ../lib/json.hpp ../lib/umbridge.h +all: build-load-balancer build-testmodel build-load-balancer: - - g++ -O3 -Wno-unused-result -std=c++17 $(load-balancer-files) -o load-balancer -pthread + - g++ -O3 -Wno-unused-result -std=c++17 -I../lib/ LoadBalancer.cpp -o load-balancer -pthread + +build-testmodel: + - g++ -O3 -Wno-unused-result -std=c++17 -I../lib/ ../models/testmodel/minimal-server.cpp -o testmodel -pthread diff --git a/hpc/hq_scripts/job.sh b/hpc/hq_scripts/job.sh index 94e7b0b..03cb381 100755 --- a/hpc/hq_scripts/job.sh +++ b/hpc/hq_scripts/job.sh @@ -30,9 +30,8 @@ port=$(get_avaliable_port) export PORT=$port # Assume that server sets the port according to the environment variable 'PORT'. -/your/model/server/call & # CHANGE ME! - -load_balancer_dir="/load/balancer/directory" # CHANGE ME! +# Otherwise the job script will be stuck waiting for model server's response. +./testmodel & # CHANGE ME! host=$(hostname -I | awk '{print $1}') @@ -44,6 +43,7 @@ done echo "Model server responded" # Write server URL to file identified by HQ job ID. +load_balancer_dir="." mkdir -p "$load_balancer_dir/urls" echo "http://$host:$port" > "$load_balancer_dir/urls/url-$HQ_JOB_ID.txt" diff --git a/models/testmodel/minimal-server.cpp b/models/testmodel/minimal-server.cpp index e0d0ad4..6e83a16 100644 --- a/models/testmodel/minimal-server.cpp +++ b/models/testmodel/minimal-server.cpp @@ -4,7 +4,7 @@ #include // Needed for HTTPS, implies the need for openssl, may be omitted if HTTP suffices -#define CPPHTTPLIB_OPENSSL_SUPPORT +// #define CPPHTTPLIB_OPENSSL_SUPPORT #include "umbridge.h" From c33b330d1dc9787e2d999f0e528837747d760ee9 Mon Sep 17 00:00:00 2001 From: Linus Seelinger Date: Wed, 29 May 2024 19:21:23 +0200 Subject: [PATCH 2/2] Update docs --- hpc/README.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/hpc/README.md b/hpc/README.md index e12eec5..038cbb7 100644 --- a/hpc/README.md +++ b/hpc/README.md @@ -5,19 +5,19 @@ This load balancer allows any scaling up UM-Bridge applications to HPC systems. ## Installation 1. **Build the load balancer** - + Clone the UM-Bridge repository. - + ``` git clone https://github.com/UM-Bridge/umbridge.git ``` - + Then navigate to the `hpc` directory. ``` cd umbridge/hpc ``` - + Finally, compile the load balancer. Depending on your HPC system, you likely have to load a module providing a recent c++ compiler. ``` @@ -25,7 +25,7 @@ This load balancer allows any scaling up UM-Bridge applications to HPC systems. ``` 2. **Download HyperQueue** - + Download HyperQueue from the most recent release at https://github.com/It4innovations/hyperqueue/releases and place the `hq` binary in the `hpc` directory next to the load balancer. ## Usage @@ -35,7 +35,7 @@ The load balancer is primarily intended to run on a login node. 1. **Configure resource allocation** The load balancer instructs HyperQueue to allocate batches of resources on the HPC system, depending on demand for model evaluations. HyperQueue will submit SLURM or PBS jobs on the HPC system when needed, scheduling requested model runs within those jobs. When demand decreases, HyperQueue will cancel some of those jobs again. - + Adapt the configuration in ``hpc/hq_scripts/allocation_queue.sh`` to your needs. For example, when running a very fast UM-Bridge model on an HPC cluster, it is advisable to choose medium-sized jobs for resource allocation. That will avoid submitting large numbers of jobs to the HPC system's scheduler, while HyperQueue itself will handle large numbers of small model runs within those allocated jobs. @@ -44,8 +44,7 @@ The load balancer is primarily intended to run on a login node. Adapt the configuration in ``hpc/hq_scripts/job.sh`` to your needs: * Specify what UM-Bridge model server to run, - * set `#HQ` variables at the top to specify what resources each instance should receive, - * and set the directory of your load balancer binary in `load_balancer_dir`. + * and set `#HQ` variables at the top to specify what resources each instance should receive. Importantly, the UM-Bridge model server must serve its models at the port specified by the environment variable `PORT`. The value of `PORT` is automatically determined by `job.sh`, avoiding potential conflicts if multiple servers run on the same compute node.