diff --git a/.github/workflows/hpc-load-balancer.yml b/.github/workflows/hpc-load-balancer.yml new file mode 100644 index 0000000..90ace9b --- /dev/null +++ b/.github/workflows/hpc-load-balancer.yml @@ -0,0 +1,42 @@ +name: hpc-load-balancer + +on: + push: + pull_request: + branches: + - 'main' + + +jobs: + + build-and-setup: + runs-on: ubuntu-latest + container: ubuntu:latest + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Dependencies + run: | + apt update -qq && DEBIAN_FRONTEND="noninteractive" apt install -yq g++ make wget curl tar + + - name: Build load balancer binary + run: | + cd hpc && make build-load-balancer + + - name: Download and setup hq binary + run: | + url=$(curl -sSL https://api.github.com/repos/It4innovations/hyperqueue/releases/latest | \ + grep -o "\"browser_download_url\": \"https://[^\"]*-linux-x64.tar.gz\"" | \ + cut -d '"' -f 4) + if [ -z "$url" ]; then + echo "Error: URL not found" + exit 1 + fi + + filename="hq-linux-x64.tar.gz" + wget -q $url -O $filename + tar xzf $filename + ./hq --version + diff --git a/hpc/LoadBalancer.cpp b/hpc/LoadBalancer.cpp index c23bdff..b839583 100644 --- a/hpc/LoadBalancer.cpp +++ b/hpc/LoadBalancer.cpp @@ -25,15 +25,19 @@ void clear_url(std::string directory) { } void launch_hq_with_alloc_queue() { - std::system("hq server stop &> /dev/null"); + std::system("./hq server stop &> /dev/null"); - std::system("hq server start &"); + std::system("./hq server start &"); sleep(1); // Workaround: give the HQ server enough time to start. // Create HQ allocation queue std::system("hq_scripts/allocation_queue.sh"); } +bool file_exists(const std::string& path) { + return std::filesystem::exists(path); +} + const std::vector get_model_names() { // Don't start a client, always use the default job submission script. HyperQueueJob hq_job("", false, true); @@ -49,6 +53,13 @@ int main(int argc, char *argv[]) create_directory_if_not_existing("sub-jobs"); clear_url("urls"); + // Check if the hq binary exists + std::string hq_binary_path = "./hq"; + if (!file_exists(hq_binary_path)) { + std::cerr << "Error: hq binary does not exist at " << hq_binary_path << std::endl; + return 1; + } + launch_hq_with_alloc_queue(); // Read environment variables for configuration diff --git a/hpc/LoadBalancer.hpp b/hpc/LoadBalancer.hpp index 20d877d..8a4e433 100644 --- a/hpc/LoadBalancer.hpp +++ b/hpc/LoadBalancer.hpp @@ -90,7 +90,7 @@ class HyperQueueJob ~HyperQueueJob() { // Cancel the SLURM job - std::system(("hq job cancel " + job_id).c_str()); + std::system(("./hq job cancel " + job_id).c_str()); // Delete the url text file std::system(("rm ./urls/url-" + job_id + ".txt").c_str()); @@ -113,7 +113,7 @@ class HyperQueueJob const std::filesystem::path submission_script_generic("job.sh"); const std::filesystem::path submission_script_model_specific("job_" + model_name + ".sh"); - std::string hq_command = "hq submit --output-mode=quiet "; + std::string hq_command = "./hq submit --output-mode=quiet "; hq_command += "--priority=" + std::to_string(job_count) + " "; if (std::filesystem::exists(submission_script_dir / submission_script_model_specific) && !force_default_submission_script) { @@ -154,7 +154,7 @@ class HyperQueueJob // state = ["WAITING", "RUNNING", "FINISHED", "CANCELED"] bool waitForHQJobState(const std::string &job_id, const std::string &state) { - const std::string command = "hq job info " + job_id + " | grep State | awk '{print $4}'"; + const std::string command = "./hq job info " + job_id + " | grep State | awk '{print $4}'"; // std::cout << "Checking runtime: " << command << std::endl; std::string job_status; diff --git a/hpc/hq_scripts/allocation_queue.sh b/hpc/hq_scripts/allocation_queue.sh index ae13b8d..35e4783 100755 --- a/hpc/hq_scripts/allocation_queue.sh +++ b/hpc/hq_scripts/allocation_queue.sh @@ -1,10 +1,14 @@ #! /bin/bash # Note: For runs on systems without SLURM, replace the slurm allocator by -# hq worker start & +# ./hq worker start & +if [[ ! -f "./hq" ]]; then + echo "Error: hq binary does not exist at ./hq" + exit 1 +fi -hq alloc add slurm --time-limit 10m \ +./hq alloc add slurm --time-limit 10m \ --idle-timeout 3m \ --backlog 1 \ --workers-per-alloc 1 \