From 87330be623e6f563a2611432d20951ea4ecf9181 Mon Sep 17 00:00:00 2001 From: Crambor Date: Sun, 17 Mar 2024 18:55:14 +0000 Subject: [PATCH] fix: #59 directly check for hq binary in hpc directory --- hpc/LoadBalancer.cpp | 15 +++++++++++++-- hpc/LoadBalancer.hpp | 6 +++--- hpc/hq_scripts/allocation_queue.sh | 2 +- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/hpc/LoadBalancer.cpp b/hpc/LoadBalancer.cpp index c23bdff..b839583 100644 --- a/hpc/LoadBalancer.cpp +++ b/hpc/LoadBalancer.cpp @@ -25,15 +25,19 @@ void clear_url(std::string directory) { } void launch_hq_with_alloc_queue() { - std::system("hq server stop &> /dev/null"); + std::system("./hq server stop &> /dev/null"); - std::system("hq server start &"); + std::system("./hq server start &"); sleep(1); // Workaround: give the HQ server enough time to start. // Create HQ allocation queue std::system("hq_scripts/allocation_queue.sh"); } +bool file_exists(const std::string& path) { + return std::filesystem::exists(path); +} + const std::vector get_model_names() { // Don't start a client, always use the default job submission script. HyperQueueJob hq_job("", false, true); @@ -49,6 +53,13 @@ int main(int argc, char *argv[]) create_directory_if_not_existing("sub-jobs"); clear_url("urls"); + // Check if the hq binary exists + std::string hq_binary_path = "./hq"; + if (!file_exists(hq_binary_path)) { + std::cerr << "Error: hq binary does not exist at " << hq_binary_path << std::endl; + return 1; + } + launch_hq_with_alloc_queue(); // Read environment variables for configuration diff --git a/hpc/LoadBalancer.hpp b/hpc/LoadBalancer.hpp index 20d877d..8a4e433 100644 --- a/hpc/LoadBalancer.hpp +++ b/hpc/LoadBalancer.hpp @@ -90,7 +90,7 @@ class HyperQueueJob ~HyperQueueJob() { // Cancel the SLURM job - std::system(("hq job cancel " + job_id).c_str()); + std::system(("./hq job cancel " + job_id).c_str()); // Delete the url text file std::system(("rm ./urls/url-" + job_id + ".txt").c_str()); @@ -113,7 +113,7 @@ class HyperQueueJob const std::filesystem::path submission_script_generic("job.sh"); const std::filesystem::path submission_script_model_specific("job_" + model_name + ".sh"); - std::string hq_command = "hq submit --output-mode=quiet "; + std::string hq_command = "./hq submit --output-mode=quiet "; hq_command += "--priority=" + std::to_string(job_count) + " "; if (std::filesystem::exists(submission_script_dir / submission_script_model_specific) && !force_default_submission_script) { @@ -154,7 +154,7 @@ class HyperQueueJob // state = ["WAITING", "RUNNING", "FINISHED", "CANCELED"] bool waitForHQJobState(const std::string &job_id, const std::string &state) { - const std::string command = "hq job info " + job_id + " | grep State | awk '{print $4}'"; + const std::string command = "./hq job info " + job_id + " | grep State | awk '{print $4}'"; // std::cout << "Checking runtime: " << command << std::endl; std::string job_status; diff --git a/hpc/hq_scripts/allocation_queue.sh b/hpc/hq_scripts/allocation_queue.sh index ae13b8d..6e48073 100755 --- a/hpc/hq_scripts/allocation_queue.sh +++ b/hpc/hq_scripts/allocation_queue.sh @@ -4,7 +4,7 @@ # hq worker start & -hq alloc add slurm --time-limit 10m \ +./hq alloc add slurm --time-limit 10m \ --idle-timeout 3m \ --backlog 1 \ --workers-per-alloc 1 \