Skip to content

Commit

Permalink
fix: #59 directly check for hq binary in hpc directory
Browse files Browse the repository at this point in the history
  • Loading branch information
Crambor committed Mar 17, 2024
1 parent 2d12741 commit 87330be
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 6 deletions.
15 changes: 13 additions & 2 deletions hpc/LoadBalancer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,19 @@ void clear_url(std::string directory) {
}

void launch_hq_with_alloc_queue() {
std::system("hq server stop &> /dev/null");
std::system("./hq server stop &> /dev/null");

std::system("hq server start &");
std::system("./hq server start &");
sleep(1); // Workaround: give the HQ server enough time to start.

// Create HQ allocation queue
std::system("hq_scripts/allocation_queue.sh");
}

bool file_exists(const std::string& path) {
return std::filesystem::exists(path);
}

const std::vector<std::string> get_model_names() {
// Don't start a client, always use the default job submission script.
HyperQueueJob hq_job("", false, true);
Expand All @@ -49,6 +53,13 @@ int main(int argc, char *argv[])
create_directory_if_not_existing("sub-jobs");
clear_url("urls");

// Check if the hq binary exists
std::string hq_binary_path = "./hq";
if (!file_exists(hq_binary_path)) {
std::cerr << "Error: hq binary does not exist at " << hq_binary_path << std::endl;
return 1;
}

launch_hq_with_alloc_queue();

// Read environment variables for configuration
Expand Down
6 changes: 3 additions & 3 deletions hpc/LoadBalancer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class HyperQueueJob
~HyperQueueJob()
{
// Cancel the SLURM job
std::system(("hq job cancel " + job_id).c_str());
std::system(("./hq job cancel " + job_id).c_str());

// Delete the url text file
std::system(("rm ./urls/url-" + job_id + ".txt").c_str());
Expand All @@ -113,7 +113,7 @@ class HyperQueueJob
const std::filesystem::path submission_script_generic("job.sh");
const std::filesystem::path submission_script_model_specific("job_" + model_name + ".sh");

std::string hq_command = "hq submit --output-mode=quiet ";
std::string hq_command = "./hq submit --output-mode=quiet ";
hq_command += "--priority=" + std::to_string(job_count) + " ";
if (std::filesystem::exists(submission_script_dir / submission_script_model_specific) && !force_default_submission_script)
{
Expand Down Expand Up @@ -154,7 +154,7 @@ class HyperQueueJob
// state = ["WAITING", "RUNNING", "FINISHED", "CANCELED"]
bool waitForHQJobState(const std::string &job_id, const std::string &state)
{
const std::string command = "hq job info " + job_id + " | grep State | awk '{print $4}'";
const std::string command = "./hq job info " + job_id + " | grep State | awk '{print $4}'";
// std::cout << "Checking runtime: " << command << std::endl;
std::string job_status;

Expand Down
2 changes: 1 addition & 1 deletion hpc/hq_scripts/allocation_queue.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# hq worker start &


hq alloc add slurm --time-limit 10m \
./hq alloc add slurm --time-limit 10m \
--idle-timeout 3m \
--backlog 1 \
--workers-per-alloc 1 \
Expand Down

0 comments on commit 87330be

Please sign in to comment.