Skip to content

Commit

Permalink
Merge branch 'main' into singularity
Browse files Browse the repository at this point in the history
  • Loading branch information
LennoxLiu committed Apr 7, 2024
2 parents 173a92c + e85094f commit 7494061
Show file tree
Hide file tree
Showing 29 changed files with 295 additions and 34,359 deletions.
1 change: 1 addition & 0 deletions .github/workflows/client-R-cran-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ name: client-R-cran-package

on:
push:
pull_request:
branches:
- 'main'

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/client-R-git-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ name: client-R-git-package

on:
push:
pull_request:
branches:
- 'main'

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/client-c++-example.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ name: client-c++-example

on:
push:
pull_request:
branches:
- 'main'

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/client-c++.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ name: client-c++

on:
push:
pull_request:
branches:
- 'main'

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/client-matlab.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ name: client-matlab

on:
push:
pull_request:
branches:
- 'main'

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/client-python-emcee.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ name: client-python-emcee

on:
push:
pull_request:
branches:
- 'main'

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/client-python-example.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ name: client-python-example

on:
push:
pull_request:
branches:
- 'main'

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/client-python-pymc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ name: client-python-pymc

on:
push:
pull_request:
branches:
- 'main'

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/client-python-qmcpy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ name: client-python-qmcpy

on:
push:
pull_request:
branches:
- 'main'

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/client-python-raw.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ name: client-python-raw

on:
push:
pull_request:
branches:
- 'main'

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/client-python-tinyDA.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ name: client-python-tinyDA

on:
push:
pull_request:
branches:
- 'main'

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/client-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ name: client-python

on:
push:
pull_request:
branches:
- 'main'

Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ __pycache__/
build/
.ipynb_checkpoints/
.vscode/
.idea/
kubernetes/setup/secret.txt
2 changes: 1 addition & 1 deletion clients/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ std::vector<std::vector<double>> inputs {{100.0, 18.0}};
The input vector can then be passed into the model.

```
std::vector<std::vector<double>> outputs = client.Evaluate(input);
std::vector<std::vector<double>> outputs = client.Evaluate(inputs);
```

The output of the model evaluation is a `std::vector<std::vector<double>>` containing the output defined by the model.
Expand Down
25 changes: 14 additions & 11 deletions hpc/LoadBalancer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include <unistd.h>
#include <limits.h>

#include "lib/umbridge.h"
#include "../lib/umbridge.h"

void create_directory_if_not_existing(std::string directory) {
if (!std::filesystem::is_directory(directory) || !std::filesystem::exists(directory)) {
Expand All @@ -24,12 +24,6 @@ void clear_url(std::string directory) {
}
}

std::string get_hostname() {
char hostname[HOST_NAME_MAX];
gethostname(hostname, HOST_NAME_MAX);
return std::string(hostname);
}

void launch_hq_with_alloc_queue() {
std::system("hq server stop &> /dev/null");

Expand All @@ -41,11 +35,14 @@ void launch_hq_with_alloc_queue() {
}

const std::vector<std::string> get_model_names() {
HyperQueueJob hq_job("", false); // Don't start a client.
// Don't start a client, always use the default job submission script.
HyperQueueJob hq_job("", false, true);

return umbridge::SupportedModels(hq_job.server_url);
}

std::atomic<int32_t> HyperQueueJob::job_count = 0;

int main(int argc, char *argv[])
{
create_directory_if_not_existing("urls");
Expand All @@ -67,6 +64,13 @@ int main(int argc, char *argv[])
port = atoi(port_cstr);
}

char const *delay_cstr = std::getenv("HQ_SUBMIT_DELAY_MS");
if (delay_cstr != NULL)
{
hq_submit_delay_ms = atoi(delay_cstr);
}
std::cout << "HQ_SUBMIT_DELAY_MS set to " << hq_submit_delay_ms << std::endl;

// Initialize load balancer for each available model on the model server.
const std::vector<std::string> model_names = get_model_names();

Expand All @@ -82,7 +86,6 @@ int main(int argc, char *argv[])
std::transform(LB_vector.begin(), LB_vector.end(), LB_ptr_vector.begin(),
[](LoadBalancer& obj) { return &obj; });

std::cout << "Load balancer running on host " << get_hostname()
<< " and bound to 0.0.0.0:" << port << std::endl;
umbridge::serveModels(LB_ptr_vector, "0.0.0.0", port, false);
std::cout << "Load balancer running port" << port << std::endl;
umbridge::serveModels(LB_ptr_vector, "0.0.0.0", port, true, false);
}
139 changes: 86 additions & 53 deletions hpc/LoadBalancer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include <tuple>
#include <memory>
#include <filesystem>
#include "lib/umbridge.h"
#include "../lib/umbridge.h"

// run and get the result of command
std::string getCommandOutput(const std::string command)
Expand Down Expand Up @@ -65,63 +65,17 @@ std::string readUrl(const std::string &filename)
return url;
}

// state = ["WAITING", "RUNNING", "FINISHED", "CANCELED"]
bool waitForHQJobState(const std::string &job_id, const std::string &state = "COMPLETED")
{
const std::string command = "hq job info " + job_id + " | grep State | awk '{print $4}'";
// std::cout << "Checking runtime: " << command << std::endl;
std::string job_status;

do
{
job_status = getCommandOutput(command);

// Delete the line break
if (!job_status.empty())
job_status.pop_back();

// Don't wait if there is an error or the job is ended
if (job_status == "" || (state != "FINISHED" && job_status == "FINISHED") || job_status == "FAILED" || job_status == "CANCELED")
{
std::cerr << "Wait for job status failure, status : " << job_status << std::endl;
return false;
}
// std::cout<<"Job status: "<<job_status<<std::endl;
sleep(1);
} while (job_status != state);

return true;
}

std::string submitHQJob()
{
std::string hq_command = "hq submit --output-mode=quiet hq_scripts/job.sh";

std::string job_id = getCommandOutput(hq_command);

// Delete the line break
if (!job_id.empty())
job_id.pop_back();

std::cout << "Waiting for job " << job_id << " to start." << std::endl;

// Wait for the HQ Job to start
waitForHQJobState(job_id, "RUNNING");

// Also wait until job is running and url file is written
waitForFile("./urls/url-" + job_id + ".txt");

std::cout << "Job " << job_id << " started." << std::endl;

return job_id;
}
std::mutex job_submission_mutex;
int hq_submit_delay_ms = 0;

class HyperQueueJob
{
public:
HyperQueueJob(std::string model_name, bool start_client=true)
static std::atomic<int32_t> job_count;
HyperQueueJob(std::string model_name, bool start_client=true,
bool force_default_submission_script=false)
{
job_id = submitHQJob();
job_id = submitHQJob(model_name, force_default_submission_script);

// Get the server URL
server_url = readUrl("./urls/url-" + job_id + ".txt");
Expand All @@ -146,6 +100,85 @@ class HyperQueueJob
std::unique_ptr<umbridge::HTTPModel> client_ptr;

private:
std::string submitHQJob(const std::string &model_name, bool force_default_submission_script=false)
{
// Add optional delay to job submissions to prevent issues in some cases.
if (hq_submit_delay_ms) {
std::lock_guard<std::mutex> lock(job_submission_mutex);
std::this_thread::sleep_for(std::chrono::milliseconds(hq_submit_delay_ms));
}

// Use model specific job script if available, default otherwise.
const std::filesystem::path submission_script_dir("./hq_scripts");
const std::filesystem::path submission_script_generic("job.sh");
const std::filesystem::path submission_script_model_specific("job_" + model_name + ".sh");

std::string hq_command = "hq submit --output-mode=quiet ";
hq_command += "--priority=" + std::to_string(job_count) + " ";
if (std::filesystem::exists(submission_script_dir / submission_script_model_specific) && !force_default_submission_script)
{
hq_command += (submission_script_dir / submission_script_model_specific).string();
}
else if (std::filesystem::exists(submission_script_dir / submission_script_generic))
{
hq_command += (submission_script_dir / submission_script_generic).string();
}
else
{
throw std::runtime_error("Job submission script not found: Check that file 'hq_script/job.sh' exists.");
}

// Submit the HQ job and retrieve the HQ job ID.
std::string job_id = getCommandOutput(hq_command);
job_count--;

// Delete the line break.
if (!job_id.empty())
{
job_id.pop_back();
}

std::cout << "Waiting for job " << job_id << " to start." << std::endl;

// Wait for the HQ Job to start
waitForHQJobState(job_id, "RUNNING");

// Also wait until job is running and url file is written
waitForFile("./urls/url-" + job_id + ".txt");

std::cout << "Job " << job_id << " started." << std::endl;

return job_id;
}

// state = ["WAITING", "RUNNING", "FINISHED", "CANCELED"]
bool waitForHQJobState(const std::string &job_id, const std::string &state)
{
const std::string command = "hq job info " + job_id + " | grep State | awk '{print $4}'";
// std::cout << "Checking runtime: " << command << std::endl;
std::string job_status;

do
{
job_status = getCommandOutput(command);

// Delete the line break
if (!job_status.empty())
job_status.pop_back();

// Don't wait if there is an error or the job is ended
if (job_status == "" || (state != "FINISHED" && job_status == "FINISHED") || job_status == "FAILED" || job_status == "CANCELED")
{
std::cerr << "Wait for job status failure, status : " << job_status << std::endl;
return false;
}

sleep(1);
} while (job_status != state);

return true;
}

std::string job_id;
};

Expand Down
2 changes: 1 addition & 1 deletion hpc/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
all: build-load-balancer

load-balancer-files = LoadBalancer.cpp LoadBalancer.hpp lib/httplib.h lib/json.hpp lib/umbridge.h
load-balancer-files = LoadBalancer.cpp LoadBalancer.hpp ../lib/httplib.h ../lib/json.hpp ../lib/umbridge.h

build-load-balancer:
- g++ -O3 -Wno-unused-result -std=c++17 $(load-balancer-files) -o load-balancer -pthread
Loading

0 comments on commit 7494061

Please sign in to comment.