Skip to content

Commit

Permalink
Modified load balancer for SLURM version. Should become a separate ba…
Browse files Browse the repository at this point in the history
…ckend
  • Loading branch information
linusseelinger committed Jul 4, 2024
1 parent 99424b4 commit a793c04
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 13 deletions.
11 changes: 6 additions & 5 deletions hpc/LoadBalancer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ void launch_hq_with_alloc_queue() {

const std::vector<std::string> get_model_names() {
// Don't start a client, always use the default job submission script.
HyperQueueJob hq_job("", false, true);
HyperQueueJob hq_job("", false, true);

return umbridge::SupportedModels(hq_job.server_url);
}
Expand All @@ -49,7 +49,7 @@ void print_model_and_job_script_information(const std::vector<std::string>& mode

const std::string SECTION_START_DELIMITER = "==============================MODEL INFO==============================";
const std::string SECTION_END_DELIMITER = "======================================================================";

// Sort the model names in alphabetical order for cleaner output.
std::vector<std::string> model_names_sorted = model_names;
std::sort(model_names_sorted.begin(), model_names_sorted.end());
Expand All @@ -69,7 +69,7 @@ void print_model_and_job_script_information(const std::vector<std::string>& mode
std::cout << "* Model '" << model_name << "' --> '" << used_job_script << "'\n";
}
std::cout << std::endl;


// Check if there are job scripts that are unused and print a warning.
std::vector<std::string> unused_job_scripts;
Expand All @@ -87,7 +87,7 @@ void print_model_and_job_script_information(const std::vector<std::string>& mode
std::smatch match_result;
if (std::regex_search(filename, match_result, format_regex)) {
// Extract first matched subexpression, i.e. the model name.
const std::string model_name = match_result[1].str();
const std::string model_name = match_result[1].str();
// Check if a corresponding model exists. If not, mark job script as unused.
if (!std::binary_search(model_names_sorted.begin(), model_names_sorted.end(), model_name)) {
unused_job_scripts.push_back(filename);
Expand Down Expand Up @@ -122,7 +122,8 @@ int main(int argc, char *argv[])
create_directory_if_not_existing("sub-jobs");
clear_url("urls");

launch_hq_with_alloc_queue();
// SLURM version
//launch_hq_with_alloc_queue();

// Read environment variables for configuration
char const *port_cstr = std::getenv("PORT");
Expand Down
28 changes: 20 additions & 8 deletions hpc/LoadBalancer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class HyperQueueJob
{
public:
static std::atomic<int32_t> job_count;
HyperQueueJob(std::string model_name, bool start_client=true,
HyperQueueJob(std::string model_name, bool start_client=true,
bool force_default_submission_script=false)
{
job_id = submitHQJob(model_name, force_default_submission_script);
Expand All @@ -90,7 +90,9 @@ class HyperQueueJob
~HyperQueueJob()
{
// Cancel the SLURM job
std::system(("./hq job cancel " + job_id).c_str());
// SLURM version
//std::system(("./hq job cancel " + job_id).c_str());
std::system(("scancel " + job_id).c_str());

// Delete the url text file
std::system(("rm ./urls/url-" + job_id + ".txt").c_str());
Expand All @@ -107,19 +109,23 @@ class HyperQueueJob
std::lock_guard<std::mutex> lock(job_submission_mutex);
std::this_thread::sleep_for(std::chrono::milliseconds(hq_submit_delay_ms));
}

// Use model specific job script if available, default otherwise.
const std::filesystem::path submission_script_dir("./hq_scripts");
// SLURM version
//const std::filesystem::path submission_script_dir("./hq_scripts");
const std::filesystem::path submission_script_dir("./slurm_scripts");
const std::filesystem::path submission_script_generic("job.sh");
const std::filesystem::path submission_script_model_specific("job_" + model_name + ".sh");

std::string hq_command = "./hq submit --output-mode=quiet ";
hq_command += "--priority=" + std::to_string(job_count) + " ";
// SLURM version
//std::string hq_command = "./hq submit --output-mode=quiet ";
//hq_command += "--priority=" + std::to_string(job_count) + " ";
std::string hq_command = "sbatch ";
if (std::filesystem::exists(submission_script_dir / submission_script_model_specific) && !force_default_submission_script)
{
hq_command += (submission_script_dir / submission_script_model_specific).string();
}
else if (std::filesystem::exists(submission_script_dir / submission_script_generic))
else if (std::filesystem::exists(submission_script_dir / submission_script_generic))
{
hq_command += (submission_script_dir / submission_script_generic).string();
}
Expand All @@ -130,6 +136,11 @@ class HyperQueueJob

// Submit the HQ job and retrieve the HQ job ID.
std::string job_id = getCommandOutput(hq_command);
// SLURM version
// Get job ID from sbatch output (last word in the output)
// Example: Submitted batch job 4010093 │
job_id = job_id.substr(job_id.find_last_of(" ") + 1);

job_count--;

// Delete the line break.
Expand All @@ -141,7 +152,8 @@ class HyperQueueJob
std::cout << "Waiting for job " << job_id << " to start." << std::endl;

// Wait for the HQ Job to start
waitForHQJobState(job_id, "RUNNING");
// SLURM version
//waitForHQJobState(job_id, "RUNNING");

// Also wait until job is running and url file is written
waitForFile("./urls/url-" + job_id + ".txt");
Expand Down
44 changes: 44 additions & 0 deletions hpc/slurm_scripts/job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash
#SBATCH --partition=devel
#SBATCH --ntasks=1
#SBATCH --time=00:05:00

function get_avaliable_port {
# Define the range of ports to select from
MIN_PORT=1024
MAX_PORT=65535

# Generate a random port number
port=$(shuf -i $MIN_PORT-$MAX_PORT -n 1)

# Check if the port is in use
while lsof -Pi :$port -sTCP:LISTEN -t >/dev/null; do
# If the port is in use, generate a new random port number
port=$(shuf -i $MIN_PORT-$MAX_PORT -n 1)
done

echo $port
}

port=$(get_avaliable_port)
export PORT=$port

# Assume that server sets the port according to the environment variable 'PORT'.
# Otherwise the job script will be stuck waiting for model server's response.
./testmodel & # CHANGE ME!


host=$(hostname -I | awk '{print $1}')

echo "Waiting for model server to respond at $host:$port..."
while ! curl -s "http://$host:$port/Info" > /dev/null; do
sleep 1
done
echo "Model server responded"

# Write server URL to file identified by HQ job ID.
load_balancer_dir="."
mkdir -p "$load_balancer_dir/urls"
echo "http://$host:$port" > "$load_balancer_dir/urls/url-$SLURM_JOB_ID.txt"

sleep infinity # keep the job occupied

0 comments on commit a793c04

Please sign in to comment.