Skip to content

Commit

Permalink
Add SchedulerParameters option of bf_hetjob_immediate.
Browse files Browse the repository at this point in the history
Instruct the backfill scheduler to attempt to start a heterogeneous job
as soon as all of its components are determined able to do so.

Bug 5579.
  • Loading branch information
asanchez1987 authored and dannyauble committed Mar 13, 2019
1 parent e076cbb commit 44ed6bc
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 41 deletions.
3 changes: 3 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ documents those changes that are of interest to users and administrators.
* Changes in Slurm 18.08.7
==========================
-- Set debug statement to debug2 to avoid benign error messages.
-- Add SchedulerParameters option of bf_hetjob_immediate to attempt to start
a heterogeneous job as soon as all of its components are determined able to
do so.

* Changes in Slurm 18.08.6-2
============================
Expand Down
10 changes: 10 additions & 0 deletions doc/man/man5/slurm.conf.5
Original file line number Diff line number Diff line change
Expand Up @@ -2519,6 +2519,16 @@ This can result in lower priority jobs being backfill scheduled instead
of newly arrived higher priority jobs, but will permit more queued jobs to be
considered for backfill scheduling.
.TP
\fBbf_hetjob_immediate\fR
Instruct the backfill scheduler to attempt to start a heterogeneous job as
soon as all of its components are determined able to do so. Otherwise, the
backfill scheduler will delay heterogeneous jobs initiation attempts until
after the rest of the queue has been processed. This delay may result in lower
priority jobs being allocated resources, which could delay the initiation of
the heterogeneous job due to account and/or QOS limits being reached. This
option is disabled by default. If enabled and \fBbf_hetjob_prio=min\fR is not
set, then it would be automatically set.
.TP
\fBbf_hetjob_prio=[min|avg|max]\fR
At the beginning of each backfill scheduling cycle, a list of pending to be
scheduled jobs is sorted according to the precedence order configured in
Expand Down
131 changes: 90 additions & 41 deletions src/plugins/sched/backfill/backfill.c
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ typedef struct pack_job_rec {

typedef struct pack_job_map {
uint32_t comp_time_limit; /* Time limit for pack job */
time_t prev_start; /* Time when expected to start from last test */
time_t prev_start; /* Expected start time from last test */
uint32_t pack_job_id;
List pack_job_list; /* List of pack_job_rec_t */
} pack_job_map_t;
Expand Down Expand Up @@ -170,6 +170,7 @@ static int bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV;
static int bf_min_age_reserve = 0;
static uint32_t bf_min_prio_reserve = 0;
static List deadlock_global_list;
static bool bf_hetjob_immediate = false;
static uint16_t bf_hetjob_prio = 0;
static int max_backfill_job_cnt = 100;
static int max_backfill_job_per_assoc = 0;
Expand Down Expand Up @@ -217,7 +218,11 @@ static void _pack_start_clear(void);
static time_t _pack_start_find(struct job_record *job_ptr, time_t now);
static void _pack_start_set(struct job_record *job_ptr, time_t latest_start,
uint32_t comp_time_limit);
static void _pack_start_test(node_space_map_t *node_space);
static void _pack_start_test_single(node_space_map_t *node_space,
pack_job_map_t *map, bool single);
static int _pack_start_test_list(void *map, void *node_space);
static void _pack_start_test(node_space_map_t *node_space,
uint32_t pack_job_id);
static void _reset_job_time_limit(struct job_record *job_ptr, time_t now,
node_space_map_t *node_space);
static int _set_hetjob_details(void *x, void *arg);
Expand Down Expand Up @@ -869,6 +874,14 @@ static void _load_config(void)
tmp_ptr);
}

bf_hetjob_immediate = false;
if (sched_params && (strstr(sched_params, "bf_hetjob_immediate")))
bf_hetjob_immediate = true;

if (bf_hetjob_immediate && !bf_hetjob_prio) {
bf_hetjob_prio |= HETJOB_PRIO_MIN;
info("bf_hetjob_immediate automatically sets bf_hetjob_prio=min");
}

if (sched_params && (tmp_ptr = strstr(sched_params, "max_rpc_cnt=")))
defer_rpc_cnt = atoi(tmp_ptr + 12);
Expand Down Expand Up @@ -2433,6 +2446,9 @@ static int _attempt_backfill(void)
_pack_start_set(job_ptr, job_ptr->start_time,
comp_time_limit);
_set_job_time_limit(job_ptr, orig_time_limit);
if (bf_hetjob_immediate)
_pack_start_test(node_space,
job_ptr->pack_job_id);
}

if ((job_ptr->start_time > now) && (job_no_reserve != 0)) {
Expand Down Expand Up @@ -2635,7 +2651,8 @@ static int _attempt_backfill(void)
&tmp_preempt_in_progress);

_job_pack_deadlock_fini();
_pack_start_test(node_space);
if (!bf_hetjob_immediate)
_pack_start_test(node_space, 0);

xfree(bf_part_jobs);
xfree(bf_part_resv);
Expand Down Expand Up @@ -3456,59 +3473,91 @@ static void _pack_kill_now(pack_job_map_t *map)
}

/*
* If all components of a pack job can start now, then do so
* If all components of a heterogeneous job can start now, then do so
* node_space IN - map of available resources through time
* map IN - info about this heterogeneous job
* single IN - true if testing single heterogeneous jobs
*/
static void _pack_start_test(node_space_map_t *node_space)
static void _pack_start_test_single(node_space_map_t *node_space,
pack_job_map_t *map, bool single)
{
ListIterator iter;
pack_job_map_t *map;
time_t now = time(NULL);
int rc;

iter = list_iterator_create(pack_job_list);
while ((map = (pack_job_map_t *) list_next (iter))) {
if (!_pack_job_full(map)) {
if (debug_flags & DEBUG_FLAG_HETERO_JOBS) {
info("Pack job %u has indefinite start time",
map->pack_job_id);
}
map->prev_start = now + YEAR_SECONDS;
continue;
}
if (!map)
return;

map->prev_start = _pack_start_compute(map, 0);
if (map->prev_start > now) {
if (debug_flags & DEBUG_FLAG_HETERO_JOBS) {
info("Pack job %u should be able to start in %u seconds",
map->pack_job_id,
(uint32_t) (map->prev_start - now));
}
continue;
if (!_pack_job_full(map)) {
if (debug_flags & DEBUG_FLAG_HETERO_JOBS) {
info("Pack job %u has indefinite start time",
map->pack_job_id);
}

if (!_pack_job_limit_check(map, now)) {
if (debug_flags & DEBUG_FLAG_HETERO_JOBS) {
info("Pack job %u prevented from starting by account/QOS limit",
map->pack_job_id);
}
if (!single)
map->prev_start = now + YEAR_SECONDS;
continue;
return;
}

map->prev_start = _pack_start_compute(map, 0);
if (map->prev_start > now) {
if (debug_flags & DEBUG_FLAG_HETERO_JOBS) {
info("Pack job %u should be able to start in %u seconds",
map->pack_job_id,
(uint32_t) (map->prev_start - now));
}
return;
}

if (!_pack_job_limit_check(map, now)) {
if (debug_flags & DEBUG_FLAG_HETERO_JOBS) {
info("Attempting to start pack job %u",
info("Pack job %u prevented from starting by account/QOS limit",
map->pack_job_id);
}
rc = _pack_start_now(map, node_space);
if (rc != SLURM_SUCCESS) {
if (debug_flags & DEBUG_FLAG_HETERO_JOBS) {
info("Failed to start pack job %u",
map->pack_job_id);
}
_pack_kill_now(map);
map->prev_start = now + YEAR_SECONDS;
return;
}

if (debug_flags & DEBUG_FLAG_HETERO_JOBS)
info("Attempting to start pack job %u", map->pack_job_id);

rc = _pack_start_now(map, node_space);
if (rc != SLURM_SUCCESS) {
if (debug_flags & DEBUG_FLAG_HETERO_JOBS) {
info("Failed to start pack job %u",
map->pack_job_id);
}
_pack_kill_now(map);
}
}

static int _pack_start_test_list(void *map, void *node_space)
{
_pack_start_test_single(node_space, map, false);

return SLURM_SUCCESS;
}


/*
* If all components of a heterogeneous job can start now, then do so
* node_space IN - map of available resources through time
* pack_job_id IN - the ID of the heterogeneous job to evaluate,
* if zero then evaluate all heterogeneous jobs
*/
static void _pack_start_test(node_space_map_t *node_space, uint32_t pack_job_id)
{
pack_job_map_t *map = NULL;

if (!pack_job_id) {
/* Test all maps. */
(void)list_for_each(pack_job_list,
_pack_start_test_list, node_space);
} else {
/* Test single map. */
map = (pack_job_map_t *)list_find_first(pack_job_list,
_pack_find_map,
&pack_job_id);
_pack_start_test_single(node_space, map, true);
}
list_iterator_destroy(iter);
}

static void _deadlock_global_list_del(void *x)
Expand Down
10 changes: 10 additions & 0 deletions src/slurmctld/job_scheduler.c
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ static int sched_pend_thread = 0;
static bool sched_running = false;
static struct timeval sched_last = {0, 0};
static uint32_t max_array_size = NO_VAL;
static bool bf_hetjob_immediate = false;
static uint16_t bf_hetjob_prio = 0;
#ifdef HAVE_ALPS_CRAY
static int sched_min_interval = 1000000;
Expand Down Expand Up @@ -1330,6 +1331,15 @@ static int _schedule(uint32_t job_limit)
tmp_ptr);
}

bf_hetjob_immediate = false;
if (sched_params && strstr(sched_params, "bf_hetjob_immediate"))
bf_hetjob_immediate = true;

if (bf_hetjob_immediate && !bf_hetjob_prio) {
bf_hetjob_prio |= HETJOB_PRIO_MIN;
info("bf_hetjob_immediate automatically sets bf_hetjob_prio=min");
}

if (sched_params &&
(tmp_ptr = strstr(sched_params, "partition_job_depth="))) {
max_jobs_per_part = atoi(tmp_ptr + 20);
Expand Down

0 comments on commit 44ed6bc

Please sign in to comment.