diff --git a/NEWS b/NEWS index 02b986d6c50..fcdcfe792d4 100644 --- a/NEWS +++ b/NEWS @@ -41,6 +41,8 @@ documents those changes that are of interest to users and administrators. strictly enforced rather than advisory). -- Change how a node's allocated CPU count is calculated to avoid double counting CPUs allocated to multiple jobs at the same time. + -- Added SchedulingParameters option of "bf_min_prio_reserve". Jobs below + the specified threshold will not have resources reserved for them. * Changes in Slurm 16.05.0pre1 ============================== diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index a7b7c4f6723..c393233bb75 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -2512,6 +2512,18 @@ This can be valuable if jobs lack time limits or all time limits have the same value. The default value is zero, which will reserve resources for any pending job and delay initiation of lower priority jobs. +Also see bf_min_prio_reserve. +.TP +\fBbf_min_prio_reserve=#\fR +The backfill and main scheduling logic will not reserve resources for pending +jobs unless they have a priority equal to or higher than the specified value. +In addition, jobs with a lower priority will not prevent a newly submitted job +from starting immediately, even if the newly submitted job has a lower priority. +This can be valuable if one wished to maximum system utilization without regard +for job priority below a certain threshold. +The default value is zero, which will reserve resources for any pending job +and delay initiation of lower priority jobs. +Also see bf_min_age_reserve. .TP \fBbf_resolution=#\fR The number of seconds in the resolution of data maintained about when jobs diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index fa7c722dc5d..ce5ff7e6814 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -981,6 +981,7 @@ enum ctx_keys { #define HAS_STATE_DIR 0x00000004 /* Used by slurmctld to track state dir */ #define BACKFILL_TEST 0x00000008 /* Backfill test in progress */ #define GRES_ENFORCE_BIND 0x00000010 /* Enforce CPU/GRES binding */ +#define TEST_NOW_ONLY 0x00000020 /* Test for immediately start only */ /*****************************************************************************\ * SLURM HOSTLIST FUNCTIONS diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index e6edcfb7b40..6fea9b7dab0 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -128,6 +128,7 @@ static int backfill_resolution = BACKFILL_RESOLUTION; static int backfill_window = BACKFILL_WINDOW; static int bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV; static int bf_min_age_reserve = 0; +static uint32_t bf_min_prio_reserve = 0; static int max_backfill_job_cnt = 100; static int max_backfill_job_per_part = 0; static int max_backfill_job_per_user = 0; @@ -596,16 +597,28 @@ static void _load_config(void) max_backfill_job_per_user = 0; } + bf_min_age_reserve = 0; if (sched_params && (tmp_ptr = strstr(sched_params, "bf_min_age_reserve="))) { - bf_min_age_reserve = atoi(tmp_ptr + 19); - if (bf_min_age_reserve < 0) { + int min_age = atoi(tmp_ptr + 19); + if (min_age < 0) { error("Invalid SchedulerParameters bf_min_age_reserve: %d", - bf_min_age_reserve); - bf_min_age_reserve = 0; + min_age); + } else { + bf_min_age_reserve = min_age; + } + } + + bf_min_prio_reserve = 0; + if (sched_params && + (tmp_ptr = strstr(sched_params, "bf_min_prio_reserve="))) { + int64_t min_prio = (int64_t) atoll(tmp_ptr + 20); + if (min_prio < 0) { + error("Invalid SchedulerParameters bf_min_prio_reserve: %"PRIi64, + min_prio); + } else { + bf_min_prio_reserve = (uint32_t) min_prio; } - } else { - bf_min_age_reserve = 0; } /* bf_continue makes backfill continue where it was if interrupted */ @@ -861,7 +874,7 @@ static int _attempt_backfill(void) struct timeval start_tv; uint32_t test_array_job_id = 0; uint32_t test_array_count = 0; - uint32_t acct_max_nodes, wait_reason = 0; + uint32_t acct_max_nodes, wait_reason = 0, job_no_reserve; bool resv_overlap = false; uint8_t save_share_res, save_whole_node; int test_fini; @@ -1002,6 +1015,7 @@ static int _attempt_backfill(void) job_ptr = job_queue_rec->job_ptr; part_ptr = job_queue_rec->part_ptr; job_ptr->part_ptr = part_ptr; + job_ptr->priority = job_queue_rec->priority; mcs_select = slurm_mcs_get_select(job_ptr); /* With bf_continue configured, the original job could have @@ -1072,6 +1086,17 @@ static int _attempt_backfill(void) !acct_policy_job_runnable_pre_select(job_ptr)) continue; + job_no_reserve = 0; + if (bf_min_prio_reserve && + (job_ptr->priority < bf_min_prio_reserve)) { + job_no_reserve = TEST_NOW_ONLY; + } else if (bf_min_age_reserve && job_ptr->details->begin_time) { + pend_time = difftime(time(NULL), + job_ptr->details->begin_time); + if (pend_time < bf_min_age_reserve) + job_no_reserve = TEST_NOW_ONLY; + } + orig_start_time = job_ptr->start_time; orig_time_limit = job_ptr->time_limit; xfree(job_queue_rec); @@ -1410,6 +1435,7 @@ static int _attempt_backfill(void) build_active_feature_bitmap(job_ptr, avail_bitmap, &active_bitmap); job_ptr->bit_flags |= BACKFILL_TEST; + job_ptr->bit_flags |= job_no_reserve; /* 0 or TEST_NOW_ONLY */ if (active_bitmap) { j = _try_sched(job_ptr, &active_bitmap, min_nodes, max_nodes, req_nodes, exc_core_bitmap); @@ -1436,6 +1462,7 @@ static int _attempt_backfill(void) } } job_ptr->bit_flags &= ~BACKFILL_TEST; + job_ptr->bit_flags &= ~TEST_NOW_ONLY; now = time(NULL); if (j != SLURM_SUCCESS) { @@ -1588,13 +1615,8 @@ static int _attempt_backfill(void) _set_job_time_limit(job_ptr, orig_time_limit); } - if ((job_ptr->start_time > now) && - (bf_min_age_reserve && job_ptr->details->begin_time)) { - pend_time = difftime(time(NULL), - job_ptr->details->begin_time); - if (pend_time < bf_min_age_reserve) - continue; - } + if ((job_ptr->start_time > now) && (job_no_reserve != 0)) + continue; if (later_start && (job_ptr->start_time > later_start)) { /* Try later when some nodes currently reserved for diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index 40ebb8c1ba2..1e740d418a5 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -1839,7 +1839,8 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap, /* Remove the running jobs one at a time from exp_node_cr and try * scheduling the pending job after each one. */ - if (rc != SLURM_SUCCESS) { + if ((rc != SLURM_SUCCESS) && + ((job_ptr->bit_flags & TEST_NOW_ONLY) == 0)) { list_sort(cr_job_list, _cr_job_list_sort); job_iterator = list_iterator_create(cr_job_list); while ((tmp_job_ptr = list_next(job_iterator))) { diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c index 3c961aac5bd..d642150b450 100644 --- a/src/plugins/select/linear/select_linear.c +++ b/src/plugins/select/linear/select_linear.c @@ -3392,7 +3392,8 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap, /* Remove the running jobs one at a time from exp_node_cr and try * scheduling the pending job after each one */ - if (rc != SLURM_SUCCESS) { + if ((rc != SLURM_SUCCESS) && + ((job_ptr->bit_flags & TEST_NOW_ONLY) == 0)) { list_sort(cr_job_list, _cr_job_list_sort); job_iterator = list_iterator_create(cr_job_list); while ((tmp_job_ptr = (struct job_record *) diff --git a/src/plugins/select/serial/select_serial.c b/src/plugins/select/serial/select_serial.c index f6eb110f05f..b7787179214 100644 --- a/src/plugins/select/serial/select_serial.c +++ b/src/plugins/select/serial/select_serial.c @@ -1293,7 +1293,8 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap, /* Remove the running jobs one at a time from exp_node_cr and try * scheduling the pending job after each one. */ - if (rc != SLURM_SUCCESS) { + if ((rc != SLURM_SUCCESS) && + ((job_ptr->bit_flags & TEST_NOW_ONLY) == 0)) { list_sort(cr_job_list, _cr_job_list_sort); job_iterator = list_iterator_create(cr_job_list); while ((tmp_job_ptr = list_next(job_iterator))) { diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 208d70e70e1..88dc868a6c9 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -4166,13 +4166,12 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, defer_sched = 1; else defer_sched = 0; + bf_min_age_reserve = 0; if (sched_params && (tmp_ptr = strstr(sched_params, "bf_min_age_reserve="))) { - bf_min_age_reserve = atoi(tmp_ptr + 19); - if (bf_min_age_reserve < 0) - bf_min_age_reserve = 0; - } else { - bf_min_age_reserve = 0; + int min_age = atoi(tmp_ptr + 19); + if (min_age > 0) + bf_min_age_reserve = min_age; } xfree(sched_params); } diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 09f0c5993cd..f8feb707bb4 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -1119,6 +1119,7 @@ static int _schedule(uint32_t job_limit) static int sched_timeout = 0; static int sched_max_job_start = 0; static int bf_min_age_reserve = 0; + static uint32_t bf_min_prio_reserve = 0; static int def_job_limit = 100; static int max_jobs_per_part = 0; static int defer_rpc_cnt = 0; @@ -1191,13 +1192,20 @@ static int _schedule(uint32_t job_limit) batch_sched_delay = 3; } + bf_min_age_reserve = 0; if (sched_params && (tmp_ptr = strstr(sched_params, "bf_min_age_reserve="))) { - bf_min_age_reserve = atoi(tmp_ptr + 19); - if (bf_min_age_reserve < 0) - bf_min_age_reserve = 0; - } else { - bf_min_age_reserve = 0; + int min_age = atoi(tmp_ptr + 19); + if (min_age > 0) + bf_min_age_reserve = min_age; + } + + bf_min_prio_reserve = 0; + if (sched_params && + (tmp_ptr = strstr(sched_params, "bf_min_prio_reserve="))) { + int64_t min_prio = (int64_t) atoll(tmp_ptr + 20); + if (min_prio > 0) + bf_min_prio_reserve = (uint32_t) min_prio; } if (sched_params && @@ -1451,6 +1459,7 @@ next_part: part_ptr = (struct part_record *) array_task_id = job_queue_rec->array_task_id; job_ptr = job_queue_rec->job_ptr; part_ptr = job_queue_rec->part_ptr; + job_ptr->priority = job_queue_rec->priority; xfree(job_queue_rec); if (!avail_front_end(job_ptr)) { job_ptr->state_reason = WAIT_FRONT_END; @@ -1896,6 +1905,9 @@ next_part: part_ptr = (struct part_record *) fail_by_part = false; } } + if (fail_by_part && bf_min_prio_reserve && + (job_ptr->priority < bf_min_prio_reserve)) + fail_by_part = false; if (fail_by_part) { /* do not schedule more jobs in this partition or on * nodes in this partition */