Skip to content

Commit

Permalink
Added SchedulingParameters option of "bf_min_prio_reserve"
Browse files Browse the repository at this point in the history
Jobs below the specified threshold will not have resources reserved for them.
bug 2565
  • Loading branch information
jette committed Mar 18, 2016
1 parent 78d1bcf commit 4556087
Show file tree
Hide file tree
Showing 9 changed files with 78 additions and 27 deletions.
2 changes: 2 additions & 0 deletions NEWS
Expand Up @@ -41,6 +41,8 @@ documents those changes that are of interest to users and administrators.
strictly enforced rather than advisory).
-- Change how a node's allocated CPU count is calculated to avoid double
counting CPUs allocated to multiple jobs at the same time.
-- Added SchedulingParameters option of "bf_min_prio_reserve". Jobs below
the specified threshold will not have resources reserved for them.

* Changes in Slurm 16.05.0pre1
==============================
Expand Down
12 changes: 12 additions & 0 deletions doc/man/man5/slurm.conf.5
Expand Up @@ -2512,6 +2512,18 @@ This can be valuable if jobs lack time limits or all time limits have the same
value.
The default value is zero, which will reserve resources for any pending job
and delay initiation of lower priority jobs.
Also see bf_min_prio_reserve.
.TP
\fBbf_min_prio_reserve=#\fR
The backfill and main scheduling logic will not reserve resources for pending
jobs unless they have a priority equal to or higher than the specified value.
In addition, jobs with a lower priority will not prevent a newly submitted job
from starting immediately, even if the newly submitted job has a lower priority.
This can be valuable if one wished to maximum system utilization without regard
for job priority below a certain threshold.
The default value is zero, which will reserve resources for any pending job
and delay initiation of lower priority jobs.
Also see bf_min_age_reserve.
.TP
\fBbf_resolution=#\fR
The number of seconds in the resolution of data maintained about when jobs
Expand Down
1 change: 1 addition & 0 deletions slurm/slurm.h.in
Expand Up @@ -981,6 +981,7 @@ enum ctx_keys {
#define HAS_STATE_DIR 0x00000004 /* Used by slurmctld to track state dir */
#define BACKFILL_TEST 0x00000008 /* Backfill test in progress */
#define GRES_ENFORCE_BIND 0x00000010 /* Enforce CPU/GRES binding */
#define TEST_NOW_ONLY 0x00000020 /* Test for immediately start only */

/*****************************************************************************\
* SLURM HOSTLIST FUNCTIONS
Expand Down
50 changes: 36 additions & 14 deletions src/plugins/sched/backfill/backfill.c
Expand Up @@ -128,6 +128,7 @@ static int backfill_resolution = BACKFILL_RESOLUTION;
static int backfill_window = BACKFILL_WINDOW;
static int bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV;
static int bf_min_age_reserve = 0;
static uint32_t bf_min_prio_reserve = 0;
static int max_backfill_job_cnt = 100;
static int max_backfill_job_per_part = 0;
static int max_backfill_job_per_user = 0;
Expand Down Expand Up @@ -596,16 +597,28 @@ static void _load_config(void)
max_backfill_job_per_user = 0;
}

bf_min_age_reserve = 0;
if (sched_params &&
(tmp_ptr = strstr(sched_params, "bf_min_age_reserve="))) {
bf_min_age_reserve = atoi(tmp_ptr + 19);
if (bf_min_age_reserve < 0) {
int min_age = atoi(tmp_ptr + 19);
if (min_age < 0) {
error("Invalid SchedulerParameters bf_min_age_reserve: %d",
bf_min_age_reserve);
bf_min_age_reserve = 0;
min_age);
} else {
bf_min_age_reserve = min_age;
}
}

bf_min_prio_reserve = 0;
if (sched_params &&
(tmp_ptr = strstr(sched_params, "bf_min_prio_reserve="))) {
int64_t min_prio = (int64_t) atoll(tmp_ptr + 20);
if (min_prio < 0) {
error("Invalid SchedulerParameters bf_min_prio_reserve: %"PRIi64,
min_prio);
} else {
bf_min_prio_reserve = (uint32_t) min_prio;
}
} else {
bf_min_age_reserve = 0;
}

/* bf_continue makes backfill continue where it was if interrupted */
Expand Down Expand Up @@ -861,7 +874,7 @@ static int _attempt_backfill(void)
struct timeval start_tv;
uint32_t test_array_job_id = 0;
uint32_t test_array_count = 0;
uint32_t acct_max_nodes, wait_reason = 0;
uint32_t acct_max_nodes, wait_reason = 0, job_no_reserve;
bool resv_overlap = false;
uint8_t save_share_res, save_whole_node;
int test_fini;
Expand Down Expand Up @@ -1002,6 +1015,7 @@ static int _attempt_backfill(void)
job_ptr = job_queue_rec->job_ptr;
part_ptr = job_queue_rec->part_ptr;
job_ptr->part_ptr = part_ptr;
job_ptr->priority = job_queue_rec->priority;
mcs_select = slurm_mcs_get_select(job_ptr);

/* With bf_continue configured, the original job could have
Expand Down Expand Up @@ -1072,6 +1086,17 @@ static int _attempt_backfill(void)
!acct_policy_job_runnable_pre_select(job_ptr))
continue;

job_no_reserve = 0;
if (bf_min_prio_reserve &&
(job_ptr->priority < bf_min_prio_reserve)) {
job_no_reserve = TEST_NOW_ONLY;
} else if (bf_min_age_reserve && job_ptr->details->begin_time) {
pend_time = difftime(time(NULL),
job_ptr->details->begin_time);
if (pend_time < bf_min_age_reserve)
job_no_reserve = TEST_NOW_ONLY;
}

orig_start_time = job_ptr->start_time;
orig_time_limit = job_ptr->time_limit;
xfree(job_queue_rec);
Expand Down Expand Up @@ -1410,6 +1435,7 @@ static int _attempt_backfill(void)
build_active_feature_bitmap(job_ptr, avail_bitmap,
&active_bitmap);
job_ptr->bit_flags |= BACKFILL_TEST;
job_ptr->bit_flags |= job_no_reserve; /* 0 or TEST_NOW_ONLY */
if (active_bitmap) {
j = _try_sched(job_ptr, &active_bitmap, min_nodes,
max_nodes, req_nodes, exc_core_bitmap);
Expand All @@ -1436,6 +1462,7 @@ static int _attempt_backfill(void)
}
}
job_ptr->bit_flags &= ~BACKFILL_TEST;
job_ptr->bit_flags &= ~TEST_NOW_ONLY;

now = time(NULL);
if (j != SLURM_SUCCESS) {
Expand Down Expand Up @@ -1588,13 +1615,8 @@ static int _attempt_backfill(void)
_set_job_time_limit(job_ptr, orig_time_limit);
}

if ((job_ptr->start_time > now) &&
(bf_min_age_reserve && job_ptr->details->begin_time)) {
pend_time = difftime(time(NULL),
job_ptr->details->begin_time);
if (pend_time < bf_min_age_reserve)
continue;
}
if ((job_ptr->start_time > now) && (job_no_reserve != 0))
continue;

if (later_start && (job_ptr->start_time > later_start)) {
/* Try later when some nodes currently reserved for
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/select/cons_res/select_cons_res.c
Expand Up @@ -1839,7 +1839,8 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap,

/* Remove the running jobs one at a time from exp_node_cr and try
* scheduling the pending job after each one. */
if (rc != SLURM_SUCCESS) {
if ((rc != SLURM_SUCCESS) &&
((job_ptr->bit_flags & TEST_NOW_ONLY) == 0)) {
list_sort(cr_job_list, _cr_job_list_sort);
job_iterator = list_iterator_create(cr_job_list);
while ((tmp_job_ptr = list_next(job_iterator))) {
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/select/linear/select_linear.c
Expand Up @@ -3392,7 +3392,8 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap,

/* Remove the running jobs one at a time from exp_node_cr and try
* scheduling the pending job after each one */
if (rc != SLURM_SUCCESS) {
if ((rc != SLURM_SUCCESS) &&
((job_ptr->bit_flags & TEST_NOW_ONLY) == 0)) {
list_sort(cr_job_list, _cr_job_list_sort);
job_iterator = list_iterator_create(cr_job_list);
while ((tmp_job_ptr = (struct job_record *)
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/select/serial/select_serial.c
Expand Up @@ -1293,7 +1293,8 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap,

/* Remove the running jobs one at a time from exp_node_cr and try
* scheduling the pending job after each one. */
if (rc != SLURM_SUCCESS) {
if ((rc != SLURM_SUCCESS) &&
((job_ptr->bit_flags & TEST_NOW_ONLY) == 0)) {
list_sort(cr_job_list, _cr_job_list_sort);
job_iterator = list_iterator_create(cr_job_list);
while ((tmp_job_ptr = list_next(job_iterator))) {
Expand Down
9 changes: 4 additions & 5 deletions src/slurmctld/job_mgr.c
Expand Up @@ -4166,13 +4166,12 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate,
defer_sched = 1;
else
defer_sched = 0;
bf_min_age_reserve = 0;
if (sched_params &&
(tmp_ptr = strstr(sched_params, "bf_min_age_reserve="))) {
bf_min_age_reserve = atoi(tmp_ptr + 19);
if (bf_min_age_reserve < 0)
bf_min_age_reserve = 0;
} else {
bf_min_age_reserve = 0;
int min_age = atoi(tmp_ptr + 19);
if (min_age > 0)
bf_min_age_reserve = min_age;
}
xfree(sched_params);
}
Expand Down
22 changes: 17 additions & 5 deletions src/slurmctld/job_scheduler.c
Expand Up @@ -1119,6 +1119,7 @@ static int _schedule(uint32_t job_limit)
static int sched_timeout = 0;
static int sched_max_job_start = 0;
static int bf_min_age_reserve = 0;
static uint32_t bf_min_prio_reserve = 0;
static int def_job_limit = 100;
static int max_jobs_per_part = 0;
static int defer_rpc_cnt = 0;
Expand Down Expand Up @@ -1191,13 +1192,20 @@ static int _schedule(uint32_t job_limit)
batch_sched_delay = 3;
}

bf_min_age_reserve = 0;
if (sched_params &&
(tmp_ptr = strstr(sched_params, "bf_min_age_reserve="))) {
bf_min_age_reserve = atoi(tmp_ptr + 19);
if (bf_min_age_reserve < 0)
bf_min_age_reserve = 0;
} else {
bf_min_age_reserve = 0;
int min_age = atoi(tmp_ptr + 19);
if (min_age > 0)
bf_min_age_reserve = min_age;
}

bf_min_prio_reserve = 0;
if (sched_params &&
(tmp_ptr = strstr(sched_params, "bf_min_prio_reserve="))) {
int64_t min_prio = (int64_t) atoll(tmp_ptr + 20);
if (min_prio > 0)
bf_min_prio_reserve = (uint32_t) min_prio;
}

if (sched_params &&
Expand Down Expand Up @@ -1451,6 +1459,7 @@ next_part: part_ptr = (struct part_record *)
array_task_id = job_queue_rec->array_task_id;
job_ptr = job_queue_rec->job_ptr;
part_ptr = job_queue_rec->part_ptr;
job_ptr->priority = job_queue_rec->priority;
xfree(job_queue_rec);
if (!avail_front_end(job_ptr)) {
job_ptr->state_reason = WAIT_FRONT_END;
Expand Down Expand Up @@ -1896,6 +1905,9 @@ next_part: part_ptr = (struct part_record *)
fail_by_part = false;
}
}
if (fail_by_part && bf_min_prio_reserve &&
(job_ptr->priority < bf_min_prio_reserve))
fail_by_part = false;
if (fail_by_part) {
/* do not schedule more jobs in this partition or on
* nodes in this partition */
Expand Down

0 comments on commit 4556087

Please sign in to comment.