Skip to content

Commit

Permalink
Merge branch 'accel'
Browse files Browse the repository at this point in the history
  • Loading branch information
jette committed Jun 4, 2015
2 parents 9128814 + 598fb6f commit 1ae5ad6
Show file tree
Hide file tree
Showing 19 changed files with 649 additions and 36 deletions.
4 changes: 4 additions & 0 deletions NEWS
Expand Up @@ -14,6 +14,10 @@ documents those changes that are of interest to users and administrators.
that job will be assigned a slurmd shepherd daemon on the same host as used
to execute the command when possible rather than an slurmd daemon on an
arbitrary front-end node.
-- Add srun --accel-bind option to control how tasks are bound to GPUs and NIC
Generic RESources (GRES).
-- gres/nic plugin modified to set OMPI_MCA_btl_openib_if_include environment
variable based upon allocated devices (usable with OpenMPI and Melanox).

* Changes in Slurm 15.08.0pre5
==============================
Expand Down
2 changes: 2 additions & 0 deletions RELEASE_NOTES
Expand Up @@ -164,6 +164,8 @@ COMMAND CHANGES (see man pages for details)
users to specify which behavior they want if a job dependency is not
satisfied.
-- Add scontrol options to view and modify layouts tables.
-- Add srun --accel-bind option to control how tasks are bound to GPUs and NIC
Generic RESources (GRES).

OTHER CHANGES
=============
Expand Down
19 changes: 19 additions & 0 deletions doc/man/man1/srun.1
Expand Up @@ -18,6 +18,25 @@ http://slurm.schedmd.com/cpu_management.html
.SH "OPTIONS"
.LP

.TP
\fB\-\-accel\-bind\fR=<\fIoptions\fR>
Control how tasks are bound to generic resources of type gpu, mic and nic.
Multiple options may be specified. Supported options are as include:
.RS
.TP
\fBg\fR
Bind each task to GPUs which are closest to the allocated CPUs.
.TP
\fBm\fR
Bind each task to MICs which are closest to the allocated CPUs.
.TP
\fBn\fR
Bind each task to NICs which are closest to the allocated CPUs.
.TP
\fBv\fR
Verbose mode. Log how tasks are bound to GPU and NIC devices.
.RE

.TP
\fB\-A\fR, \fB\-\-account\fR=<\fIaccount\fR>
Charge resources used by this job to specified account.
Expand Down
8 changes: 8 additions & 0 deletions slurm/slurm.h.in
Expand Up @@ -665,6 +665,13 @@ typedef enum mem_bind_type { /* memory binding type from --mem_bind=... */
MEM_BIND_LOCAL = 0x20 /* =local */
} mem_bind_type_t;

typedef enum accel_bind_type { /* accelerator binding from --accel_bind= */
ACCEL_BIND_VERBOSE = 0x01, /* 'v' verbose */
ACCEL_BIND_CLOSEST_GPU = 0x02, /* 'g' Use closest GPU to the CPU */
ACCEL_BIND_CLOSEST_MIC = 0x04, /* 'm' Use closest NIC to CPU */
ACCEL_BIND_CLOSEST_NIC = 0x08, /* 'n' Use closest NIC to CPU */
} accel_bind_type_t;

/* The last entry in node_states must be STATE_END, keep in sync with
* node_state_string. values may be ORed with NODE_STATE_FLAGS below.
* Node states typically alternate between NODE_STATE_IDLE and
Expand Down Expand Up @@ -1650,6 +1657,7 @@ typedef struct {
uint32_t cpu_freq_gov; /* cpu frequency governor */
uint16_t mem_bind_type; /* use mem_bind_type_t */
char *mem_bind;
uint16_t accel_bind_type; /* --accel-bind= */

uint16_t max_sockets;
uint16_t max_cores;
Expand Down
2 changes: 2 additions & 0 deletions src/api/step_launch.c
Expand Up @@ -265,6 +265,7 @@ int slurm_step_launch (slurm_step_ctx_t *ctx,
launch.cpu_freq_gov = params->cpu_freq_gov;
launch.mem_bind_type = params->mem_bind_type;
launch.mem_bind = params->mem_bind;
launch.accel_bind_type = params->accel_bind_type;
launch.multi_prog = params->multi_prog ? 1 : 0;
launch.cpus_per_task = params->cpus_per_task;
launch.task_dist = params->task_dist;
Expand Down Expand Up @@ -444,6 +445,7 @@ int slurm_step_launch_add (slurm_step_ctx_t *ctx,
launch.cpu_freq_gov = params->cpu_freq_gov;
launch.mem_bind_type = params->mem_bind_type;
launch.mem_bind = params->mem_bind;
launch.accel_bind_type = params->accel_bind_type;
launch.multi_prog = params->multi_prog ? 1 : 0;
launch.cpus_per_task = params->cpus_per_task;
launch.task_dist = params->task_dist;
Expand Down
156 changes: 145 additions & 11 deletions src/common/gres.c
Expand Up @@ -37,6 +37,7 @@
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/

#define _GNU_SOURCE
#if HAVE_CONFIG_H
# include "config.h"
# if STDC_HEADERS
Expand Down Expand Up @@ -66,12 +67,14 @@
# include <string.h>
#endif /* HAVE_CONFIG_H */

#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>

#include "slurm/slurm.h"
#include "slurm/slurm_errno.h"
#include "src/common/bitstring.h"
#include "src/common/gres.h"
#include "src/common/list.h"
#include "src/common/log.h"
Expand All @@ -95,6 +98,9 @@ typedef struct slurm_gres_ops {
void *gres_ptr );
void (*step_set_env) ( char ***job_env_ptr,
void *gres_ptr );
void (*step_reset_env) ( char ***job_env_ptr,
void *gres_ptr,
bitstr_t *usable_gres );
void (*send_stepd) ( int fd );
void (*recv_stepd) ( int fd );
int (*job_info) ( gres_job_state_t *job_gres_data,
Expand Down Expand Up @@ -325,6 +331,7 @@ static int _load_gres_plugin(char *plugin_name,
"node_config_load",
"job_set_env",
"step_set_env",
"step_reset_env",
"send_stepd",
"recv_stepd",
"job_info",
Expand Down Expand Up @@ -624,6 +631,7 @@ static void _destroy_gres_slurmd_conf(void *x)

xassert(p);
xfree(p->cpus);
FREE_NULL_BITMAP(p->cpus_bitmap);
xfree(p->file); /* Only used by slurmd */
xfree(p->name);
xfree(p->type);
Expand Down Expand Up @@ -775,15 +783,13 @@ static int _parse_gres_config(void **dest, slurm_parser_enum_t type,

p->cpu_cnt = gres_cpu_cnt;
if (s_p_get_string(&p->cpus, "CPUs", tbl)) {
bitstr_t *cpu_bitmap; /* Just use to validate config */
cpu_bitmap = bit_alloc(gres_cpu_cnt);
i = bit_unfmt(cpu_bitmap, p->cpus);
p->cpus_bitmap = bit_alloc(gres_cpu_cnt);
i = bit_unfmt(p->cpus_bitmap, p->cpus);
if (i != 0) {
fatal("Invalid gres data for %s, CPUs=%s (only %u CPUs"
" are available)",
p->name, p->cpus, gres_cpu_cnt);
}
FREE_NULL_BITMAP(cpu_bitmap);
}

if (s_p_get_string(&p->file, "File", tbl)) {
Expand Down Expand Up @@ -5428,42 +5434,170 @@ extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer,
return SLURM_ERROR;
}

/* Return the count of GRES of a specific name on this machine
* IN step_gres_list - generated by gres_plugin_step_allocate()
* IN gres_name - name of the GRES to match
* RET count of GRES of this specific name available to the job or NO_VAL64
*/
extern uint64_t gres_plugin_step_count(List step_gres_list, char *gres_name)
{
uint64_t gres_cnt = NO_VAL64;
gres_state_t *gres_ptr = NULL;
gres_step_state_t *gres_step_ptr = NULL;
ListIterator gres_iter;
int i;

slurm_mutex_lock(&gres_context_lock);
for (i = 0; i < gres_context_cnt; i++) {
if (strcmp(gres_context[i].gres_name, gres_name))
continue;
gres_iter = list_iterator_create(step_gres_list);
while ((gres_ptr = (gres_state_t *)list_next(gres_iter))) {
if (gres_ptr->plugin_id != gres_context[i].plugin_id)
continue;
gres_step_ptr = (gres_step_state_t*)gres_ptr->gres_data;
if (gres_cnt == NO_VAL64)
gres_cnt = gres_step_ptr->gres_cnt_alloc;
else
gres_cnt += gres_step_ptr->gres_cnt_alloc;
}
list_iterator_destroy(gres_iter);
break;
}
slurm_mutex_unlock(&gres_context_lock);

return gres_cnt;
}

/* Given a GRES context index, return a bitmap representing those GRES
* which are available from the CPUs current allocated to this process */
static bitstr_t * _get_usable_gres(int context_inx)
{
cpu_set_t mask;
bitstr_t *usable_gres = NULL;
int i, i_last, rc;
ListIterator iter;
gres_slurmd_conf_t *gres_slurmd_conf;
int gres_inx = 0;


CPU_ZERO(&mask);
#ifdef SCHED_GETAFFINITY_THREE_ARGS
rc = sched_getaffinity(0, sizeof(mask), &mask);
#else
rc = sched_getaffinity(0, &mask);
#endif
if (rc) {
error("sched_getaffinity error: %m");
return usable_gres;
}

usable_gres = bit_alloc(MAX_GRES_BITMAP);
iter = list_iterator_create(gres_conf_list);
while ((gres_slurmd_conf = (gres_slurmd_conf_t *) list_next(iter))) {
if (gres_slurmd_conf->plugin_id !=
gres_context[context_inx].plugin_id)
continue;
if (gres_inx + gres_slurmd_conf->count >= MAX_GRES_BITMAP) {
error("GRES %s bitmap overflow",gres_slurmd_conf->name);
continue;
}
if (!gres_slurmd_conf->cpus_bitmap) {
bit_nset(usable_gres, gres_inx,
gres_inx + gres_slurmd_conf->count - 1);
} else {
i_last = bit_fls(gres_slurmd_conf->cpus_bitmap);
for (i = 0; i <= i_last; i++) {
if (!bit_test(gres_slurmd_conf->cpus_bitmap,i))
continue;
if (!CPU_ISSET(i, &mask))
continue;
bit_nset(usable_gres, gres_inx,
gres_inx + gres_slurmd_conf->count-1);
break;
}
}
gres_inx += gres_slurmd_conf->count;
}
list_iterator_destroy(iter);

return usable_gres;
}

/*
* Set environment variables as required for all tasks of a job step
* IN/OUT job_env_ptr - environment variable array
* IN gres_list - generated by gres_plugin_step_allocate()
*/
extern void gres_plugin_step_set_env(char ***job_env_ptr, List step_gres_list)
* IN step_gres_list - generated by gres_plugin_step_allocate()
* IN accel_bind_type - GRES binding options
*/
extern void gres_plugin_step_set_env(char ***job_env_ptr, List step_gres_list,
uint16_t accel_bind_type)
{
int i;
ListIterator gres_iter;
gres_state_t *gres_ptr = NULL;
bool bind_gpu = accel_bind_type & ACCEL_BIND_CLOSEST_GPU;
bool bind_nic = accel_bind_type & ACCEL_BIND_CLOSEST_NIC;
bool bind_mic = accel_bind_type & ACCEL_BIND_CLOSEST_MIC;
bitstr_t *usable_gres = NULL;

(void) gres_plugin_init();

slurm_mutex_lock(&gres_context_lock);
for (i=0; i<gres_context_cnt; i++) {
for (i = 0; i < gres_context_cnt; i++) {
if (gres_context[i].ops.step_set_env == NULL)
continue; /* No plugin to call */
if (bind_gpu || bind_mic || bind_nic) {
if (!strcmp(gres_context[i].gres_name, "gpu")) {
if (!bind_gpu)
continue;
usable_gres = _get_usable_gres(i);
} else if (!strcmp(gres_context[i].gres_name, "mic")) {
if (!bind_mic)
continue;
usable_gres = _get_usable_gres(i);
} else if (!strcmp(gres_context[i].gres_name, "nic")) {
if (!bind_nic)
continue;
usable_gres = _get_usable_gres(i);
} else {
continue;
}
}
if (step_gres_list) {
gres_iter = list_iterator_create(step_gres_list);
while ((gres_ptr = (gres_state_t *)
list_next(gres_iter))) {
if (gres_ptr->plugin_id !=
gres_context[i].plugin_id)
continue;
(*(gres_context[i].ops.step_set_env))
(job_env_ptr, gres_ptr->gres_data);
if (accel_bind_type) {
(*(gres_context[i].ops.step_reset_env))
(job_env_ptr,
gres_ptr->gres_data,
usable_gres);
} else {
(*(gres_context[i].ops.step_set_env))
(job_env_ptr,
gres_ptr->gres_data);
}
break;
}
list_iterator_destroy(gres_iter);
}
if (gres_ptr == NULL) { /* No data found */
(*(gres_context[i].ops.step_set_env))
if (accel_bind_type) {
(*(gres_context[i].ops.step_reset_env))
(job_env_ptr, NULL, NULL);
} else {
(*(gres_context[i].ops.step_set_env))
(job_env_ptr, NULL);
}
}
FREE_NULL_BITMAP(usable_gres);
}
slurm_mutex_unlock(&gres_context_lock);
FREE_NULL_BITMAP(usable_gres);
}

static void _step_state_log(void *gres_data, uint32_t job_id, uint32_t step_id,
Expand Down
16 changes: 13 additions & 3 deletions src/common/gres.h
Expand Up @@ -61,6 +61,7 @@ typedef struct gres_slurmd_conf {
/* Specific CPUs associated with this configuration record */
uint16_t cpu_cnt;
char *cpus;
bitstr_t *cpus_bitmap; /* Using LOCAL mapping */

/* Device file associated with this configuration record */
char *file;
Expand Down Expand Up @@ -612,12 +613,21 @@ extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer,
uint32_t job_id, uint32_t step_id,
uint16_t protocol_version);

/* Return the count of GRES of a specific name on this machine
* IN step_gres_list - generated by gres_plugin_step_allocate()
* IN gres_name - name of the GRES to match
* RET count of GRES of this specific name available to the job or NO_VAL64
*/
extern uint64_t gres_plugin_step_count(List step_gres_list, char *gres_name);

/*
* Set environment variables as required for all tasks of a job step
* IN/OUT job_env_ptr - environment variable array
* IN gres_list - generated by gres_plugin_step_allocate()
*/
extern void gres_plugin_step_set_env(char ***job_env_ptr, List step_gres_list);
* IN step_gres_list - generated by gres_plugin_step_allocate()
* IN accel_bind_type - GRES binding options
*/
extern void gres_plugin_step_set_env(char ***job_env_ptr, List step_gres_list,
uint16_t accel_bind_type);

/*
* Log a step's current gres state
Expand Down
1 change: 1 addition & 0 deletions src/common/slurm_protocol_defs.h
Expand Up @@ -738,6 +738,7 @@ typedef struct launch_tasks_request_msg {
char *cpu_bind; /* binding map for map/mask_cpu */
uint16_t mem_bind_type; /* --mem_bind= */
char *mem_bind; /* binding map for tasks to memory */
uint16_t accel_bind_type; /* --accel-bind= */
uint16_t num_resp_port;
uint16_t *resp_port; /* array of available response ports */

Expand Down
2 changes: 2 additions & 0 deletions src/common/slurm_protocol_pack.c
Expand Up @@ -9169,6 +9169,7 @@ _pack_launch_tasks_request_msg(launch_tasks_request_msg_t * msg, Buf buffer,
pack16(msg->task_dist, buffer);
pack16(msg->node_cpus, buffer);
pack16(msg->job_core_spec, buffer);
pack16(msg->accel_bind_type, buffer);

slurm_cred_pack(msg->cred, buffer, protocol_version);
for (i = 0; i < msg->nnodes; i++) {
Expand Down Expand Up @@ -9345,6 +9346,7 @@ _unpack_launch_tasks_request_msg(launch_tasks_request_msg_t **
safe_unpack16(&msg->task_dist, buffer);
safe_unpack16(&msg->node_cpus, buffer);
safe_unpack16(&msg->job_core_spec, buffer);
safe_unpack16(&msg->accel_bind_type, buffer);

if (!(msg->cred = slurm_cred_unpack(buffer, protocol_version)))
goto unpack_error;
Expand Down

0 comments on commit 1ae5ad6

Please sign in to comment.