diff --git a/NEWS b/NEWS index 641e03f0cbf..c84414fd7bb 100644 --- a/NEWS +++ b/NEWS @@ -14,6 +14,10 @@ documents those changes that are of interest to users and administrators. that job will be assigned a slurmd shepherd daemon on the same host as used to execute the command when possible rather than an slurmd daemon on an arbitrary front-end node. + -- Add srun --accel-bind option to control how tasks are bound to GPUs and NIC + Generic RESources (GRES). + -- gres/nic plugin modified to set OMPI_MCA_btl_openib_if_include environment + variable based upon allocated devices (usable with OpenMPI and Melanox). * Changes in Slurm 15.08.0pre5 ============================== diff --git a/RELEASE_NOTES b/RELEASE_NOTES index edcdf4994b8..9a5aae02ca2 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -164,6 +164,8 @@ COMMAND CHANGES (see man pages for details) users to specify which behavior they want if a job dependency is not satisfied. -- Add scontrol options to view and modify layouts tables. + -- Add srun --accel-bind option to control how tasks are bound to GPUs and NIC + Generic RESources (GRES). OTHER CHANGES ============= diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 9dde513060c..8512b2b4018 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -18,6 +18,25 @@ http://slurm.schedmd.com/cpu_management.html .SH "OPTIONS" .LP +.TP +\fB\-\-accel\-bind\fR=<\fIoptions\fR> +Control how tasks are bound to generic resources of type gpu, mic and nic. +Multiple options may be specified. Supported options are as include: +.RS +.TP +\fBg\fR +Bind each task to GPUs which are closest to the allocated CPUs. +.TP +\fBm\fR +Bind each task to MICs which are closest to the allocated CPUs. +.TP +\fBn\fR +Bind each task to NICs which are closest to the allocated CPUs. +.TP +\fBv\fR +Verbose mode. Log how tasks are bound to GPU and NIC devices. +.RE + .TP \fB\-A\fR, \fB\-\-account\fR=<\fIaccount\fR> Charge resources used by this job to specified account. diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index a44d3465b4b..e24a6e8d908 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -665,6 +665,13 @@ typedef enum mem_bind_type { /* memory binding type from --mem_bind=... */ MEM_BIND_LOCAL = 0x20 /* =local */ } mem_bind_type_t; +typedef enum accel_bind_type { /* accelerator binding from --accel_bind= */ + ACCEL_BIND_VERBOSE = 0x01, /* 'v' verbose */ + ACCEL_BIND_CLOSEST_GPU = 0x02, /* 'g' Use closest GPU to the CPU */ + ACCEL_BIND_CLOSEST_MIC = 0x04, /* 'm' Use closest NIC to CPU */ + ACCEL_BIND_CLOSEST_NIC = 0x08, /* 'n' Use closest NIC to CPU */ +} accel_bind_type_t; + /* The last entry in node_states must be STATE_END, keep in sync with * node_state_string. values may be ORed with NODE_STATE_FLAGS below. * Node states typically alternate between NODE_STATE_IDLE and @@ -1650,6 +1657,7 @@ typedef struct { uint32_t cpu_freq_gov; /* cpu frequency governor */ uint16_t mem_bind_type; /* use mem_bind_type_t */ char *mem_bind; + uint16_t accel_bind_type; /* --accel-bind= */ uint16_t max_sockets; uint16_t max_cores; diff --git a/src/api/step_launch.c b/src/api/step_launch.c index 3871998763e..b312b365929 100644 --- a/src/api/step_launch.c +++ b/src/api/step_launch.c @@ -265,6 +265,7 @@ int slurm_step_launch (slurm_step_ctx_t *ctx, launch.cpu_freq_gov = params->cpu_freq_gov; launch.mem_bind_type = params->mem_bind_type; launch.mem_bind = params->mem_bind; + launch.accel_bind_type = params->accel_bind_type; launch.multi_prog = params->multi_prog ? 1 : 0; launch.cpus_per_task = params->cpus_per_task; launch.task_dist = params->task_dist; @@ -444,6 +445,7 @@ int slurm_step_launch_add (slurm_step_ctx_t *ctx, launch.cpu_freq_gov = params->cpu_freq_gov; launch.mem_bind_type = params->mem_bind_type; launch.mem_bind = params->mem_bind; + launch.accel_bind_type = params->accel_bind_type; launch.multi_prog = params->multi_prog ? 1 : 0; launch.cpus_per_task = params->cpus_per_task; launch.task_dist = params->task_dist; diff --git a/src/common/gres.c b/src/common/gres.c index f39ecb713ed..033a8de44f6 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -37,6 +37,7 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ +#define _GNU_SOURCE #if HAVE_CONFIG_H # include "config.h" # if STDC_HEADERS @@ -66,12 +67,14 @@ # include #endif /* HAVE_CONFIG_H */ +#include #include #include #include #include "slurm/slurm.h" #include "slurm/slurm_errno.h" +#include "src/common/bitstring.h" #include "src/common/gres.h" #include "src/common/list.h" #include "src/common/log.h" @@ -95,6 +98,9 @@ typedef struct slurm_gres_ops { void *gres_ptr ); void (*step_set_env) ( char ***job_env_ptr, void *gres_ptr ); + void (*step_reset_env) ( char ***job_env_ptr, + void *gres_ptr, + bitstr_t *usable_gres ); void (*send_stepd) ( int fd ); void (*recv_stepd) ( int fd ); int (*job_info) ( gres_job_state_t *job_gres_data, @@ -325,6 +331,7 @@ static int _load_gres_plugin(char *plugin_name, "node_config_load", "job_set_env", "step_set_env", + "step_reset_env", "send_stepd", "recv_stepd", "job_info", @@ -624,6 +631,7 @@ static void _destroy_gres_slurmd_conf(void *x) xassert(p); xfree(p->cpus); + FREE_NULL_BITMAP(p->cpus_bitmap); xfree(p->file); /* Only used by slurmd */ xfree(p->name); xfree(p->type); @@ -775,15 +783,13 @@ static int _parse_gres_config(void **dest, slurm_parser_enum_t type, p->cpu_cnt = gres_cpu_cnt; if (s_p_get_string(&p->cpus, "CPUs", tbl)) { - bitstr_t *cpu_bitmap; /* Just use to validate config */ - cpu_bitmap = bit_alloc(gres_cpu_cnt); - i = bit_unfmt(cpu_bitmap, p->cpus); + p->cpus_bitmap = bit_alloc(gres_cpu_cnt); + i = bit_unfmt(p->cpus_bitmap, p->cpus); if (i != 0) { fatal("Invalid gres data for %s, CPUs=%s (only %u CPUs" " are available)", p->name, p->cpus, gres_cpu_cnt); } - FREE_NULL_BITMAP(cpu_bitmap); } if (s_p_get_string(&p->file, "File", tbl)) { @@ -5428,23 +5434,136 @@ extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer, return SLURM_ERROR; } +/* Return the count of GRES of a specific name on this machine + * IN step_gres_list - generated by gres_plugin_step_allocate() + * IN gres_name - name of the GRES to match + * RET count of GRES of this specific name available to the job or NO_VAL64 + */ +extern uint64_t gres_plugin_step_count(List step_gres_list, char *gres_name) +{ + uint64_t gres_cnt = NO_VAL64; + gres_state_t *gres_ptr = NULL; + gres_step_state_t *gres_step_ptr = NULL; + ListIterator gres_iter; + int i; + + slurm_mutex_lock(&gres_context_lock); + for (i = 0; i < gres_context_cnt; i++) { + if (strcmp(gres_context[i].gres_name, gres_name)) + continue; + gres_iter = list_iterator_create(step_gres_list); + while ((gres_ptr = (gres_state_t *)list_next(gres_iter))) { + if (gres_ptr->plugin_id != gres_context[i].plugin_id) + continue; + gres_step_ptr = (gres_step_state_t*)gres_ptr->gres_data; + if (gres_cnt == NO_VAL64) + gres_cnt = gres_step_ptr->gres_cnt_alloc; + else + gres_cnt += gres_step_ptr->gres_cnt_alloc; + } + list_iterator_destroy(gres_iter); + break; + } + slurm_mutex_unlock(&gres_context_lock); + + return gres_cnt; +} + +/* Given a GRES context index, return a bitmap representing those GRES + * which are available from the CPUs current allocated to this process */ +static bitstr_t * _get_usable_gres(int context_inx) +{ + cpu_set_t mask; + bitstr_t *usable_gres = NULL; + int i, i_last, rc; + ListIterator iter; + gres_slurmd_conf_t *gres_slurmd_conf; + int gres_inx = 0; + + + CPU_ZERO(&mask); +#ifdef SCHED_GETAFFINITY_THREE_ARGS + rc = sched_getaffinity(0, sizeof(mask), &mask); +#else + rc = sched_getaffinity(0, &mask); +#endif + if (rc) { + error("sched_getaffinity error: %m"); + return usable_gres; + } + + usable_gres = bit_alloc(MAX_GRES_BITMAP); + iter = list_iterator_create(gres_conf_list); + while ((gres_slurmd_conf = (gres_slurmd_conf_t *) list_next(iter))) { + if (gres_slurmd_conf->plugin_id != + gres_context[context_inx].plugin_id) + continue; + if (gres_inx + gres_slurmd_conf->count >= MAX_GRES_BITMAP) { + error("GRES %s bitmap overflow",gres_slurmd_conf->name); + continue; + } + if (!gres_slurmd_conf->cpus_bitmap) { + bit_nset(usable_gres, gres_inx, + gres_inx + gres_slurmd_conf->count - 1); + } else { + i_last = bit_fls(gres_slurmd_conf->cpus_bitmap); + for (i = 0; i <= i_last; i++) { + if (!bit_test(gres_slurmd_conf->cpus_bitmap,i)) + continue; + if (!CPU_ISSET(i, &mask)) + continue; + bit_nset(usable_gres, gres_inx, + gres_inx + gres_slurmd_conf->count-1); + break; + } + } + gres_inx += gres_slurmd_conf->count; + } + list_iterator_destroy(iter); + + return usable_gres; +} + /* * Set environment variables as required for all tasks of a job step * IN/OUT job_env_ptr - environment variable array - * IN gres_list - generated by gres_plugin_step_allocate() - */ -extern void gres_plugin_step_set_env(char ***job_env_ptr, List step_gres_list) + * IN step_gres_list - generated by gres_plugin_step_allocate() + * IN accel_bind_type - GRES binding options + */ +extern void gres_plugin_step_set_env(char ***job_env_ptr, List step_gres_list, + uint16_t accel_bind_type) { int i; ListIterator gres_iter; gres_state_t *gres_ptr = NULL; + bool bind_gpu = accel_bind_type & ACCEL_BIND_CLOSEST_GPU; + bool bind_nic = accel_bind_type & ACCEL_BIND_CLOSEST_NIC; + bool bind_mic = accel_bind_type & ACCEL_BIND_CLOSEST_MIC; + bitstr_t *usable_gres = NULL; (void) gres_plugin_init(); slurm_mutex_lock(&gres_context_lock); - for (i=0; iplugin_id != gres_context[i].plugin_id) continue; - (*(gres_context[i].ops.step_set_env)) - (job_env_ptr, gres_ptr->gres_data); + if (accel_bind_type) { + (*(gres_context[i].ops.step_reset_env)) + (job_env_ptr, + gres_ptr->gres_data, + usable_gres); + } else { + (*(gres_context[i].ops.step_set_env)) + (job_env_ptr, + gres_ptr->gres_data); + } break; } list_iterator_destroy(gres_iter); } if (gres_ptr == NULL) { /* No data found */ - (*(gres_context[i].ops.step_set_env)) + if (accel_bind_type) { + (*(gres_context[i].ops.step_reset_env)) + (job_env_ptr, NULL, NULL); + } else { + (*(gres_context[i].ops.step_set_env)) (job_env_ptr, NULL); + } } + FREE_NULL_BITMAP(usable_gres); } slurm_mutex_unlock(&gres_context_lock); + FREE_NULL_BITMAP(usable_gres); } static void _step_state_log(void *gres_data, uint32_t job_id, uint32_t step_id, diff --git a/src/common/gres.h b/src/common/gres.h index 9a7ed2ce8d3..146ed7182ed 100644 --- a/src/common/gres.h +++ b/src/common/gres.h @@ -61,6 +61,7 @@ typedef struct gres_slurmd_conf { /* Specific CPUs associated with this configuration record */ uint16_t cpu_cnt; char *cpus; + bitstr_t *cpus_bitmap; /* Using LOCAL mapping */ /* Device file associated with this configuration record */ char *file; @@ -612,12 +613,21 @@ extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer, uint32_t job_id, uint32_t step_id, uint16_t protocol_version); +/* Return the count of GRES of a specific name on this machine + * IN step_gres_list - generated by gres_plugin_step_allocate() + * IN gres_name - name of the GRES to match + * RET count of GRES of this specific name available to the job or NO_VAL64 + */ +extern uint64_t gres_plugin_step_count(List step_gres_list, char *gres_name); + /* * Set environment variables as required for all tasks of a job step * IN/OUT job_env_ptr - environment variable array - * IN gres_list - generated by gres_plugin_step_allocate() - */ -extern void gres_plugin_step_set_env(char ***job_env_ptr, List step_gres_list); + * IN step_gres_list - generated by gres_plugin_step_allocate() + * IN accel_bind_type - GRES binding options + */ +extern void gres_plugin_step_set_env(char ***job_env_ptr, List step_gres_list, + uint16_t accel_bind_type); /* * Log a step's current gres state diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index d7cf3e52b9d..87f68cabdc6 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -738,6 +738,7 @@ typedef struct launch_tasks_request_msg { char *cpu_bind; /* binding map for map/mask_cpu */ uint16_t mem_bind_type; /* --mem_bind= */ char *mem_bind; /* binding map for tasks to memory */ + uint16_t accel_bind_type; /* --accel-bind= */ uint16_t num_resp_port; uint16_t *resp_port; /* array of available response ports */ diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 87becbc8287..f85b66298e3 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -9169,6 +9169,7 @@ _pack_launch_tasks_request_msg(launch_tasks_request_msg_t * msg, Buf buffer, pack16(msg->task_dist, buffer); pack16(msg->node_cpus, buffer); pack16(msg->job_core_spec, buffer); + pack16(msg->accel_bind_type, buffer); slurm_cred_pack(msg->cred, buffer, protocol_version); for (i = 0; i < msg->nnodes; i++) { @@ -9345,6 +9346,7 @@ _unpack_launch_tasks_request_msg(launch_tasks_request_msg_t ** safe_unpack16(&msg->task_dist, buffer); safe_unpack16(&msg->node_cpus, buffer); safe_unpack16(&msg->job_core_spec, buffer); + safe_unpack16(&msg->accel_bind_type, buffer); if (!(msg->cred = slurm_cred_unpack(buffer, protocol_version))) goto unpack_error; diff --git a/src/plugins/gres/gpu/gres_gpu.c b/src/plugins/gres/gpu/gres_gpu.c index a08d9c02f3f..5afbf700628 100644 --- a/src/plugins/gres/gpu/gres_gpu.c +++ b/src/plugins/gres/gpu/gres_gpu.c @@ -36,6 +36,7 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ +#define _GNU_SOURCE #if HAVE_CONFIG_H # include "config.h" # if STDC_HEADERS @@ -61,9 +62,10 @@ # include #endif /* HAVE_CONFIG_H */ +#include +#include #include #include -#include #include "slurm/slurm.h" #include "slurm/slurm_errno.h" @@ -124,6 +126,7 @@ extern int fini(void) return SLURM_SUCCESS; } + /* * We could load gres state or validate it using various mechanisms here. * This only validates that the configuration was specified in gres.conf. @@ -282,8 +285,7 @@ extern void job_set_env(char ***job_env_ptr, void *gres_ptr) } else if (gres_job_ptr && (gres_job_ptr->gres_cnt_alloc > 0)) { /* The gres.conf file must identify specific device files * in order to set the CUDA_VISIBLE_DEVICES env var */ - error("gres/gpu unable to set CUDA_VISIBLE_DEVICES, " - "no device files configured"); + debug("gres/gpu unable to set CUDA_VISIBLE_DEVICES, no device files configured"); } else { xstrcat(local_list, "NoDevFiles"); } @@ -343,9 +345,71 @@ extern void step_set_env(char ***job_env_ptr, void *gres_ptr) } if (dev_list) { - env_array_overwrite(job_env_ptr,"CUDA_VISIBLE_DEVICES", + env_array_overwrite(job_env_ptr, "CUDA_VISIBLE_DEVICES", dev_list); - env_array_overwrite(job_env_ptr,"GPU_DEVICE_ORDINAL", + env_array_overwrite(job_env_ptr, "GPU_DEVICE_ORDINAL", + dev_list); + xfree(dev_list); + } +} + +/* + * Reset environment variables as appropriate for a job (i.e. this one tasks) + * based upon the job step's GRES state and assigned CPUs. + */ +extern void step_reset_env(char ***job_env_ptr, void *gres_ptr, + bitstr_t *usable_gres) +{ + int i, len, local_inx = 0, first_match = -1; + char *dev_list = NULL; + gres_step_state_t *gres_step_ptr = (gres_step_state_t *) gres_ptr; + bool use_local_dev_index = _use_local_device_index(); + + if ((gres_step_ptr != NULL) && + (gres_step_ptr->node_cnt == 1) && + (gres_step_ptr->gres_bit_alloc != NULL) && + (gres_step_ptr->gres_bit_alloc[0] != NULL) && + (usable_gres != NULL)) { + len = MIN(bit_size(gres_step_ptr->gres_bit_alloc[0]), + bit_size(usable_gres)); + for (i = 0; i < len; i++) { + if (!bit_test(gres_step_ptr->gres_bit_alloc[0], i)) + continue; + if (first_match == -1) + first_match = i; + if (!bit_test(usable_gres, i)) + continue; + if (!dev_list) + dev_list = xmalloc(128); + else + xstrcat(dev_list, ","); + if (use_local_dev_index) { + xstrfmtcat(dev_list, "%d", local_inx++); + } else if (gpu_devices && (i < nb_available_files) && + (gpu_devices[i] >= 0)) { + xstrfmtcat(dev_list, "%d", gpu_devices[i]); + } else { + xstrfmtcat(dev_list, "%d", i); + } + } + if (!dev_list && (first_match != -1)) { + i = first_match; + dev_list = xmalloc(128); + if (use_local_dev_index) { + xstrfmtcat(dev_list, "%d", local_inx++); + } else if (gpu_devices && (i < nb_available_files) && + (gpu_devices[i] >= 0)) { + xstrfmtcat(dev_list, "%d", gpu_devices[i]); + } else { + xstrfmtcat(dev_list, "%d", i); + } + } + } + + if (dev_list) { + env_array_overwrite(job_env_ptr, "CUDA_VISIBLE_DEVICES", + dev_list); + env_array_overwrite(job_env_ptr, "GPU_DEVICE_ORDINAL", dev_list); xfree(dev_list); } diff --git a/src/plugins/gres/mic/gres_mic.c b/src/plugins/gres/mic/gres_mic.c index 71ace5bdfe7..5c418e3ab43 100644 --- a/src/plugins/gres/mic/gres_mic.c +++ b/src/plugins/gres/mic/gres_mic.c @@ -252,6 +252,56 @@ extern void step_set_env(char ***job_env_ptr, void *gres_ptr) } } +/* + * Reset environment variables as appropriate for a job (i.e. this one tasks) + * based upon the job step's GRES state and assigned CPUs. + */ +extern void step_reset_env(char ***job_env_ptr, void *gres_ptr, + bitstr_t *usable_gres) +{ + int i, len, first_match = -1; + char *dev_list = NULL; + gres_step_state_t *gres_step_ptr = (gres_step_state_t *) gres_ptr; + + if ((gres_step_ptr != NULL) && + (gres_step_ptr->node_cnt == 1) && + (gres_step_ptr->gres_bit_alloc != NULL) && + (gres_step_ptr->gres_bit_alloc[0] != NULL) && + (usable_gres != NULL)) { + len = MIN(bit_size(gres_step_ptr->gres_bit_alloc[0]), + bit_size(usable_gres)); + for (i = 0; i < len; i++) { + if (!bit_test(gres_step_ptr->gres_bit_alloc[0], i)) + continue; + if (first_match == -1) + first_match = i; + if (!bit_test(usable_gres, i)) + continue; + if (!dev_list) + dev_list = xmalloc(128); + else + xstrcat(dev_list, ","); + if (mic_devices && (mic_devices[i] >= 0)) + xstrfmtcat(dev_list, "%d", mic_devices[i]); + else + xstrfmtcat(dev_list, "%d", i); + } + if (!dev_list && (first_match != -1)) { + i = first_match; + dev_list = xmalloc(128); + if (mic_devices && (mic_devices[i] >= 0)) + xstrfmtcat(dev_list, "%d", mic_devices[i]); + else + xstrfmtcat(dev_list, "%d", i); + } + } + if (dev_list) { + env_array_overwrite(job_env_ptr,"OFFLOAD_DEVICES", + dev_list); + xfree(dev_list); + } +} + /* Send GRES information to slurmstepd on the specified file descriptor */ extern void send_stepd(int fd) { diff --git a/src/plugins/gres/nic/gres_nic.c b/src/plugins/gres/nic/gres_nic.c index 484087ab064..1fa13a8b44a 100644 --- a/src/plugins/gres/nic/gres_nic.c +++ b/src/plugins/gres/nic/gres_nic.c @@ -61,6 +61,7 @@ # include #endif /* HAVE_CONFIG_H */ +#include #include #include @@ -69,8 +70,11 @@ #include "src/common/slurm_xlator.h" #include "src/common/bitstring.h" +#include "src/common/env.h" #include "src/common/gres.h" #include "src/common/list.h" +#include "src/common/xcgroup_read_config.c" +#include "src/common/xstring.h" /* * These variables are required by the generic plugin interface. If they @@ -103,6 +107,24 @@ const uint32_t plugin_version = SLURM_VERSION_NUMBER; static char gres_name[] = "nic"; +static int *nic_devices = NULL; +static int nb_available_files = 0; + +extern int init(void) +{ + debug("%s: %s loaded", __func__, plugin_name); + + return SLURM_SUCCESS; +} +extern int fini(void) +{ + debug("%s: unloading %s", __func__, plugin_name); + xfree(nic_devices); + nb_available_files = 0; + + return SLURM_SUCCESS; +} + /* * We could load gres state or validate it using various mechanisms here. * This only validates that the configuration was specified in gres.conf. @@ -110,31 +132,160 @@ static char gres_name[] = "nic"; */ extern int node_config_load(List gres_conf_list) { - int rc = SLURM_ERROR; + int i, rc = SLURM_SUCCESS; ListIterator iter; gres_slurmd_conf_t *gres_slurmd_conf; + int nb_nic = 0; /* Number of NICs in the list */ + int available_files_index = 0; xassert(gres_conf_list); iter = list_iterator_create(gres_conf_list); while ((gres_slurmd_conf = list_next(iter))) { - if (strcmp(gres_slurmd_conf->name, gres_name) == 0) { - rc = SLURM_SUCCESS; + if (strcmp(gres_slurmd_conf->name, gres_name)) + continue; + if (gres_slurmd_conf->file) + nb_nic++; + } + list_iterator_destroy(iter); + xfree(nic_devices); /* No-op if NULL */ + nb_available_files = -1; + /* (Re-)Allocate memory if number of files changed */ + if (nb_nic > nb_available_files) { + nic_devices = (int *) xmalloc(sizeof(int) * nb_nic); + nb_available_files = nb_nic; + for (i = 0; i < nb_available_files; i++) + nic_devices[i] = -1; + } + + iter = list_iterator_create(gres_conf_list); + while ((gres_slurmd_conf = list_next(iter))) { + if ((strcmp(gres_slurmd_conf->name, gres_name) == 0) && + gres_slurmd_conf->file) { + /* Populate nic_devices array with number + * at end of the file name */ + char *bracket, *fname, *tmp_name; + hostlist_t hl; + bracket = strrchr(gres_slurmd_conf->file, '['); + if (bracket) + tmp_name = xstrdup(bracket); + else + tmp_name = xstrdup(gres_slurmd_conf->file); + hl = hostlist_create(tmp_name); + xfree(tmp_name); + if (!hl) { + rc = EINVAL; + break; + } + while ((fname = hostlist_shift(hl))) { + if (available_files_index == + nb_available_files) { + nb_available_files++; + xrealloc(nic_devices, sizeof(int) * + nb_available_files); + nic_devices[available_files_index] = -1; + } + for (i = 0; fname[i]; i++) { + if (!isdigit(fname[i])) + continue; + nic_devices[available_files_index] = + atoi(fname + i); + break; + } + available_files_index++; + free(fname); + } + hostlist_destroy(hl); } } list_iterator_destroy(iter); if (rc != SLURM_SUCCESS) fatal("%s failed to load configuration", plugin_name); + + for (i = 0; i < nb_available_files; i++) + info("nic %d is device number %d", i, nic_devices[i]); + return rc; } +/* + * Test if OMPI_MCA_btl_openib_if_include should be set to global device ID or a + * device ID that always starts at zero (based upon what the application can see). + * RET true if TaskPlugin=task/cgroup AND ConstrainDevices=yes (in cgroup.conf). + */ +static bool _use_local_device_index(void) +{ + slurm_cgroup_conf_t slurm_cgroup_conf; + char *task_plugin = slurm_get_task_plugin(); + bool use_cgroup = false, use_local_index = false; + + if (!task_plugin) + return use_local_index; + + if (strstr(task_plugin, "cgroup")) + use_cgroup = true; + xfree(task_plugin); + if (!use_cgroup) + return use_local_index; + + /* Read and parse cgroup.conf */ + bzero(&slurm_cgroup_conf, sizeof(slurm_cgroup_conf_t)); + if (read_slurm_cgroup_conf(&slurm_cgroup_conf) != SLURM_SUCCESS) + return use_local_index; + if (slurm_cgroup_conf.constrain_devices) + use_local_index = true; + free_slurm_cgroup_conf(&slurm_cgroup_conf); + + return use_local_index; +} + /* * Set environment variables as appropriate for a job (i.e. all tasks) based * upon the job's GRES state. */ extern void job_set_env(char ***job_env_ptr, void *gres_ptr) { - /* EMPTY */ + int i, len, local_inx = 0; + char *dev_list = NULL; + gres_job_state_t *gres_job_ptr = (gres_job_state_t *) gres_ptr; + bool use_local_dev_index = _use_local_device_index(); + + if ((gres_job_ptr != NULL) && + (gres_job_ptr->node_cnt == 1) && + (gres_job_ptr->gres_bit_alloc != NULL) && + (gres_job_ptr->gres_bit_alloc[0] != NULL)) { + len = bit_size(gres_job_ptr->gres_bit_alloc[0]); + for (i = 0; i < len; i++) { + if (!bit_test(gres_job_ptr->gres_bit_alloc[0], i)) + continue; + if (!dev_list) + dev_list = xmalloc(128); + else + xstrcat(dev_list, ","); + if (use_local_dev_index) { + xstrfmtcat(dev_list, "mlx4_%d", local_inx++); + } else if (nic_devices && (i < nb_available_files) && + (nic_devices[i] >= 0)) { + xstrfmtcat(dev_list, "mlx4_%d", nic_devices[i]); + } else { + xstrfmtcat(dev_list, "mlx4_%d", i); + } + } + } else if (gres_job_ptr && (gres_job_ptr->gres_cnt_alloc > 0)) { + /* The gres.conf file must identify specific device files + * in order to set the OMPI_MCA_btl_openib_if_include env var */ + debug("gres/nic unable to set OMPI_MCA_btl_openib_if_include, no device files configured"); + } else { + xstrcat(dev_list, "NoDevFiles"); + } + + if (dev_list) { + /* we assume mellanox cards and OpenMPI programm */ + env_array_overwrite(job_env_ptr, + "OMPI_MCA_btl_openib_if_include", + dev_list); + xfree(dev_list); + } } /* @@ -143,19 +294,135 @@ extern void job_set_env(char ***job_env_ptr, void *gres_ptr) */ extern void step_set_env(char ***job_env_ptr, void *gres_ptr) { - /* EMPTY */ + int i, len, local_inx = 0; + char *dev_list = NULL; + gres_step_state_t *gres_step_ptr = (gres_step_state_t *) gres_ptr; + bool use_local_dev_index = _use_local_device_index(); + + if ((gres_step_ptr != NULL) && + (gres_step_ptr->node_cnt == 1) && + (gres_step_ptr->gres_bit_alloc != NULL) && + (gres_step_ptr->gres_bit_alloc[0] != NULL)) { + len = bit_size(gres_step_ptr->gres_bit_alloc[0]); + for (i = 0; i < len; i++) { + if (!bit_test(gres_step_ptr->gres_bit_alloc[0], i)) + continue; + if (!dev_list) + dev_list = xmalloc(128); + else + xstrcat(dev_list, ","); + if (use_local_dev_index) { + xstrfmtcat(dev_list, "mlx4_%d", local_inx++); + } else if (nic_devices && (i < nb_available_files) && + (nic_devices[i] >= 0)) { + xstrfmtcat(dev_list, "mlx4_%d", nic_devices[i]); + } else { + xstrfmtcat(dev_list, "mlx4_%d", i); + } + } + } else if (gres_step_ptr && (gres_step_ptr->gres_cnt_alloc > 0)) { + /* The gres.conf file must identify specific device files + * in order to set the OMPI_MCA_btl_openib_if_include env var */ + error("gres/nic unable to set OMPI_MCA_btl_openib_if_include, " + "no device files configured"); + } else { + xstrcat(dev_list, "NoDevFiles"); + } + + if (dev_list) { + /* we assume mellanox cards and OpenMPI programm */ + env_array_overwrite(job_env_ptr, + "OMPI_MCA_btl_openib_if_include", + dev_list); + xfree(dev_list); + } +} + +/* + * Reset environment variables as appropriate for a job (i.e. this one tasks) + * based upon the job step's GRES state and assigned CPUs. + */ +extern void step_reset_env(char ***job_env_ptr, void *gres_ptr, + bitstr_t *usable_gres) +{ + int i, len, first_match = -1; + char *dev_list = NULL; + gres_step_state_t *gres_step_ptr = (gres_step_state_t *) gres_ptr; + + if ((gres_step_ptr != NULL) && + (gres_step_ptr->node_cnt == 1) && + (gres_step_ptr->gres_bit_alloc != NULL) && + (gres_step_ptr->gres_bit_alloc[0] != NULL) && + (usable_gres != NULL)) { + len = MIN(bit_size(gres_step_ptr->gres_bit_alloc[0]), + bit_size(usable_gres)); + for (i = 0; i < len; i++) { + if (!bit_test(gres_step_ptr->gres_bit_alloc[0], i)) + continue; + if (first_match == -1) + first_match = i; + if (!bit_test(usable_gres, i)) + continue; + if (!dev_list) + dev_list = xmalloc(128); + else + xstrcat(dev_list, ","); + if (nic_devices && (i < nb_available_files) && + (nic_devices[i] >= 0)) { + xstrfmtcat(dev_list, "mlx4_%d", nic_devices[i]); + } else { + xstrfmtcat(dev_list, "mlx4_%d", i); + } + } + if (!dev_list && (first_match != -1)) { + i = first_match; + dev_list = xmalloc(128); + if (nic_devices && (i < nb_available_files) && + (nic_devices[i] >= 0)) { + xstrfmtcat(dev_list, "mlx4_%d", nic_devices[i]); + } else { + xstrfmtcat(dev_list, "mlx4_%d", i); + } + } + } + + if (dev_list) { + /* we assume mellanox cards and OpenMPI programm */ + env_array_overwrite(job_env_ptr, + "OMPI_MCA_btl_openib_if_include", + dev_list); + xfree(dev_list); + } } /* Send GRES information to slurmstepd on the specified file descriptor*/ extern void send_stepd(int fd) { - /* EMPTY */ + int i; + + safe_write(fd, &nb_available_files, sizeof(int)); + for (i = 0; i < nb_available_files; i++) + safe_write(fd, &nic_devices[i], sizeof(int)); + return; + +rwfail: error("gres_plugin_send_stepd failed"); } -/* Receive GRES information from slurmd on the specified file descriptor*/ +/* Receive GRES information from slurmd on the specified file descriptor */ extern void recv_stepd(int fd) { - /* EMPTY */ + int i; + + safe_read(fd, &nb_available_files, sizeof(int)); + if (nb_available_files > 0) { + xfree(nic_devices); /* No-op if NULL */ + nic_devices = xmalloc(sizeof(int) * nb_available_files); + } + for (i = 0; i < nb_available_files; i++) + safe_read(fd, &nic_devices[i], sizeof(int)); + return; + +rwfail: error("gres_plugin_recv_stepd failed"); } extern int job_info(gres_job_state_t *job_gres_data, uint32_t node_inx, diff --git a/src/plugins/launch/slurm/launch_slurm.c b/src/plugins/launch/slurm/launch_slurm.c index 545661f3d83..d1c1c83f397 100644 --- a/src/plugins/launch/slurm/launch_slurm.c +++ b/src/plugins/launch/slurm/launch_slurm.c @@ -552,6 +552,7 @@ extern int launch_p_step_launch( launch_params.cpu_bind_type = opt.cpu_bind_type; launch_params.mem_bind = opt.mem_bind; launch_params.mem_bind_type = opt.mem_bind_type; + launch_params.accel_bind_type = opt.accel_bind_type; launch_params.open_mode = opt.open_mode; if (opt.acctg_freq >= 0) launch_params.acctg_freq = opt.acctg_freq; diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index 65bc20974eb..8e3b3159c52 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -90,6 +90,7 @@ #include "src/common/env.h" #include "src/common/fd.h" #include "src/common/forward.h" +#include "src/common/gres.h" #include "src/common/hostlist.h" #include "src/common/log.h" #include "src/common/mpi.h" @@ -251,6 +252,7 @@ mgr_launch_tasks_setup(launch_tasks_request_msg_t *msg, slurm_addr_t *cli, job->envtp->cli = cli; job->envtp->self = self; job->envtp->select_jobinfo = msg->select_jobinfo; + job->accel_bind_type = msg->accel_bind_type; return job; } @@ -1090,6 +1092,9 @@ job_manager(stepd_step_rec_t *job) if (job->stepid == SLURM_EXTERN_CONT) return _spawn_job_container(job); + if (!job->batch && job->accel_bind_type) + (void) gres_plugin_node_config_load(conf->cpus,conf->node_name); + #ifdef HAVE_ALPS_CRAY /* * Note that the previously called proctrack_g_create function is @@ -1153,6 +1158,23 @@ job_manager(stepd_step_rec_t *job) goto fail2; } + if (!job->batch && job->accel_bind_type && (job->node_tasks <= 1)) + job->accel_bind_type = 0; + if (!job->batch && job->accel_bind_type && (job->node_tasks > 1)) { + uint64_t gpu_cnt, mic_cnt, nic_cnt; + gpu_cnt = gres_plugin_step_count(job->step_gres_list, "gpu"); + mic_cnt = gres_plugin_step_count(job->step_gres_list, "mic"); + nic_cnt = gres_plugin_step_count(job->step_gres_list, "nic"); + if ((gpu_cnt <= 1) || (gpu_cnt == NO_VAL64)) + job->accel_bind_type &= (~ACCEL_BIND_CLOSEST_GPU); + if ((mic_cnt <= 1) || (mic_cnt == NO_VAL64)) + job->accel_bind_type &= (~ACCEL_BIND_CLOSEST_MIC); + if ((nic_cnt <= 1) || (nic_cnt == NO_VAL64)) + job->accel_bind_type &= (~ACCEL_BIND_CLOSEST_NIC); + if (job->accel_bind_type == ACCEL_BIND_VERBOSE) + job->accel_bind_type = 0; + } + /* Calls pam_setup() and requires pam_finish() if * successful. Only check for < 0 here since other slurm * error codes could come that are more descriptive. */ @@ -1349,7 +1371,7 @@ static void _exec_wait_info_destroy (struct exec_wait_info *e) xfree(e); } -static pid_t exec_wait_get_pid (struct exec_wait_info *e) +static pid_t _exec_wait_get_pid (struct exec_wait_info *e) { if (e == NULL) return (-1); @@ -1595,7 +1617,7 @@ _fork_all_tasks(stepd_step_rec_t *job, bool *io_initialized) exec_wait_kill_children (exec_wait_list); rc = SLURM_ERROR; goto fail4; - } else if ((pid = exec_wait_get_pid (ei)) == 0) { /* child */ + } else if ((pid = _exec_wait_get_pid (ei)) == 0) { /* child */ /* * Destroy exec_wait_list in the child. * Only exec_wait_info for previous tasks have been @@ -2665,7 +2687,7 @@ _run_script_as_user(const char *name, const char *path, stepd_step_rec_t *job, error ("executing %s: fork: %m", name); return -1; } - if ((cpid = exec_wait_get_pid (ei)) == 0) { + if ((cpid = _exec_wait_get_pid (ei)) == 0) { struct priv_state sprivs; char *argv[2]; diff --git a/src/slurmd/slurmstepd/slurmstepd.c b/src/slurmd/slurmstepd/slurmstepd.c index de520255f8c..037d883de91 100644 --- a/src/slurmd/slurmstepd/slurmstepd.c +++ b/src/slurmd/slurmstepd/slurmstepd.c @@ -513,7 +513,7 @@ _step_setup(slurm_addr_t *cli, slurm_addr_t *self, slurm_msg_t *msg) { stepd_step_rec_t *job = NULL; - switch(msg->msg_type) { + switch (msg->msg_type) { case REQUEST_BATCH_JOB_LAUNCH: debug2("setup for a batch_job"); job = mgr_launch_batch_job_setup(msg->data, cli); @@ -545,7 +545,7 @@ _step_setup(slurm_addr_t *cli, slurm_addr_t *self, slurm_msg_t *msg) if (msg->msg_type == REQUEST_BATCH_JOB_LAUNCH) gres_plugin_job_set_env(&job->env, job->job_gres_list); else if (msg->msg_type == REQUEST_LAUNCH_TASKS) - gres_plugin_step_set_env(&job->env, job->step_gres_list); + gres_plugin_step_set_env(&job->env, job->step_gres_list, 0); /* * Add slurmd node topology informations to job env array diff --git a/src/slurmd/slurmstepd/slurmstepd_job.h b/src/slurmd/slurmstepd/slurmstepd_job.h index efd03775fac..cdf0698bb0b 100644 --- a/src/slurmd/slurmstepd/slurmstepd_job.h +++ b/src/slurmd/slurmstepd/slurmstepd_job.h @@ -153,6 +153,7 @@ typedef struct { char *cpu_bind; /* binding map for map/mask_cpu */ mem_bind_type_t mem_bind_type; /* --mem_bind= */ char *mem_bind; /* binding map for tasks to memory */ + uint16_t accel_bind_type; /* --accel_bind= */ uint32_t cpu_freq_min; /* Minimum cpu frequency */ uint32_t cpu_freq_max; /* Maximum cpu frequency */ uint32_t cpu_freq_gov; /* cpu frequency governor */ diff --git a/src/slurmd/slurmstepd/task.c b/src/slurmd/slurmstepd/task.c index 8ba32ec739b..4ed5cb91780 100644 --- a/src/slurmd/slurmstepd/task.c +++ b/src/slurmd/slurmstepd/task.c @@ -75,6 +75,7 @@ #include "src/common/checkpoint.h" #include "src/common/env.h" +#include "src/common/gres.h" #include "src/common/fd.h" #include "src/common/log.h" #include "src/common/mpi.h" @@ -442,8 +443,19 @@ exec_task(stepd_step_rec_t *job, int i) /* task plugin hook */ if (task_g_pre_launch(job)) { - error ("Failed task affinity setup"); - exit (1); + error("Failed task affinity setup"); + exit(1); + } + if (!job->batch && job->accel_bind_type) { + /* Modify copy of job's environment. Do not alter in place or + * concurrent searches of the environment can generate invalid + * memory references. */ + job->envtp->env = env_array_copy((const char **) job->env); + gres_plugin_step_set_env(&job->envtp->env, job->step_gres_list, + job->accel_bind_type); + tmp_env = job->env; + job->env = job->envtp->env; + env_array_free(tmp_env); } if (spank_user_task (job, i) < 0) { diff --git a/src/srun/libsrun/opt.c b/src/srun/libsrun/opt.c index 1d7f77afae0..443693170c6 100644 --- a/src/srun/libsrun/opt.c +++ b/src/srun/libsrun/opt.c @@ -205,6 +205,7 @@ #define LONG_OPT_PROFILE 0x157 #define LONG_OPT_EXPORT 0x158 #define LONG_OPT_PRIORITY 0x160 +#define LONG_OPT_ACCEL_BIND 0x161 extern char **environ; @@ -415,6 +416,7 @@ static void _opt_default(void) opt.cpu_bind = NULL; opt.mem_bind_type = 0; opt.mem_bind = NULL; + opt.accel_bind_type = 0; opt.core_spec = (uint16_t) NO_VAL; opt.core_spec_set = false; opt.time_limit = NO_VAL; @@ -896,6 +898,7 @@ static void _set_options(const int argc, char **argv) {"exclude", required_argument, 0, 'x'}, {"disable-status", no_argument, 0, 'X'}, {"no-allocate", no_argument, 0, 'Z'}, + {"accel-bind", required_argument, 0, LONG_OPT_ACCEL_BIND}, {"acctg-freq", required_argument, 0, LONG_OPT_ACCTG_FREQ}, {"bb", required_argument, 0, LONG_OPT_BURST_BUFFER}, {"begin", required_argument, 0, LONG_OPT_BEGIN}, @@ -994,8 +997,8 @@ static void _set_options(const int argc, char **argv) else error("opt.progname is already set."); optind = 0; - while((opt_char = getopt_long(argc, argv, opt_string, - optz, &option_index)) != -1) { + while ((opt_char = getopt_long(argc, argv, opt_string, + optz, &option_index)) != -1) { switch (opt_char) { case (int)'?': @@ -1674,6 +1677,16 @@ static void _set_options(const int argc, char **argv) opt.core_spec = _get_int(optarg, "thread_spec", true) | CORE_SPEC_THREAD; break; + case LONG_OPT_ACCEL_BIND: + if (strchr(optarg, 'v')) + opt.accel_bind_type |= ACCEL_BIND_VERBOSE; + if (strchr(optarg, 'g')) + opt.accel_bind_type |= ACCEL_BIND_CLOSEST_GPU; + if (strchr(optarg, 'm')) + opt.accel_bind_type |= ACCEL_BIND_CLOSEST_MIC; + if (strchr(optarg, 'n')) + opt.accel_bind_type |= ACCEL_BIND_CLOSEST_NIC; + break; default: if (spank_process_option (opt_char, optarg) < 0) { exit(error_exit); diff --git a/src/srun/libsrun/opt.h b/src/srun/libsrun/opt.h index da08707d169..a8885443216 100644 --- a/src/srun/libsrun/opt.h +++ b/src/srun/libsrun/opt.h @@ -99,6 +99,7 @@ typedef struct srun_options { char *cpu_bind; /* binding map for map/mask_cpu */ mem_bind_type_t mem_bind_type; /* --mem_bind= */ char *mem_bind; /* binding map for map/mask_mem */ + uint16_t accel_bind_type; /* --accel-bind= */ bool nodes_set; /* true if nodes explicitly set */ bool nodes_set_env; /* true if nodes set via SLURM_NNODES */ bool nodes_set_opt; /* true if nodes explicitly set using