Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Comparing changes

Choose two branches to see what's changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
base fork: SchedMD/slurm
...
head fork: SchedMD/slurm
  • 5 commits
  • 8 files changed
  • 0 commit comments
  • 1 contributor
View
8 NEWS
@@ -56,6 +56,14 @@ documents those changes that are of interest to users and admins.
for each Linux session ID.
-- Cray - For srun wrapper when creating a job allocation, set the default job
name to the executable file's name.
+ -- Add support for Cray ALPS 5.0.0
+ -- FRONTEND - if a front end unexpectedly reboots kill all jobs but don't
+ mark front end node down.
+ -- FRONTEND - don't down a front end node if you have an epilog error.
+ -- Cray - fix for if a frontend slurmd was started after the slurmctld had
+ already pinged it on startup the unresponding flag would be removed from
+ the frontend node.
+ -- Cray - Fix issue on smap not displaying grid correctly.
* Changes in SLURM 2.3.3
========================
View
1  src/plugins/select/cray/basil_alps.h
@@ -56,6 +56,7 @@ enum basil_version {
BV_3_1, /* Basil 1.1 CLE 3.x (XE/Gemini support) */
BV_4_0, /* Basil 1.2 CLE 4.x unconfirmed simulator version */
BV_4_1, /* Basil 1.2 CLE 4.x unconfirmed simulator version */
+ BV_5_0, /* Basil 1.2 CLE 5.x unconfirmed simulator version */
BV_MAX
};
View
2  src/plugins/select/cray/libalps/do_query.c
@@ -75,6 +75,8 @@ extern enum basil_version get_basil_version(void)
if (_get_alps_engine(engine_version, sizeof(engine_version)) == NULL)
fatal("can not determine ALPS Engine version");
+ else if (strncmp(engine_version, "5.0.0", 5) == 0)
+ bv = BV_5_0;
else if (strncmp(engine_version, "4.1.0", 5) == 0)
bv = BV_4_1;
else if (strncmp(engine_version, "4.0", 3) == 0)
View
3  src/plugins/select/cray/libalps/parser_common.c
@@ -486,7 +486,8 @@ static const struct element_handler *basil_tables[BV_MAX] = {
[BV_1_2] = basil_1_1_elements, /* Basil 1.2 behaves like 1.1 */
[BV_3_1] = basil_3_1_elements,
[BV_4_0] = basil_4_0_elements,
- [BV_4_1] = basil_4_0_elements
+ [BV_4_1] = basil_4_0_elements,
+ [BV_5_0] = basil_4_0_elements
};
/**
View
6 src/plugins/select/cray/parser_common.h
@@ -20,7 +20,8 @@ const char *bv_names[BV_MAX] = { /* Basil Protocol version */
[BV_1_2] = "1.1",
[BV_3_1] = "1.1",
[BV_4_0] = "1.2",
- [BV_4_1] = "1.2"
+ [BV_4_1] = "1.2",
+ [BV_5_0] = "1.2"
};
const char *bv_names_long[BV_MAX] = { /* Actual version name */
@@ -29,7 +30,8 @@ const char *bv_names_long[BV_MAX] = { /* Actual version name */
[BV_1_2] = "1.2",
[BV_3_1] = "3.1",
[BV_4_0] = "4.0",
- [BV_4_1] = "4.1"
+ [BV_4_1] = "4.1",
+ [BV_5_0] = "5.0"
};
/* Basil methods */
View
15 src/slurmctld/job_mgr.c
@@ -8603,14 +8603,15 @@ extern bool job_epilog_complete(uint32_t job_id, char *node_name,
#ifdef HAVE_FRONT_END
xassert(job_ptr->batch_host);
- if (return_code) {
- error("Epilog error for job %u on %s, setting DOWN",
+ /* If there is a bad epilog error don't down the frontend
+ node. If needed (not on a bluegene) the nodes in use by
+ the job will be downed below.
+ */
+ if (return_code)
+ error("Epilog error for job %u on %s",
job_ptr->job_id, job_ptr->batch_host);
- if (job_ptr->front_end_ptr) {
- set_front_end_down(job_ptr->front_end_ptr,
- "Epilog error");
- }
- } else if (job_ptr->front_end_ptr && IS_JOB_COMPLETING(job_ptr)) {
+
+ if (job_ptr->front_end_ptr && IS_JOB_COMPLETING(job_ptr)) {
front_end_record_t *front_end_ptr = job_ptr->front_end_ptr;
if (front_end_ptr->job_cnt_comp)
front_end_ptr->job_cnt_comp--;
View
14 src/slurmctld/node_mgr.c
@@ -1797,12 +1797,11 @@ static front_end_record_t * _front_end_reg(
front_end_ptr->boot_time = now - reg_msg->up_time;
if (front_end_ptr->last_response &&
- (front_end_ptr->boot_time > front_end_ptr->last_response) &&
- (slurmctld_conf.ret2service != 2)) {
- set_front_end_down(front_end_ptr,
- "Front end unexpectedly rebooted");
- info("Front end %s unexpectedly rebooted",
+ (front_end_ptr->boot_time > front_end_ptr->last_response)) {
+ info("front end %s unexpectedly rebooted, "
+ "killing all previously running jobs running on it.",
reg_msg->node_name);
+ (void) kill_job_by_front_end_name(front_end_ptr->name);
reg_msg->job_count = 0;
}
@@ -1821,10 +1820,9 @@ static front_end_record_t * _front_end_reg(
}
if (state_base == NODE_STATE_UNKNOWN)
state_base = NODE_STATE_IDLE;
-#ifndef HAVE_CRAY
- /* This is handled by the select/cray plugin */
+
state_flags &= (~NODE_STATE_NO_RESPOND);
-#endif
+
front_end_ptr->node_state = state_base | state_flags;
last_front_end_update = now;
return front_end_ptr;
View
5 src/smap/smap.c
@@ -121,9 +121,8 @@ int main(int argc, char *argv[])
}
}
-#ifdef HAVE_BG
- bg_configure_ba_init(new_node_ptr, 0);
-#endif
+ select_g_ba_init(new_node_ptr, 0);
+
if (dim_size == NULL) {
dim_size = get_cluster_dims(new_node_ptr);
if ((dim_size == NULL) || (dim_size[0] < 1))

No commit comments for this range

Something went wrong with that request. Please try again.