Skip to content

Commit 2cb65cb

Browse files
nateriniasanchez1987
authored andcommitted
Fatal if unable to load script or env when building a launch job msg.
If a file that slurmctld wrote disappears (job script or job environment), then the StateSaveLocation is in unexpected state and Slurm should fatal out immediately before corrupting anything else. Give user the option to explicitly ignore the error at startup at the loss of the job. Bug 7783.
1 parent 2bf797d commit 2cb65cb

File tree

2 files changed

+27
-16
lines changed

2 files changed

+27
-16
lines changed

NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ documents those changes that are of interest to users and administrators.
3838
conflicts on development systems running multiple Slurm instances.
3939
-- Return a newly added ESLURM_DEFER error and set a job state reason to
4040
FAIL_DEFER for immediate alloc requests if defer in SchedulerParameters.
41+
-- Make slurmctld fatal if unable to load a script or a job environment when
42+
building the launch job message.
4143

4244
* Changes in Slurm 19.05.5
4345
==========================

src/slurmctld/job_scheduler.c

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2040,6 +2040,7 @@ static void _split_env(batch_job_launch_msg_t *launch_msg_ptr)
20402040
static batch_job_launch_msg_t *_build_launch_job_msg(job_record_t *job_ptr,
20412041
uint16_t protocol_version)
20422042
{
2043+
char *fail_why = NULL;
20432044
batch_job_launch_msg_t *launch_msg_ptr;
20442045

20452046
/* Initialization of data structures */
@@ -2054,18 +2055,10 @@ static batch_job_launch_msg_t *_build_launch_job_msg(job_record_t *job_ptr,
20542055
launch_msg_ptr->gid = job_ptr->group_id;
20552056

20562057
if (!(launch_msg_ptr->script_buf = get_job_script(job_ptr))) {
2057-
error("Can not find batch script, aborting batch %pJ",
2058-
job_ptr);
2059-
/* FIXME: This is a kludge, but this event indicates a missing
2060-
* batch script and should never happen. We are too deep into
2061-
* the job launch to gracefully clean up here. */
2062-
slurm_free_job_launch_msg(launch_msg_ptr);
2063-
job_complete(job_ptr->job_id, slurmctld_conf.slurm_user_id,
2064-
true, false, 0);
2065-
return NULL;
2058+
fail_why = "Unable to load job batch script";
2059+
goto job_failed;
20662060
}
20672061

2068-
20692062
launch_msg_ptr->ntasks = job_ptr->details->num_tasks;
20702063
launch_msg_ptr->alias_list = xstrdup(job_ptr->alias_list);
20712064
launch_msg_ptr->nodes = xstrdup(job_ptr->nodes);
@@ -2111,13 +2104,10 @@ static batch_job_launch_msg_t *_build_launch_job_msg(job_record_t *job_ptr,
21112104
launch_msg_ptr->environment = get_job_env(job_ptr,
21122105
&launch_msg_ptr->envc);
21132106
if (launch_msg_ptr->environment == NULL) {
2114-
error("%s: environment missing or corrupted aborting %pJ",
2115-
__func__, job_ptr);
2116-
slurm_free_job_launch_msg(launch_msg_ptr);
2117-
job_complete(job_ptr->job_id, slurmctld_conf.slurm_user_id,
2118-
false, true, 0);
2119-
return NULL;
2107+
fail_why = "Unable to load job environment";
2108+
goto job_failed;
21202109
}
2110+
21212111
_split_env(launch_msg_ptr);
21222112
launch_msg_ptr->job_mem = job_ptr->details->pn_min_memory;
21232113
launch_msg_ptr->num_cpu_groups = job_ptr->job_resrcs->cpu_array_cnt;
@@ -2146,7 +2136,26 @@ static batch_job_launch_msg_t *_build_launch_job_msg(job_record_t *job_ptr,
21462136
launch_msg_ptr->account = xstrdup(job_ptr->account);
21472137
launch_msg_ptr->resv_name = xstrdup(job_ptr->resv_name);
21482138

2139+
xassert(!fail_why);
21492140
return launch_msg_ptr;
2141+
2142+
job_failed:
2143+
/* fatal or kill the job as it can never be recovered */
2144+
if (!ignore_state_errors)
2145+
fatal("%s: %s for %pJ. Check file system serving StateSaveLocation as that directory may be missing or corrupted. Start with '-i' to ignore this error and kill the afflicted jobs.",
2146+
__func__, fail_why, job_ptr);
2147+
2148+
error("%s: %s for %pJ. %pJ will be killed due to system error.",
2149+
__func__, fail_why, job_ptr, job_ptr);
2150+
xfree(job_ptr->state_desc);
2151+
job_ptr->state_desc = xstrdup(fail_why);
2152+
job_ptr->state_reason_prev = job_ptr->state_reason;
2153+
job_ptr->state_reason = FAIL_SYSTEM;
2154+
slurm_free_job_launch_msg(launch_msg_ptr);
2155+
/* ignore the return as job is in an unknown state anyway */
2156+
job_complete(job_ptr->job_id, slurmctld_conf.slurm_user_id, false,
2157+
false, 1);
2158+
return NULL;
21502159
}
21512160

21522161
/* Validate the job is ready for launch

0 commit comments

Comments
 (0)