Skip to content

Commit

Permalink
Fix handling of REQUEST_STEP_COMPLETE retries
Browse files Browse the repository at this point in the history
If child slurmstepd gets a timeout when sending a REQUEST_STEP_COMPLETE
to its parent it will retry it (_one_step_complete_msg()). Those
messages are handled on _handle_completion(), where it does the
jobacctinfo_aggregate().

This commit detects if the message was already processed to avoid errors on
the aggregated jobacctinfo.

The same thing can happen on the slurmctld side as well.

Bug 10723
  • Loading branch information
agilmor authored and gaijin03 committed May 7, 2021
1 parent cc8d000 commit c3325f6
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 9 deletions.
2 changes: 2 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ documents those changes that are of interest to users and administrators.
-- torque/qstat - fix printf error message in output.
-- When adding associations or wckeys avoid checking multiple times a user or
cluster name.
-- Fix wrong jobacctgather information on a step on multiple nodes
due to timeouts sending its the information gathered on its node.

* Changes in Slurm 20.11.6
==========================
Expand Down
41 changes: 38 additions & 3 deletions src/slurmctld/step_mgr.c
Original file line number Diff line number Diff line change
Expand Up @@ -3448,6 +3448,9 @@ extern int step_partial_comp(step_complete_msg_t *req, uid_t uid, bool finish,
job_record_t *job_ptr;
step_record_t *step_ptr;
int nodes, rem_nodes;
#ifndef HAVE_FRONT_END
int range_bits, set_bits;
#endif

xassert(rem);

Expand Down Expand Up @@ -3492,9 +3495,6 @@ extern int step_partial_comp(step_complete_msg_t *req, uid_t uid, bool finish,
return EINVAL;
}

ext_sensors_g_get_stependdata(step_ptr);
jobacctinfo_aggregate(step_ptr->jobacct, req->jobacct);

/* we have been adding task average frequencies for
* jobacct->act_cpufreq so we need to divide with the
* total number of tasks/cpus for the step average frequency */
Expand Down Expand Up @@ -3525,10 +3525,45 @@ extern int step_partial_comp(step_complete_msg_t *req, uid_t uid, bool finish,
bit_set_all(step_ptr->exit_node_bitmap);
rem_nodes = 0;
#else
range_bits = req->range_last + 1 - req->range_first;
set_bits = bit_set_count_range(step_ptr->exit_node_bitmap,
req->range_first,
req->range_last + 1);

/* Check if any stepd of the range was already received */
if (set_bits) {
/* If all are already received skip jobacctinfo_aggregate */
if (set_bits == range_bits) {
debug("Step complete from %d to %d was already processed. Probably a RPC was resent from a child.",
req->range_first, req->range_last);
goto no_aggregate;
}

/*
* If partially received, we cannot recover the right gathered
* information. If we don't gather the new one we'll miss some
* information, and if we gather it some of the info will be
* duplicated. We log that error and chose to partially
* duplicate because it's probably a smaller error.
*/
error("Step complete from %d to %d was already processed (%d of %d). Probably a RPC was resent from a child and gathered information is partially duplicated.",
req->range_first, req->range_last,
set_bits, range_bits);
}

bit_nset(step_ptr->exit_node_bitmap,
req->range_first, req->range_last);

#endif

ext_sensors_g_get_stependdata(step_ptr);
jobacctinfo_aggregate(step_ptr->jobacct, req->jobacct);

#ifndef HAVE_FRONT_END
no_aggregate:
rem_nodes = bit_clear_count(step_ptr->exit_node_bitmap);
#endif

*rem = rem_nodes;
if (rem_nodes == 0) {
/* release all switch windows */
Expand Down
9 changes: 8 additions & 1 deletion src/slurmd/slurmstepd/mgr.c
Original file line number Diff line number Diff line change
Expand Up @@ -785,7 +785,14 @@ _one_step_complete_msg(stepd_step_rec_t *job, int first, int last)
if ((retcode == 0) && (rc == 0))
goto finished;
}
/* on error AGAIN, send to the slurmctld instead */
/*
* On error AGAIN, send to the slurmctld instead.
* This is useful if parent_rank gave up waiting for us
* on stepd_wait_for_children_slurmstepd.
* If it's just busy handeling our prev messages we'll need
* to handle duplicated messages in both the parent and
* slurmctld.
*/
debug3("Rank %d sending complete to slurmctld instead, range "
"%d to %d", step_complete.rank, first, last);
} else {
Expand Down
27 changes: 22 additions & 5 deletions src/slurmd/slurmstepd/req.c
Original file line number Diff line number Diff line change
Expand Up @@ -1669,17 +1669,34 @@ _handle_completion(int fd, stepd_step_rec_t *job, uid_t uid)
* without the hostlist from the credential.
*/
if (step_complete.bits && (step_complete.rank >= 0)) {
int32_t set_bits;
int32_t first_bit = first - (step_complete.rank + 1);
int32_t last_bit = last - (step_complete.rank + 1);
/* bit_set_count_range is [first, end) so +1 last_bit */
int32_t last_bit_range = last_bit + 1;

#if 0
char bits_string[128];
debug2("Setting range %d (bit %d) through %d(bit %d)",
first, first-(step_complete.rank+1),
last, last-(step_complete.rank+1));
first, first_bit,
last, last_bit);
bit_fmt(bits_string, sizeof(bits_string), step_complete.bits);
debug2(" before bits: %s", bits_string);
#endif
bit_nset(step_complete.bits,
first - (step_complete.rank+1),
last - (step_complete.rank+1));
if (!(set_bits = bit_set_count_range(step_complete.bits,
first_bit,
last_bit_range))) {
bit_nset(step_complete.bits, first_bit, last_bit);
} else if (set_bits == (last_bit_range - first_bit)) {
debug("Step complete from %d to %d was already processed on rank %d. Probably a RPC was resent from a child.",
first, last, step_complete.rank);
goto timeout;
} else {
error("Step complete from %d to %d was half-way processed on rank %d. This should never happen.",
first, last, step_complete.rank);
goto timeout;
}

#if 0
bit_fmt(bits_string, sizeof(bits_string), step_complete.bits);
debug2(" after bits: %s", bits_string);
Expand Down

0 comments on commit c3325f6

Please sign in to comment.