Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 89 additions & 22 deletions testsuite/fleettest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,11 @@
removed when the run ends -- on success or failure, and best-effort on
Ctrl-C/kill (pass --keep to retain it for inspection). A run that is hard-killed
(SIGKILL), or signalled mid-push, or whose ssh dies during cleanup can leave a
stray <builddir>-<id> behind; sweep those with `fleettest.py --cleanup`
(optionally scoped with --targets). Because each
stray <builddir>-<id> behind -- plus an orphaned path-flipper or test rsyncd on
platforms without a parent-death backstop; sweep all of those (root-owned files
included, via sudo -n) with `fleettest.py --cleanup` (optionally scoped with
--targets). Run --cleanup between runs, not during one: its process kills are
host-global and would also catch a concurrent run's flipper/daemon. Because each
run starts from a fresh dir, every build is a full configure + build.

PROVISIONING: each target must have the build toolchain its workflow's prepare
Expand Down Expand Up @@ -771,33 +774,94 @@ def _on_signal(signum, frame):
os._exit(130 if signum == signal.SIGINT else 143)


# sweep() counts a pattern, kills it (best effort; sudo -n retry for processes a
# root-running test left), then RE-counts after a settle so we report what
# actually died (KILLED = before-after) and flag any survivor (SURVIVED, sets
# fail) instead of claiming success when pkill couldn't reach it. The patterns
# use the pgrep self-exclusion trick -- 'r[e]name'/'det[a]ch' match a real
# process's "rename"/"detach" but not the bracketed literal in this script's own
# argv (run_on passes the whole script as the remote argv), so we never signal
# ourselves. @BASE@ is substituted with the target's run-dir prefix.
_CLEANUP_SCRIPT = r'''fail=0
sweep() {
command -v pgrep >/dev/null 2>&1 || return 0
before=$(pgrep -f "$2" 2>/dev/null | wc -l | tr -d ' ')
[ "$before" -gt 0 ] 2>/dev/null || return 0
pkill -f "$2" 2>/dev/null
sudo -n pkill -f "$2" 2>/dev/null
sleep 1
after=$(pgrep -f "$2" 2>/dev/null | wc -l | tr -d ' ')
killed=$((before - after))
[ "$killed" -gt 0 ] 2>/dev/null && echo "KILLED $killed stray $1(s)"
if [ "$after" -gt 0 ] 2>/dev/null; then
echo "SURVIVED $after stray $1(s)"
fail=1
fi
}
sweep flipper 'r[e]name.*r[e]name.*r[e]name'
sweep daemon 'det[a]ch --address=127.0.0.1'
for d in @BASE@-*; do
[ -e "$d" ] || continue
if rm -rf -- "$d" 2>/dev/null || sudo -n rm -rf -- "$d" 2>/dev/null; then
echo "REMOVED $d"
else
echo "FAILED $d"
fail=1
fi
done
exit $fail
'''


def cleanup_remnants(targets: list[Target]) -> int:
"""--cleanup mode: remove every <base>-* run dir on each target, reporting
what each removed. Returns a process exit code. Only suffixed run dirs are
swept -- a bare <base> is left alone."""
"""--cleanup mode: on each target, kill the stray processes a killed run can
leave behind, then remove every <base>-* run dir, reporting what went.
Returns a process exit code. Only suffixed run dirs are swept -- a bare
<base> is left alone.

A run that is SIGKILLed (or whose ssh drops) can strand two kinds of process
on platforms without a parent-death backstop: the TOCTOU path-flipper (a
busy `python -c` rename loop that pins a CPU) and an orphaned test rsyncd
(`--no-detach --address=127.0.0.1`, which then squats its fixed port -- the
very wedge claim_ports()' bind-probe now reports). Both are killed best
effort (sudo -n retry for root-owned ones); a kill is verified by re-counting
afterwards, and a process that survives is reported and fails the run.

CAVEAT: the kill patterns are host-global, not scoped to a particular run, so
--cleanup assumes no *other* fleettest run is active on the target -- it
would also kill a concurrent run's flipper/daemon (and any manual `rsync
--daemon --no-detach --address=127.0.0.1`). Run it between runs, not during
one. Run dirs whose contents a root test owns are removed via a `sudo -n rm`
fallback; only a dir that survives even that is a failure."""
rc = 0
for t in targets:
base = t.builddir
if _unsafe_builddir(base):
log(f"[{t.name}] skipped (unsafe builddir {base!r})")
continue
# Echo each match before removing it so the harness can report what
# went; an unmatched glob stays literal and is skipped by the -e test.
script = (f'set -e\n'
f'for d in {base}-*; do\n'
f' [ -e "$d" ] || continue\n'
f' echo "$d"\n'
f' rm -rf -- "$d"\n'
f'done\n')
r = run_on(t, script, timeout=120)
removed = [ln for ln in r.out.splitlines() if ln.strip()]
if r.rc != 0:
# Structured markers (KILLED/SURVIVED/REMOVED/FAILED) keep the report
# clean even though run_on() folds stderr into stdout.
r = run_on(t, _CLEANUP_SCRIPT.replace("@BASE@", base), timeout=120)
lines = r.out.splitlines()
removed = [ln.split(" ", 1)[1] for ln in lines if ln.startswith("REMOVED ")]
failed = [ln.split(" ", 1)[1] for ln in lines if ln.startswith("FAILED ")]
killed = [ln.replace("KILLED ", "killed ", 1)
for ln in lines if ln.startswith("KILLED ")]
survived = [ln.replace("SURVIVED ", "still alive: ", 1)
for ln in lines if ln.startswith("SURVIVED ")]
msgs = killed[:]
if removed:
msgs.append("removed: " + " ".join(removed))
if survived:
rc = 1
msgs += survived
if failed:
rc = 1
msgs.append("could not remove (even with sudo): " + " ".join(failed))
if r.rc not in (0, 1):
rc = 1
log(f"[{t.name}] cleanup error (rc={r.rc}): {r.out.strip()[:200]}")
elif removed:
log(f"[{t.name}] removed: {' '.join(removed)}")
else:
log(f"[{t.name}] nothing to remove")
msgs.append(f"cleanup error rc={r.rc}: {r.out.strip()[:160]}")
log(f"[{t.name}] " + ("; ".join(msgs) if msgs else "nothing to remove"))
return rc


Expand All @@ -813,7 +877,10 @@ def main() -> int:
ap.add_argument("--keep", action="store_true",
help="keep each run's build dir (default: remove it at exit)")
ap.add_argument("--cleanup", action="store_true",
help="remove stray <builddir>-* run dirs on the targets, then exit")
help="kill stray flippers/test daemons and remove stray "
"<builddir>-* run dirs (root-owned via sudo -n) on the "
"targets, then exit; run between runs, not during one "
"(kills are host-global)")
ap.add_argument("--jobs", type=int, help="override -j for both transports")
ap.add_argument("--timing", action="store_true",
help="report per-target wall-clock (push/build/test) to find "
Expand Down
39 changes: 39 additions & 0 deletions testsuite/rsyncfns.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,42 @@ def _open_lock_file() -> int:
return fd


def _probe_bindable(port: int) -> 'None':
"""Confirm `port` is actually free once we hold its claim_ports() lock.

The byte-range lock only coordinates *live* test drivers, and the kernel
releases it the instant the holding process dies -- even if that driver left
an orphaned daemon still bound to the port. That happens when a run is
SIGKILLed (or its ssh drops) on a platform with no parent-death backstop:
rsyncfns only arms PR_SET_PDEATHSIG, which is Linux-only, so on the
BSDs/Solaris/macOS a killed fleettest run can strand its rsyncd, which then
squats the fixed test port forever. A later run wins the (now-free) lock but
the socket is still taken, and the daemon dies with a cryptic "bind() failed:
Address already in use" / the client "did not see server greeting".

So actually try to bind it. SO_REUSEADDR is used so a port merely in
TIME_WAIT (recently and cleanly closed) is NOT a false positive; only a
live bound/listening socket -- a real squatter -- makes the bind fail, and
then we stop here with an actionable message instead of failing obscurely
later. The probe socket is closed immediately, freeing the port for the
daemon that is about to bind it.
"""
s = _socket.socket(_socket.AF_INET, _socket.SOCK_STREAM)
s.setsockopt(_socket.SOL_SOCKET, _socket.SO_REUSEADDR, 1)
try:
s.bind(('127.0.0.1', port))
except OSError as e:
test_fail(
f"port {port} was claimed for this run but something is still bound "
f"to 127.0.0.1:{port} ({e.strerror}). The claim_ports() lock only "
"serializes live test runs, so a still-bound port almost always "
"means an orphaned 'rsync --daemon' from a previously killed run "
f"(find it with `fstat | grep {port}` / `netstat -an | grep {port}` "
"and kill it, or run `fleettest.py --cleanup`), then retry.")
finally:
s.close()


def claim_ports(*ports: int) -> 'None':
"""Reserve the given TCP port numbers for the rest of this process.

Expand Down Expand Up @@ -210,6 +246,9 @@ def claim_ports(*ports: int) -> 'None':
# F_SETLKW via fcntl.lockf(LOCK_EX, length, start): exclusive
# byte-range lock on byte `port`, blocking until acquired.
fcntl.lockf(_port_lock_fd, fcntl.LOCK_EX, 1, port)
# The lock only proves no other live test run owns the port; an orphaned
# daemon from a killed run can still squat it (see _probe_bindable).
_probe_bindable(port)


# --- standalone rsyncd helpers ---------------------------------------------
Expand Down
Loading