From 214b91b96507508ff6968ca449463282a041d21e Mon Sep 17 00:00:00 2001 From: Andrew Tridgell Date: Mon, 8 Jun 2026 08:49:24 +1000 Subject: [PATCH 1/2] testsuite: verify a claimed test port is actually bindable claim_ports() takes a POSIX byte-range lock per port, which serializes concurrent live test runs. But the kernel drops that lock the instant the holding process dies, even if the run left an orphaned rsync --daemon still bound to the port -- which happens when a run is SIGKILLed on a platform with no parent-death backstop (rsyncfns only arms PR_SET_PDEATHSIG, Linux-only, so the BSDs/Solaris/macOS can strand a daemon). A later run then wins the freed lock while the socket is still squatted and dies with a cryptic "bind() failed: Address already in use" / "did not see server greeting". After taking each lock, actually bind the port (SO_REUSEADDR, so a port merely in TIME_WAIT is not a false positive; only a live squatter fails) and close it immediately. On failure stop with an actionable message naming the port and the likely orphaned daemon. Closes the gap that masked the OpenBSD daemon-auth wedge. --- testsuite/rsyncfns.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/testsuite/rsyncfns.py b/testsuite/rsyncfns.py index 9a96c2b05..2e00abb39 100644 --- a/testsuite/rsyncfns.py +++ b/testsuite/rsyncfns.py @@ -175,6 +175,42 @@ def _open_lock_file() -> int: return fd +def _probe_bindable(port: int) -> 'None': + """Confirm `port` is actually free once we hold its claim_ports() lock. + + The byte-range lock only coordinates *live* test drivers, and the kernel + releases it the instant the holding process dies -- even if that driver left + an orphaned daemon still bound to the port. That happens when a run is + SIGKILLed (or its ssh drops) on a platform with no parent-death backstop: + rsyncfns only arms PR_SET_PDEATHSIG, which is Linux-only, so on the + BSDs/Solaris/macOS a killed fleettest run can strand its rsyncd, which then + squats the fixed test port forever. A later run wins the (now-free) lock but + the socket is still taken, and the daemon dies with a cryptic "bind() failed: + Address already in use" / the client "did not see server greeting". + + So actually try to bind it. SO_REUSEADDR is used so a port merely in + TIME_WAIT (recently and cleanly closed) is NOT a false positive; only a + live bound/listening socket -- a real squatter -- makes the bind fail, and + then we stop here with an actionable message instead of failing obscurely + later. The probe socket is closed immediately, freeing the port for the + daemon that is about to bind it. + """ + s = _socket.socket(_socket.AF_INET, _socket.SOCK_STREAM) + s.setsockopt(_socket.SOL_SOCKET, _socket.SO_REUSEADDR, 1) + try: + s.bind(('127.0.0.1', port)) + except OSError as e: + test_fail( + f"port {port} was claimed for this run but something is still bound " + f"to 127.0.0.1:{port} ({e.strerror}). The claim_ports() lock only " + "serializes live test runs, so a still-bound port almost always " + "means an orphaned 'rsync --daemon' from a previously killed run " + f"(find it with `fstat | grep {port}` / `netstat -an | grep {port}` " + "and kill it, or run `fleettest.py --cleanup`), then retry.") + finally: + s.close() + + def claim_ports(*ports: int) -> 'None': """Reserve the given TCP port numbers for the rest of this process. @@ -210,6 +246,9 @@ def claim_ports(*ports: int) -> 'None': # F_SETLKW via fcntl.lockf(LOCK_EX, length, start): exclusive # byte-range lock on byte `port`, blocking until acquired. fcntl.lockf(_port_lock_fd, fcntl.LOCK_EX, 1, port) + # The lock only proves no other live test run owns the port; an orphaned + # daemon from a killed run can still squat it (see _probe_bindable). + _probe_bindable(port) # --- standalone rsyncd helpers --------------------------------------------- From ee2fdcffca2f20acd6c5b580fffa4eb46658893e Mon Sep 17 00:00:00 2001 From: Andrew Tridgell Date: Mon, 8 Jun 2026 08:58:09 +1000 Subject: [PATCH 2/2] fleettest: --cleanup also kills stray flippers/daemons and root-owned dirs A run killed without a parent-death backstop can strand a TOCTOU path-flipper (a busy `python -c` rename loop that pins a CPU) and an orphaned test rsyncd (--no-detach --address=127.0.0.1) that squats its fixed port -- the wedge the claim_ports() bind-probe now reports and points at --cleanup. Sweep both, best effort, before removing the run dirs. Each sweep counts the pattern, kills it (with a `sudo -n` retry for a process a root-running test left), then re-counts after a settle: KILLED reports what actually died, and a process that survives (pkill blocked, no passwordless sudo, missing/limited pkill) is reported as SURVIVED and fails the run instead of falsely claiming success. Run-dir removal falls back to `sudo -n rm` so a dir whose contents a root test owns is removed instead of failing with "Permission denied" (the failure mode seen on the ubuntu/mac targets); only a dir that survives even sudo is failed. The kill patterns use the pgrep self-exclusion trick ('r[e]name', 'det[a]ch') so they match a real process's "rename"/"detach" but not the literal pattern in the cleanup shell's own argv -- run_on() passes the whole script as the remote argv, so without it --cleanup would signal itself. The patterns are host-global (not scoped to one run), so --cleanup is documented to run between runs, not during one. --- testsuite/fleettest.py | 111 +++++++++++++++++++++++++++++++++-------- 1 file changed, 89 insertions(+), 22 deletions(-) diff --git a/testsuite/fleettest.py b/testsuite/fleettest.py index b200604a1..7c758708b 100755 --- a/testsuite/fleettest.py +++ b/testsuite/fleettest.py @@ -31,8 +31,11 @@ removed when the run ends -- on success or failure, and best-effort on Ctrl-C/kill (pass --keep to retain it for inspection). A run that is hard-killed (SIGKILL), or signalled mid-push, or whose ssh dies during cleanup can leave a -stray - behind; sweep those with `fleettest.py --cleanup` -(optionally scoped with --targets). Because each +stray - behind -- plus an orphaned path-flipper or test rsyncd on +platforms without a parent-death backstop; sweep all of those (root-owned files +included, via sudo -n) with `fleettest.py --cleanup` (optionally scoped with +--targets). Run --cleanup between runs, not during one: its process kills are +host-global and would also catch a concurrent run's flipper/daemon. Because each run starts from a fresh dir, every build is a full configure + build. PROVISIONING: each target must have the build toolchain its workflow's prepare @@ -771,33 +774,94 @@ def _on_signal(signum, frame): os._exit(130 if signum == signal.SIGINT else 143) +# sweep() counts a pattern, kills it (best effort; sudo -n retry for processes a +# root-running test left), then RE-counts after a settle so we report what +# actually died (KILLED = before-after) and flag any survivor (SURVIVED, sets +# fail) instead of claiming success when pkill couldn't reach it. The patterns +# use the pgrep self-exclusion trick -- 'r[e]name'/'det[a]ch' match a real +# process's "rename"/"detach" but not the bracketed literal in this script's own +# argv (run_on passes the whole script as the remote argv), so we never signal +# ourselves. @BASE@ is substituted with the target's run-dir prefix. +_CLEANUP_SCRIPT = r'''fail=0 +sweep() { + command -v pgrep >/dev/null 2>&1 || return 0 + before=$(pgrep -f "$2" 2>/dev/null | wc -l | tr -d ' ') + [ "$before" -gt 0 ] 2>/dev/null || return 0 + pkill -f "$2" 2>/dev/null + sudo -n pkill -f "$2" 2>/dev/null + sleep 1 + after=$(pgrep -f "$2" 2>/dev/null | wc -l | tr -d ' ') + killed=$((before - after)) + [ "$killed" -gt 0 ] 2>/dev/null && echo "KILLED $killed stray $1(s)" + if [ "$after" -gt 0 ] 2>/dev/null; then + echo "SURVIVED $after stray $1(s)" + fail=1 + fi +} +sweep flipper 'r[e]name.*r[e]name.*r[e]name' +sweep daemon 'det[a]ch --address=127.0.0.1' +for d in @BASE@-*; do + [ -e "$d" ] || continue + if rm -rf -- "$d" 2>/dev/null || sudo -n rm -rf -- "$d" 2>/dev/null; then + echo "REMOVED $d" + else + echo "FAILED $d" + fail=1 + fi +done +exit $fail +''' + + def cleanup_remnants(targets: list[Target]) -> int: - """--cleanup mode: remove every -* run dir on each target, reporting - what each removed. Returns a process exit code. Only suffixed run dirs are - swept -- a bare is left alone.""" + """--cleanup mode: on each target, kill the stray processes a killed run can + leave behind, then remove every -* run dir, reporting what went. + Returns a process exit code. Only suffixed run dirs are swept -- a bare + is left alone. + + A run that is SIGKILLed (or whose ssh drops) can strand two kinds of process + on platforms without a parent-death backstop: the TOCTOU path-flipper (a + busy `python -c` rename loop that pins a CPU) and an orphaned test rsyncd + (`--no-detach --address=127.0.0.1`, which then squats its fixed port -- the + very wedge claim_ports()' bind-probe now reports). Both are killed best + effort (sudo -n retry for root-owned ones); a kill is verified by re-counting + afterwards, and a process that survives is reported and fails the run. + + CAVEAT: the kill patterns are host-global, not scoped to a particular run, so + --cleanup assumes no *other* fleettest run is active on the target -- it + would also kill a concurrent run's flipper/daemon (and any manual `rsync + --daemon --no-detach --address=127.0.0.1`). Run it between runs, not during + one. Run dirs whose contents a root test owns are removed via a `sudo -n rm` + fallback; only a dir that survives even that is a failure.""" rc = 0 for t in targets: base = t.builddir if _unsafe_builddir(base): log(f"[{t.name}] skipped (unsafe builddir {base!r})") continue - # Echo each match before removing it so the harness can report what - # went; an unmatched glob stays literal and is skipped by the -e test. - script = (f'set -e\n' - f'for d in {base}-*; do\n' - f' [ -e "$d" ] || continue\n' - f' echo "$d"\n' - f' rm -rf -- "$d"\n' - f'done\n') - r = run_on(t, script, timeout=120) - removed = [ln for ln in r.out.splitlines() if ln.strip()] - if r.rc != 0: + # Structured markers (KILLED/SURVIVED/REMOVED/FAILED) keep the report + # clean even though run_on() folds stderr into stdout. + r = run_on(t, _CLEANUP_SCRIPT.replace("@BASE@", base), timeout=120) + lines = r.out.splitlines() + removed = [ln.split(" ", 1)[1] for ln in lines if ln.startswith("REMOVED ")] + failed = [ln.split(" ", 1)[1] for ln in lines if ln.startswith("FAILED ")] + killed = [ln.replace("KILLED ", "killed ", 1) + for ln in lines if ln.startswith("KILLED ")] + survived = [ln.replace("SURVIVED ", "still alive: ", 1) + for ln in lines if ln.startswith("SURVIVED ")] + msgs = killed[:] + if removed: + msgs.append("removed: " + " ".join(removed)) + if survived: + rc = 1 + msgs += survived + if failed: + rc = 1 + msgs.append("could not remove (even with sudo): " + " ".join(failed)) + if r.rc not in (0, 1): rc = 1 - log(f"[{t.name}] cleanup error (rc={r.rc}): {r.out.strip()[:200]}") - elif removed: - log(f"[{t.name}] removed: {' '.join(removed)}") - else: - log(f"[{t.name}] nothing to remove") + msgs.append(f"cleanup error rc={r.rc}: {r.out.strip()[:160]}") + log(f"[{t.name}] " + ("; ".join(msgs) if msgs else "nothing to remove")) return rc @@ -813,7 +877,10 @@ def main() -> int: ap.add_argument("--keep", action="store_true", help="keep each run's build dir (default: remove it at exit)") ap.add_argument("--cleanup", action="store_true", - help="remove stray -* run dirs on the targets, then exit") + help="kill stray flippers/test daemons and remove stray " + "-* run dirs (root-owned via sudo -n) on the " + "targets, then exit; run between runs, not during one " + "(kills are host-global)") ap.add_argument("--jobs", type=int, help="override -j for both transports") ap.add_argument("--timing", action="store_true", help="report per-target wall-clock (push/build/test) to find "