From 214b91b96507508ff6968ca449463282a041d21e Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <andrew@tridgell.net>
Date: Mon, 8 Jun 2026 08:49:24 +1000
Subject: [PATCH 1/2] testsuite: verify a claimed test port is actually
 bindable

claim_ports() takes a POSIX byte-range lock per port, which serializes
concurrent live test runs. But the kernel drops that lock the instant the
holding process dies, even if the run left an orphaned rsync --daemon still
bound to the port -- which happens when a run is SIGKILLed on a platform with
no parent-death backstop (rsyncfns only arms PR_SET_PDEATHSIG, Linux-only, so
the BSDs/Solaris/macOS can strand a daemon). A later run then wins the freed
lock while the socket is still squatted and dies with a cryptic "bind() failed:
Address already in use" / "did not see server greeting".

After taking each lock, actually bind the port (SO_REUSEADDR, so a port merely
in TIME_WAIT is not a false positive; only a live squatter fails) and close it
immediately. On failure stop with an actionable message naming the port and the
likely orphaned daemon. Closes the gap that masked the OpenBSD daemon-auth wedge.
---
 testsuite/rsyncfns.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/testsuite/rsyncfns.py b/testsuite/rsyncfns.py
index 9a96c2b05..2e00abb39 100644
--- a/testsuite/rsyncfns.py
+++ b/testsuite/rsyncfns.py
@@ -175,6 +175,42 @@ def _open_lock_file() -> int:
     return fd
 
 
+def _probe_bindable(port: int) -> 'None':
+    """Confirm `port` is actually free once we hold its claim_ports() lock.
+
+    The byte-range lock only coordinates *live* test drivers, and the kernel
+    releases it the instant the holding process dies -- even if that driver left
+    an orphaned daemon still bound to the port. That happens when a run is
+    SIGKILLed (or its ssh drops) on a platform with no parent-death backstop:
+    rsyncfns only arms PR_SET_PDEATHSIG, which is Linux-only, so on the
+    BSDs/Solaris/macOS a killed fleettest run can strand its rsyncd, which then
+    squats the fixed test port forever. A later run wins the (now-free) lock but
+    the socket is still taken, and the daemon dies with a cryptic "bind() failed:
+    Address already in use" / the client "did not see server greeting".
+
+    So actually try to bind it. SO_REUSEADDR is used so a port merely in
+    TIME_WAIT (recently and cleanly closed) is NOT a false positive; only a
+    live bound/listening socket -- a real squatter -- makes the bind fail, and
+    then we stop here with an actionable message instead of failing obscurely
+    later. The probe socket is closed immediately, freeing the port for the
+    daemon that is about to bind it.
+    """
+    s = _socket.socket(_socket.AF_INET, _socket.SOCK_STREAM)
+    s.setsockopt(_socket.SOL_SOCKET, _socket.SO_REUSEADDR, 1)
+    try:
+        s.bind(('127.0.0.1', port))
+    except OSError as e:
+        test_fail(
+            f"port {port} was claimed for this run but something is still bound "
+            f"to 127.0.0.1:{port} ({e.strerror}). The claim_ports() lock only "
+            "serializes live test runs, so a still-bound port almost always "
+            "means an orphaned 'rsync --daemon' from a previously killed run "
+            f"(find it with `fstat | grep {port}` / `netstat -an | grep {port}` "
+            "and kill it, or run `fleettest.py --cleanup`), then retry.")
+    finally:
+        s.close()
+
+
 def claim_ports(*ports: int) -> 'None':
     """Reserve the given TCP port numbers for the rest of this process.
 
@@ -210,6 +246,9 @@ def claim_ports(*ports: int) -> 'None':
         # F_SETLKW via fcntl.lockf(LOCK_EX, length, start): exclusive
         # byte-range lock on byte `port`, blocking until acquired.
         fcntl.lockf(_port_lock_fd, fcntl.LOCK_EX, 1, port)
+        # The lock only proves no other live test run owns the port; an orphaned
+        # daemon from a killed run can still squat it (see _probe_bindable).
+        _probe_bindable(port)
 
 
 # --- standalone rsyncd helpers ---------------------------------------------

From ee2fdcffca2f20acd6c5b580fffa4eb46658893e Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <andrew@tridgell.net>
Date: Mon, 8 Jun 2026 08:58:09 +1000
Subject: [PATCH 2/2] fleettest: --cleanup also kills stray flippers/daemons
 and root-owned dirs

A run killed without a parent-death backstop can strand a TOCTOU path-flipper
(a busy `python -c` rename loop that pins a CPU) and an orphaned test rsyncd
(--no-detach --address=127.0.0.1) that squats its fixed port -- the wedge the
claim_ports() bind-probe now reports and points at --cleanup. Sweep both, best
effort, before removing the run dirs.

Each sweep counts the pattern, kills it (with a `sudo -n` retry for a process a
root-running test left), then re-counts after a settle: KILLED reports what
actually died, and a process that survives (pkill blocked, no passwordless sudo,
missing/limited pkill) is reported as SURVIVED and fails the run instead of
falsely claiming success.

Run-dir removal falls back to `sudo -n rm` so a dir whose contents a root test
owns is removed instead of failing with "Permission denied" (the failure mode
seen on the ubuntu/mac targets); only a dir that survives even sudo is failed.

The kill patterns use the pgrep self-exclusion trick ('r[e]name', 'det[a]ch')
so they match a real process's "rename"/"detach" but not the literal pattern in
the cleanup shell's own argv -- run_on() passes the whole script as the remote
argv, so without it --cleanup would signal itself. The patterns are host-global
(not scoped to one run), so --cleanup is documented to run between runs, not
during one.
---
 testsuite/fleettest.py | 111 +++++++++++++++++++++++++++++++++--------
 1 file changed, 89 insertions(+), 22 deletions(-)

diff --git a/testsuite/fleettest.py b/testsuite/fleettest.py
index b200604a1..7c758708b 100755
--- a/testsuite/fleettest.py
+++ b/testsuite/fleettest.py
@@ -31,8 +31,11 @@
 removed when the run ends -- on success or failure, and best-effort on
 Ctrl-C/kill (pass --keep to retain it for inspection). A run that is hard-killed
 (SIGKILL), or signalled mid-push, or whose ssh dies during cleanup can leave a
-stray <builddir>-<id> behind; sweep those with `fleettest.py --cleanup`
-(optionally scoped with --targets). Because each
+stray <builddir>-<id> behind -- plus an orphaned path-flipper or test rsyncd on
+platforms without a parent-death backstop; sweep all of those (root-owned files
+included, via sudo -n) with `fleettest.py --cleanup` (optionally scoped with
+--targets). Run --cleanup between runs, not during one: its process kills are
+host-global and would also catch a concurrent run's flipper/daemon. Because each
 run starts from a fresh dir, every build is a full configure + build.
 
 PROVISIONING: each target must have the build toolchain its workflow's prepare
@@ -771,33 +774,94 @@ def _on_signal(signum, frame):
     os._exit(130 if signum == signal.SIGINT else 143)
 
 
+# sweep() counts a pattern, kills it (best effort; sudo -n retry for processes a
+# root-running test left), then RE-counts after a settle so we report what
+# actually died (KILLED = before-after) and flag any survivor (SURVIVED, sets
+# fail) instead of claiming success when pkill couldn't reach it. The patterns
+# use the pgrep self-exclusion trick -- 'r[e]name'/'det[a]ch' match a real
+# process's "rename"/"detach" but not the bracketed literal in this script's own
+# argv (run_on passes the whole script as the remote argv), so we never signal
+# ourselves. @BASE@ is substituted with the target's run-dir prefix.
+_CLEANUP_SCRIPT = r'''fail=0
+sweep() {
+  command -v pgrep >/dev/null 2>&1 || return 0
+  before=$(pgrep -f "$2" 2>/dev/null | wc -l | tr -d ' ')
+  [ "$before" -gt 0 ] 2>/dev/null || return 0
+  pkill -f "$2" 2>/dev/null
+  sudo -n pkill -f "$2" 2>/dev/null
+  sleep 1
+  after=$(pgrep -f "$2" 2>/dev/null | wc -l | tr -d ' ')
+  killed=$((before - after))
+  [ "$killed" -gt 0 ] 2>/dev/null && echo "KILLED $killed stray $1(s)"
+  if [ "$after" -gt 0 ] 2>/dev/null; then
+    echo "SURVIVED $after stray $1(s)"
+    fail=1
+  fi
+}
+sweep flipper 'r[e]name.*r[e]name.*r[e]name'
+sweep daemon 'det[a]ch --address=127.0.0.1'
+for d in @BASE@-*; do
+  [ -e "$d" ] || continue
+  if rm -rf -- "$d" 2>/dev/null || sudo -n rm -rf -- "$d" 2>/dev/null; then
+    echo "REMOVED $d"
+  else
+    echo "FAILED $d"
+    fail=1
+  fi
+done
+exit $fail
+'''
+
+
 def cleanup_remnants(targets: list[Target]) -> int:
-    """--cleanup mode: remove every <base>-* run dir on each target, reporting
-    what each removed. Returns a process exit code. Only suffixed run dirs are
-    swept -- a bare <base> is left alone."""
+    """--cleanup mode: on each target, kill the stray processes a killed run can
+    leave behind, then remove every <base>-* run dir, reporting what went.
+    Returns a process exit code. Only suffixed run dirs are swept -- a bare
+    <base> is left alone.
+
+    A run that is SIGKILLed (or whose ssh drops) can strand two kinds of process
+    on platforms without a parent-death backstop: the TOCTOU path-flipper (a
+    busy `python -c` rename loop that pins a CPU) and an orphaned test rsyncd
+    (`--no-detach --address=127.0.0.1`, which then squats its fixed port -- the
+    very wedge claim_ports()' bind-probe now reports). Both are killed best
+    effort (sudo -n retry for root-owned ones); a kill is verified by re-counting
+    afterwards, and a process that survives is reported and fails the run.
+
+    CAVEAT: the kill patterns are host-global, not scoped to a particular run, so
+    --cleanup assumes no *other* fleettest run is active on the target -- it
+    would also kill a concurrent run's flipper/daemon (and any manual `rsync
+    --daemon --no-detach --address=127.0.0.1`). Run it between runs, not during
+    one. Run dirs whose contents a root test owns are removed via a `sudo -n rm`
+    fallback; only a dir that survives even that is a failure."""
     rc = 0
     for t in targets:
         base = t.builddir
         if _unsafe_builddir(base):
             log(f"[{t.name}] skipped (unsafe builddir {base!r})")
             continue
-        # Echo each match before removing it so the harness can report what
-        # went; an unmatched glob stays literal and is skipped by the -e test.
-        script = (f'set -e\n'
-                  f'for d in {base}-*; do\n'
-                  f'  [ -e "$d" ] || continue\n'
-                  f'  echo "$d"\n'
-                  f'  rm -rf -- "$d"\n'
-                  f'done\n')
-        r = run_on(t, script, timeout=120)
-        removed = [ln for ln in r.out.splitlines() if ln.strip()]
-        if r.rc != 0:
+        # Structured markers (KILLED/SURVIVED/REMOVED/FAILED) keep the report
+        # clean even though run_on() folds stderr into stdout.
+        r = run_on(t, _CLEANUP_SCRIPT.replace("@BASE@", base), timeout=120)
+        lines = r.out.splitlines()
+        removed = [ln.split(" ", 1)[1] for ln in lines if ln.startswith("REMOVED ")]
+        failed = [ln.split(" ", 1)[1] for ln in lines if ln.startswith("FAILED ")]
+        killed = [ln.replace("KILLED ", "killed ", 1)
+                  for ln in lines if ln.startswith("KILLED ")]
+        survived = [ln.replace("SURVIVED ", "still alive: ", 1)
+                    for ln in lines if ln.startswith("SURVIVED ")]
+        msgs = killed[:]
+        if removed:
+            msgs.append("removed: " + " ".join(removed))
+        if survived:
+            rc = 1
+            msgs += survived
+        if failed:
+            rc = 1
+            msgs.append("could not remove (even with sudo): " + " ".join(failed))
+        if r.rc not in (0, 1):
             rc = 1
-            log(f"[{t.name}] cleanup error (rc={r.rc}): {r.out.strip()[:200]}")
-        elif removed:
-            log(f"[{t.name}] removed: {' '.join(removed)}")
-        else:
-            log(f"[{t.name}] nothing to remove")
+            msgs.append(f"cleanup error rc={r.rc}: {r.out.strip()[:160]}")
+        log(f"[{t.name}] " + ("; ".join(msgs) if msgs else "nothing to remove"))
     return rc
 
 
@@ -813,7 +877,10 @@ def main() -> int:
     ap.add_argument("--keep", action="store_true",
                     help="keep each run's build dir (default: remove it at exit)")
     ap.add_argument("--cleanup", action="store_true",
-                    help="remove stray <builddir>-* run dirs on the targets, then exit")
+                    help="kill stray flippers/test daemons and remove stray "
+                    "<builddir>-* run dirs (root-owned via sudo -n) on the "
+                    "targets, then exit; run between runs, not during one "
+                    "(kills are host-global)")
     ap.add_argument("--jobs", type=int, help="override -j for both transports")
     ap.add_argument("--timing", action="store_true",
                     help="report per-target wall-clock (push/build/test) to find "