From e8e39f6a84fb7364190c9c35b946111eb98eaabf Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Thu, 14 May 2026 08:58:46 +0000 Subject: [PATCH] fix(sandbox): retry rm -rf during delete to handle kill -9 file-handle race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #121. `RemoveSandbox` (and `RemoveCustomSandbox`, plus an earlier custom-delete path) runs the sandbox's stop / send_kill script and then immediately `rm -rf`s the sandbox dir. The `destroy` mode of `send_kill` does `kill -9 $MYPID`, which doesn't synchronously wait for the killed process's open file descriptors to be released. During that brief window, the dying mysqld's last buffered I/O can land inside `master/data/` after `rm`'s readdir() walk has already passed — making the final `rmdir(2)` return ENOTEMPTY: rm: cannot remove '/master/data': Directory not empty error while deleting sandbox Recently observed on: - master Integration Tests run 25839621095 (`MariaDB (10.11.9)`) - PR #119 hit the same flake on 4 consecutive runs before clearing on the 5th retry - PR #114 hit it during initial review Fix: wrap the rm in a small exponential-backoff retry loop (200ms + 400ms + 800ms + 1.6s + 3.2s ≈ 6.2s max wait). The race resolves in milliseconds in practice; retries handle it without changing the stop-script contract or template scripts across topologies. A more thorough fix would be to have `send_kill` wait for the killed PIDs to disappear from /proc before returning, but that's a larger change touching template scripts across single, replication, multiple, tidb, etc. The localized retry in `RemoveSandbox` solves the user-visible symptom with one focused change. Applies to all four `rm -rf` call-sites in sandbox/sandbox.go: - The early custom-delete path (around line 213) used by sandboxDef recreation, both sandbox dir and log directory. - The synchronous (non-concurrent) branch of `RemoveSandbox`. - The synchronous branch of `RemoveCustomSandbox`. The concurrent-mode branches (which enqueue an `ExecCommand` into the parallel execution list) are unchanged — those run later and their race window is already wider by virtue of scheduling. --- sandbox/sandbox.go | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/sandbox/sandbox.go b/sandbox/sandbox.go index 979c53fe..268d5ca0 100644 --- a/sandbox/sandbox.go +++ b/sandbox/sandbox.go @@ -210,13 +210,11 @@ func checkDirectory(sandboxDef SandboxDef) (SandboxDef, error) { if err != nil { return sandboxDef, fmt.Errorf(globals.ErrWhileStoppingSandbox, sandboxDir) } - _, err = common.RunCmdWithArgs("rm", []string{"-rf", sandboxDir}) - if err != nil { + if err = removeDirWithRetry(sandboxDir); err != nil { return sandboxDef, fmt.Errorf(globals.ErrWhileDeletingSandbox, sandboxDir) } if logDirectory != "" { - _, err = common.RunCmdWithArgs("rm", []string{"-rf", logDirectory}) - if err != nil { + if err = removeDirWithRetry(logDirectory); err != nil { return sandboxDef, fmt.Errorf("error while deleting log directory %s", logDirectory) } } @@ -1183,6 +1181,30 @@ func getLogDirFromSbDescription(fullPath string) (string, error) { return logDirectory, nil } +// removeDirWithRetry runs `rm -rf ` with a short exponential backoff +// retry loop. After the stop/send_kill script returns, the killed mysqld +// process may not yet have finished releasing its open file descriptors — +// during that brief window, last buffered writes can land in master/data/ +// after rm's readdir() walk, making the final rmdir(2) return ENOTEMPTY +// ("Directory not empty"). Retrying handles that race (issue #121). +// Total max wait: 200+400+800+1600+3200ms ≈ 6.2s. +func removeDirWithRetry(target string) error { + backoff := 200 * time.Millisecond + var lastErr error + for attempt := 0; attempt < 5; attempt++ { + if attempt > 0 { + time.Sleep(backoff) + backoff *= 2 + } + if _, err := common.RunCmdWithArgs("rm", []string{"-rf", target}); err == nil { + return nil + } else { + lastErr = err + } + } + return lastErr +} + // Deprecated: use RemoveCustomSandbox instead func RemoveSandbox(sandboxDir, sandbox string, runConcurrently bool) (execList []concurrent.ExecutionList, err error) { fullPath := path.Join(sandboxDir, sandbox) @@ -1273,8 +1295,7 @@ func RemoveSandbox(sandboxDir, sandbox string, runConcurrently bool) (execList [ if globals.UsingDbDeployer && target != logDirectory { common.CondPrintf("Running %s\n", cmdStr) } - _, err = common.RunCmdWithArgs("rm", rmArgs) - if err != nil { + if err = removeDirWithRetry(target); err != nil { return emptyExecutionList, fmt.Errorf(globals.ErrWhileDeletingSandbox, target) } if globals.UsingDbDeployer && target != logDirectory { @@ -1394,8 +1415,7 @@ func RemoveCustomSandbox(sandboxDir, sandbox string, runConcurrently, useStop bo if globals.UsingDbDeployer && target != logDirectory { common.CondPrintf("Running %s\n", cmdStr) } - _, err = common.RunCmdWithArgs("rm", rmArgs) - if err != nil { + if err = removeDirWithRetry(target); err != nil { return emptyExecutionList, fmt.Errorf(globals.ErrWhileDeletingSandbox, target) } if globals.UsingDbDeployer && target != logDirectory {