Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions cmd/replication.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,17 @@ func deployReplicationNonMySQL(cmd *cobra.Command, args []string, providerName s
fmt.Printf("WARNING: could not write check_recovery script: %s\n", err)
}

testReplOpts := postgresql.ScriptOptions{
SandboxDir: topologyDir,
BinDir: binDir,
LibDir: libDir,
Port: primaryPort,
}
testReplScript := postgresql.GenerateTestReplicationScript(testReplOpts, len(replicaPorts))
Comment on lines +157 to +163
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The testReplOpts variable is redundant. You can reuse the existing scriptOpts (defined on line 141) by setting its SandboxDir field. This makes the code cleaner and avoids duplicating the configuration for BinDir, LibDir, and Port.

	scriptOpts.SandboxDir = topologyDir
	testReplScript := postgresql.GenerateTestReplicationScript(scriptOpts, len(replicaPorts))

if err := os.WriteFile(path.Join(topologyDir, "test_replication"), []byte(testReplScript), 0755); err != nil { //nolint:gosec // scripts must be executable
fmt.Printf("WARNING: could not write test_replication script: %s\n", err)
}

// Handle --with-proxysql
withProxySQL, _ := flags.GetBool("with-proxysql")
if withProxySQL {
Expand Down
28 changes: 28 additions & 0 deletions providers/postgresql/postgresql_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,34 @@ func TestGenerateCheckRecoveryScript(t *testing.T) {
}
}

func TestGenerateTestReplicationScript(t *testing.T) {
script := GenerateTestReplicationScript(ScriptOptions{
SandboxDir: "/home/test/sandboxes/postgresql_repl_16614",
BinDir: "/opt/postgresql/16.13/bin",
LibDir: "/opt/postgresql/16.13/lib",
Port: 16614,
}, 2)
expectedSubstrings := []string{
`SBDIR=/home/test/sandboxes/postgresql_repl_16614`,
"./primary/use",
"pg_current_wal_lsn()",
"pg_last_wal_replay_lsn()",
"pg_is_in_recovery()",
"./replica1/use",
"./replica2/use",
"test_summary",
}
for _, want := range expectedSubstrings {
if !strings.Contains(script, want) {
t.Errorf("script missing %q", want)
}
}
// numReplicas=2 should NOT mention replica3
if strings.Contains(script, "./replica3/use") {
t.Error("script unexpectedly references replica3")
}
}

func TestGenerateScripts(t *testing.T) {
opts := ScriptOptions{
SandboxDir: "/tmp/pg_sandbox",
Expand Down
88 changes: 88 additions & 0 deletions providers/postgresql/scripts.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,91 @@ func GenerateCheckRecoveryScript(opts ScriptOptions, replicaPorts []int) string
}
return b.String()
}

// GenerateTestReplicationScript emits a topology-level test_replication
// script that verifies replication is actually working: it creates a
// table on the primary, inserts rows, captures the primary's WAL LSN,
// then queries each replica to confirm the LSN matches and the row
// count is in sync. Modeled on the MySQL `test_replication` template
// (sandbox/templates/replication/test_replication.gotxt) but adapted to
// PostgreSQL primitives (pg_current_wal_lsn / pg_last_wal_replay_lsn).
//
// numReplicas is the count of replica sub-sandboxes laid down by
// `deploy replication --provider=postgresql` (i.e. one less than --nodes).
// The generated script invokes ./primary/use and ./replicaN/use, which
// the PostgreSQL provider already creates as per-sandbox `use` scripts.
func GenerateTestReplicationScript(opts ScriptOptions, numReplicas int) string {
preamble := fmt.Sprintf(envPreamble, opts.LibDir)
var b strings.Builder
b.WriteString(preamble)
b.WriteString(fmt.Sprintf(`SBDIR=%s
cd "$SBDIR"

if [ ! -x ./primary/use ]; then
echo "# No primary found at ./primary/use"
exit 1
fi
PRIMARY=./primary/use

FAILED=0
PASSED=0

function ok_equal {
fact="$1"
expected="$2"
msg="$3"
if [ "$fact" = "$expected" ]; then
echo "ok - (expected: <$expected>) - $msg"
PASSED=$((PASSED+1))
else
echo "not ok - (expected: <$expected> found: <$fact>) - $msg"
FAILED=$((FAILED+1))
fi
}

function test_summary {
TESTS=$((PASSED+FAILED))
printf "# Tests : %%5d\n" $TESTS
printf "# Passed: %%5d\n" $PASSED
printf "# Failed: %%5d\n" $FAILED
if [ "$FAILED" != "0" ]; then exit 1; fi
exit 0
}

YYYY_MM=$(date +%%Y-%%m)
echo "# Creating and populating table on primary..."
$PRIMARY -qc 'DROP TABLE IF EXISTS dbdeployer_test_replication'
$PRIMARY -qc 'CREATE TABLE dbdeployer_test_replication (i INT NOT NULL PRIMARY KEY, msg VARCHAR(50), d DATE, t TIME, dt TIMESTAMP)'
for N in $(seq -f '%%02.0f' 1 20); do
$PRIMARY -qc "INSERT INTO dbdeployer_test_replication VALUES ($N, 'test sandbox $N', '${YYYY_MM}-$N', '11:23:$N', '${YYYY_MM}-01 12:34:$N')"
done

PRIMARY_RECS=$($PRIMARY -Atqc 'SELECT count(*) FROM dbdeployer_test_replication')
PRIMARY_LSN=$($PRIMARY -Atqc 'SELECT pg_current_wal_lsn()')
echo "# Primary: rows=$PRIMARY_RECS LSN=$PRIMARY_LSN"

# Allow a brief window for streaming replication to catch up
sleep 1

`, opts.SandboxDir))

for n := 1; n <= numReplicas; n++ {
b.WriteString(fmt.Sprintf(`echo "# Testing replica %d"
if [ -x ./replica%d/use ]; then
REPLICA_LSN=$(./replica%d/use -Atqc 'SELECT pg_last_wal_replay_lsn()')
REPLICA_RECS=$(./replica%d/use -Atqc 'SELECT count(*) FROM dbdeployer_test_replication')
REPLICA_IS_REPLICA=$(./replica%d/use -Atqc 'SELECT pg_is_in_recovery()')
ok_equal "$REPLICA_IS_REPLICA" "t" "replica %d is in recovery mode"
ok_equal "$REPLICA_LSN" "$PRIMARY_LSN" "replica %d LSN matches primary"
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Comparing LSNs as strings for exact equality is fragile in PostgreSQL. Even in an idle sandbox, background processes (like the checkpointer or autovacuum) can advance the WAL LSN on the primary between the time the inserts finish and the LSN is captured, or during the sleep period. A more robust approach would be to use PostgreSQL's LSN comparison operator to ensure the replica has replayed at least up to the primary's LSN: SELECT pg_last_wal_replay_lsn() >= '$PRIMARY_LSN'::pg_lsn. This would return 't' if the replica is caught up or ahead, avoiding false negatives.

ok_equal "$REPLICA_RECS" "$PRIMARY_RECS" "replica %d row count matches primary"
Comment on lines +126 to +139
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

🧩 Analysis chain

🌐 Web query:

PostgreSQL standby validation best practice: should replication catch-up checks use pg_last_wal_replay_lsn() >= ''::pg_lsn with polling timeout instead of fixed sleep + strict equality?

💡 Result:

Yes—best practice is not to rely on fixed sleeps plus strict equality checks. Prefer waiting semantics (deadline/polling with timeout) against LSN replay progress, typically using a “wait until LSN is reached” mechanism; if you must poll, use a >= comparison rather than ==. Rationale (grounded in PostgreSQL behavior and docs): 1) Replay progresses in steps and may temporarily not exactly match your target LSN. Using a threshold check like pg_last_wal_replay_lsn >= target is robust against minor timing differences, and polling with a timeout avoids both indefinite hangs and coarse fixed sleeps. The WAL replay position on the standby is specifically exposed via pg_last_wal_replay_lsn as the “most recent WAL location replayed” by a streaming standby [1]. 2) PostgreSQL provides a built-in SQL facility designed for this exact pattern: WAIT FOR, which “waits until the specified lsn is reached” and supports timeout. It also clarifies standby modes: standby_replay waits for replay (applied to the database) which matches your use case (“catch-up” before validation) [2]. On timeout it errors unless NO_THROW is set [2]. 3) Replication catch-up is about lags and states rather than instantaneous equality. The documentation for warm standby explains the system transitions from “catchup” until the lag between standby and primary reaches zero for the first time, after which it becomes real-time streaming; this naturally implies using lag/position thresholds and waits rather than strict equality at an arbitrary sampling time [3]. Concrete recommendation: - If you can, use WAIT FOR with standby_replay and a TIMEOUT rather than custom loops. Example (conceptual): store the target LSN on the primary client side, then on the standby: WAIT FOR <target_lsn> IN standby_replay TIMEOUT '' (syntax/interval depends on your PostgreSQL version/client usage) [2]. - If you cannot use WAIT FOR, implement polling on the standby replay LSN with a timeout and a >= comparison: - compute target_lsn (from the primary, e.g., primary current WAL LSN) - loop: SELECT pg_last_wal_replay_lsn; succeed when replay_lsn >= target_lsn - abort when deadline exceeded This aligns with how pg_last_wal_replay_lsn represents “replayed” progress [1] and with “wait until reached” semantics [2]. So to your exact question: - Should you use pg_last_wal_replay_lsn >= ''::pg_lsn with polling timeout instead of fixed sleep + strict equality? Yes: >= + timeout polling (or ideally WAIT FOR standby_replay + TIMEOUT) is the safer, more correct best practice than == with fixed sleep [2][1].

Citations:


🏁 Script executed:

# First, check if the file exists and examine its structure
head -20 providers/postgresql/scripts.go

Repository: ProxySQL/dbdeployer

Length of output: 363


🏁 Script executed:

# Check for the GenerateTestReplicationScript function and the specific code section
sed -n '64,150p' providers/postgresql/scripts.go | head -100

Repository: ProxySQL/dbdeployer

Length of output: 3266


🏁 Script executed:

# Verify copyright header requirement for .go files per guidelines
head -1 providers/postgresql/scripts.go

Repository: ProxySQL/dbdeployer

Length of output: 82


Use polling with >= comparison instead of fixed sleep plus strict LSN equality to avoid flaky replica validation.

Fixed sleep 1 followed by strict string equality on LSN (REPLICA_LSN == PRIMARY_LSN) causes false failures when replay progresses or lags naturally occur. PostgreSQL replication best practice is to poll with a timeout using pg_last_wal_replay_lsn() >= target_lsn comparison, or use WAIT FOR with timeout if available. This ensures the test tolerates normal replication timing variations.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@providers/postgresql/scripts.go` around lines 126 - 139, The test currently
does a fixed "sleep 1" and then strict string equality between REPLICA_LSN and
PRIMARY_LSN (variables used in the replica loop and the ok_equal assertions),
which is flaky; replace that with a polling loop per replica inside the
numReplicas loop that repeatedly queries pg_last_wal_replay_lsn() (via
./replica%d/use) and checks whether the replay LSN is >= PRIMARY_LSN (or use
pg_lsn comparison/CAST) until either the condition is true or a short timeout
(e.g., several seconds) elapses, then assert success/failure; remove the fixed
sleep and change the ok_equal LSN assertion to assert that the poll succeeded
(and keep the row-count and recovery-mode checks as before).

else
echo "not ok - replica %d/use not found"
FAILED=$((FAILED+1))
fi

`, n, n, n, n, n, n, n, n, n))
}

b.WriteString("test_summary\n")
return b.String()
}
Loading