Skip to content

Commit 51c1b78

Browse files
committed
Fix config loading from snapshots on startup.
This fixes an issue with loading of node configuration from snapshot to the Raft library, when RedisRaft re-starts and loads from snapshot. Due to missing initalization, the Raft library remained in a state where the loaded configuration was *working*, but an attempt to re-create a "2nd generation" snapshot from it resulted with missing nodes. This may have been obscured by #33, as well as by the fact that additional node membership operations, if done before a snapshot is taken, would correct this state. Seems to be the root cause for #44.
1 parent b9ee410 commit 51c1b78

File tree

5 files changed

+76
-0
lines changed

5 files changed

+76
-0
lines changed

Diff for: raft.c

+5
Original file line numberDiff line numberDiff line change
@@ -984,7 +984,12 @@ static void configureFromSnapshot(RedisRaftCtx *rr)
984984
c->id, c->addr.host, c->addr.port, c->active, c->voting);
985985
}
986986

987+
/* Load configuration loaded from the snapshot into Raft library.
988+
*/
987989
configRaftFromSnapshotInfo(rr);
990+
raft_end_load_snapshot(rr->raft);
991+
raft_set_snapshot_metadata(rr->raft, rr->snapshot_info.last_applied_term,
992+
rr->snapshot_info.last_applied_idx);
988993
}
989994

990995
RRStatus RedisRaftInit(RedisModuleCtx *ctx, RedisRaftCtx *rr, RedisRaftConfig *config)

Diff for: snapshot.c

+12
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,18 @@ void handleLoadSnapshot(RedisRaftCtx *rr, RaftReq *req)
498498
rr->config);
499499
}
500500

501+
/* Recreate the snapshot key in keyspace, to be sure we'll get a chance to
502+
* serialize it into the RDB file when it is saved.
503+
*
504+
* Note: this is just a precaution, because the snapshot we load should contain
505+
* the meta-key anyway so we should be safe either way.
506+
*
507+
* Future improvement: consider using hooks to automatically handle this. It
508+
* won't be just cleaner, but also be fool-proof in case someone decides to
509+
* manually dump an RDB file etc.
510+
*/
511+
initializeSnapshotInfo(rr);
512+
501513
RedisModule_ThreadSafeContextUnlock(rr->ctx);
502514
RedisModule_ReplyWithLongLong(req->ctx, 1);
503515

File renamed without changes.

Diff for: tests/integration/test_fuzzing.py

+25
Original file line numberDiff line numberDiff line change
@@ -176,3 +176,28 @@ def test_proxy_stability_under_load(cluster, workload):
176176
last_commit_index = new_commit_index
177177

178178
workload.stop()
179+
180+
181+
@pytest.mark.slow
182+
def test_stability_with_snapshots_and_restarts(cluster, workload):
183+
"""
184+
Test stability of the cluster with frequent snapshoting.
185+
"""
186+
187+
thread_count = 100
188+
duration = 300
189+
190+
cluster.create(5, raft_args={'follower-proxy': 'yes',
191+
'raftize-all-commands': 'yes',
192+
'raft-log-max-file-size': '2000'})
193+
194+
workload.start(thread_count, cluster, MultiWithLargeReply)
195+
196+
# Monitor progress
197+
start = time.time()
198+
last_commit_index = 0
199+
while start + duration > time.time():
200+
time.sleep(2)
201+
cluster.random_node().restart()
202+
203+
workload.stop()

Diff for: tests/integration/test_snapshots.py

+34
Original file line numberDiff line numberDiff line change
@@ -309,3 +309,37 @@ def test_loading_log_tail_after_rewrite(cluster):
309309
# Log contains last 3 entries
310310
# Snapshot has first 3 entries
311311
assert r1.client.get('testkey') == b'6'
312+
313+
314+
def test_config_from_second_generation_snapshot(cluster):
315+
"""
316+
A regression test for #44: confirm that if we load a snapshot
317+
on startup, do nothing, then re-create a snapshot we don't end
318+
up with a messed up nodes config.
319+
"""
320+
cluster.create(3)
321+
322+
# Bump the log a bit
323+
for _ in range(20):
324+
assert cluster.raft_exec('INCR', 'testkey')
325+
326+
# Compact to get rid of logs
327+
node3 = cluster.node(3)
328+
assert node3.client.execute_command('RAFT.DEBUG', 'COMPACT') == b'OK'
329+
330+
# Restart node
331+
node3.restart()
332+
node3.wait_for_node_voting()
333+
334+
# Bump the log a bit
335+
for _ in range(20):
336+
assert cluster.raft_exec('INCR', 'testkey')
337+
338+
# Recompact
339+
cluster.wait_for_unanimity()
340+
assert node3.client.execute_command('RAFT.DEBUG', 'COMPACT') == b'OK'
341+
342+
node3.restart()
343+
node3.wait_for_node_voting()
344+
345+
assert node3.raft_info()['num_nodes'] == 3

0 commit comments

Comments
 (0)