Skip to content
Permalink
Browse files
Fix config loading from snapshots on startup.
This fixes an issue with loading of node configuration from snapshot to
the Raft library, when RedisRaft re-starts and loads from snapshot.

Due to missing initalization, the Raft library remained in a state where
the loaded configuration was *working*, but an attempt to re-create a
"2nd generation" snapshot from it resulted with missing nodes.

This may have been obscured by #33, as well as by the fact that
additional node membership operations, if done before a snapshot is
taken, would correct this state.

Seems to be the root cause for #44.
  • Loading branch information
yossigo committed May 8, 2020
1 parent b9ee410 commit 51c1b782493512533d7f8d277c95531023d68b95
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 0 deletions.
5 raft.c
@@ -984,7 +984,12 @@ static void configureFromSnapshot(RedisRaftCtx *rr)
c->id, c->addr.host, c->addr.port, c->active, c->voting);
}

/* Load configuration loaded from the snapshot into Raft library.
*/
configRaftFromSnapshotInfo(rr);
raft_end_load_snapshot(rr->raft);
raft_set_snapshot_metadata(rr->raft, rr->snapshot_info.last_applied_term,
rr->snapshot_info.last_applied_idx);
}

RRStatus RedisRaftInit(RedisModuleCtx *ctx, RedisRaftCtx *rr, RedisRaftConfig *config)
@@ -498,6 +498,18 @@ void handleLoadSnapshot(RedisRaftCtx *rr, RaftReq *req)
rr->config);
}

/* Recreate the snapshot key in keyspace, to be sure we'll get a chance to
* serialize it into the RDB file when it is saved.
*
* Note: this is just a precaution, because the snapshot we load should contain
* the meta-key anyway so we should be safe either way.
*
* Future improvement: consider using hooks to automatically handle this. It
* won't be just cleaner, but also be fool-proof in case someone decides to
* manually dump an RDB file etc.
*/
initializeSnapshotInfo(rr);

RedisModule_ThreadSafeContextUnlock(rr->ctx);
RedisModule_ReplyWithLongLong(req->ctx, 1);

File renamed without changes.
@@ -176,3 +176,28 @@ def test_proxy_stability_under_load(cluster, workload):
last_commit_index = new_commit_index

workload.stop()


@pytest.mark.slow
def test_stability_with_snapshots_and_restarts(cluster, workload):
"""
Test stability of the cluster with frequent snapshoting.
"""

thread_count = 100
duration = 300

cluster.create(5, raft_args={'follower-proxy': 'yes',
'raftize-all-commands': 'yes',
'raft-log-max-file-size': '2000'})

workload.start(thread_count, cluster, MultiWithLargeReply)

# Monitor progress
start = time.time()
last_commit_index = 0
while start + duration > time.time():
time.sleep(2)
cluster.random_node().restart()

workload.stop()
@@ -309,3 +309,37 @@ def test_loading_log_tail_after_rewrite(cluster):
# Log contains last 3 entries
# Snapshot has first 3 entries
assert r1.client.get('testkey') == b'6'


def test_config_from_second_generation_snapshot(cluster):
"""
A regression test for #44: confirm that if we load a snapshot
on startup, do nothing, then re-create a snapshot we don't end
up with a messed up nodes config.
"""
cluster.create(3)

# Bump the log a bit
for _ in range(20):
assert cluster.raft_exec('INCR', 'testkey')

# Compact to get rid of logs
node3 = cluster.node(3)
assert node3.client.execute_command('RAFT.DEBUG', 'COMPACT') == b'OK'

# Restart node
node3.restart()
node3.wait_for_node_voting()

# Bump the log a bit
for _ in range(20):
assert cluster.raft_exec('INCR', 'testkey')

# Recompact
cluster.wait_for_unanimity()
assert node3.client.execute_command('RAFT.DEBUG', 'COMPACT') == b'OK'

node3.restart()
node3.wait_for_node_voting()

assert node3.raft_info()['num_nodes'] == 3

0 comments on commit 51c1b78

Please sign in to comment.