Skip to content

Commit

Permalink
config: validate names and UUIDs before box.cfg
Browse files Browse the repository at this point in the history
Currently only instance_uuid is validated before recovery process.
All names and replicaset_uuid are checked only when recovery is done,
which can take a long time. It can be frustrating to users, which
have been waiting for several hours only to get name mismatch error.

Let's read the small part of snapshot file before calling box.cfg
in order to figure out, whether the names and uuids, passed to
configuration match the ones, saved inside the snapshot. Let's also
save all names, which are missing from snapshot in order to set them
automatically in the future.

Needed for tarantool#8978

NO_DOC=tarantool/doc#3661
  • Loading branch information
Serpentian committed Oct 12, 2023
1 parent 8259abc commit dc1267a
Show file tree
Hide file tree
Showing 3 changed files with 363 additions and 2 deletions.
5 changes: 5 additions & 0 deletions changelogs/unreleased/config-validate-identifiers.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
## feature/config

* Introduced validation for replicaset_name/uuid and instance_name/uuid
mismatches before the recovery process when Tarantool is configured via
a YAML file or etcd.
193 changes: 191 additions & 2 deletions src/box/lua/config/configdata.lua
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,18 @@

local fio = require('fio')
local fun = require('fun')
local json = require('json')
local urilib = require('uri')
local digest = require('digest')
local xlog = require('xlog')
local instance_config = require('internal.config.instance_config')
local cluster_config = require('internal.config.cluster_config')

-- The map of peers, which don't have names set. Global, as it should
-- not change during fetching new config data. Created only during
-- on first startup.
local missing_names = {}

local function choose_iconfig(self, opts)
if opts ~= nil and opts.peer ~= nil then
local peers = self._peers
Expand Down Expand Up @@ -246,6 +253,177 @@ local function effective_snapshot_dir(iconfig)
return fio.abspath(fio.pathjoin(work_dir, snap_dir))
end

-- Read snap file and return a map of saved UUIDs and names for
-- all instances and for the current repicaset.
local function snapshot_names(snap_path)
-- Hardcoded space IDs, used to determine the names, persisted
-- in snapshot. Don't determine IDs dynamically, as it's needed
-- to read xlog file twice: getting IDs from _space and only after
-- that getting names from _schema and _cluster. IDs are anyway
-- hardcoded in box/schema_def.h.
local space_ids = {
['_schema'] = 272,
['_cluster'] = 320,
}

local peers = {}
local instance_uuid = xlog.meta(snap_path).instance_uuid
local instance_name, replicaset_name, replicaset_uuid
for _, row in xlog.pairs(snap_path) do
local body = row.BODY
if not body.space_id then
goto continue
end

if body.space_id > space_ids._cluster then
-- No sense in scanning after _cluster.
break
end

if body.space_id == space_ids._schema then
if body.tuple[1] == 'replicaset_uuid' then
replicaset_uuid = body.tuple[2]
elseif body.tuple[1] == 'replicaset_name' then
replicaset_name = body.tuple[2]
end
elseif body.space_id == space_ids._cluster then
if body.tuple[2] == instance_uuid then
instance_name = body.tuple[3]
end

table.insert(peers, {
instance_uuid = body.tuple[2],
instance_name = body.tuple[3],
})
end
::continue::
end

return {
replicaset_name = replicaset_name,
replicaset_uuid = replicaset_uuid,
instance_name = instance_name,
instance_uuid = instance_uuid,
peers = peers,
}
end

local function find_peer_by_uuid(peers, instance_uuid)
for name, peer in pairs(peers) do
local uuid = instance_config:get(peer.iconfig_def,
'database.instance_uuid')
if uuid == instance_uuid then
return name
end
end
return nil
end

-- Validate UUIDs and names passed to config against the data,
-- saved inside snapshot. Fail early and don't recover all
-- data if mismatch is found. Returns a map of names, missing
-- in snapshot in the following format: {<name> = <uuid>}
local function validate_names(snap_names, config_names)
-- Snapshot always has replicaset uuid and at least one
-- peer in _cluster space.
assert(snap_names.replicaset_uuid)
assert(snap_names.instance_uuid)
assert(#snap_names.peers > 0)
-- Config always has names set.
assert(config_names.replicaset_name ~= nil)
assert(config_names.instance_name ~= nil)

if config_names.replicaset_uuid ~= nil and
config_names.replicaset_uuid ~= snap_names.replicaset_uuid then
error(string.format('Replicaset UUID mismatch. Snapshot: %s, ' ..
'config: %s.', snap_names.replicaset_uuid,
config_names.replicaset_uuid))
end

if snap_names.replicaset_name ~= nil and
snap_names.replicaset_name ~= config_names.replicaset_name then
error(string.format('Replicaset name mismatch. Snapshot: %s, ' ..
'config: %s.', snap_names.replicaset_name,
config_names.replicaset_name))
end

if config_names.instance_uuid ~= nil and
config_names.instance_uuid ~= snap_names.instance_uuid then
error(string.format('Instance UUID mismatch. Snapshot: %s, ' ..
'config: %s.', snap_names.instance_uuid,
config_names.instance_uuid))
end

if snap_names.instance_name ~= nil and
snap_names.instance_name ~= config_names.instance_name then
error(string.format('Instance name mismatch. Snapshot: %s, ' ..
'config: %s.', snap_names.instance_name,
config_names.instance_name))
end

-- Fail early, if current UUID is not set, but no name is found
-- inside the snapshot file.
--
-- TODO: it's NoOp for now, as instance_uuid is automatically
-- generated for vshard configuration. This error and error_uuids
-- can't happen and consequently not tested. We must get rid of
-- generating UUIDs and test it properly.
--
-- For now it's impossible to recover from snaps without manually
-- specifying UUIDs.
if snap_names.instance_name == nil and
config_names.instance_uuid == nil then
error(string.format('Instance name is not set in snapshot and UUID ' ..
'is missing in the config. Found %s in snapshot.',
snap_names.instance_uuid))
end

-- Check, that every peer, which doesn't have name in snap,
-- has UUID in config, so that we can validate, that the right
-- config is applied to the instance.
--
-- Don't immediately fail on encountering the missing UUID.
-- Gather a butch of all instances, where UUIDs are needed and fail
-- only after that.
local error_uuids = {}
local missing_names = {}
for _, snap_peer in ipairs(snap_names.peers) do
if snap_peer.instance_name ~= nil then
goto skip
end

assert(snap_peer.instance_uuid ~= nil)
local config_peer_name = find_peer_by_uuid(config_names.peers,
snap_peer.instance_uuid)
if config_peer_name ~= nil then
missing_names[config_peer_name] = snap_peer.instance_uuid
else
table.insert(error_uuids, snap_peer.instance_uuid)
end

::skip::
end

if #error_uuids > 0 then
-- We don't have any info about names, which should be associated
-- with these UUIDs. E.g. if no UUIDs are given, master has name
-- set in snapshot, and replica doesn't, then master will fail with
-- this error, but replica - with error 'Instance name is not set'
-- (see above). So, replica will say, which UUID should be set,
-- but everyone fails to start.
error(string.format("Some replicas without names doesn't have UUIDs " ..
"set in config. Cannot match the following UUIDs" ..
" with names: %s", json.encode(error_uuids)))
end

-- Add replicaset_name to missing_names if needed.
if snap_names.replicaset_name == nil then
missing_names[config_names.replicaset_name] = snap_names.replicaset_uuid
end

return missing_names
end

local function new(iconfig, cconfig, instance_name)
-- Precalculate configuration with applied defaults.
local iconfig_def = instance_config:apply_default(iconfig)
Expand Down Expand Up @@ -399,15 +577,25 @@ local function new(iconfig, cconfig, instance_name)

-- Snapshot, used for recovery.
local snapshot_path = nil
-- Recovery can be done only during startup. Don't set snapshot_path,
-- when config reload is done. It's not needed anymore.
-- Recovery can be done only during startup. Don't set snapshot_path and
-- missing_names, when config reload is done.
if type(box.cfg) == 'function' then
local snap_dir = effective_snapshot_dir(iconfig_def)
local glob = fio.glob(fio.pathjoin(snap_dir, '*.snap'))
if #glob > 0 then
table.sort(glob)
snapshot_path = glob[#glob]
end

if snapshot_path ~= nil then
missing_names = validate_names(snapshot_names(snapshot_path), {
replicaset_uuid = replicaset_uuid,
replicaset_name = found.replicaset_name,
instance_uuid = instance_uuid,
instance_name = instance_name,
peers = peers,
})
end
end

return setmetatable({
Expand All @@ -426,6 +614,7 @@ local function new(iconfig, cconfig, instance_name)
_bootstrap_leader = bootstrap_leader,
_bootstrap_leader_name = bootstrap_leader_name,
_snapshot_path = snapshot_path,
_missing_names = missing_names,
}, mt)
end

Expand Down

0 comments on commit dc1267a

Please sign in to comment.