Skip to content

Commit

Permalink
cleanup msgpack related str/bytes mess, fixes borgbackup#968
Browse files Browse the repository at this point in the history
see ticket and borg.helpers.msgpack docstring.

this changeset implements the full migration to
msgpack 2.0 spec (use_bin_type=True, raw=False).

still needed compat to the past is done via want_bytes decoder in borg.item.
  • Loading branch information
ThomasWaldmann committed May 6, 2022
1 parent 9447273 commit b2cfad4
Show file tree
Hide file tree
Showing 11 changed files with 126 additions and 131 deletions.
11 changes: 4 additions & 7 deletions src/borg/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -1753,13 +1753,10 @@ def rebuild_manifest(self):
Iterates through all objects in the repository looking for archive metadata blocks.
"""
required_archive_keys = frozenset(key.encode() for key in REQUIRED_ARCHIVE_KEYS)

def valid_archive(obj):
if not isinstance(obj, dict):
return False
keys = set(obj)
return required_archive_keys.issubset(keys)
return REQUIRED_ARCHIVE_KEYS.issubset(obj)

logger.info('Rebuilding missing manifest, this might take some time...')
# as we have lost the manifest, we do not know any more what valid item keys we had.
Expand Down Expand Up @@ -1939,10 +1936,10 @@ def list_keys_safe(keys):
def valid_item(obj):
if not isinstance(obj, StableDict):
return False, 'not a dictionary'
# A bug in Attic up to and including release 0.13 added a (meaningless) b'acl' key to every item.
# A bug in Attic up to and including release 0.13 added a (meaningless) 'acl' key to every item.
# We ignore it here, should it exist. See test_attic013_acl_bug for details.
obj.pop(b'acl', None)
keys = set(k.decode('utf-8', errors='replace') for k in obj)
obj.pop('acl', None)
keys = set(obj)
if not required_item_keys.issubset(keys):
return False, 'missing required keys: ' + list_keys_safe(required_item_keys - keys)
if not keys.issubset(item_keys):
Expand Down
2 changes: 1 addition & 1 deletion src/borg/archiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -2327,7 +2327,7 @@ def output(fd):

unpacker = msgpack.Unpacker(use_list=False, object_hook=StableDict)
first = True
for item_id in archive_org_dict[b'items']:
for item_id in archive_org_dict['items']:
data = key.decrypt(item_id, repository.get(item_id))
unpacker.feed(data)
for item in unpacker:
Expand Down
10 changes: 5 additions & 5 deletions src/borg/crypto/key.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,24 +232,24 @@ def unpack_and_verify_manifest(self, data, force_tam_not_required=False):
unpacker = get_limited_unpacker('manifest')
unpacker.feed(data)
unpacked = unpacker.unpack()
if b'tam' not in unpacked:
if 'tam' not in unpacked:
if tam_required:
raise TAMRequiredError(self.repository._location.canonical_path())
else:
logger.debug('TAM not found and not required')
return unpacked, False
tam = unpacked.pop(b'tam', None)
tam = unpacked.pop('tam', None)
if not isinstance(tam, dict):
raise TAMInvalid()
tam_type = tam.get(b'type', b'<none>').decode('ascii', 'replace')
tam_type = tam.get('type', '<none>')
if tam_type != 'HKDF_HMAC_SHA512':
if tam_required:
raise TAMUnsupportedSuiteError(repr(tam_type))
else:
logger.debug('Ignoring TAM made with unsupported suite, since TAM is not required: %r', tam_type)
return unpacked, False
tam_hmac = tam.get(b'hmac')
tam_salt = tam.get(b'salt')
tam_hmac = tam.get('hmac')
tam_salt = tam.get('salt')
if not isinstance(tam_salt, bytes) or not isinstance(tam_hmac, bytes):
raise TAMInvalid()
offset = data.index(tam_hmac)
Expand Down
34 changes: 15 additions & 19 deletions src/borg/helpers/msgpack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
wrapping msgpack
================
Due to the planned breaking api changes in upstream msgpack, we wrap it the way we need it -
to avoid having lots of clutter in the calling code. see tickets #968 and #3632.
We wrap msgpack here the way we need it - to avoid having lots of clutter in the calling code.
Packing
-------
Expand All @@ -22,30 +21,27 @@
Unpacking
---------
- raw = True (the old way, used by borg <= 1.3)
This is currently still needed to not try to decode "raw" msgpack objects.
These could come either from str (new or old msgpack) or bytes (old msgpack).
Thus, we basically must know what we want and either keep the bytes we get
or decode them to str, if we want str.
- raw = False (the new way)
This can be used in future, when we do not have to deal with data any more that was packed the old way.
- raw = False (used by borg since borg 1.3)
We already can use this with borg 1.3 due to the want_bytes decoder.
This decoder can be removed in future, when we do not have to deal with data any more that was packed the old way.
It will then unpack according to the msgpack 2.0 spec format and directly output bytes or str.
- raw = True (the old way, used by borg < 1.3)
- unicode_errors = 'surrogateescape' -> see description above (will be used when raw is False).
As of borg 1.3, we have the first part on the way to fix the msgpack str/bytes mess, #968.
borg now still needs to **read** old repos, archives, keys, ... so we can not yet fix it completely.
But from now on, borg only **writes** new data according to the new msgpack spec,
thus we can complete the fix for #968 in a later borg release.
As of borg 1.3, we have fixed most of the msgpack str/bytes mess, #968.
Borg now still needs to **read** old repos, archives, keys, ... so we can not yet fix it completely.
But from now on, borg only **writes** new data according to the new msgpack 2.0 spec,
thus we can remove some legacy support in a later borg release (some places are marked with "legacy").
current way in msgpack terms
----------------------------
- pack with use_bin_type=True (according to msgpack 2.0 spec)
- packs str -> raw and bytes -> bin
- unpack with raw=True (aka "the old way")
- unpacks raw to bytes (thus we always need to decode manually if we want str)
- unpack with raw=False (according to msgpack 2.0 spec, using unicode_errors='surrogateescape')
- unpacks bin to bytes and raw to str (thus we need to re-encode manually if we want bytes from "raw")
"""

from .datastruct import StableDict
Expand All @@ -66,8 +62,8 @@
version = mp_version

USE_BIN_TYPE = True
RAW = True # should become False later when we do not need to read old stuff any more
UNICODE_ERRORS = 'surrogateescape' # previously done by safe_encode, safe_decode
RAW = False
UNICODE_ERRORS = 'surrogateescape'


class PackException(Exception):
Expand Down Expand Up @@ -161,7 +157,7 @@ def unpackb(packed, *, raw=RAW, unicode_errors=UNICODE_ERRORS,
def unpack(stream, *, raw=RAW, unicode_errors=UNICODE_ERRORS,
strict_map_key=False,
**kwargs):
# assert raw == RAW
assert raw == RAW
assert unicode_errors == UNICODE_ERRORS
try:
kw = dict(raw=raw, unicode_errors=unicode_errors,
Expand Down
35 changes: 22 additions & 13 deletions src/borg/item.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,15 @@ def fix_tuple_of_str_and_int(t):
return t


def want_bytes(v):
"""we know that we want bytes and the value should be bytes"""
# legacy support: it being str can be caused by msgpack unpack decoding old data that was packed with use_bin_type=False
if isinstance(v, str):
v = v.encode('utf-8', errors='surrogateescape')
assert isinstance(v, bytes)
return v


class PropDict:
"""
Manage a dictionary via properties.
Expand Down Expand Up @@ -204,10 +213,10 @@ class Item(PropDict):
user = PropDict._make_property('user', (str, type(None)), 'surrogate-escaped str or None')
group = PropDict._make_property('group', (str, type(None)), 'surrogate-escaped str or None')

acl_access = PropDict._make_property('acl_access', bytes)
acl_default = PropDict._make_property('acl_default', bytes)
acl_extended = PropDict._make_property('acl_extended', bytes)
acl_nfs4 = PropDict._make_property('acl_nfs4', bytes)
acl_access = PropDict._make_property('acl_access', bytes, decode=want_bytes)
acl_default = PropDict._make_property('acl_default', bytes, decode=want_bytes)
acl_extended = PropDict._make_property('acl_extended', bytes, decode=want_bytes)
acl_nfs4 = PropDict._make_property('acl_nfs4', bytes, decode=want_bytes)

mode = PropDict._make_property('mode', int)
uid = PropDict._make_property('uid', int)
Expand All @@ -224,7 +233,7 @@ class Item(PropDict):
# compatibility note: this is a new feature, in old archives size will be missing.
size = PropDict._make_property('size', int)

hlid = PropDict._make_property('hlid', bytes) # hard link id: same value means same hard link.
hlid = PropDict._make_property('hlid', bytes, decode=want_bytes) # hard link id: same value means same hard link.
hardlink_master = PropDict._make_property('hardlink_master', bool) # legacy

chunks = PropDict._make_property('chunks', (list, type(None)), 'list or None')
Expand Down Expand Up @@ -364,9 +373,9 @@ class EncryptedKey(PropDict):
version = PropDict._make_property('version', int)
algorithm = PropDict._make_property('algorithm', str)
iterations = PropDict._make_property('iterations', int)
salt = PropDict._make_property('salt', bytes)
hash = PropDict._make_property('hash', bytes)
data = PropDict._make_property('data', bytes)
salt = PropDict._make_property('salt', bytes, decode=want_bytes)
hash = PropDict._make_property('hash', bytes, decode=want_bytes)
data = PropDict._make_property('data', bytes, decode=want_bytes)
argon2_time_cost = PropDict._make_property('argon2_time_cost', int)
argon2_memory_cost = PropDict._make_property('argon2_memory_cost', int)
argon2_parallelism = PropDict._make_property('argon2_parallelism', int)
Expand Down Expand Up @@ -400,10 +409,10 @@ class Key(PropDict):
__slots__ = ("_dict", ) # avoid setting attributes not supported by properties

version = PropDict._make_property('version', int)
repository_id = PropDict._make_property('repository_id', bytes)
enc_key = PropDict._make_property('enc_key', bytes)
enc_hmac_key = PropDict._make_property('enc_hmac_key', bytes)
id_key = PropDict._make_property('id_key', bytes)
repository_id = PropDict._make_property('repository_id', bytes, decode=want_bytes)
enc_key = PropDict._make_property('enc_key', bytes, decode=want_bytes)
enc_hmac_key = PropDict._make_property('enc_hmac_key', bytes, decode=want_bytes)
id_key = PropDict._make_property('id_key', bytes, decode=want_bytes)
chunk_seed = PropDict._make_property('chunk_seed', int)
tam_required = PropDict._make_property('tam_required', bool)

Expand Down Expand Up @@ -444,7 +453,7 @@ class ArchiveItem(PropDict):
chunker_params = PropDict._make_property('chunker_params', tuple)
recreate_cmdline = PropDict._make_property('recreate_cmdline', list) # list of s-e-str
# recreate_source_id, recreate_args, recreate_partial_chunks were used in 1.1.0b1 .. b2
recreate_source_id = PropDict._make_property('recreate_source_id', bytes)
recreate_source_id = PropDict._make_property('recreate_source_id', bytes, decode=want_bytes)
recreate_args = PropDict._make_property('recreate_args', list) # list of s-e-str
recreate_partial_chunks = PropDict._make_property('recreate_partial_chunks', list) # list of tuples
size = PropDict._make_property('size', int)
Expand Down

0 comments on commit b2cfad4

Please sign in to comment.