Skip to content

Commit

Permalink
Better approach to store dict attributes in HDF5
Browse files Browse the repository at this point in the history
Dictionaries are serialized to a string and
deserialized using YAML.
  • Loading branch information
claudiodsf committed Mar 26, 2024
1 parent 050da51 commit 145b456
Showing 1 changed file with 36 additions and 52 deletions.
88 changes: 36 additions & 52 deletions sourcespec/spectrum.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,41 +362,28 @@ def _write_hdf5(self, group):
:param group: The HDF5 group to write to.
"""
for attr, value in self.stats.items():
# check if value is dict-like
stats = _normalize_metadata_object(self.stats)
for attr, value in stats.items():
# convert dictionaries to strings
if hasattr(value, 'items'):
# basic support for dict-like attributes, no nested dicts
_keys = list(value.keys())
_values = list(value.values())
try:
group.attrs[f'_dict_{attr}_keys'] = _keys
group.attrs[f'_dict_{attr}_values'] = _values
except TypeError:
warnings.warn(
f'Attribute "{attr}" is not a supported type and will '
'be ignored'
value = str(value)
# if value is a list-like,
# check if all elements are of the same type
elif hasattr(value, '__iter__') and len(value) > 0:
type0 = type(value[0])
if not all(isinstance(v, type0) for v in value):
raise ValueError(
f'All values of attribute "{attr}" must be of the '
'same type'
)
continue
# check if it is a list-like
elif hasattr(value, '__iter__'):
if len(value) > 0:
type0 = type(value[0])
if not all(isinstance(v, type0) for v in value):
raise ValueError(
f'All values of attribute "{attr}" must be of the '
'same type'
)
group.attrs[attr] = value
# check if it is a number
elif isinstance(value, (int, float)):
group.attrs[attr] = value
# ignore other types
else:
# if value is not a dict, nor a list, nor a number, then ignore it
elif not isinstance(value, (int, float)):
warnings.warn(
f'Attribute "{attr}" is not a supported type and will be '
'ignored'
)
continue
group.attrs[attr] = value
group.create_dataset('freq', data=self.freq)
group.create_dataset('data', data=self.data)
group.create_dataset('data_mag', data=self.data_mag)
Expand All @@ -414,8 +401,7 @@ def _write_ascii(self, filename):
with open(filename, 'w', encoding='utf-8') as fp:
fp.write('# %SOURCESPEC ASCII SPECTRUM FORMAT 1.0\n')
fp.write('# %BEGIN STATS YAML\n')
# make sure all dict-like values in stats are converted to dicts
stats = _normalize_value_for_yaml(self.stats)
stats = _normalize_metadata_object(self.stats)
stats_str = yaml.safe_dump(
stats,
sort_keys=False
Expand Down Expand Up @@ -465,6 +451,9 @@ def write(self, filename, format='HDF5', append=False):
if format == 'HDF5':
if append:
fp = h5py.File(filename, 'a')
fp.attrs['format'] = 'SourceSpec SpectrumStream HDF5'
fp.attrs['version'] = '1.0'
fp.attrs['url'] = 'https://sourcespec.seismicsource.org'
try:
lastgroup = sorted(fp.keys())[-1]
newgroup = f'spectrum_{int(lastgroup[-4:])+1:04d}'
Expand Down Expand Up @@ -575,21 +564,22 @@ def _default_yaml_representer(dumper, data):
None, _default_yaml_representer)


def _normalize_value_for_yaml(value):
def _normalize_metadata_object(obj):
"""
Normalize a value to a type supported by YAML serialization.
Normalize a metadata object to use dictionaries instead of custom objects
and numbers instead of numeric objects.
:param value: The value to normalize.
:param obj: The object to normalize.
:return: A dictionary, a float or the original value.
"""
if hasattr(value, 'items'):
if hasattr(obj, 'items'):
return {
key: _normalize_value_for_yaml(val) for key, val in value.items()
key: _normalize_metadata_object(val) for key, val in obj.items()
}
# check if value is numeric
with contextlib.suppress(TypeError, ValueError):
return float(value)
return value
return float(obj)
return obj


def _read_spectrum_from_hdf5_group(group):
Expand All @@ -601,21 +591,15 @@ def _read_spectrum_from_hdf5_group(group):
"""
spectrum = Spectrum()
for attr, value in group.attrs.items():
# basic support for dict-like attributes, no nested dicts
if attr.startswith('_dict_'):
dict_attr = attr.split('_')[2]
if dict_attr in spectrum.stats:
# already processed
continue
_keys_attr = f'_dict_{dict_attr}_keys'
_values_attr = f'_dict_{dict_attr}_values'
if _keys_attr not in group.attrs or\
_values_attr not in group.attrs:
continue
keys = group.attrs[_keys_attr]
values = group.attrs[_values_attr]
spectrum.stats[dict_attr] = dict(zip(keys, values))
continue
# convert strings back to dictionaries, using YAML
if isinstance(value, str):
try:
value = yaml.safe_load(value)
except yaml.YAMLError:
warnings.warn(
f'Attribute "{attr}" is not a supported type and will be '
'ignored'
)
spectrum.stats[attr] = value
spectrum.freq = group['freq']
spectrum.data = group['data']
Expand Down

0 comments on commit 145b456

Please sign in to comment.