Skip to content

Commit

Permalink
Merge pull request #1072 from PyTables/direct-chunking-b2ndarray
Browse files Browse the repository at this point in the history
Foreign b2nd array compatibility.

This expands the work on b2nd array support with direct chunking (#1056) to better handle such arrays created with other tools, e.g. when the filter values with chunk rank & shape are missing, or by being more relaxed about the format of the b2nd array used for storing the chunk (e.g. by not requiring it to consist of a single inner chunk, or have a specific blocksize). Some missing tests on chunk data shape have been added in the optimized path.

A new example script has been added to test writing and reading partial chunks.

Finally, the workaround for stack smashing on some versions of GCC has been removed (it seems not to be needed anymore).
  • Loading branch information
ivilata committed Oct 19, 2023
2 parents 1d6f564 + d5536db commit bb02c88
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 28 deletions.
112 changes: 112 additions & 0 deletions examples/direct-chunk-shape.py
@@ -0,0 +1,112 @@
# This script uses h5py to write two versions of the following 6x6 array into
# two datasets with 4x4 chunks::
#
# chunk boundary
# |
# [[ 0 1 2 3 0 1]
# [ 4 5 6 7 4 5]
# [ 8 9 10 11 8 9]
# [12 13 14 15 12 13] __ chunk boundary
# [ 0 1 2 3 0 1]
# [ 4 5 6 7 4 5]]
#
# So its chunks contain, left-to-right, top-to-bottom, 4x4, 4x2, 2x4, and 2x2
# items worth of data.
#
# - ``/data_full`` has all its chunks with shape 4x4 in storage, even if the
# actual data does not fill the whole chunk.
# - ``/data_part`` has chunks with the shape of actual data, as listed above.
#
# Direct chunking is used in h5py to control the exact storage format for each
# chunk, using a Blosc2 ND container. Then PyTables is used to read both
# datasets and compare them against the original array, so as to test whether
# there are problems in loading partial chunks.
#
# Since PyTables has an optimized path for uncompressing Blosc2 ND data that
# does not use the HDF5 filter pipeline, you may force it to only use the
# latter by setting ``BLOSC2_FILTER=1`` in the environment.

import blosc2
import h5py
import hdf5plugin
import numpy
import os
import tempfile

# Both parameter sets below are equivalent.
fparams = hdf5plugin.Blosc2(cname='zstd', clevel=1, filters=hdf5plugin.Blosc2.SHUFFLE)
cparams = {
"codec": blosc2.Codec.ZSTD,
"clevel": 1,
"filters": [blosc2.Filter.SHUFFLE],
}

def np2b2(a):
return blosc2.asarray(numpy.ascontiguousarray(a),
chunks=a.shape, blocks=a.shape, cparams=cparams)

# Assemble the array.
achunk = numpy.arange(4 * 4).reshape((4, 4))
adata = numpy.zeros((6, 6), dtype=achunk.dtype)
adata[0:4, 0:4] = achunk[:, :]
adata[0:4, 4:6] = achunk[:, 0:2]
adata[4:6, 0:4] = achunk[0:2, :]
adata[4:6, 4:6] = achunk[0:2, 0:2]

h5fdesc, h5fpath = tempfile.mkstemp()
os.close(h5fdesc)

h5f = h5py.File(h5fpath, "w")
print(f"Writing with full chunks to {h5fpath}:/data_full...")
dataset = h5f.create_dataset(
"data_full", adata.shape, dtype=adata.dtype, chunks=achunk.shape,
**fparams)
b2chunk = np2b2(achunk) # need to keep the ref or cframe becomes bogus
b2frame = b2chunk._schunk.to_cframe()
dataset.id.write_direct_chunk((0, 0), b2frame)
dataset.id.write_direct_chunk((0, 4), b2frame)
dataset.id.write_direct_chunk((4, 0), b2frame)
dataset.id.write_direct_chunk((4, 4), b2frame)
print(f"Writing with partial chunks to {h5fpath}:/data_part...")
dataset = h5f.create_dataset(
"data_part", adata.shape, dtype=adata.dtype, chunks=achunk.shape,
**fparams)
b2chunk = np2b2(achunk[:, :]) # need to keep the ref or cframe becomes bogus
b2frame = b2chunk._schunk.to_cframe()
dataset.id.write_direct_chunk((0, 0), b2frame)
b2chunk = np2b2(achunk[:, 0:2])
# Uncomment to introduce a bogus partial chunk in the midle of data,
# too small even for a margin chunk.
#b2chunk = np2b2(achunk[0:2, 0:2])
b2frame = b2chunk._schunk.to_cframe()
dataset.id.write_direct_chunk((0, 4), b2frame)
b2chunk = np2b2(achunk[0:2, :])
b2frame = b2chunk._schunk.to_cframe()
dataset.id.write_direct_chunk((4, 0), b2frame)
b2chunk = np2b2(achunk[0:2, 0:2])
b2frame = b2chunk._schunk.to_cframe()
dataset.id.write_direct_chunk((4, 4), b2frame)
h5f.close()

# Import after h5py part is done so that it uses its own `hdf5-blosc2`.
import tables

h5f = tables.open_file(h5fpath, "r")
print(f"Reading with full chunks from {h5fpath}:/data_full...")
a_full = h5f.root.data_full[:]
print(f"Reading with partial chunks from {h5fpath}:/data_part...")
a_part = h5f.root.data_part[:]
h5f.close()

if not (a_full == adata).all():
print("FULL CHUNKS FAILED (original, read):")
print(adata)
print(a_full)

if not (a_part == adata).all():
print("PARTIAL CHUNKS FAILED (original, read):")
print(adata)
print(a_part)

print(f"Removing {h5fpath}...")
os.remove(h5fpath)
41 changes: 28 additions & 13 deletions hdf5-blosc2/src/blosc2_filter.c
Expand Up @@ -105,7 +105,7 @@ herr_t blosc2_set_local(hid_t dcpl, hid_t type, hid_t space) {

unsigned int typesize, basetypesize;
unsigned int bufsize;
hsize_t chunkshape[32];
hsize_t chunkshape[H5S_MAX_RANK];
unsigned int flags;
size_t nelements = MAX_FILTER_VALUES;
unsigned int values[MAX_FILTER_VALUES];
Expand All @@ -122,11 +122,11 @@ herr_t blosc2_set_local(hid_t dcpl, hid_t type, hid_t space) {
/* Set Blosc2 info in first slot */
values[0] = FILTER_BLOSC2_VERSION;

ndim = H5Pget_chunk(dcpl, 32, chunkshape);
ndim = H5Pget_chunk(dcpl, H5S_MAX_RANK, chunkshape);
if (ndim < 0)
return -1;
if (ndim > 32) {
PUSH_ERR("blosc2_set_local", H5E_CALLBACK, "Chunk rank exceeds limit");
if (ndim > H5S_MAX_RANK) {
PUSH_ERR("blosc2_set_local", H5E_CALLBACK, "Chunk rank exceeds HDF5 limit");
return -1;
}

Expand Down Expand Up @@ -171,8 +171,8 @@ herr_t blosc2_set_local(hid_t dcpl, hid_t type, hid_t space) {
} else if (ndim > 1) {
/* The user may be expecting more efficient storage than we can currently provide,
* so convey some information when tracing. */
BLOSC_TRACE_ERROR("Chunk rank %d exceeds B2ND build limit %d, "
"using plain Blosc2 instead", ndim, BLOSC2_MAX_DIM);
BLOSC_TRACE_WARNING("Chunk rank %d exceeds B2ND build limit %d, "
"using plain Blosc2 instead", ndim, BLOSC2_MAX_DIM);
}

r = H5Pmodify_filter(dcpl, FILTER_BLOSC2, flags, nelements, values);
Expand Down Expand Up @@ -236,8 +236,8 @@ int32_t compute_b2nd_block_shape(size_t block_size,
}

if (nitems_new > nitems) {
BLOSC_TRACE_ERROR("Target block size is too small (%lu items), raising to %lu items",
nitems, nitems_new);
BLOSC_TRACE_INFO("Target block size is too small (%lu items), raising to %lu items",
nitems, nitems_new);
}
if (nitems_new >= nitems) {
return nitems_new * type_size;
Expand Down Expand Up @@ -331,7 +331,7 @@ size_t blosc2_filter_function(unsigned flags, size_t cd_nelmts,

if (cd_nelmts < 6) {
PUSH_ERR("blosc2_filter", H5E_CALLBACK,
"Too few filter parameters for B2ND compression: %d", cd_nelmts);
"Too few filter parameters for Blosc2 compression: %d", cd_nelmts);
goto failed;
}

Expand Down Expand Up @@ -459,9 +459,18 @@ size_t blosc2_filter_function(unsigned flags, size_t cd_nelmts,
goto failed;
}

/* Although blosc2_decompress_ctx ("else" branch) can decompress b2nd-formatted data,
* there may be padding bytes when the chunkshape is not a multiple of the blockshape,
* and only b2nd machinery knows how to handle these correctly.
*/
if (blosc2_meta_exists(schunk, "b2nd") >= 0
|| blosc2_meta_exists(schunk, "caterva") >= 0) {

if (ndim < 0) {
BLOSC_TRACE_WARNING("Auxiliary Blosc2 filter values contain no chunk rank/shape, "
"some checks on B2ND arrays will be disabled");
}

b2nd_array_t *array = NULL;

if (b2nd_from_schunk(schunk, &array) < 0) {
Expand All @@ -471,7 +480,7 @@ size_t blosc2_filter_function(unsigned flags, size_t cd_nelmts,
schunk = NULL; // owned by the array now, do not free on its own

/* Check array metadata against filter values */
if (array->ndim != ndim) {
if (ndim >= 0 && array->ndim != ndim) {
PUSH_ERR("blosc2_filter", H5E_CALLBACK,
"B2ND array rank (%hhd) != filter rank (%d)", array->ndim, ndim);
goto b2nd_decomp_out;
Expand All @@ -481,10 +490,16 @@ size_t blosc2_filter_function(unsigned flags, size_t cd_nelmts,
start[i] = 0;
stop[i] = array->shape[i];
size *= array->shape[i];
if (array->chunkshape[i] != chunkshape[i]) {
if (ndim >= 0 && array->shape[i] != chunkshape[i]) {
/* The HDF5 filter pipeline needs the filter to always return full chunks,
* even for margin chunks where data does not fill the whole chunk
* (in which case padding should be used
* (at least until the last byte with actual data).
* Of course, we need to know the chunkshape to check this.
*/
PUSH_ERR("blosc2_filter", H5E_CALLBACK,
"B2ND array chunkshape[%d] (%d) != filter chunkshape[%d] (%d)",
i, array->chunkshape[i], i, chunkshape[i]);
"B2ND array shape[%d] (%ld) != filter chunkshape[%d] (%d)",
i, array->shape[i], i, chunkshape[i]);
goto b2nd_decomp_out;
}
}
Expand Down
35 changes: 20 additions & 15 deletions src/H5ARRAY-opt.c
Expand Up @@ -49,6 +49,8 @@ herr_t read_chunk_slice_b2nd(const char *filename,
hid_t dataset_id,
hid_t space_id,
hsize_t chunk_idx,
int rank,
const hsize_t *chunk_shape,
const int64_t *slice_shape,
const int64_t *slice_start,
const int64_t *slice_stop,
Expand Down Expand Up @@ -208,7 +210,7 @@ herr_t get_blosc2_slice(char *filename,

herr_t rv;
IF_NEG_OUT_RET(rv = read_chunk_slice_b2nd(filename, dataset_id, space_id,
slice_chunk_idx, chunk_slice_shape,
slice_chunk_idx, rank, chunk_shape, chunk_slice_shape,
start_in_stored_chunk, stop_in_stored_chunk,
chunk_slice_size, chunk_slice_data),
rv - 50);
Expand Down Expand Up @@ -483,21 +485,12 @@ hid_t H5ARRAYOmake(hid_t loc_id,
}


/* There are some weird issues with optimization under GCC and Linux:
* this code causes stack smashing on Ubuntu with optimization level > 0 (with GCC >= 10),
* and segmentation faults on Guix with optimization level > 1 (with GCC >= 12).
* Clang on Linux seems ok, as well as GCC and Clang on macOS.
* For the moment, disable optimization entirely for this function
* under GCC (and compatible compilers),
* which should not cause a noticeable performance penalty anyway.
*/
#ifdef __GNUC__
__attribute__((optimize("O0")))
#endif
herr_t read_chunk_slice_b2nd(const char *filename,
hid_t dataset_id,
hid_t space_id,
hsize_t chunk_idx,
int rank,
const hsize_t *chunk_shape,
const int64_t *slice_shape,
const int64_t *slice_start,
const int64_t *slice_stop,
Expand All @@ -510,15 +503,27 @@ herr_t read_chunk_slice_b2nd(const char *filename,
haddr_t address;
IF_NEG_OUT_BTRACE(H5Dget_chunk_info(dataset_id, space_id, chunk_idx,
NULL, NULL, &address, NULL),
"Failed getting chunk info of array in %s", filename);
"Failed getting chunk info of B2ND array in %s", filename);

/* Open the schunk on-disk */
IF_TRUE_OUT_BTRACE(b2nd_open_offset(filename, &array, address) != BLOSC2_ERROR_SUCCESS,
"Cannot open array in %s", filename);
"Cannot open B2ND array in %s", filename);
IF_TRUE_OUT_BTRACE(array->ndim != rank,
"B2ND array rank (%hhd) != chunk rank (%d)", array->ndim, rank);
for (int i = 0; i < rank; i++) {
/* The HDF5 filter pipeline needs the filter to always return full chunks.
* This is not strictly needed here,
* but we enforce that requirement to avoid issues with other applications
* which use the pipeline.
*/
IF_TRUE_OUT_BTRACE(array->shape[i] != (int64_t)(chunk_shape[i]),
"B2ND array shape[%d] (%ld) != chunk shape[%d] (%lu)",
i, array->shape[i], i, chunk_shape[i]);
}

IF_TRUE_OUT_BTRACE(b2nd_get_slice_cbuffer(array, slice_start, slice_stop,
slice_data, slice_shape, slice_size) != BLOSC2_ERROR_SUCCESS,
"Failed getting slice of array in %s", filename);
"Failed getting slice of B2ND array in %s", filename);

//out_success:
retval = 0;
Expand Down

0 comments on commit bb02c88

Please sign in to comment.