diff --git a/CMakeLists.txt b/CMakeLists.txt index a5b962909fc..4b7a3a04d2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -268,7 +268,7 @@ if (TILEDB_TESTS) add_custom_target(tests) add_dependencies(tests tiledb_unit) add_dependencies(tests unit_interval unit_datum unit_dynamic_memory unit_thread_pool) - add_dependencies(tests unit_array_schema unit_filter unit_filter_pipeline unit_metadata) + add_dependencies(tests unit_array_schema unit_filter_create unit_filter_pipeline unit_metadata) add_dependencies(tests unit_compressors) add_dependencies(tests unit_range_subset) endif() diff --git a/test/src/unit-Tile.cc b/test/src/unit-Tile.cc index bc3cfb33095..d927683142f 100644 --- a/test/src/unit-Tile.cc +++ b/test/src/unit-Tile.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -155,7 +155,7 @@ TEST_CASE("Tile: Test move constructor", "[Tile][move_constructor]") { // Verify all public attributes are identical. CHECK(tile2.cell_size() == cell_size); CHECK(tile2.cell_num() == buffer_len); - CHECK(tile2.dim_num() == dim_num); + CHECK(tile2.zipped_coords_dim_num() == dim_num); CHECK(tile2.empty() == false); CHECK(tile2.filtered() == false); CHECK(tile2.format_version() == format_version); @@ -204,7 +204,7 @@ TEST_CASE("Tile: Test move-assignment", "[Tile][move_assignment]") { // Verify all public attributes are identical. CHECK(tile2.cell_size() == cell_size); CHECK(tile2.cell_num() == buffer_len); - CHECK(tile2.dim_num() == dim_num); + CHECK(tile2.zipped_coords_dim_num() == dim_num); CHECK(tile2.empty() == false); CHECK(tile2.filtered() == false); CHECK(tile2.format_version() == format_version); diff --git a/test/src/unit-compression-rle.cc b/test/src/unit-compression-rle.cc index bed972d2681..b41c245e25b 100644 --- a/test/src/unit-compression-rle.cc +++ b/test/src/unit-compression-rle.cc @@ -538,7 +538,6 @@ TEMPLATE_LIST_TEST_CASE( 127}; // Compress the input array - // TBD: how to caclulate exp_size, maybe an overhead function? const auto num_of_unique_runs = 6; const auto exp_size = num_of_unique_runs * 2; std::vector compressed(exp_size); diff --git a/test/src/unit-cppapi-filter.cc b/test/src/unit-cppapi-filter.cc index ba23707a5a5..363e0549ec2 100644 --- a/test/src/unit-cppapi-filter.cc +++ b/test/src/unit-cppapi-filter.cc @@ -229,3 +229,72 @@ TEST_CASE("C++ API: Filter lists on array", "[cppapi][filter]") { if (vfs.is_dir(array_name)) vfs.remove_dir(array_name); } + +TEST_CASE("C++ API: Filter strings with RLE", "[cppapi][filter][rle-strings]") { + using namespace tiledb; + Context ctx; + VFS vfs(ctx); + std::string array_name = "cpp_unit_array"; + + if (vfs.is_dir(array_name)) + vfs.remove_dir(array_name); + + // Create schema with filter lists + FilterList a1_filters(ctx); + a1_filters.add_filter({ctx, TILEDB_FILTER_RLE}); + + auto a1 = Attribute::create(ctx, "a1"); + a1.set_filter_list(a1_filters); + + Domain domain(ctx); + auto d1 = Dimension::create(ctx, "d1", {{0, 100}}, 10); + auto d2 = Dimension::create(ctx, "d2", {{0, 100}}, 10); + domain.add_dimensions(d1, d2); + + ArraySchema schema(ctx, TILEDB_SPARSE); + schema.set_domain(domain); + schema.add_attributes(a1); + + // Create array + Array::create(array_name, schema); + + // Write to array + std::vector a1_data = { + "foo", "foo", "foobar", "bar", "bar", "bar", "bar"}; + auto a1buf = ungroup_var_buffer(a1_data); + std::vector coords = { + 0, 0, 10, 10, 20, 20, 20, 30, 30, 30, 30, 40, 40, 40}; + Array array(ctx, array_name, TILEDB_WRITE); + Query query(ctx, array); + query.set_data_buffer("a1", a1buf.second) + .set_offsets_buffer("a1", a1buf.first) + .set_coordinates(coords) + .set_layout(TILEDB_UNORDERED); + REQUIRE(query.submit() == Query::Status::COMPLETE); + array.close(); + + // Sanity check reading + array.open(TILEDB_READ); + std::vector subarray = {0, 40, 0, 40}; + std::vector a1_read_off(7); + std::string a1_read_data; + a1_read_data.resize(24); + Query query_r(ctx, array); + query_r.set_subarray(subarray) + .set_layout(TILEDB_ROW_MAJOR) + .set_data_buffer("a1", a1_read_data) + .set_offsets_buffer("a1", a1_read_off); + REQUIRE(query_r.submit() == Query::Status::COMPLETE); + array.close(); + auto ret = query_r.result_buffer_elements(); + REQUIRE(ret.size() == 1); + REQUIRE(ret["a1"].first == 7); + REQUIRE(ret["a1"].second == 24); + std::vector exp_offsets = {0, 3, 6, 12, 15, 18, 21}; + REQUIRE(a1_read_off == exp_offsets); + REQUIRE(a1_read_data.substr(0, 24) == "foofoofoobarbarbarbarbar"); + + // Clean up + if (vfs.is_dir(array_name)) + vfs.remove_dir(array_name); +} diff --git a/test/src/unit-cppapi-string-dims.cc b/test/src/unit-cppapi-string-dims.cc index bc435632d5e..0d5e0e67b57 100644 --- a/test/src/unit-cppapi-string-dims.cc +++ b/test/src/unit-cppapi-string-dims.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2021 TileDB Inc. + * @copyright Copyright (c) 2021-2022 TileDB Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -1446,51 +1446,157 @@ TEST_CASE( } TEST_CASE( - "C++ API: Test filtering of string dimension", + "C++ API: Test filtering of string dimensions", "[cppapi][string-dim][rle-strings][sparse]") { std::string array_name = "test_rle_string_dim"; - /* - * Write an array with string dimension and add RLE filter. This will result - * in tile filtering instead of chunk filtering. For now make sure we don't - * fail. In the future check filtering/unfiltering is done correclty - */ - // Create data buffer to use - std::string data = "aabbbcdddd"; - std::vector data_elem_offsets = {0, 2, 5, 6}; - - Context ctx; - Domain domain(ctx); - auto dim = - Dimension::create(ctx, "dim1", TILEDB_STRING_ASCII, nullptr, nullptr); + std::stringstream repetitions; + size_t repetition_num = 100; + for (size_t i = 0; i < repetition_num; i++) + repetitions << "GLSD987JHY"; + std::string data = + "ATSD987JIO" + std::string(repetitions.str()) + "TGSD987JPO"; + // Create the corresponding offsets buffer + std::vector data_elem_offsets(repetition_num + 2); + int start = -10; + std::generate(data_elem_offsets.begin(), data_elem_offsets.end(), [&] { + return start += 10; + }); - // Create compressor as a filter - Filter filter(ctx, TILEDB_FILTER_RLE); - // Create filter list - FilterList filter_list(ctx); - // Add compressor to filter list - filter_list.add_filter(filter); - dim.set_filter_list(filter_list); - domain.add_dimension(dim); + { + Context ctx; + Domain domain(ctx); + auto dim = + Dimension::create(ctx, "dim1", TILEDB_STRING_ASCII, nullptr, nullptr); + + // Create compressor as a filter + Filter filter(ctx, TILEDB_FILTER_RLE); + // Create filter list + FilterList filter_list(ctx); + // Add compressor to filter list + filter_list.add_filter(filter); + dim.set_filter_list(filter_list); + + domain.add_dimension(dim); + + ArraySchema schema(ctx, TILEDB_SPARSE); + schema.set_domain(domain); + schema.set_allows_dups(true); - ArraySchema schema(ctx, TILEDB_SPARSE); - schema.set_domain(domain); + tiledb::Array::create(array_name, schema); - tiledb::Array::create(array_name, schema); + auto array = tiledb::Array(ctx, array_name, TILEDB_WRITE); + Query query(ctx, array, TILEDB_WRITE); + query.set_data_buffer("dim1", (char*)data.data(), data.size()); + query.set_offsets_buffer( + "dim1", data_elem_offsets.data(), data_elem_offsets.size()); - auto array = tiledb::Array(ctx, array_name, TILEDB_WRITE); - Query query(ctx, array, TILEDB_WRITE); - query.set_data_buffer("dim1", (char*)data.data(), data.size()); - query.set_offsets_buffer( - "dim1", data_elem_offsets.data(), data_elem_offsets.size()); + query.set_layout(TILEDB_UNORDERED); + query.submit(); + query.finalize(); + array.close(); + } - query.set_layout(TILEDB_UNORDERED); - query.submit(); - query.finalize(); - array.close(); + { + Context ctx; + std::vector offsets_back(data_elem_offsets.size()); + std::string data_back; + data_back.resize(data.size()); + + auto array = tiledb::Array(ctx, array_name, TILEDB_READ); + Query query(ctx, array, TILEDB_READ); + query.add_range( + "dim1", std::string("ATSD987JIO"), std::string("TGSD987JPO")); + query.set_data_buffer("dim1", (char*)data_back.data(), data_back.size()); + query.set_offsets_buffer("dim1", offsets_back.data(), offsets_back.size()); + + query.submit(); + + CHECK(query.query_status() == Query::Status::COMPLETE); + CHECK(offsets_back == data_elem_offsets); + CHECK(data_back == data); + } + Context ctx; VFS vfs(ctx); if (vfs.is_dir(array_name)) vfs.remove_dir(array_name); } + +TEST_CASE( + "C++ API: Test adding RLE filter of string dimensions", + "[cppapi][string-dim][rle-strings][sparse]") { + std::string array_name = "test_rle_string_dim"; + + Context ctx; + Domain domain(ctx); + // Create var-length string dimension + auto dim_var_string = + Dimension::create(ctx, "dim1", TILEDB_STRING_ASCII, nullptr, nullptr); + auto dim_not_var_string = + tiledb::Dimension::create(ctx, "id", {{1, 100}}, 10); + + // Create filters + Filter rle_filter(ctx, TILEDB_FILTER_RLE); + Filter another_filter(ctx, TILEDB_FILTER_CHECKSUM_MD5); + + // Create filter list with RLE only + FilterList filter_list_rle_only(ctx); + filter_list_rle_only.add_filter(rle_filter); + + // Create filter list with RLE and other filters + FilterList filter_list_with_others(ctx); + filter_list_with_others.add_filter(another_filter); + filter_list_with_others.add_filter(rle_filter); + + { + // Add dimension that is not var length string + domain.add_dimension(dim_not_var_string); + ArraySchema schema(ctx, TILEDB_SPARSE); + schema.set_domain(domain); + + CHECK_NOTHROW(schema.set_coords_filter_list(filter_list_rle_only)); + CHECK_NOTHROW(schema.set_coords_filter_list(filter_list_with_others)); + + // Add var length string dimension + domain.add_dimension(dim_var_string); + schema.set_domain(domain); + + // Test set_coords_filter_list + { + // Case 1: There is no more specific filter list for this dimension + // Adding RLE with other filters to var-length string dimension is not + // allowed + CHECK_THROWS(schema.set_coords_filter_list(filter_list_with_others)); + // If only RLE is used, it's allowed + CHECK_NOTHROW(schema.set_coords_filter_list(filter_list_rle_only)); + } + + { + // set coords Case 2: There is a more specific filter, so whatever we set + // with set coords should not matter + CHECK_NOTHROW(dim_var_string.set_filter_list(filter_list_rle_only)); + + // We need to use another domain, as adding a dimension with the same name + // doesn't replace the old one + Domain domain2(ctx); + domain2.add_dimension(dim_var_string); + schema.set_domain(domain2); + CHECK_NOTHROW(schema.set_coords_filter_list(filter_list_with_others)); + } + + // Test set_filter_list + { + // Adding RLE with other filters to var-length string dimension is not + // allowed + CHECK_THROWS(dim_var_string.set_filter_list(filter_list_with_others)); + + // The rest of the cases are allowed + CHECK_NOTHROW(dim_var_string.set_filter_list(filter_list_rle_only)); + CHECK_NOTHROW(dim_not_var_string.set_filter_list(filter_list_rle_only)); + CHECK_NOTHROW( + dim_not_var_string.set_filter_list(filter_list_with_others)); + } + } +} \ No newline at end of file diff --git a/test/src/unit-filter-pipeline.cc b/test/src/unit-filter-pipeline.cc index 8a4cd9ffcc3..a7db570a0fd 100644 --- a/test/src/unit-filter-pipeline.cc +++ b/test/src/unit-filter-pipeline.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * @copyright Copyright (c) 2016 MIT and Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -79,6 +79,7 @@ class Add1InPlace : public tiledb::sm::Filter { Status run_forward( const Tile&, + Tile* const, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, @@ -148,6 +149,7 @@ class Add1OutOfPlace : public tiledb::sm::Filter { Status run_forward( const Tile&, + Tile* const, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, @@ -239,6 +241,7 @@ class AddNInPlace : public tiledb::sm::Filter { Status run_forward( const Tile&, + Tile* const, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, @@ -320,6 +323,7 @@ class PseudoChecksumFilter : public tiledb::sm::Filter { Status run_forward( const Tile&, + Tile* const, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, @@ -409,6 +413,7 @@ class Add1IncludingMetadataFilter : public tiledb::sm::Filter { Status run_forward( const Tile&, + Tile* const, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, diff --git a/tiledb/sm/array_schema/array_schema.cc b/tiledb/sm/array_schema/array_schema.cc index 0b7fa62ce86..8a13204daee 100644 --- a/tiledb/sm/array_schema/array_schema.cc +++ b/tiledb/sm/array_schema/array_schema.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * @copyright Copyright (c) 2016 MIT and Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -249,7 +249,8 @@ Status ArraySchema::check() const { } } - RETURN_NOT_OK(check_double_delta_compressor()); + RETURN_NOT_OK(check_double_delta_compressor(coords_filters())); + RETURN_NOT_OK(check_rle_compressor(coords_filters())); if (!check_attribute_dimension_names()) return LOG_STATUS( @@ -570,7 +571,8 @@ Status ArraySchema::deserialize(ConstBuffer* buff) { RETURN_NOT_OK(buff->read(&capacity_, sizeof(uint64_t))); // Load coords filters - auto&& [st_coords_filters, coords_filters]{FilterPipeline::deserialize(buff)}; + auto&& [st_coords_filters, coords_filters]{ + FilterPipeline::deserialize(buff, version_)}; if (!st_coords_filters.ok()) { return Status_ArraySchemaError("Cannot deserialize coords filters"); } @@ -578,7 +580,7 @@ Status ArraySchema::deserialize(ConstBuffer* buff) { // Load offsets filters auto&& [st_cell_var_filters, cell_var_filters]{ - FilterPipeline::deserialize(buff)}; + FilterPipeline::deserialize(buff, version_)}; if (!st_coords_filters.ok()) { return Status_ArraySchemaError("Cannot deserialize cell var filters"); } @@ -587,7 +589,7 @@ Status ArraySchema::deserialize(ConstBuffer* buff) { // Load validity filters if (version_ >= 7) { auto&& [st_cell_validity_filters, cell_validity_filters]{ - FilterPipeline::deserialize(buff)}; + FilterPipeline::deserialize(buff, version_)}; if (!st_cell_validity_filters.ok()) { return Status_ArraySchemaError( "Cannot deserialize cell validity filters"); @@ -665,6 +667,10 @@ void ArraySchema::set_capacity(uint64_t capacity) { } Status ArraySchema::set_coords_filter_pipeline(const FilterPipeline* pipeline) { + assert(pipeline); + RETURN_NOT_OK(check_rle_compressor(*pipeline)); + RETURN_NOT_OK(check_double_delta_compressor(*pipeline)); + coords_filters_ = *pipeline; return Status::Ok(); } @@ -823,11 +829,12 @@ bool ArraySchema::check_attribute_dimension_names() const { return (names.size() == attributes_.size() + dim_num); } -Status ArraySchema::check_double_delta_compressor() const { +Status ArraySchema::check_double_delta_compressor( + const FilterPipeline& coords_filters) const { // Check if coordinate filters have DOUBLE DELTA as a compressor bool has_double_delta = false; - for (unsigned i = 0; i < coords_filters_.size(); ++i) { - if (coords_filters_.get_filter(i)->type() == + for (unsigned i = 0; i < coords_filters.size(); ++i) { + if (coords_filters.get_filter(i)->type() == FilterType::FILTER_DOUBLE_DELTA) { has_double_delta = true; break; @@ -854,6 +861,31 @@ Status ArraySchema::check_double_delta_compressor() const { return Status::Ok(); } +Status ArraySchema::check_rle_compressor(const FilterPipeline& filters) const { + // There is no error if only 1 filter is used for RLE + if (filters.size() <= 1 || !filters.has_filter(FilterType::FILTER_RLE)) { + return Status::Ok(); + } + + // Error if there are also other filters set for a string dimension together + // with RLE + auto dim_num = domain_->dim_num(); + for (unsigned d = 0; d < dim_num; ++d) { + auto dim = domain_->dimension(d); + const auto& dim_filters = dim->filters(); + // if it's a var-length string dimension and there is no specific filter + // list already set for that dimension (then coords_filters_ will be used) + if (dim->type() == Datatype::STRING_ASCII && dim->var_size() && + dim_filters.empty()) { + return LOG_STATUS(Status_ArraySchemaError( + "RLE filter cannot be combined with other filters when applied to " + "variable length string dimensions")); + } + } + + return Status::Ok(); +} + void ArraySchema::clear() { array_uri_ = URI(); uri_ = URI(); diff --git a/tiledb/sm/array_schema/array_schema.h b/tiledb/sm/array_schema/array_schema.h index 39f19a5335d..1f35bee07b0 100644 --- a/tiledb/sm/array_schema/array_schema.h +++ b/tiledb/sm/array_schema/array_schema.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * @copyright Copyright (c) 2016 MIT and Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -438,7 +438,14 @@ class ArraySchema { * Returns error if double delta compression is used in the zipped * coordinate filters and is inherited by a dimension. */ - Status check_double_delta_compressor() const; + Status check_double_delta_compressor( + const FilterPipeline& coords_filters) const; + + /** + * Returns error if RLE is used for string dimensions but it is not the only + * filter in the filter list. + */ + Status check_rle_compressor(const FilterPipeline& coords_filters) const; /** Clears all members. Use with caution! */ void clear(); diff --git a/tiledb/sm/array_schema/attribute.cc b/tiledb/sm/array_schema/attribute.cc index c5855abc166..1e81ed7342e 100644 --- a/tiledb/sm/array_schema/attribute.cc +++ b/tiledb/sm/array_schema/attribute.cc @@ -131,7 +131,8 @@ tuple> Attribute::deserialize( RETURN_NOT_OK_TUPLE(buff->read(&cell_val_num, sizeof(uint32_t)), nullopt); // Load filter pipeline - auto&& [st_filterpipeline, filterpipeline]{FilterPipeline::deserialize(buff)}; + auto&& [st_filterpipeline, filterpipeline]{ + FilterPipeline::deserialize(buff, version)}; if (!st_filterpipeline.ok()) { return {st_filterpipeline, nullopt}; } @@ -282,8 +283,16 @@ Status Attribute::set_filter_pipeline(const FilterPipeline* pipeline) { if (datatype_is_real(type_) && pipeline->get_filter(i)->type() == FilterType::FILTER_DOUBLE_DELTA) return LOG_STATUS( - Status_AttributeError("Cannot set DOUBLE DELTA filter to a " - "dimension with a real datatype")); + Status_AttributeError("Cannot set DOUBLE DELTA filter to an " + "attribute with a real datatype")); + } + + if (type_ == Datatype::STRING_ASCII && var_size() && pipeline->size() > 1) { + if (pipeline->has_filter(FilterType::FILTER_RLE)) { + return LOG_STATUS(Status_AttributeError( + "RLE filter cannot be combined with other filters when applied to " + "variable length string attributes")); + } } filters_ = *pipeline; diff --git a/tiledb/sm/array_schema/dimension.cc b/tiledb/sm/array_schema/dimension.cc index d30ddd130ed..e4985f3dae6 100644 --- a/tiledb/sm/array_schema/dimension.cc +++ b/tiledb/sm/array_schema/dimension.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -212,7 +212,7 @@ tuple>> Dimension::deserialize( // Load filter pipeline auto&& [st_filterpipeline, filterpipeline]{ - FilterPipeline::deserialize(buff)}; + FilterPipeline::deserialize(buff, version)}; if (!st_filterpipeline.ok()) { return {st_filterpipeline, nullopt}; } @@ -1430,6 +1430,14 @@ Status Dimension::set_filter_pipeline(const FilterPipeline* pipeline) { "dimension with a real datatype")); } + if (type_ == Datatype::STRING_ASCII && var_size() && pipeline->size() > 1) { + if (pipeline->has_filter(FilterType::FILTER_RLE)) { + return LOG_STATUS(Status_DimensionError( + "RLE filter cannot be combined with other filters when applied to " + "variable length string dimensions")); + } + } + filters_ = *pipeline; return Status::Ok(); diff --git a/tiledb/sm/compressors/rle_compressor.h b/tiledb/sm/compressors/rle_compressor.h index a9f88d79002..686b6768ad5 100644 --- a/tiledb/sm/compressors/rle_compressor.h +++ b/tiledb/sm/compressors/rle_compressor.h @@ -143,7 +143,8 @@ class RLE { * Memory is allocated and owned by the caller */ template - static void decompress(const span input, span output) { + static void decompress( + const span input, span output) { if (input.empty() || output.empty()) return; @@ -181,8 +182,6 @@ class RLE { * Compress numbers in contiguous memory to RLE format * * @tparam T Type of integer in input - * @tparam P Type of integer to store run legths, must fit max num of - * repetitions in input * @param input Input in form of a memory contiguous sequence of numbers * @param output RLE-encoded as a series of [run length|value] items. Memory * is allocated and owned by the caller @@ -217,8 +216,6 @@ class RLE { * Decompress numbers in contiguous memory encoded in RLE format * * @tparam T Type of integer in input - * @tparam P Type of integer to store run legths, must be the same used for - * encoding * @param input Input in [run length|value] RLE format to decompress * @param output Decoded output as a series of values in contiguous memory. * Memory is allocated and owned by the caller diff --git a/tiledb/sm/cpp_api/query.h b/tiledb/sm/cpp_api/query.h index 4ed30f89603..e4da1344c4e 100644 --- a/tiledb/sm/cpp_api/query.h +++ b/tiledb/sm/cpp_api/query.h @@ -7,7 +7,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -582,9 +582,9 @@ class Query { * **Example:** * * @code{.cpp} - * // Set a 1D range on dimension 0, assuming the domain type is int64. - * int64_t start = 10; - * int64_t end = 20; + * // Set a 1D range on variable-sized string dimension "rows" + * std::string start = "ab""; + * std::string end = "d"; * // Stride is optional * subarray.add_range(0, start, end); * @endcode @@ -618,9 +618,9 @@ class Query { * **Example:** * * @code{.cpp} - * // Set a 1D range on dimension "rows", assuming the domain type is int64. - * int64_t start = 10; - * int64_t end = 20; + * // Set a 1D range on variable-sized string dimension "rows" + * std::string start = "ab""; + * std::string end = "d"; * const std::string dim_name = "rows"; * // Stride is optional * subarray.add_range(dim_name, start, end); diff --git a/tiledb/sm/filter/CMakeLists.txt b/tiledb/sm/filter/CMakeLists.txt index 472c7830f45..578ae72884e 100644 --- a/tiledb/sm/filter/CMakeLists.txt +++ b/tiledb/sm/filter/CMakeLists.txt @@ -3,7 +3,7 @@ # # The MIT License # -# Copyright (c) 2021 TileDB, Inc. +# Copyright (c) 2021-2022 TileDB, Inc. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -185,37 +185,33 @@ target_sources(compile_filter_pipeline PRIVATE test/compile_filter_pipeline_main if (TILEDB_TESTS) - add_executable(unit_filter EXCLUDE_FROM_ALL) - target_link_libraries(unit_filter PUBLIC filter_pipeline) + add_executable(unit_filter_create EXCLUDE_FROM_ALL) + target_link_libraries(unit_filter_create PUBLIC all_filters) find_package(Catch_EP REQUIRED) - target_link_libraries(unit_filter PUBLIC Catch2::Catch2) + target_link_libraries(unit_filter_create PUBLIC Catch2::Catch2) - # Sources for tests - target_sources(unit_filter PUBLIC - test/main.cc - test/unit_filter_create.cc - test/unit_filter_pipeline.cc - ) - - add_test( - NAME "unit_filter" - COMMAND $ - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - ) -endif() - -if (TILEDB_TESTS) add_executable(unit_filter_pipeline EXCLUDE_FROM_ALL) target_link_libraries(unit_filter_pipeline PUBLIC filter_pipeline) find_package(Catch_EP REQUIRED) target_link_libraries(unit_filter_pipeline PUBLIC Catch2::Catch2) # Sources for tests + target_sources(unit_filter_create PUBLIC + test/main.cc + test/unit_filter_create.cc + ) + target_sources(unit_filter_pipeline PUBLIC test/main.cc test/unit_filter_pipeline.cc ) + add_test( + NAME "unit_filter_create" + COMMAND $ + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + ) + add_test( NAME "unit_filter_pipeline" COMMAND $ diff --git a/tiledb/sm/filter/bit_width_reduction_filter.cc b/tiledb/sm/filter/bit_width_reduction_filter.cc index 00c1f1fc3c7..0faeb95f49a 100644 --- a/tiledb/sm/filter/bit_width_reduction_filter.cc +++ b/tiledb/sm/filter/bit_width_reduction_filter.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -99,6 +99,7 @@ void BitWidthReductionFilter::dump(FILE* out) const { Status BitWidthReductionFilter::run_forward( const Tile& tile, + Tile* const offsets_tile, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, @@ -119,29 +120,29 @@ Status BitWidthReductionFilter::run_forward( switch (tile_type) { case Datatype::INT8: return run_forward( - tile, input_metadata, input, output_metadata, output); + tile, offsets_tile, input_metadata, input, output_metadata, output); case Datatype::BLOB: case Datatype::UINT8: return run_forward( - tile, input_metadata, input, output_metadata, output); + tile, offsets_tile, input_metadata, input, output_metadata, output); case Datatype::INT16: return run_forward( - tile, input_metadata, input, output_metadata, output); + tile, offsets_tile, input_metadata, input, output_metadata, output); case Datatype::UINT16: return run_forward( - tile, input_metadata, input, output_metadata, output); + tile, offsets_tile, input_metadata, input, output_metadata, output); case Datatype::INT32: return run_forward( - tile, input_metadata, input, output_metadata, output); + tile, offsets_tile, input_metadata, input, output_metadata, output); case Datatype::UINT32: return run_forward( - tile, input_metadata, input, output_metadata, output); + tile, offsets_tile, input_metadata, input, output_metadata, output); case Datatype::INT64: return run_forward( - tile, input_metadata, input, output_metadata, output); + tile, offsets_tile, input_metadata, input, output_metadata, output); case Datatype::UINT64: return run_forward( - tile, input_metadata, input, output_metadata, output); + tile, offsets_tile, input_metadata, input, output_metadata, output); case Datatype::DATETIME_YEAR: case Datatype::DATETIME_MONTH: case Datatype::DATETIME_WEEK: @@ -165,7 +166,7 @@ Status BitWidthReductionFilter::run_forward( case Datatype::TIME_FS: case Datatype::TIME_AS: return run_forward( - tile, input_metadata, input, output_metadata, output); + tile, offsets_tile, input_metadata, input, output_metadata, output); default: return LOG_STATUS( Status_FilterError("Cannot filter; Unsupported input type")); @@ -175,6 +176,7 @@ Status BitWidthReductionFilter::run_forward( template Status BitWidthReductionFilter::run_forward( const Tile&, + Tile* const, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, diff --git a/tiledb/sm/filter/bit_width_reduction_filter.h b/tiledb/sm/filter/bit_width_reduction_filter.h index fb866ffb174..370ef1c02e6 100644 --- a/tiledb/sm/filter/bit_width_reduction_filter.h +++ b/tiledb/sm/filter/bit_width_reduction_filter.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -102,6 +102,7 @@ class BitWidthReductionFilter : public Filter { */ Status run_forward( const Tile& tile, + Tile* const tile_offsets, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, @@ -179,6 +180,7 @@ class BitWidthReductionFilter : public Filter { template Status run_forward( const Tile& tile, + Tile* const tile_offsets, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, diff --git a/tiledb/sm/filter/bitshuffle_filter.cc b/tiledb/sm/filter/bitshuffle_filter.cc index 66f2b25e8b3..20171d28e52 100644 --- a/tiledb/sm/filter/bitshuffle_filter.cc +++ b/tiledb/sm/filter/bitshuffle_filter.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -60,6 +60,7 @@ void BitshuffleFilter::dump(FILE* out) const { Status BitshuffleFilter::run_forward( const Tile& tile, + Tile* const, // offsets_tile FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, diff --git a/tiledb/sm/filter/bitshuffle_filter.h b/tiledb/sm/filter/bitshuffle_filter.h index 3eb1459eebc..b625f126b62 100644 --- a/tiledb/sm/filter/bitshuffle_filter.h +++ b/tiledb/sm/filter/bitshuffle_filter.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -87,6 +87,7 @@ class BitshuffleFilter : public Filter { */ Status run_forward( const Tile& tile, + Tile* const tile_offsets, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, diff --git a/tiledb/sm/filter/byteshuffle_filter.cc b/tiledb/sm/filter/byteshuffle_filter.cc index d121e93202b..4b54650cac8 100644 --- a/tiledb/sm/filter/byteshuffle_filter.cc +++ b/tiledb/sm/filter/byteshuffle_filter.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -60,6 +60,7 @@ void ByteshuffleFilter::dump(FILE* out) const { Status ByteshuffleFilter::run_forward( const Tile& tile, + Tile* const, // offsets_tile, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, diff --git a/tiledb/sm/filter/byteshuffle_filter.h b/tiledb/sm/filter/byteshuffle_filter.h index dcb5bd3596e..132a7a632db 100644 --- a/tiledb/sm/filter/byteshuffle_filter.h +++ b/tiledb/sm/filter/byteshuffle_filter.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -79,6 +79,7 @@ class ByteshuffleFilter : public Filter { */ Status run_forward( const Tile& tile, + Tile* const tile_offsets, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, diff --git a/tiledb/sm/filter/checksum_md5_filter.cc b/tiledb/sm/filter/checksum_md5_filter.cc index 2019f359851..4e2cdb16375 100644 --- a/tiledb/sm/filter/checksum_md5_filter.cc +++ b/tiledb/sm/filter/checksum_md5_filter.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -62,6 +62,7 @@ void ChecksumMD5Filter::dump(FILE* out) const { Status ChecksumMD5Filter::run_forward( const Tile&, + Tile* const, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, diff --git a/tiledb/sm/filter/checksum_md5_filter.h b/tiledb/sm/filter/checksum_md5_filter.h index 1b2af166bee..fb46449fb5a 100644 --- a/tiledb/sm/filter/checksum_md5_filter.h +++ b/tiledb/sm/filter/checksum_md5_filter.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -85,6 +85,7 @@ class ChecksumMD5Filter : public Filter { */ Status run_forward( const Tile& tile, + Tile* const tile_offsets, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, diff --git a/tiledb/sm/filter/checksum_sha256_filter.cc b/tiledb/sm/filter/checksum_sha256_filter.cc index bfaea24da25..1f71ecd29c4 100644 --- a/tiledb/sm/filter/checksum_sha256_filter.cc +++ b/tiledb/sm/filter/checksum_sha256_filter.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -62,6 +62,7 @@ void ChecksumSHA256Filter::dump(FILE* out) const { Status ChecksumSHA256Filter::run_forward( const Tile&, + Tile* const, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, diff --git a/tiledb/sm/filter/checksum_sha256_filter.h b/tiledb/sm/filter/checksum_sha256_filter.h index fc068e512df..49ed2026d63 100644 --- a/tiledb/sm/filter/checksum_sha256_filter.h +++ b/tiledb/sm/filter/checksum_sha256_filter.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -85,6 +85,7 @@ class ChecksumSHA256Filter : public Filter { */ Status run_forward( const Tile& tile, + Tile* const tile_offsets, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, diff --git a/tiledb/sm/filter/compression_filter.cc b/tiledb/sm/filter/compression_filter.cc index 14d71dd5678..87795d9b253 100644 --- a/tiledb/sm/filter/compression_filter.cc +++ b/tiledb/sm/filter/compression_filter.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -53,18 +53,22 @@ using namespace tiledb::common; namespace tiledb { namespace sm { -CompressionFilter::CompressionFilter(FilterType compressor, int level) +CompressionFilter::CompressionFilter( + FilterType compressor, int level, const uint32_t version) : Filter(compressor) , compressor_(filter_to_compressor(compressor)) , level_(level) + , version_(version) , zstd_compress_ctx_pool_(nullptr) , zstd_decompress_ctx_pool_(nullptr) { } -CompressionFilter::CompressionFilter(Compressor compressor, int level) +CompressionFilter::CompressionFilter( + Compressor compressor, int level, const uint32_t version) : Filter(compressor_to_filter(compressor)) , compressor_(compressor) , level_(level) + , version_(version) , zstd_compress_ctx_pool_(nullptr) , zstd_decompress_ctx_pool_(nullptr) { } @@ -112,7 +116,7 @@ void CompressionFilter::dump(FILE* out) const { } CompressionFilter* CompressionFilter::clone_impl() const { - return tdb_new(CompressionFilter, compressor_, level_); + return tdb_new(CompressionFilter, compressor_, level_, version_); } void CompressionFilter::set_compressor(Compressor compressor) { @@ -196,8 +200,28 @@ Status CompressionFilter::get_option_impl( } } +size_t CompressionFilter::calculate_output_metadata_size( + const Tile& tile, + const std::vector& data_parts, + const std::vector& metadata_parts) const { + auto total_num_parts = data_parts.size() + metadata_parts.size(); + auto metadata_size = + 2 * sizeof(uint32_t) + total_num_parts * 2 * sizeof(uint32_t); + + if (compressor_ == Compressor::RLE) { + if (datatype_is_string(tile.type())) { + // Add two extra metadata bytes that store run length datasize and + // string length datasize + metadata_size += 2 * sizeof(uint8_t); + } + } + + return metadata_size; +} + Status CompressionFilter::run_forward( const Tile& tile, + Tile* const offsets_tile, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, @@ -213,12 +237,27 @@ Status CompressionFilter::run_forward( return LOG_STATUS( Status_FilterError("Input is too large to be compressed.")); - // Compute the upper bound on the size of the output. std::vector data_parts = input->buffers(), metadata_parts = input_metadata->buffers(); - auto num_data_parts = (uint32_t)data_parts.size(), - num_metadata_parts = (uint32_t)metadata_parts.size(), - total_num_parts = num_data_parts + num_metadata_parts; + + // Allocate output metadata + auto metadata_size = + calculate_output_metadata_size(tile, data_parts, metadata_parts); + auto num_metadata_parts = static_cast(metadata_parts.size()); + auto num_data_parts = static_cast(data_parts.size()); + RETURN_NOT_OK(output_metadata->prepend_buffer(metadata_size)); + RETURN_NOT_OK(output_metadata->write(&num_metadata_parts, sizeof(uint32_t))); + RETURN_NOT_OK(output_metadata->write(&num_data_parts, sizeof(uint32_t))); + + if (compressor_ == Compressor::RLE && tile.type() == Datatype::STRING_ASCII && + offsets_tile) { + // String RLE is supported but only allowed on single filter + assert(num_data_parts == 1); + return compress_var_string_coords( + *input, offsets_tile, *output, *output_metadata); + } + + // Allocate output data uint64_t output_size_ub = 0; for (const auto& part : metadata_parts) output_size_ub += part.size() + overhead(tile, part.size()); @@ -231,13 +270,6 @@ Status CompressionFilter::run_forward( assert(buffer_ptr != nullptr); buffer_ptr->reset_offset(); - // Allocate a buffer for this filter's metadata and write the number of parts. - auto metadata_size = - 2 * sizeof(uint32_t) + total_num_parts * 2 * sizeof(uint32_t); - RETURN_NOT_OK(output_metadata->prepend_buffer(metadata_size)); - RETURN_NOT_OK(output_metadata->write(&num_metadata_parts, sizeof(uint32_t))); - RETURN_NOT_OK(output_metadata->write(&num_data_parts, sizeof(uint32_t))); - // Compress all parts. for (auto& part : metadata_parts) RETURN_NOT_OK(compress_part(tile, &part, buffer_ptr, output_metadata)); @@ -276,6 +308,13 @@ Status CompressionFilter::run_reverse( Buffer* metadata_buffer = output_metadata->buffer_ptr(0); assert(metadata_buffer != nullptr); + if (compressor_ == Compressor::RLE && tile.type() == Datatype::STRING_ASCII && + version_ >= 12) { + // String RLE is only allowed on first/single filter + assert(num_data_parts == 1); + return decompress_var_string_coords(*input, *input_metadata, *output); + } + for (uint32_t i = 0; i < num_metadata_parts; i++) RETURN_NOT_OK( decompress_part(tile, input, metadata_buffer, input_metadata)); @@ -396,6 +435,128 @@ Status CompressionFilter::decompress_part( return st; } +std::vector CompressionFilter::create_input_view( + const FilterBuffer& input, Tile* const offsets_tile) { + auto input_buf = static_cast(input.buffers()[0].data()); + auto offsets_data = static_cast(offsets_tile->data()); + auto offsets_size = offsets_tile->size() / constants::cell_var_offset_size; + std::vector input_view(offsets_size); + + size_t i = 0; + for (i = 0; i < offsets_size - 1; i++) { + input_view[i] = std::string_view( + input_buf + offsets_data[i], offsets_data[i + 1] - offsets_data[i]); + } + + // special case for the last string + input_view[i] = std::string_view( + input_buf + offsets_data[i], input.size() - offsets_data[i]); + + return input_view; +} + +Status CompressionFilter::compress_var_string_coords( + const FilterBuffer& input, + Tile* const offsets_tile, + FilterBuffer& output, + FilterBuffer& output_metadata) const { + if (input.num_buffers() != 1) { + return LOG_STATUS( + Status_FilterError("Var-sized string input has to be in single " + "buffer format to be compressed with RLE")); + } + + // Construct string view of input + auto input_view = create_input_view(input, offsets_tile); + + // Estimate and allocate output size + uint64_t output_size_ub = 0; + uint8_t rle_len_bytesize = 0, string_len_bytesize = 0; + auto [max_rle_len, max_string_size, num_of_runs, output_strings_size] = + RLE::calculate_compression_params(input_view); + // TODO: calculate smallest datatypes to fit max_string_size and + // max_rle_len -> hardcode for now + rle_len_bytesize = 4; + string_len_bytesize = 2; + output_size_ub = num_of_runs * (rle_len_bytesize + string_len_bytesize) + + output_strings_size; + + // Allocate output data buffer + RETURN_NOT_OK(output.prepend_buffer(output_size_ub)); + Buffer* data_buffer = output.buffer_ptr(0); + assert(data_buffer != nullptr); + data_buffer->reset_offset(); + auto output_view = span( + reinterpret_cast(data_buffer->data()), output_size_ub); + + switch (compressor_) { + case Compressor::RLE: { + // TODO : same as above, hardcode for now + RLE::compress(input_view, output_view); + break; + } + default: + break; + } + + data_buffer->set_size(output_size_ub); + + // Note: assumes single buffer (holds when RLE is the first/only filter) + auto input_size = input.buffers()[0].size(); + RETURN_NOT_OK(output_metadata.write(&input_size, sizeof(uint32_t))); + RETURN_NOT_OK(output_metadata.write(&output_size_ub, sizeof(uint32_t))); + RETURN_NOT_OK(output_metadata.write(&rle_len_bytesize, sizeof(uint8_t))); + RETURN_NOT_OK(output_metadata.write(&string_len_bytesize, sizeof(uint8_t))); + + return Status::Ok(); +} + +Status CompressionFilter::decompress_var_string_coords( + FilterBuffer& input, + FilterBuffer& input_metadata, + FilterBuffer& output) const { + if (input.num_buffers() != 1) { + return LOG_STATUS( + Status_FilterError("Var-sized string input has to be in single " + "buffer format to get decompressed with RLE")); + } + + // Read the part metadata + uint32_t compressed_size, uncompressed_size; + RETURN_NOT_OK(input_metadata.read(&uncompressed_size, sizeof(uint32_t))); + RETURN_NOT_OK(input_metadata.read(&compressed_size, sizeof(uint32_t))); + + uint8_t rle_len_bytesize, string_len_bytesize; + RETURN_NOT_OK(input_metadata.read(&rle_len_bytesize, sizeof(uint8_t))); + RETURN_NOT_OK(input_metadata.read(&string_len_bytesize, sizeof(uint8_t))); + + // Get views of input and output + auto input_buffer = input.buffers()[0]; + auto input_view = span( + reinterpret_cast(input_buffer.data()), compressed_size); + Buffer* output_buffer = output.buffer_ptr(0); + auto output_view = span( + reinterpret_cast(output_buffer->data()), uncompressed_size); + + switch (compressor_) { + case Compressor::RLE: { + // TODO: Use input metadata to calculate the datatypes used to write rle + // lenghts and string sizes, hardcode for now + RLE::decompress(input_view, output_view); + break; + } + default: + break; + } + + if (output_buffer->owns_data()) + output_buffer->advance_size(uncompressed_size); + output_buffer->advance_offset(uncompressed_size); + input.advance_offset(compressed_size); + + return Status::Ok(); +} + uint64_t CompressionFilter::overhead(const Tile& tile, uint64_t nbytes) const { auto cell_size = tile.cell_size(); diff --git a/tiledb/sm/filter/compression_filter.h b/tiledb/sm/filter/compression_filter.h index a21fa144a83..59e796cddab 100644 --- a/tiledb/sm/filter/compression_filter.h +++ b/tiledb/sm/filter/compression_filter.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -36,6 +36,7 @@ #include "tiledb/common/status.h" #include "tiledb/sm/compressors/zstd_compressor.h" #include "tiledb/sm/filter/filter.h" +#include "tiledb/sm/misc/constants.h" #include "tiledb/sm/misc/resource_pool.h" using namespace tiledb::common; @@ -82,16 +83,24 @@ class CompressionFilter : public Filter { * * @param compressor Compressor to use * @param level Compression level to use + * @param version Format version */ - CompressionFilter(Compressor compressor, int level); + CompressionFilter( + Compressor compressor, + int level, + const uint32_t version = constants::format_version); /** * Constructor. * * @param compressor Compressor to use * @param level Compression level to use + * @param version Format version */ - CompressionFilter(FilterType compressor, int level); + CompressionFilter( + FilterType compressor, + int level, + const uint32_t version = constants::format_version); /** Return the compressor used by this filter instance. */ Compressor compressor() const; @@ -107,6 +116,7 @@ class CompressionFilter : public Filter { */ Status run_forward( const Tile& tile, + Tile* const tile_offsets, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, @@ -136,6 +146,9 @@ class CompressionFilter : public Filter { /** The compression level. */ int level_; + /** The format version. */ + int version_; + /** The default filter compression level. */ static constexpr int default_level_ = -30000; @@ -177,6 +190,31 @@ class CompressionFilter : public Filter { Buffer* output, FilterBuffer* input_metadata) const; + /** Calculate the size of the output metadata to allocate */ + size_t calculate_output_metadata_size( + const Tile& tile, + const std::vector& data_parts, + const std::vector& metadata_parts) const; + + /** + * Helper function to compress a buffer of variable-sized strings for certain + * algorithms where this is a special case + */ + Status compress_var_string_coords( + const FilterBuffer& input, + Tile* const offsets_tile, + FilterBuffer& output, + FilterBuffer& output_metadata) const; + + /** + * Helper function to decompress a buffer of variable-sized strings for + * certain algorithms where this is a special case + */ + Status decompress_var_string_coords( + FilterBuffer& input, + FilterBuffer& input_metadata, + FilterBuffer& output) const; + /** Gets an option from this filter. */ Status get_option_impl(FilterOption option, void* value) const override; @@ -197,6 +235,10 @@ class CompressionFilter : public Filter { /** Initializes the decompression resource pool */ void init_decompression_resource_pool(uint64_t size) override; + + /** Creates a vector of views of the input strings */ + static std::vector create_input_view( + const FilterBuffer& input, Tile* const offsets_tile); }; } // namespace sm diff --git a/tiledb/sm/filter/encryption_aes256gcm_filter.cc b/tiledb/sm/filter/encryption_aes256gcm_filter.cc index 64295510430..898c3d7df0c 100644 --- a/tiledb/sm/filter/encryption_aes256gcm_filter.cc +++ b/tiledb/sm/filter/encryption_aes256gcm_filter.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -72,6 +72,7 @@ void EncryptionAES256GCMFilter::dump(FILE* out) const { Status EncryptionAES256GCMFilter::run_forward( const Tile&, + Tile* const, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, diff --git a/tiledb/sm/filter/encryption_aes256gcm_filter.h b/tiledb/sm/filter/encryption_aes256gcm_filter.h index d050a1b40f1..418658709a8 100644 --- a/tiledb/sm/filter/encryption_aes256gcm_filter.h +++ b/tiledb/sm/filter/encryption_aes256gcm_filter.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -98,6 +98,7 @@ class EncryptionAES256GCMFilter : public Filter { */ Status run_forward( const Tile& tile, + Tile* const tile_offsets, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, diff --git a/tiledb/sm/filter/filter.h b/tiledb/sm/filter/filter.h index 558cf04d4f1..04ec8b897e0 100644 --- a/tiledb/sm/filter/filter.h +++ b/tiledb/sm/filter/filter.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -93,12 +93,15 @@ class Filter { * * @param input_metadata Buffer with metadata for `input` * @param input Buffer with data to be filtered. + * @param input_offsets Buffer with the offsets (if any) of the data to be + * filtered. * @param output_metadata Buffer with metadata for filtered data * @param output Buffer with filtered data (unused by in-place filters). * @return */ virtual Status run_forward( const Tile& tile, + Tile* const offsets_tile, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, diff --git a/tiledb/sm/filter/filter_create.cc b/tiledb/sm/filter/filter_create.cc index 513be7d8093..de5bfb6f553 100644 --- a/tiledb/sm/filter/filter_create.cc +++ b/tiledb/sm/filter/filter_create.cc @@ -80,7 +80,9 @@ tiledb::sm::Filter* tiledb::sm::FilterCreate::make(FilterType type) { tuple>> tiledb::sm::FilterCreate::deserialize( - ConstBuffer* buff, const EncryptionKey& encryption_key) { + ConstBuffer* buff, + const EncryptionKey& encryption_key, + const uint32_t version) { Status st; uint8_t type; st = buff->read(&type, sizeof(uint8_t)); @@ -123,7 +125,7 @@ tiledb::sm::FilterCreate::deserialize( } return {Status::Ok(), tiledb::common::make_shared( - HERE(), compressor, compression_level)}; + HERE(), compressor, compression_level, version)}; } case FilterType::FILTER_BIT_WIDTH_REDUCTION: { uint32_t max_window_size; @@ -176,7 +178,8 @@ tiledb::sm::FilterCreate::deserialize( } tuple>> -tiledb::sm::FilterCreate::deserialize(ConstBuffer* buff) { +tiledb::sm::FilterCreate::deserialize( + ConstBuffer* buff, const uint32_t version) { EncryptionKey encryption_key; - return tiledb::sm::FilterCreate::deserialize(buff, encryption_key); + return tiledb::sm::FilterCreate::deserialize(buff, encryption_key, version); } \ No newline at end of file diff --git a/tiledb/sm/filter/filter_create.h b/tiledb/sm/filter/filter_create.h index c4de721cf00..bf65f04f2c2 100644 --- a/tiledb/sm/filter/filter_create.h +++ b/tiledb/sm/filter/filter_create.h @@ -55,19 +55,23 @@ class FilterCreate { * * @param buff The buffer to deserialize from. * @param encryption_key. + * @param version Array schema version * @return Status and Filter */ static tuple>> deserialize( - ConstBuffer* buff, const EncryptionKey& encryption_key); + ConstBuffer* buff, + const EncryptionKey& encryption_key, + const uint32_t version); /** * Deserializes a new Filter instance from the data in the given buffer. * * @param buff The buffer to deserialize from. + * @param version Array schema version * @return Status and Filter */ static tuple>> deserialize( - ConstBuffer* buff); + ConstBuffer* buff, const uint32_t version); }; } // namespace tiledb::sm diff --git a/tiledb/sm/filter/filter_pipeline.cc b/tiledb/sm/filter/filter_pipeline.cc index dcdda518272..74bcd0e4be7 100644 --- a/tiledb/sm/filter/filter_pipeline.cc +++ b/tiledb/sm/filter/filter_pipeline.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -97,10 +97,7 @@ void FilterPipeline::clear() { tuple>> FilterPipeline::get_var_chunk_sizes( - uint32_t chunk_size, - Tile* const tile, - Tile* const offsets_tile, - bool chunking) const { + uint32_t chunk_size, Tile* const tile, Tile* const offsets_tile) const { std::vector chunk_offsets; if (offsets_tile != nullptr) { uint64_t num_offsets = @@ -117,7 +114,7 @@ FilterPipeline::get_var_chunk_sizes( // Time for a new chunk? auto new_size = current_size + cell_size; - if (new_size > chunk_size || !chunking) { + if (new_size > chunk_size) { // Do we add this cell to this chunk? if (current_size <= min_size || new_size <= max_size) { if (new_size > std::numeric_limits::max()) { @@ -156,21 +153,25 @@ FilterPipeline::get_var_chunk_sizes( Status FilterPipeline::filter_chunks_forward( const Tile& tile, + Tile* const offsets_tile, uint32_t chunk_size, std::vector& chunk_offsets, FilteredBuffer& output, ThreadPool* const compute_tp) const { bool var_sizes = chunk_offsets.size() > 0; - uint64_t nchunks = - var_sizes ? chunk_offsets.size() : tile.size() / chunk_size; - uint64_t last_buffer_size = var_sizes ? - tile.size() - chunk_offsets[nchunks - 1] : - tile.size() % chunk_size; - if (!var_sizes) { - if (last_buffer_size != 0) { - nchunks++; - } else { - last_buffer_size = chunk_size; + uint64_t last_buffer_size = chunk_size; + uint64_t nchunks = 1; + // if chunking will be used + if (tile.size() != chunk_size) { + nchunks = var_sizes ? chunk_offsets.size() : tile.size() / chunk_size; + last_buffer_size = var_sizes ? tile.size() - chunk_offsets[nchunks - 1] : + tile.size() % chunk_size; + if (!var_sizes) { + if (last_buffer_size != 0) { + nchunks++; + } else { + last_buffer_size = chunk_size; + } } } @@ -212,7 +213,12 @@ Status FilterPipeline::filter_chunks_forward( f->init_compression_resource_pool(compute_tp->concurrency_level()); RETURN_NOT_OK(f->run_forward( - tile, &input_metadata, &input_data, &output_metadata, &output_data)); + tile, + offsets_tile, + &input_metadata, + &input_data, + &output_metadata, + &output_data)); input_data.set_read_only(false); input_data.swap(output_data); @@ -433,14 +439,17 @@ Status FilterPipeline::run_forward( uint32_t chunk_size = 0; if (use_chunking) { RETURN_NOT_OK(Tile::compute_chunk_size( - tile->size(), tile->dim_num(), tile->cell_size(), &chunk_size)); + tile->size(), + tile->zipped_coords_dim_num(), + tile->cell_size(), + &chunk_size)); } else { chunk_size = tile->size(); } // Get the chunk sizes for var size attributes. auto&& [st, chunk_offsets] = - get_var_chunk_sizes(chunk_size, tile, offsets_tile, use_chunking); + get_var_chunk_sizes(chunk_size, tile, offsets_tile); RETURN_NOT_OK_ELSE(st, tile->filtered_buffer().clear()); // Run the filters over all the chunks and store the result in @@ -448,6 +457,7 @@ Status FilterPipeline::run_forward( RETURN_NOT_OK_ELSE( filter_chunks_forward( *tile, + offsets_tile, chunk_size, *chunk_offsets, tile->filtered_buffer(), @@ -637,7 +647,7 @@ Status FilterPipeline::serialize(Buffer* buff) const { } tuple> FilterPipeline::deserialize( - ConstBuffer* buff) { + ConstBuffer* buff, const uint32_t version) { Status st; uint32_t max_chunk_size; std::vector> filters; @@ -648,7 +658,7 @@ tuple> FilterPipeline::deserialize( RETURN_NOT_OK_TUPLE(buff->read(&num_filters, sizeof(uint32_t)), nullopt); for (uint32_t i = 0; i < num_filters; i++) { - auto&& [st_filter, filter]{FilterCreate::deserialize(buff)}; + auto&& [st_filter, filter]{FilterCreate::deserialize(buff, version)}; if (!st_filter.ok()) { return {st_filter, nullopt}; } @@ -669,9 +679,9 @@ void FilterPipeline::dump(FILE* out) const { } } -bool FilterPipeline::has_filter(const Filter& filter) const { +bool FilterPipeline::has_filter(const FilterType& filter_type) const { for (auto& f : filters_) { - if (f->type() == filter.type()) + if (f->type() == filter_type) return true; } return false; @@ -707,10 +717,9 @@ Status FilterPipeline::append_encryption_filter( } } -bool FilterPipeline::use_tile_chunking( - bool is_dim, bool is_var, Datatype type) const { - if (is_dim && is_var && datatype_is_string(type)) { - if (has_filter(CompressionFilter(Compressor::RLE, 0))) { +bool FilterPipeline::use_tile_chunking(bool is_var, Datatype type) const { + if (is_var && type == Datatype::STRING_ASCII) { + if (has_filter(FilterType::FILTER_RLE)) { return false; } } diff --git a/tiledb/sm/filter/filter_pipeline.h b/tiledb/sm/filter/filter_pipeline.h index 455f6db2c3b..a50741a2fd5 100644 --- a/tiledb/sm/filter/filter_pipeline.h +++ b/tiledb/sm/filter/filter_pipeline.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -105,9 +105,11 @@ class FilterPipeline { * Populates the filter pipeline from the data in the input binary buffer. * * @param buff The buffer to deserialize from. + * @param version Array schema version * @return Status and FilterPipeline */ - static tuple> deserialize(ConstBuffer* buff); + static tuple> deserialize( + ConstBuffer* buff, const uint32_t version); /** * Dumps the filter pipeline details in ASCII format in the selected @@ -137,12 +139,12 @@ class FilterPipeline { } /** - * Checks if a certain filter exists in the filter pipeline + * Checks if a certain filter type exists in the filter pipeline * - * @param filter The filter to search for + * @param filter_type The filter type to search for * @return True if found, false otherwise */ - bool has_filter(const Filter& filter) const; + bool has_filter(const FilterType& filter_type) const; /** * Returns a pointer to the filter in the pipeline at the given index. @@ -303,13 +305,12 @@ class FilterPipeline { * Checks if an attribute/dimension needs to be filtered in chunks or as a * whole * - * @param is_dim True if checking for dimension, false if attribute * @param is_var True if checking for a var-sized attribute/dimension, false * if not * @param type Datatype of the input attribute/dimension * @return True if chunking needs to be used, false if not */ - bool use_tile_chunking(bool is_dim, bool is_var, const Datatype type) const; + bool use_tile_chunking(bool is_var, const Datatype type) const; private: /** A pair of FilterBuffers. */ @@ -333,20 +334,16 @@ class FilterPipeline { * @param chunk_size Target chunk size. * @param tile Var tile. * @param offsets_tile Offsets tile. - * @param chunking True if the tile is filtered in chunks * @return Status, chunk offsets vector. */ - tuple>> get_var_chunk_sizes( - uint32_t chunk_size, - Tile* const tile, - Tile* const offsets_tile, - bool chunking) const; + uint32_t chunk_size, Tile* const tile, Tile* const offsets_tile) const; /** * Run the given buffer forward through the pipeline. * - * @param tile Current tile on which the filter pipeline is being run + * @param tile Current tile on which the filter pipeline is being run. + * @param offsets_tile Current offsets tile for var sized attributes. * @param input buffer to process. * @param chunk_size chunk size. * @param chunk_offsets chunk offsets computed for var sized attributes. @@ -357,6 +354,7 @@ class FilterPipeline { */ Status filter_chunks_forward( const Tile& tile, + Tile* const offsets_tile, uint32_t chunk_size, std::vector& chunk_offsets, FilteredBuffer& output, diff --git a/tiledb/sm/filter/noop_filter.cc b/tiledb/sm/filter/noop_filter.cc index c445e7330b1..1de1996dd42 100644 --- a/tiledb/sm/filter/noop_filter.cc +++ b/tiledb/sm/filter/noop_filter.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -57,6 +57,7 @@ void NoopFilter::dump(FILE* out) const { Status NoopFilter::run_forward( const Tile&, + Tile* const, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, diff --git a/tiledb/sm/filter/noop_filter.h b/tiledb/sm/filter/noop_filter.h index 320cc03340a..45ada3823ba 100644 --- a/tiledb/sm/filter/noop_filter.h +++ b/tiledb/sm/filter/noop_filter.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -59,6 +59,7 @@ class NoopFilter : public Filter { */ Status run_forward( const Tile& tile, + Tile* const tile_offsets, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, diff --git a/tiledb/sm/filter/positive_delta_filter.cc b/tiledb/sm/filter/positive_delta_filter.cc index 44654190c6c..ff767f4f835 100644 --- a/tiledb/sm/filter/positive_delta_filter.cc +++ b/tiledb/sm/filter/positive_delta_filter.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -62,6 +62,7 @@ void PositiveDeltaFilter::dump(FILE* out) const { Status PositiveDeltaFilter::run_forward( const Tile& tile, + Tile* const offsets_tile, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, @@ -81,29 +82,29 @@ Status PositiveDeltaFilter::run_forward( switch (tile_type) { case Datatype::INT8: return run_forward( - tile, input_metadata, input, output_metadata, output); + tile, offsets_tile, input_metadata, input, output_metadata, output); case Datatype::BLOB: case Datatype::UINT8: return run_forward( - tile, input_metadata, input, output_metadata, output); + tile, offsets_tile, input_metadata, input, output_metadata, output); case Datatype::INT16: return run_forward( - tile, input_metadata, input, output_metadata, output); + tile, offsets_tile, input_metadata, input, output_metadata, output); case Datatype::UINT16: return run_forward( - tile, input_metadata, input, output_metadata, output); + tile, offsets_tile, input_metadata, input, output_metadata, output); case Datatype::INT32: return run_forward( - tile, input_metadata, input, output_metadata, output); + tile, offsets_tile, input_metadata, input, output_metadata, output); case Datatype::UINT32: return run_forward( - tile, input_metadata, input, output_metadata, output); + tile, offsets_tile, input_metadata, input, output_metadata, output); case Datatype::INT64: return run_forward( - tile, input_metadata, input, output_metadata, output); + tile, offsets_tile, input_metadata, input, output_metadata, output); case Datatype::UINT64: return run_forward( - tile, input_metadata, input, output_metadata, output); + tile, offsets_tile, input_metadata, input, output_metadata, output); case Datatype::DATETIME_YEAR: case Datatype::DATETIME_MONTH: case Datatype::DATETIME_WEEK: @@ -127,7 +128,7 @@ Status PositiveDeltaFilter::run_forward( case Datatype::TIME_FS: case Datatype::TIME_AS: return run_forward( - tile, input_metadata, input, output_metadata, output); + tile, offsets_tile, input_metadata, input, output_metadata, output); default: return LOG_STATUS( Status_FilterError("Cannot filter; Unsupported input type")); @@ -137,6 +138,7 @@ Status PositiveDeltaFilter::run_forward( template Status PositiveDeltaFilter::run_forward( const Tile&, + Tile* const, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, diff --git a/tiledb/sm/filter/positive_delta_filter.h b/tiledb/sm/filter/positive_delta_filter.h index 6b58dc7bd4e..4da958fbfef 100644 --- a/tiledb/sm/filter/positive_delta_filter.h +++ b/tiledb/sm/filter/positive_delta_filter.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -93,6 +93,7 @@ class PositiveDeltaFilter : public Filter { */ Status run_forward( const Tile& tile, + Tile* const tile_offsets, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, @@ -141,6 +142,7 @@ class PositiveDeltaFilter : public Filter { template Status run_forward( const Tile& tile, + Tile* const tile_offsets, FilterBuffer* input_metadata, FilterBuffer* input, FilterBuffer* output_metadata, diff --git a/tiledb/sm/filter/test/unit_filter_create.cc b/tiledb/sm/filter/test/unit_filter_create.cc index da2febd0c99..de49947a66a 100644 --- a/tiledb/sm/filter/test/unit_filter_create.cc +++ b/tiledb/sm/filter/test/unit_filter_create.cc @@ -70,7 +70,8 @@ TEST_CASE( buffer_offset(p) = max_window_size0; ConstBuffer constbuffer(&serialized_buffer, sizeof(serialized_buffer)); - auto&& [st_filter, filter1]{FilterCreate::deserialize(&constbuffer)}; + auto&& [st_filter, filter1]{ + FilterCreate::deserialize(&constbuffer, constants::format_version)}; REQUIRE(st_filter.ok()); // Check type @@ -94,7 +95,8 @@ TEST_CASE( buffer_offset(p) = 0; // metadata_length ConstBuffer constbuffer(&serialized_buffer, sizeof(serialized_buffer)); - auto&& [st_filter, filter1]{FilterCreate::deserialize(&constbuffer)}; + auto&& [st_filter, filter1]{ + FilterCreate::deserialize(&constbuffer, constants::format_version)}; REQUIRE(st_filter.ok()); // Check type @@ -111,7 +113,8 @@ TEST_CASE( buffer_offset(p) = 0; // metadata_length ConstBuffer constbuffer(&serialized_buffer, sizeof(serialized_buffer)); - auto&& [st_filter, filter1]{FilterCreate::deserialize(&constbuffer)}; + auto&& [st_filter, filter1]{ + FilterCreate::deserialize(&constbuffer, constants::format_version)}; REQUIRE(st_filter.ok()); // Check type @@ -128,7 +131,8 @@ TEST_CASE( buffer_offset(p) = 0; // metadata_length ConstBuffer constbuffer(&serialized_buffer, sizeof(serialized_buffer)); - auto&& [st_filter, filter1]{FilterCreate::deserialize(&constbuffer)}; + auto&& [st_filter, filter1]{ + FilterCreate::deserialize(&constbuffer, constants::format_version)}; REQUIRE(st_filter.ok()); // Check type @@ -145,7 +149,8 @@ TEST_CASE( buffer_offset(p) = 0; // metadata_length ConstBuffer constbuffer(&serialized_buffer, sizeof(serialized_buffer)); - auto&& [st_filter, filter1]{FilterCreate::deserialize(&constbuffer)}; + auto&& [st_filter, filter1]{ + FilterCreate::deserialize(&constbuffer, constants::format_version)}; REQUIRE(st_filter.ok()); // Check type @@ -162,7 +167,8 @@ TEST_CASE( buffer_offset(p) = 0; // metadata_length ConstBuffer constbuffer(&serialized_buffer, sizeof(serialized_buffer)); - auto&& [st_filter, filter1]{FilterCreate::deserialize(&constbuffer)}; + auto&& [st_filter, filter1]{ + FilterCreate::deserialize(&constbuffer, constants::format_version)}; REQUIRE(st_filter.ok()); // Check type @@ -196,7 +202,8 @@ TEST_CASE( buffer_offset(p) = static_cast(compressor0); ConstBuffer constbuffer(&serialized_buffer, sizeof(serialized_buffer)); - auto&& [st_filter, filter1]{FilterCreate::deserialize(&constbuffer)}; + auto&& [st_filter, filter1]{ + FilterCreate::deserialize(&constbuffer, constants::format_version)}; REQUIRE(st_filter.ok()); // Check type @@ -217,7 +224,8 @@ TEST_CASE( buffer_offset(p) = level0; ConstBuffer constbuffer(&serialized_buffer, sizeof(serialized_buffer)); - auto&& [st_filter, filter1]{FilterCreate::deserialize(&constbuffer)}; + auto&& [st_filter, filter1]{ + FilterCreate::deserialize(&constbuffer, constants::format_version)}; REQUIRE(st_filter.ok()); // Check type @@ -246,7 +254,8 @@ TEST_CASE( buffer_offset(p) = level0; ConstBuffer constbuffer(&serialized_buffer, sizeof(serialized_buffer)); - auto&& [st_filter, filter1]{FilterCreate::deserialize(&constbuffer)}; + auto&& [st_filter, filter1]{ + FilterCreate::deserialize(&constbuffer, constants::format_version)}; REQUIRE(st_filter.ok()); // Check type @@ -275,7 +284,8 @@ TEST_CASE( buffer_offset(p) = level0; ConstBuffer constbuffer(&serialized_buffer, sizeof(serialized_buffer)); - auto&& [st_filter, filter1]{FilterCreate::deserialize(&constbuffer)}; + auto&& [st_filter, filter1]{ + FilterCreate::deserialize(&constbuffer, constants::format_version)}; REQUIRE(st_filter.ok()); // Check type @@ -304,7 +314,8 @@ TEST_CASE( buffer_offset(p) = level0; ConstBuffer constbuffer(&serialized_buffer, sizeof(serialized_buffer)); - auto&& [st_filter, filter1]{FilterCreate::deserialize(&constbuffer)}; + auto&& [st_filter, filter1]{ + FilterCreate::deserialize(&constbuffer, constants::format_version)}; REQUIRE(st_filter.ok()); // Check type @@ -328,7 +339,8 @@ TEST_CASE("Filter: Test noop filter deserialization", "[filter][noop]") { buffer_offset(p) = 0; // metadata_length ConstBuffer constbuffer(&serialized_buffer, sizeof(serialized_buffer)); - auto&& [st_filter, filter1]{FilterCreate::deserialize(&constbuffer)}; + auto&& [st_filter, filter1]{ + FilterCreate::deserialize(&constbuffer, constants::format_version)}; REQUIRE(st_filter.ok()); // Check type @@ -346,7 +358,8 @@ TEST_CASE( buffer_offset(p) = sizeof(uint32_t); // metadata_length buffer_offset(p) = max_window_size0; ConstBuffer constbuffer(&serialized_buffer, sizeof(serialized_buffer)); - auto&& [st_filter, filter1]{FilterCreate::deserialize(&constbuffer)}; + auto&& [st_filter, filter1]{ + FilterCreate::deserialize(&constbuffer, constants::format_version)}; REQUIRE(st_filter.ok()); // Check type diff --git a/tiledb/sm/filter/test/unit_filter_pipeline.cc b/tiledb/sm/filter/test/unit_filter_pipeline.cc index e7161f6f45b..18ef785f1f4 100644 --- a/tiledb/sm/filter/test/unit_filter_pipeline.cc +++ b/tiledb/sm/filter/test/unit_filter_pipeline.cc @@ -29,6 +29,7 @@ */ #include + #include "../bit_width_reduction_filter.h" #include "../bitshuffle_filter.h" #include "../byteshuffle_filter.h" @@ -102,7 +103,8 @@ TEST_CASE( filters_buffer_offset(p) = compressor_level3; ConstBuffer constbuffer(&serialized_buffer, sizeof(serialized_buffer)); - auto&& [st_filters, filters]{FilterPipeline::deserialize(&constbuffer)}; + auto&& [st_filters, filters]{ + FilterPipeline::deserialize(&constbuffer, constants::format_version)}; REQUIRE(st_filters.ok()); CHECK(filters.value().max_chunk_size() == max_chunk_size); @@ -133,14 +135,14 @@ TEST_CASE( fp.add_filter(CompressionFilter(Compressor::LZ4, 1)); // Check that filters are searched correctly - CHECK(fp.has_filter(CompressionFilter(Compressor::RLE, 0))); - CHECK(fp.has_filter(BitWidthReductionFilter())); - CHECK_FALSE(fp.has_filter(CompressionFilter(Compressor::GZIP, 0))); - CHECK_FALSE(fp.has_filter(BitshuffleFilter())); + CHECK(fp.has_filter(FilterType::FILTER_RLE)); + CHECK(fp.has_filter(FilterType::FILTER_BIT_WIDTH_REDUCTION)); + CHECK_FALSE(fp.has_filter(FilterType::FILTER_GZIP)); + CHECK_FALSE(fp.has_filter(FilterType::FILTER_BITSHUFFLE)); // Check no error when pipeline empty FilterPipeline fp2; - CHECK_FALSE(fp2.has_filter(CompressionFilter(Compressor::RLE, 0))); + CHECK_FALSE(fp2.has_filter(FilterType::FILTER_RLE)); } TEST_CASE( @@ -157,31 +159,17 @@ TEST_CASE( fp_without_rle.add_filter(CompressionFilter(Compressor::ZSTD, 2)); fp_without_rle.add_filter(BitWidthReductionFilter()); - bool is_dimension = true; bool is_var_sized = true; - // Do not chunk the Tile for filtering if RLE is used for var-sized string - // dimensions - CHECK_FALSE(fp_with_rle.use_tile_chunking( - is_dimension, is_var_sized, Datatype::STRING_ASCII)); + // Do not chunk the Tile for filtering if RLE is used for var-sized strings + CHECK_FALSE( + fp_with_rle.use_tile_chunking(is_var_sized, Datatype::STRING_ASCII)); // Chunk in any other case - CHECK(fp_without_rle.use_tile_chunking( - is_dimension, is_var_sized, Datatype::STRING_ASCII)); - CHECK(fp_with_rle.use_tile_chunking( - !is_dimension, is_var_sized, Datatype::STRING_ASCII)); - CHECK(fp_with_rle.use_tile_chunking( - is_dimension, !is_var_sized, Datatype::STRING_ASCII)); - CHECK(fp_with_rle.use_tile_chunking( - !is_dimension, !is_var_sized, Datatype::STRING_ASCII)); - CHECK(fp_with_rle.use_tile_chunking( - is_dimension, is_var_sized, Datatype::TIME_MS)); - CHECK(fp_with_rle.use_tile_chunking( - is_dimension, is_var_sized, Datatype::DATETIME_AS)); - CHECK(fp_with_rle.use_tile_chunking( - is_dimension, is_var_sized, Datatype::BLOB)); - CHECK(fp_with_rle.use_tile_chunking( - is_dimension, is_var_sized, Datatype::INT32)); - CHECK(fp_with_rle.use_tile_chunking( - is_dimension, is_var_sized, Datatype::FLOAT64)); -} + CHECK(fp_with_rle.use_tile_chunking(!is_var_sized, Datatype::STRING_ASCII)); + CHECK(fp_with_rle.use_tile_chunking(is_var_sized, Datatype::TIME_MS)); + CHECK(fp_with_rle.use_tile_chunking(is_var_sized, Datatype::DATETIME_AS)); + CHECK(fp_with_rle.use_tile_chunking(is_var_sized, Datatype::BLOB)); + CHECK(fp_with_rle.use_tile_chunking(is_var_sized, Datatype::INT32)); + CHECK(fp_with_rle.use_tile_chunking(is_var_sized, Datatype::FLOAT64)); +} \ No newline at end of file diff --git a/tiledb/sm/fragment/fragment_info.h b/tiledb/sm/fragment/fragment_info.h index 5ee3d3ece62..a89191dd0c0 100644 --- a/tiledb/sm/fragment/fragment_info.h +++ b/tiledb/sm/fragment/fragment_info.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2020-2021 TileDB, Inc. + * @copyright Copyright (c) 2020-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -331,13 +331,7 @@ class FragmentInfo { /** * All the array schemas relevant to the loaded fragment metadata - * keyed by their file name. These schemas are also stored inside - * fragment metadata objects in `fragment_metadata_`, but as pointers, - * not shared pointers. Therefore, we need to store the shared pointers - * in a separate place here. - * - * TODO: when we transition to using a shared pointer for ArraySchema - * objects everywhere, we will not need to store this here. + * keyed by their file name. */ std::unordered_map> array_schemas_all_; diff --git a/tiledb/sm/query/result_tile.cc b/tiledb/sm/query/result_tile.cc index b2f93bea4ff..01c8321d949 100644 --- a/tiledb/sm/query/result_tile.cc +++ b/tiledb/sm/query/result_tile.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -206,7 +206,8 @@ const void* ResultTile::unzipped_coord(uint64_t pos, unsigned dim_idx) const { const void* ResultTile::zipped_coord(uint64_t pos, unsigned dim_idx) const { auto coords_size = std::get<0>(coords_tile_).cell_size(); - auto coord_size = coords_size / std::get<0>(coords_tile_).dim_num(); + auto coord_size = + coords_size / std::get<0>(coords_tile_).zipped_coords_dim_num(); const uint64_t offset = pos * coords_size + dim_idx * coord_size; void* const ret = static_cast(std::get<0>(coords_tile_).data()) + offset; @@ -244,7 +245,7 @@ uint64_t ResultTile::coord_size(unsigned dim_idx) const { // Handle zipped coordinate tiles if (!std::get<0>(coords_tile_).empty()) return std::get<0>(coords_tile_).cell_size() / - std::get<0>(coords_tile_).dim_num(); + std::get<0>(coords_tile_).zipped_coords_dim_num(); // Handle separate coordinate tiles assert(dim_idx < coord_tiles_.size()); diff --git a/tiledb/sm/query/writer_base.cc b/tiledb/sm/query/writer_base.cc index 3b2f1161efd..613f549b1f1 100644 --- a/tiledb/sm/query/writer_base.cc +++ b/tiledb/sm/query/writer_base.cc @@ -759,8 +759,8 @@ Status WriterBase::filter_tile( &filters, array_->get_encryption_key())); // Check if chunk or tile level filtering/unfiltering is appropriate - bool use_chunking = filters.use_tile_chunking( - array_schema_.is_dim(name), array_schema_.var_size(name), tile->type()); + bool use_chunking = + filters.use_tile_chunking(array_schema_.var_size(name), tile->type()); assert(!tile->filtered()); RETURN_NOT_OK(filters.run_forward( diff --git a/tiledb/sm/tile/generic_tile_io.cc b/tiledb/sm/tile/generic_tile_io.cc index 14cbf387055..0fb84cff5cb 100644 --- a/tiledb/sm/tile/generic_tile_io.cc +++ b/tiledb/sm/tile/generic_tile_io.cc @@ -137,7 +137,7 @@ Status GenericTileIO::read_generic_tile_header( header->filter_pipeline_size)); ConstBuffer cbuf(header_buff->data(), header_buff->size()); auto&& [st_filterpipeline, filterpipeline]{ - FilterPipeline::deserialize(&cbuf)}; + FilterPipeline::deserialize(&cbuf, header->version_number)}; if (!st_filterpipeline.ok()) { return st_filterpipeline; } diff --git a/tiledb/sm/tile/tile.cc b/tiledb/sm/tile/tile.cc index ef3dbcf1318..d280aec233c 100644 --- a/tiledb/sm/tile/tile.cc +++ b/tiledb/sm/tile/tile.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -83,7 +83,7 @@ Tile::Tile() : data_(nullptr, tiledb_free) , size_(0) , cell_size_(0) - , dim_num_(0) + , zipped_coords_dim_num_(0) , format_version_(0) , type_(Datatype::INT32) , filtered_buffer_(0) { @@ -92,13 +92,13 @@ Tile::Tile() Tile::Tile( const Datatype type, const uint64_t cell_size, - const unsigned int dim_num, + const unsigned int zipped_coords_dim_num, void* const buffer, uint64_t size) : data_(static_cast(buffer), tiledb_free) , size_(size) , cell_size_(cell_size) - , dim_num_(dim_num) + , zipped_coords_dim_num_(zipped_coords_dim_num) , format_version_(0) , type_(type) , filtered_buffer_(0) { @@ -133,7 +133,7 @@ Status Tile::init_unfiltered( unsigned int dim_num, bool fill_with_zeros) { cell_size_ = cell_size; - dim_num_ = dim_num; + zipped_coords_dim_num_ = dim_num; type_ = type; format_version_ = format_version; @@ -158,9 +158,9 @@ Status Tile::init_filtered( uint32_t format_version, Datatype type, uint64_t cell_size, - unsigned int dim_num) { + unsigned int zipped_coords_dim_num) { cell_size_ = cell_size; - dim_num_ = dim_num; + zipped_coords_dim_num_ = zipped_coords_dim_num; type_ = type; format_version_ = format_version; size_ = 0; @@ -215,11 +215,11 @@ Status Tile::write(const void* data, uint64_t offset, uint64_t nbytes) { } Status Tile::zip_coordinates() { - assert(dim_num_ > 0); + assert(zipped_coords_dim_num_ > 0); // For easy reference const uint64_t tile_size = size_; - const uint64_t coord_size = cell_size_ / dim_num_; + const uint64_t coord_size = cell_size_ / zipped_coords_dim_num_; const uint64_t cell_num = tile_size / cell_size_; // Create a tile clone @@ -229,7 +229,7 @@ Status Tile::zip_coordinates() { // Zip coordinates uint64_t ptr_tmp = 0; - for (unsigned int j = 0; j < dim_num_; ++j) { + for (unsigned int j = 0; j < zipped_coords_dim_num_; ++j) { uint64_t ptr = j * coord_size; for (uint64_t i = 0; i < cell_num; ++i) { std::memcpy(data_.get() + ptr, tile_tmp + ptr_tmp, coord_size); @@ -250,7 +250,7 @@ void Tile::swap(Tile& tile) { std::swap(size_, tile.size_); std::swap(data_, tile.data_); std::swap(cell_size_, tile.cell_size_); - std::swap(dim_num_, tile.dim_num_); + std::swap(zipped_coords_dim_num_, tile.zipped_coords_dim_num_); std::swap(format_version_, tile.format_version_); std::swap(type_, tile.type_); } diff --git a/tiledb/sm/tile/tile.h b/tiledb/sm/tile/tile.h index 5b2d3bff80d..8f8c56924ed 100644 --- a/tiledb/sm/tile/tile.h +++ b/tiledb/sm/tile/tile.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB, Inc. + * @copyright Copyright (c) 2017-2022 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -188,9 +188,10 @@ class Tile { return cell_size_; } - /** Returns the number of dimensions (0 if this is an attribute tile). */ - inline unsigned int dim_num() const { - return dim_num_; + /** Returns the number of zipped coordinates (0 if this is an attribute tile). + */ + inline unsigned int zipped_coords_dim_num() const { + return zipped_coords_dim_num_; } /** Checks if the tile is empty. */ @@ -228,9 +229,9 @@ class Tile { return (data_ == nullptr) ? 0 : size_; } - /** Returns *true* if the tile stores coordinates. */ + /** Returns *true* if the tile stores zipped coordinates. */ inline bool stores_coords() const { - return dim_num_ > 0; + return zipped_coords_dim_num_ > 0; } /** Returns the tile data type. */ @@ -275,10 +276,10 @@ class Tile { uint64_t cell_size_; /** - * The number of dimensions, in case the tile stores coordinates. It is 0 - * in case the tile stores attributes. + * The number of dimensions, in case the tile stores zipped coordinates. + * It is 0 in case the tile stores attributes or other type of dimensions */ - unsigned int dim_num_; + unsigned int zipped_coords_dim_num_; /** The format version of the data in this tile. */ uint32_t format_version_;