Skip to content

Commit

Permalink
Integrate RLE for var-length string dimensions (#2938)
Browse files Browse the repository at this point in the history
* Integrate RLE for var-length string dimensions

* Fix windows warning

* Fix dates in headers

* Various correctness fixes

* Fix backwards compatibility issues

* Add basic test for string attributes and fix filter UTs

* Fix merge issues
  • Loading branch information
ypatia committed Mar 15, 2022
1 parent 814c570 commit 023cb53
Show file tree
Hide file tree
Showing 44 changed files with 729 additions and 264 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Expand Up @@ -268,7 +268,7 @@ if (TILEDB_TESTS)
add_custom_target(tests)
add_dependencies(tests tiledb_unit)
add_dependencies(tests unit_interval unit_datum unit_dynamic_memory unit_thread_pool)
add_dependencies(tests unit_array_schema unit_filter unit_filter_pipeline unit_metadata)
add_dependencies(tests unit_array_schema unit_filter_create unit_filter_pipeline unit_metadata)
add_dependencies(tests unit_compressors)
add_dependencies(tests unit_range_subset)
endif()
Expand Down
6 changes: 3 additions & 3 deletions test/src/unit-Tile.cc
Expand Up @@ -5,7 +5,7 @@
*
* The MIT License
*
* @copyright Copyright (c) 2017-2021 TileDB, Inc.
* @copyright Copyright (c) 2017-2022 TileDB, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -155,7 +155,7 @@ TEST_CASE("Tile: Test move constructor", "[Tile][move_constructor]") {
// Verify all public attributes are identical.
CHECK(tile2.cell_size() == cell_size);
CHECK(tile2.cell_num() == buffer_len);
CHECK(tile2.dim_num() == dim_num);
CHECK(tile2.zipped_coords_dim_num() == dim_num);
CHECK(tile2.empty() == false);
CHECK(tile2.filtered() == false);
CHECK(tile2.format_version() == format_version);
Expand Down Expand Up @@ -204,7 +204,7 @@ TEST_CASE("Tile: Test move-assignment", "[Tile][move_assignment]") {
// Verify all public attributes are identical.
CHECK(tile2.cell_size() == cell_size);
CHECK(tile2.cell_num() == buffer_len);
CHECK(tile2.dim_num() == dim_num);
CHECK(tile2.zipped_coords_dim_num() == dim_num);
CHECK(tile2.empty() == false);
CHECK(tile2.filtered() == false);
CHECK(tile2.format_version() == format_version);
Expand Down
1 change: 0 additions & 1 deletion test/src/unit-compression-rle.cc
Expand Up @@ -538,7 +538,6 @@ TEMPLATE_LIST_TEST_CASE(
127};

// Compress the input array
// TBD: how to caclulate exp_size, maybe an overhead function?
const auto num_of_unique_runs = 6;
const auto exp_size = num_of_unique_runs * 2;
std::vector<T> compressed(exp_size);
Expand Down
69 changes: 69 additions & 0 deletions test/src/unit-cppapi-filter.cc
Expand Up @@ -229,3 +229,72 @@ TEST_CASE("C++ API: Filter lists on array", "[cppapi][filter]") {
if (vfs.is_dir(array_name))
vfs.remove_dir(array_name);
}

TEST_CASE("C++ API: Filter strings with RLE", "[cppapi][filter][rle-strings]") {
using namespace tiledb;
Context ctx;
VFS vfs(ctx);
std::string array_name = "cpp_unit_array";

if (vfs.is_dir(array_name))
vfs.remove_dir(array_name);

// Create schema with filter lists
FilterList a1_filters(ctx);
a1_filters.add_filter({ctx, TILEDB_FILTER_RLE});

auto a1 = Attribute::create<std::string>(ctx, "a1");
a1.set_filter_list(a1_filters);

Domain domain(ctx);
auto d1 = Dimension::create<int>(ctx, "d1", {{0, 100}}, 10);
auto d2 = Dimension::create<int>(ctx, "d2", {{0, 100}}, 10);
domain.add_dimensions(d1, d2);

ArraySchema schema(ctx, TILEDB_SPARSE);
schema.set_domain(domain);
schema.add_attributes(a1);

// Create array
Array::create(array_name, schema);

// Write to array
std::vector<std::string> a1_data = {
"foo", "foo", "foobar", "bar", "bar", "bar", "bar"};
auto a1buf = ungroup_var_buffer(a1_data);
std::vector<int> coords = {
0, 0, 10, 10, 20, 20, 20, 30, 30, 30, 30, 40, 40, 40};
Array array(ctx, array_name, TILEDB_WRITE);
Query query(ctx, array);
query.set_data_buffer("a1", a1buf.second)
.set_offsets_buffer("a1", a1buf.first)
.set_coordinates(coords)
.set_layout(TILEDB_UNORDERED);
REQUIRE(query.submit() == Query::Status::COMPLETE);
array.close();

// Sanity check reading
array.open(TILEDB_READ);
std::vector<int> subarray = {0, 40, 0, 40};
std::vector<uint64_t> a1_read_off(7);
std::string a1_read_data;
a1_read_data.resize(24);
Query query_r(ctx, array);
query_r.set_subarray(subarray)
.set_layout(TILEDB_ROW_MAJOR)
.set_data_buffer("a1", a1_read_data)
.set_offsets_buffer("a1", a1_read_off);
REQUIRE(query_r.submit() == Query::Status::COMPLETE);
array.close();
auto ret = query_r.result_buffer_elements();
REQUIRE(ret.size() == 1);
REQUIRE(ret["a1"].first == 7);
REQUIRE(ret["a1"].second == 24);
std::vector<uint64_t> exp_offsets = {0, 3, 6, 12, 15, 18, 21};
REQUIRE(a1_read_off == exp_offsets);
REQUIRE(a1_read_data.substr(0, 24) == "foofoofoobarbarbarbarbar");

// Clean up
if (vfs.is_dir(array_name))
vfs.remove_dir(array_name);
}
176 changes: 141 additions & 35 deletions test/src/unit-cppapi-string-dims.cc
Expand Up @@ -5,7 +5,7 @@
*
* The MIT License
*
* @copyright Copyright (c) 2021 TileDB Inc.
* @copyright Copyright (c) 2021-2022 TileDB Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -1446,51 +1446,157 @@ TEST_CASE(
}

TEST_CASE(
"C++ API: Test filtering of string dimension",
"C++ API: Test filtering of string dimensions",
"[cppapi][string-dim][rle-strings][sparse]") {
std::string array_name = "test_rle_string_dim";

/*
* Write an array with string dimension and add RLE filter. This will result
* in tile filtering instead of chunk filtering. For now make sure we don't
* fail. In the future check filtering/unfiltering is done correclty
*/

// Create data buffer to use
std::string data = "aabbbcdddd";
std::vector<uint64_t> data_elem_offsets = {0, 2, 5, 6};

Context ctx;
Domain domain(ctx);
auto dim =
Dimension::create(ctx, "dim1", TILEDB_STRING_ASCII, nullptr, nullptr);
std::stringstream repetitions;
size_t repetition_num = 100;
for (size_t i = 0; i < repetition_num; i++)
repetitions << "GLSD987JHY";
std::string data =
"ATSD987JIO" + std::string(repetitions.str()) + "TGSD987JPO";
// Create the corresponding offsets buffer
std::vector<uint64_t> data_elem_offsets(repetition_num + 2);
int start = -10;
std::generate(data_elem_offsets.begin(), data_elem_offsets.end(), [&] {
return start += 10;
});

// Create compressor as a filter
Filter filter(ctx, TILEDB_FILTER_RLE);
// Create filter list
FilterList filter_list(ctx);
// Add compressor to filter list
filter_list.add_filter(filter);
dim.set_filter_list(filter_list);
domain.add_dimension(dim);
{
Context ctx;
Domain domain(ctx);
auto dim =
Dimension::create(ctx, "dim1", TILEDB_STRING_ASCII, nullptr, nullptr);

// Create compressor as a filter
Filter filter(ctx, TILEDB_FILTER_RLE);
// Create filter list
FilterList filter_list(ctx);
// Add compressor to filter list
filter_list.add_filter(filter);
dim.set_filter_list(filter_list);

domain.add_dimension(dim);

ArraySchema schema(ctx, TILEDB_SPARSE);
schema.set_domain(domain);
schema.set_allows_dups(true);

ArraySchema schema(ctx, TILEDB_SPARSE);
schema.set_domain(domain);
tiledb::Array::create(array_name, schema);

tiledb::Array::create(array_name, schema);
auto array = tiledb::Array(ctx, array_name, TILEDB_WRITE);
Query query(ctx, array, TILEDB_WRITE);
query.set_data_buffer("dim1", (char*)data.data(), data.size());
query.set_offsets_buffer(
"dim1", data_elem_offsets.data(), data_elem_offsets.size());

auto array = tiledb::Array(ctx, array_name, TILEDB_WRITE);
Query query(ctx, array, TILEDB_WRITE);
query.set_data_buffer("dim1", (char*)data.data(), data.size());
query.set_offsets_buffer(
"dim1", data_elem_offsets.data(), data_elem_offsets.size());
query.set_layout(TILEDB_UNORDERED);
query.submit();
query.finalize();
array.close();
}

query.set_layout(TILEDB_UNORDERED);
query.submit();
query.finalize();
array.close();
{
Context ctx;
std::vector<uint64_t> offsets_back(data_elem_offsets.size());
std::string data_back;
data_back.resize(data.size());

auto array = tiledb::Array(ctx, array_name, TILEDB_READ);
Query query(ctx, array, TILEDB_READ);
query.add_range(
"dim1", std::string("ATSD987JIO"), std::string("TGSD987JPO"));
query.set_data_buffer("dim1", (char*)data_back.data(), data_back.size());
query.set_offsets_buffer("dim1", offsets_back.data(), offsets_back.size());

query.submit();

CHECK(query.query_status() == Query::Status::COMPLETE);
CHECK(offsets_back == data_elem_offsets);
CHECK(data_back == data);
}

Context ctx;
VFS vfs(ctx);
if (vfs.is_dir(array_name))
vfs.remove_dir(array_name);
}

TEST_CASE(
"C++ API: Test adding RLE filter of string dimensions",
"[cppapi][string-dim][rle-strings][sparse]") {
std::string array_name = "test_rle_string_dim";

Context ctx;
Domain domain(ctx);
// Create var-length string dimension
auto dim_var_string =
Dimension::create(ctx, "dim1", TILEDB_STRING_ASCII, nullptr, nullptr);
auto dim_not_var_string =
tiledb::Dimension::create<int32_t>(ctx, "id", {{1, 100}}, 10);

// Create filters
Filter rle_filter(ctx, TILEDB_FILTER_RLE);
Filter another_filter(ctx, TILEDB_FILTER_CHECKSUM_MD5);

// Create filter list with RLE only
FilterList filter_list_rle_only(ctx);
filter_list_rle_only.add_filter(rle_filter);

// Create filter list with RLE and other filters
FilterList filter_list_with_others(ctx);
filter_list_with_others.add_filter(another_filter);
filter_list_with_others.add_filter(rle_filter);

{
// Add dimension that is not var length string
domain.add_dimension(dim_not_var_string);
ArraySchema schema(ctx, TILEDB_SPARSE);
schema.set_domain(domain);

CHECK_NOTHROW(schema.set_coords_filter_list(filter_list_rle_only));
CHECK_NOTHROW(schema.set_coords_filter_list(filter_list_with_others));

// Add var length string dimension
domain.add_dimension(dim_var_string);
schema.set_domain(domain);

// Test set_coords_filter_list
{
// Case 1: There is no more specific filter list for this dimension
// Adding RLE with other filters to var-length string dimension is not
// allowed
CHECK_THROWS(schema.set_coords_filter_list(filter_list_with_others));
// If only RLE is used, it's allowed
CHECK_NOTHROW(schema.set_coords_filter_list(filter_list_rle_only));
}

{
// set coords Case 2: There is a more specific filter, so whatever we set
// with set coords should not matter
CHECK_NOTHROW(dim_var_string.set_filter_list(filter_list_rle_only));

// We need to use another domain, as adding a dimension with the same name
// doesn't replace the old one
Domain domain2(ctx);
domain2.add_dimension(dim_var_string);
schema.set_domain(domain2);
CHECK_NOTHROW(schema.set_coords_filter_list(filter_list_with_others));
}

// Test set_filter_list
{
// Adding RLE with other filters to var-length string dimension is not
// allowed
CHECK_THROWS(dim_var_string.set_filter_list(filter_list_with_others));

// The rest of the cases are allowed
CHECK_NOTHROW(dim_var_string.set_filter_list(filter_list_rle_only));
CHECK_NOTHROW(dim_not_var_string.set_filter_list(filter_list_rle_only));
CHECK_NOTHROW(
dim_not_var_string.set_filter_list(filter_list_with_others));
}
}
}
7 changes: 6 additions & 1 deletion test/src/unit-filter-pipeline.cc
Expand Up @@ -5,7 +5,7 @@
*
* The MIT License
*
* @copyright Copyright (c) 2017-2021 TileDB, Inc.
* @copyright Copyright (c) 2017-2022 TileDB, Inc.
* @copyright Copyright (c) 2016 MIT and Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
Expand Down Expand Up @@ -79,6 +79,7 @@ class Add1InPlace : public tiledb::sm::Filter {

Status run_forward(
const Tile&,
Tile* const,
FilterBuffer* input_metadata,
FilterBuffer* input,
FilterBuffer* output_metadata,
Expand Down Expand Up @@ -148,6 +149,7 @@ class Add1OutOfPlace : public tiledb::sm::Filter {

Status run_forward(
const Tile&,
Tile* const,
FilterBuffer* input_metadata,
FilterBuffer* input,
FilterBuffer* output_metadata,
Expand Down Expand Up @@ -239,6 +241,7 @@ class AddNInPlace : public tiledb::sm::Filter {

Status run_forward(
const Tile&,
Tile* const,
FilterBuffer* input_metadata,
FilterBuffer* input,
FilterBuffer* output_metadata,
Expand Down Expand Up @@ -320,6 +323,7 @@ class PseudoChecksumFilter : public tiledb::sm::Filter {

Status run_forward(
const Tile&,
Tile* const,
FilterBuffer* input_metadata,
FilterBuffer* input,
FilterBuffer* output_metadata,
Expand Down Expand Up @@ -409,6 +413,7 @@ class Add1IncludingMetadataFilter : public tiledb::sm::Filter {

Status run_forward(
const Tile&,
Tile* const,
FilterBuffer* input_metadata,
FilterBuffer* input,
FilterBuffer* output_metadata,
Expand Down

0 comments on commit 023cb53

Please sign in to comment.