Skip to content

Commit

Permalink
Use sparse global ordered reader for unordered queries with no dups.
Browse files Browse the repository at this point in the history
This change modifies the sparse global order reader to process unordered
queries made on arrays with duplicates. As we don't care about the
order of the data, we can ignore the constraint that we only support
one range for global order.

This also fixes an issue in bitmap computation for multiplicities,
where the relevant ranges were not computed properly for count bitmaps.

---
TYPE: IMPROVEMENT
DESC: Use sparse global ordered reader for unordered queries with no dups.
  • Loading branch information
KiterLuc committed May 24, 2022
1 parent 8bb17c9 commit 9252f1f
Show file tree
Hide file tree
Showing 14 changed files with 375 additions and 139 deletions.
10 changes: 5 additions & 5 deletions test/src/unit-result-coords.cc
Expand Up @@ -58,7 +58,7 @@ struct CResultCoordsFx {
CResultCoordsFx();
~CResultCoordsFx();

GlobalOrderResultTile make_tile_with_num_cells(uint64_t num_cells);
GlobalOrderResultTile<uint8_t> make_tile_with_num_cells(uint64_t num_cells);
};

CResultCoordsFx::CResultCoordsFx() {
Expand Down Expand Up @@ -116,9 +116,9 @@ CResultCoordsFx::~CResultCoordsFx() {
tiledb_vfs_free(&vfs_);
}

GlobalOrderResultTile CResultCoordsFx::make_tile_with_num_cells(
GlobalOrderResultTile<uint8_t> CResultCoordsFx::make_tile_with_num_cells(
uint64_t num_cells) {
GlobalOrderResultTile result_tile(
GlobalOrderResultTile<uint8_t> result_tile(
0, 0, array_->array_->array_schema_latest());
auto tile_tuple = result_tile.tile_tuple(constants::coords);
Tile* const tile = &std::get<0>(*tile_tuple);
Expand All @@ -145,8 +145,8 @@ class Cmp {
}

bool operator()(
const GlobalOrderResultCoords& a,
const GlobalOrderResultCoords& b) const {
const GlobalOrderResultCoords<uint8_t>& a,
const GlobalOrderResultCoords<uint8_t>& b) const {
if (a.pos_ == b.pos_) {
return true;
}
Expand Down
6 changes: 4 additions & 2 deletions test/src/unit-sparse-global-order-reader.cc
Expand Up @@ -451,7 +451,8 @@ TEST_CASE_METHOD(

// Check the internal loop count against expected value.
auto stats =
((sm::SparseGlobalOrderReader*)query->query_->strategy())->stats();
((sm::SparseGlobalOrderReader<uint8_t>*)query->query_->strategy())
->stats();
REQUIRE(stats != nullptr);
auto counters = stats->counters();
REQUIRE(counters != nullptr);
Expand Down Expand Up @@ -646,7 +647,8 @@ TEST_CASE_METHOD(

// Check the internal loop count against expected value.
auto stats =
((sm::SparseGlobalOrderReader*)query->query_->strategy())->stats();
((sm::SparseGlobalOrderReader<uint8_t>*)query->query_->strategy())
->stats();
REQUIRE(stats != nullptr);
auto counters = stats->counters();
REQUIRE(counters != nullptr);
Expand Down
39 changes: 19 additions & 20 deletions tiledb/sm/array_schema/dimension.cc
Expand Up @@ -782,35 +782,34 @@ void Dimension::relevant_ranges<char>(
});

// If we have no ranges just exit early
if (it == ranges.end())
if (it == ranges.end()) {
return;
}

// Set start index
const uint64_t start_range_idx = std::distance(ranges.begin(), it);
const uint64_t start_range = std::distance(ranges.begin(), it);

// Binary search to find the last range containing the start mbr.
// Find upper bound to end comparisons. Finding this early allows avoiding the
// conditional exit in the for loop below
auto it2 = std::lower_bound(
it,
ranges.end(),
mbr_end,
[&](const Range& a, const std::string_view& value) {
return a.start_str() < value;
return a.start_str() <= value;
});

// If the upper bound isn't the end add +1 to the index
uint64_t offset = 0;
if (it2 != ranges.end())
offset = 1;
const uint64_t end_range_idx =
std::distance(it, it2) + start_range_idx + offset;
// Set end index
const uint64_t end_range = std::distance(it, it2) + start_range;

// Loop over only potential relevant ranges
for (uint64_t r = start_range_idx; r < end_range_idx; ++r) {
for (uint64_t r = start_range; r < end_range; ++r) {
const auto& r1_start = ranges[r].start_str();
const auto& r1_end = ranges[r].end_str();

if (r1_start <= mbr_end && mbr_start <= r1_end)
if (r1_start <= mbr_end && mbr_start <= r1_end) {
relevant_ranges.emplace_back(r);
}
}
}

Expand All @@ -832,30 +831,30 @@ void Dimension::relevant_ranges(
return ((const T*)a.start_fixed())[1] < value;
});

if (it == ranges.end())
if (it == ranges.end()) {
return;
}

// Set start index
const uint64_t start_range = std::distance(ranges.begin(), it);

// Find upper bound to end comparisons. Finding this early allows avoiding the
// conditional exit in the for loop below
auto it2 = std::lower_bound(
it, ranges.end(), mbr_end, [&](const Range& a, const T value) {
return ((const T*)a.start_fixed())[0] < value;
return ((const T*)a.start_fixed())[0] <= value;
});

// If the upper bound isn't the end add +1 to the index
uint64_t offset = 0;
if (it2 != ranges.end())
offset = 1;
const uint64_t end_range = std::distance(it, it2) + start_range + offset;
// Set end index
const uint64_t end_range = std::distance(it, it2) + start_range;

// Loop over only potential relevant ranges
for (uint64_t r = start_range; r < end_range; ++r) {
const auto d1 = (const T*)ranges[r].start_fixed();

if ((d1[0] <= mbr_end && d1[1] >= mbr_start))
if ((d1[0] <= mbr_end && d1[1] >= mbr_start)) {
relevant_ranges.emplace_back(r);
}
}
}

Expand Down
141 changes: 141 additions & 0 deletions tiledb/sm/array_schema/test/unit_dimension.cc
Expand Up @@ -38,6 +38,63 @@ using namespace tiledb;
using namespace tiledb::common;
using namespace tiledb::sm;

using Datatype = tiledb::sm::Datatype;

template <class T>
struct type_to_datatype {
static Datatype datatype;
};

template <>
struct type_to_datatype<int8_t> {
static constexpr Datatype datatype = Datatype::INT8;
};

template <>
struct type_to_datatype<int16_t> {
static constexpr Datatype datatype = Datatype::INT16;
};

template <>
struct type_to_datatype<int32_t> {
static constexpr Datatype datatype = Datatype::INT32;
};

template <>
struct type_to_datatype<int64_t> {
static constexpr Datatype datatype = Datatype::INT64;
};

template <>
struct type_to_datatype<uint8_t> {
static constexpr Datatype datatype = Datatype::UINT8;
};

template <>
struct type_to_datatype<uint16_t> {
static constexpr Datatype datatype = Datatype::UINT16;
};

template <>
struct type_to_datatype<uint32_t> {
static constexpr Datatype datatype = Datatype::UINT32;
};

template <>
struct type_to_datatype<uint64_t> {
static constexpr Datatype datatype = Datatype::UINT64;
};

template <>
struct type_to_datatype<float> {
static constexpr Datatype datatype = Datatype::FLOAT32;
};

template <>
struct type_to_datatype<double> {
static constexpr Datatype datatype = Datatype::FLOAT64;
};

template <class T, int n>
inline T& dim_buffer_offset(void* p) {
return *static_cast<T*>(static_cast<void*>(static_cast<char*>(p) + n));
Expand Down Expand Up @@ -213,3 +270,87 @@ TEMPLATE_LIST_TEST_CASE(
}
}
}

void check_relevant_ranges(
std::vector<uint64_t>& relevant_ranges, std::vector<uint64_t>& expected) {
CHECK(relevant_ranges.size() == expected.size());
for (uint64_t r = 0; r < expected.size(); r++) {
CHECK(relevant_ranges[r] == expected[r]);
}
}

typedef tuple<
int8_t,
int16_t,
int32_t,
int64_t,
uint8_t,
uint16_t,
uint32_t,
uint64_t>
FixedTypes;
TEMPLATE_LIST_TEST_CASE(
"test relevant_ranges", "[dimension][relevant_ranges][fixed]", FixedTypes) {
typedef TestType T;
auto tiledb_type = type_to_datatype<T>().datatype;
Dimension dim{"", tiledb_type};

std::vector<T> range_data = {
1, 1, 1, 1, 2, 2, 3, 4, 5, 6, 5, 7, 8, 9, 50, 56};
NDRange ranges;
for (uint64_t r = 0; r < range_data.size() / 2; r++) {
ranges.emplace_back(&range_data[r * 2], 2 * sizeof(T));
}

// Test data.
std::vector<std::vector<T>> mbr_data = {{1, 1}, {2, 6}, {7, 8}};
std::vector<std::vector<uint64_t>> expected = {{0, 1}, {2, 3, 4, 5}, {5, 6}};

// Compute and check relevant ranges.
for (uint64_t i = 0; i < mbr_data.size(); i++) {
Range mbr(mbr_data[i].data(), 2 * sizeof(T));

std::vector<uint64_t> relevant_ranges;
dim.relevant_ranges(ranges, mbr, relevant_ranges);
check_relevant_ranges(relevant_ranges, expected[i]);
}
}

TEST_CASE("test relevant_ranges", "[dimension][relevant_ranges][string]") {
Dimension dim{"", Datatype::STRING_ASCII};

std::vector<char> range_data = {'a',
'a',
'a',
'a',
'b',
'b',
'c',
'd',
'e',
'f',
'e',
'g',
'h',
'i',
'y',
'z'};
NDRange ranges;
for (uint64_t r = 0; r < range_data.size() / 2; r++) {
ranges.emplace_back(&range_data[r * 2], 2, 1);
}

// Test data.
std::vector<std::vector<char>> mbr_data = {
{'a', 'a'}, {'b', 'f'}, {'g', 'h'}};
std::vector<std::vector<uint64_t>> expected = {{0, 1}, {2, 3, 4, 5}, {5, 6}};

// Compute and check relevant ranges.
for (uint64_t i = 0; i < mbr_data.size(); i++) {
Range mbr(mbr_data[i].data(), 2, 1);

std::vector<uint64_t> relevant_ranges;
dim.relevant_ranges(ranges, mbr, relevant_ranges);
check_relevant_ranges(relevant_ranges, expected[i]);
}
}
5 changes: 3 additions & 2 deletions tiledb/sm/misc/comparators.h
Expand Up @@ -150,9 +150,10 @@ class HilbertCmp : protected CellCmpBase {
* @param b The second coordinate.
* @return `true` if `a` precedes `b` and `false` otherwise.
*/
template <class BitmapType>
bool operator()(
const GlobalOrderResultCoords& a,
const GlobalOrderResultCoords& b) const {
const GlobalOrderResultCoords<BitmapType>& a,
const GlobalOrderResultCoords<BitmapType>& b) const {
auto hilbert_a = a.tile_->hilbert_value(a.pos_);
auto hilbert_b = b.tile_->hilbert_value(b.pos_);
if (hilbert_a < hilbert_b)
Expand Down
14 changes: 12 additions & 2 deletions tiledb/sm/query/hilbert_order.cc
Expand Up @@ -56,8 +56,18 @@ uint64_t map_to_uint64(
return dim.map_to_uint64(d.content(), d.size(), bits, max_bucket_val);
}

template uint64_t map_to_uint64<GlobalOrderResultCoords>(
const Dimension&, const GlobalOrderResultCoords&, uint32_t, int, uint64_t);
template uint64_t map_to_uint64<GlobalOrderResultCoords<uint8_t>>(
const Dimension&,
const GlobalOrderResultCoords<uint8_t>&,
uint32_t,
int,
uint64_t);
template uint64_t map_to_uint64<GlobalOrderResultCoords<uint64_t>>(
const Dimension&,
const GlobalOrderResultCoords<uint64_t>&,
uint32_t,
int,
uint64_t);
template uint64_t map_to_uint64<ResultCoords>(
const Dimension&, const ResultCoords&, uint32_t, int, uint64_t);

Expand Down
47 changes: 33 additions & 14 deletions tiledb/sm/query/query.cc
Expand Up @@ -1073,22 +1073,41 @@ Status Query::create_strategy() {
} else if (
use_refactored_sparse_global_order_reader() &&
!array_schema_->dense() &&
(layout_ == Layout::GLOBAL_ORDER ||
(layout_ == Layout::UNORDERED && subarray_.range_num() <= 1))) {
(layout_ == Layout::GLOBAL_ORDER || layout_ == Layout::UNORDERED)) {
// Using the reader for unordered queries to do deduplication.
use_default = false;
strategy_ = tdb_unique_ptr<IQueryStrategy>(tdb_new(
SparseGlobalOrderReader,
stats_->create_child("Reader"),
logger_,
storage_manager_,
array_,
config_,
buffers_,
subarray_,
layout_,
condition_,
consolidation_with_timestamps_));

auto&& [st, non_overlapping_ranges]{Query::non_overlapping_ranges()};
RETURN_NOT_OK(st);

if (*non_overlapping_ranges || !subarray_.is_set() ||
subarray_.range_num() == 1) {
strategy_ = tdb_unique_ptr<IQueryStrategy>(tdb_new(
SparseGlobalOrderReader<uint8_t>,
stats_->create_child("Reader"),
logger_,
storage_manager_,
array_,
config_,
buffers_,
subarray_,
layout_,
condition_,
consolidation_with_timestamps_));
} else {
strategy_ = tdb_unique_ptr<IQueryStrategy>(tdb_new(
SparseGlobalOrderReader<uint64_t>,
stats_->create_child("Reader"),
logger_,
storage_manager_,
array_,
config_,
buffers_,
subarray_,
layout_,
condition_,
consolidation_with_timestamps_));
}
} else if (use_refactored_dense_reader() && array_schema_->dense()) {
bool all_dense = true;
for (auto& frag_md : fragment_metadata_)
Expand Down

0 comments on commit 9252f1f

Please sign in to comment.