Skip to content

Commit

Permalink
Optimize sparse reads when full non empty domain is requested. (#4710)
Browse files Browse the repository at this point in the history
This change allows to optimize sparse reads where the only range set
covers the non-empty domain of the fragments to be read. This is the
case for python.

---
TYPE: IMPROVEMENT
DESC: Optimize sparse reads when full non empty domain is requested.
  • Loading branch information
KiterLuc committed Feb 9, 2024
1 parent 2f03f0c commit 437340d
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 3 deletions.
1 change: 1 addition & 0 deletions tiledb/sm/array/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -764,6 +764,7 @@ class Array {
* has not been computed or loaded it will be loaded first
*/
const NDRange non_empty_domain();

/**
* Retrieves the array metadata object that is already loaded. If it's not yet
* loaded it will be empty.
Expand Down
2 changes: 2 additions & 0 deletions tiledb/sm/query/readers/sparse_global_order_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ Status SparseGlobalOrderReader<BitmapType>::dowork() {
auto timer_se = stats_->start_timer("dowork");
stats_->add_counter("loop_num", 1);

subarray_.reset_default_ranges();

// Check that the query condition is valid.
if (condition_.has_value()) {
throw_if_not_ok(condition_->check(array_schema_));
Expand Down
7 changes: 4 additions & 3 deletions tiledb/sm/query/readers/sparse_unordered_with_dups_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,13 @@ void SparseUnorderedWithDupsReader<BitmapType>::refresh_config() {

template <class BitmapType>
Status SparseUnorderedWithDupsReader<BitmapType>::dowork() {
// Subarray is not known to be explicitly set until buffers are deserialized
include_coords_ = subarray_.is_set();

auto timer_se = stats_->start_timer("dowork");
stats_->add_counter("loop_num", 1);

// Subarray is not known to be explicitly set until buffers are deserialized
subarray_.reset_default_ranges();
include_coords_ = subarray_.is_set();

// Make sure user didn't request delete timestamps.
if (buffers_.count(constants::delete_timestamps) != 0) {
return logger_->status(Status_SparseUnorderedWithDupsReaderError(
Expand Down
23 changes: 23 additions & 0 deletions tiledb/sm/subarray/subarray.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2131,6 +2131,29 @@ void Subarray::add_default_label_ranges(dimension_size_type dim_num) {
label_range_subset_.resize(dim_num, nullopt);
}

void Subarray::reset_default_ranges() {
if (array_->non_empty_domain_computed()) {
auto dim_num = array_->array_schema_latest().dim_num();
auto& domain{array_->array_schema_latest().domain()};

// Process all dimensions one by one.
for (unsigned d = 0; d < dim_num; d++) {
// Only enter the check if there are only one range set on the dimension.
if (!is_default_[d] && range_subset_[d].num_ranges() == 1) {
// If the range set is the same as the non empty domain.
auto& ned = array_->non_empty_domain()[d];
if (ned == range_subset_[d][0]) {
// Reset the default flag and reset the range subset to be default.
is_default_[d] = true;
auto dim{domain.dimension_ptr(d)};
range_subset_[d] = RangeSetAndSuperset(
dim->type(), dim->domain(), true, coalesce_ranges_);
}
}
}
}
}

void Subarray::compute_range_offsets() {
range_offsets_.clear();

Expand Down
11 changes: 11 additions & 0 deletions tiledb/sm/subarray/subarray.h
Original file line number Diff line number Diff line change
Expand Up @@ -1306,6 +1306,17 @@ class Subarray {
*/
void add_default_label_ranges(dimension_size_type dim_num);

/**
* Reset ranges to default if possible before a read operation for sparse
* reads. We have a lot of optimizations in the sparse readers when no ranges
* are specified. Python will set ranges that are equal to the non empty
* domain, which will negate those optimizations. When the non empty domain is
* computed for the array, it is low performance cost to see if the ranges set
* are equal to the non empty domain. If they are, we can reset them to be
* default.
*/
void reset_default_ranges();

private:
/* ********************************* */
/* PRIVATE DATA TYPES */
Expand Down

0 comments on commit 437340d

Please sign in to comment.