Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport release-2.20] Optimize sparse reads when full non empty domain is requested. #4715

Merged
merged 1 commit into from
Feb 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions tiledb/sm/array/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -764,6 +764,7 @@ class Array {
* has not been computed or loaded it will be loaded first
*/
const NDRange non_empty_domain();

/**
* Retrieves the array metadata object that is already loaded. If it's not yet
* loaded it will be empty.
Expand Down
2 changes: 2 additions & 0 deletions tiledb/sm/query/readers/sparse_global_order_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ Status SparseGlobalOrderReader<BitmapType>::dowork() {
auto timer_se = stats_->start_timer("dowork");
stats_->add_counter("loop_num", 1);

subarray_.reset_default_ranges();

// Check that the query condition is valid.
if (condition_.has_value()) {
throw_if_not_ok(condition_->check(array_schema_));
Expand Down
7 changes: 4 additions & 3 deletions tiledb/sm/query/readers/sparse_unordered_with_dups_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,13 @@ void SparseUnorderedWithDupsReader<BitmapType>::refresh_config() {

template <class BitmapType>
Status SparseUnorderedWithDupsReader<BitmapType>::dowork() {
// Subarray is not known to be explicitly set until buffers are deserialized
include_coords_ = subarray_.is_set();

auto timer_se = stats_->start_timer("dowork");
stats_->add_counter("loop_num", 1);

// Subarray is not known to be explicitly set until buffers are deserialized
subarray_.reset_default_ranges();
include_coords_ = subarray_.is_set();

// Make sure user didn't request delete timestamps.
if (buffers_.count(constants::delete_timestamps) != 0) {
return logger_->status(Status_SparseUnorderedWithDupsReaderError(
Expand Down
23 changes: 23 additions & 0 deletions tiledb/sm/subarray/subarray.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2131,6 +2131,29 @@ void Subarray::add_default_label_ranges(dimension_size_type dim_num) {
label_range_subset_.resize(dim_num, nullopt);
}

void Subarray::reset_default_ranges() {
if (array_->non_empty_domain_computed()) {
auto dim_num = array_->array_schema_latest().dim_num();
auto& domain{array_->array_schema_latest().domain()};

// Process all dimensions one by one.
for (unsigned d = 0; d < dim_num; d++) {
// Only enter the check if there are only one range set on the dimension.
if (!is_default_[d] && range_subset_[d].num_ranges() == 1) {
// If the range set is the same as the non empty domain.
auto& ned = array_->non_empty_domain()[d];
if (ned == range_subset_[d][0]) {
// Reset the default flag and reset the range subset to be default.
is_default_[d] = true;
auto dim{domain.dimension_ptr(d)};
range_subset_[d] = RangeSetAndSuperset(
dim->type(), dim->domain(), true, coalesce_ranges_);
}
}
}
}
}

void Subarray::compute_range_offsets() {
range_offsets_.clear();

Expand Down
11 changes: 11 additions & 0 deletions tiledb/sm/subarray/subarray.h
Original file line number Diff line number Diff line change
Expand Up @@ -1306,6 +1306,17 @@ class Subarray {
*/
void add_default_label_ranges(dimension_size_type dim_num);

/**
* Reset ranges to default if possible before a read operation for sparse
* reads. We have a lot of optimizations in the sparse readers when no ranges
* are specified. Python will set ranges that are equal to the non empty
* domain, which will negate those optimizations. When the non empty domain is
* computed for the array, it is low performance cost to see if the ranges set
* are equal to the non empty domain. If they are, we can reset them to be
* default.
*/
void reset_default_ranges();

private:
/* ********************************* */
/* PRIVATE DATA TYPES */
Expand Down