Skip to content

Commit

Permalink
Rewrite Dimension::overlap_ratio to remove overflow, underflow, and…
Browse files Browse the repository at this point in the history
… divide by zero (#2304) (#2349)

* Add tests for overflow defects in overlap_ratio.
Minor (non-fix) changes to overlap_ratio. Most importantly, add an assert() to verify the postcondition.
Add helpers-dimension.h with some utility functions.
Fix an iteration error in a caller of overlap_ratio. Fix typos.

* Rewrote Dimension::overlap_ratio to pass all the tests it was failing before.
Added some test tags.
Reverted a change in subarray.cc

* clang format

* More clang-format

* Use the limits function that returns the actual minimum element for floating point, not the minimum-positive element.

* Fix Hilbert test now that size estimate is correct

Now that the overlap ratio is correct, we get proper size estimation
and the partitioner doesn't do an unnecessary split of the results
anymore.

* Comment changes.
clang-format

Co-authored-by: Luc Rancourt <lucrancourt@gmail.com>

Co-authored-by: eric-hughes-tiledb <82400964+eric-hughes-tiledb@users.noreply.github.com>
Co-authored-by: Luc Rancourt <lucrancourt@gmail.com>
  • Loading branch information
3 people committed Jun 14, 2021
1 parent 0b00402 commit 4a0d431
Show file tree
Hide file tree
Showing 6 changed files with 286 additions and 55 deletions.
1 change: 1 addition & 0 deletions test/CMakeLists.txt
Expand Up @@ -98,6 +98,7 @@ list(APPEND TILEDB_CORE_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../tiledb/sm/c_
set(TILEDB_TEST_SOURCES
src/helpers.h
src/helpers.cc
src/helpers-dimension.h
src/unit-azure.cc
src/unit-backwards_compat.cc
src/unit-buffer.cc
Expand Down
107 changes: 107 additions & 0 deletions test/src/helpers-dimension.h
@@ -0,0 +1,107 @@
/**
* @file helpers-dimension.h
*
* @section LICENSE
*
* The MIT License
*
* @copyright Copyright (c) 2017-2021 TileDB, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
* @section DESCRIPTION
*
* Helpers for tests involving dimensions.
*/

#include "tiledb/sm/enums/datatype.h"
#include "tiledb/sm/misc/types.h"

#ifndef TILEDB_HELPERS_DIMENSION_H
#define TILEDB_HELPERS_DIMENSION_H

namespace tiledb {
namespace test {

/**
* A typed version of the standard range class. Constructs Range objects without
* requiring the caller to know anything about its data structures.
*
* @tparam T
*/
template <class T>
class TypedRange : public tiledb::sm::Range {
public:
/**
* Construct a Range from a single interval.
* @param low Lower bound of interval
* @param high Upper bound of interval
* @pre low <= high
*/
TypedRange(T low, T high) {
T x[2] = {low, high};
set_range(&x[0], 2 * sizeof(T));
/* Commentary: There's no way of initializing a Range without copying from
* existing memory. As a result any value-initializing constructor first
* needs to build an array and then copy it.
*/
}
};

using Datatype = tiledb::sm::Datatype;

template <class T>
struct RangeTraits {
static Datatype datatype;
};

template <>
struct RangeTraits<int32_t> {
static constexpr Datatype datatype = Datatype::INT32;
};

template <>
struct RangeTraits<int64_t> {
static constexpr Datatype datatype = Datatype::INT64;
};

template <>
struct RangeTraits<uint32_t> {
static constexpr Datatype datatype = Datatype::UINT32;
};

template <>
struct RangeTraits<uint64_t> {
static constexpr Datatype datatype = Datatype::UINT64;
};

template <>
struct RangeTraits<float> {
static constexpr Datatype datatype = Datatype::FLOAT32;
};

template <>
struct RangeTraits<double> {
static constexpr Datatype datatype = Datatype::FLOAT64;
};

} // namespace test
} // namespace tiledb

#endif // TILEDB_HELPERS_DIMENSION_H
42 changes: 8 additions & 34 deletions test/src/unit-cppapi-hilbert.cc
Expand Up @@ -1250,23 +1250,10 @@ TEST_CASE(

// Check results
CHECK(query_r.query_status() == Query::Status::INCOMPLETE);
CHECK(query_r.result_buffer_elements()["a"].second == 1);
std::vector<int32_t> c_buff_a = {3};
std::vector<float> c_buff_d1 = {0.1f};
std::vector<float> c_buff_d2 = {0.1f};
CHECK(r_buff_a[0] == c_buff_a[0]);
CHECK(r_buff_d1[0] == c_buff_d1[0]);
CHECK(r_buff_d2[0] == c_buff_d2[0]);

// Read again
CHECK_NOTHROW(query_r.submit());

// Check results again
CHECK(query_r.query_status() == Query::Status::INCOMPLETE);
CHECK(query_r.result_buffer_elements()["a"].second == 1);
c_buff_a = {2};
c_buff_d1 = {0.1f};
c_buff_d2 = {0.3f};
CHECK(query_r.result_buffer_elements()["a"].second == 2);
std::vector<int32_t> c_buff_a = {3, 2};
std::vector<float> c_buff_d1 = {0.1f, 0.1f};
std::vector<float> c_buff_d2 = {0.1f, 0.3f};
CHECK(r_buff_a[0] == c_buff_a[0]);
CHECK(r_buff_d1[0] == c_buff_d1[0]);
CHECK(r_buff_d2[0] == c_buff_d2[0]);
Expand Down Expand Up @@ -1308,23 +1295,10 @@ TEST_CASE(

// Check results
CHECK(query_r.query_status() == Query::Status::INCOMPLETE);
CHECK(query_r.result_buffer_elements()["a"].second == 1);
std::vector<int32_t> c_buff_a = {3};
std::vector<float> c_buff_d1 = {0.1f};
std::vector<float> c_buff_d2 = {0.1f};
CHECK(r_buff_a[0] == c_buff_a[0]);
CHECK(r_buff_d1[0] == c_buff_d1[0]);
CHECK(r_buff_d2[0] == c_buff_d2[0]);

// Read again
CHECK_NOTHROW(query_r.submit());

// Check results again
CHECK(query_r.query_status() == Query::Status::INCOMPLETE);
CHECK(query_r.result_buffer_elements()["a"].second == 1);
c_buff_a = {2};
c_buff_d1 = {0.1f};
c_buff_d2 = {0.3f};
CHECK(query_r.result_buffer_elements()["a"].second == 2);
std::vector<int32_t> c_buff_a = {3, 2};
std::vector<float> c_buff_d1 = {0.1f, 0.1f};
std::vector<float> c_buff_d2 = {0.1f, 0.3f};
CHECK(r_buff_a[0] == c_buff_a[0]);
CHECK(r_buff_d1[0] == c_buff_d1[0]);
CHECK(r_buff_d2[0] == c_buff_d2[0]);
Expand Down
86 changes: 86 additions & 0 deletions test/src/unit-dimension.cc
Expand Up @@ -30,6 +30,7 @@
* Tests the `Dimension` class.
*/

#include "test/src/helpers-dimension.h"
#include "tiledb/sm/array_schema/dimension.h"
#include "tiledb/sm/enums/datatype.h"
#include "tiledb/sm/misc/hilbert.h"
Expand All @@ -40,6 +41,7 @@
using namespace tiledb;
using namespace tiledb::common;
using namespace tiledb::sm;
using namespace tiledb::test;

TEST_CASE(
"Dimension: Test map_to_uint64, integers",
Expand Down Expand Up @@ -603,3 +605,87 @@ TEST_CASE(
val_str = std::string((const char*)(&val[0]), 4);
CHECK(val_str == std::string("yell", 4));
}

template <class T>
double basic_verify_overlap_ratio(
T range1_low, T range1_high, T range2_low, T range2_high) {
auto r1 = TypedRange<T>(range1_low, range1_high);
auto r2 = TypedRange<T>(range2_low, range2_high);
Dimension d("foo", RangeTraits<T>::datatype);
auto ratio = d.overlap_ratio(r1, r2);
CHECK(0.0 <= ratio);
CHECK(ratio <= 1.0);
return ratio;
}

/**
* The denominator of the ratio is computed as range2_high - range2_low. For a
* k-bit signed integer, the largest this value can take is 2^k-1, which is
* larger than the maximum signed value of 2^(k-1)-1.
*/
TEST_CASE(
"Range<int32_t>.overlap_ratio: denominator overflow",
"[dimension][overlap_ratio][signed][integral]") {
typedef int32_t T;
auto min = std::numeric_limits<T>::min();
auto max = std::numeric_limits<T>::max();
basic_verify_overlap_ratio<T>(min / 3, 1, min + 1, max);
}

/**
* Denominator overflow for an unsigned integral type.
*/
TEST_CASE(
"Range<uint32_t>.overlap_ratio: denominator overflow",
"[dimension][overlap_ratio][unsigned][integral]") {
typedef uint32_t T;
auto min = std::numeric_limits<T>::min();
auto max = std::numeric_limits<T>::max();
basic_verify_overlap_ratio<T>(0, 1, min + 1, max);
}

/**
* Denominator overflow for double.
*/
TEST_CASE(
"Range<double>.overlap_ratio: denominator overflow",
"[dimension][overlap_ratio][signed][floating]") {
typedef double T;
auto min = std::numeric_limits<T>::min();
auto max = std::numeric_limits<T>::max();
basic_verify_overlap_ratio<T>(0, 1, min + 1, max);
}

TEST_CASE(
"Range<uint32_t>.overlap_ratio: max base range",
"[dimension][overlap_ratio][unsigned][integral]") {
typedef uint32_t T;
auto min = std::numeric_limits<T>::min();
auto max = std::numeric_limits<T>::max();
basic_verify_overlap_ratio<T>(min, min + 1, min, max);
}

TEST_CASE(
"Range<uint32_t>.overlap_ratio: max both ranges",
"[dimension][overlap_ratio][unsigned][integral]") {
typedef uint32_t T;
auto min = std::numeric_limits<T>::min();
auto max = std::numeric_limits<T>::max();
basic_verify_overlap_ratio<T>(min, max, min, max);
}

TEST_CASE(
"Range<int32_t>.overlap_ratio: over-by-1 legit-max base range",
"[dimension][overlap_ratio][signed][integral]") {
typedef int32_t T;
auto max = std::numeric_limits<T>::max();
basic_verify_overlap_ratio<T>(0, 1, -1, max);
}

TEST_CASE(
"Range<int32_t>.overlap_ratio: legit-max base range",
"[dimension][overlap_ratio][signed][integral]") {
typedef int32_t T;
auto max = std::numeric_limits<T>::max();
basic_verify_overlap_ratio<T>(0, 1, -2, max - 2);
}
101 changes: 82 additions & 19 deletions tiledb/sm/array_schema/dimension.cc
Expand Up @@ -734,35 +734,98 @@ double Dimension::overlap_ratio<char>(const Range& r1, const Range& r2) {

template <class T>
double Dimension::overlap_ratio(const Range& r1, const Range& r2) {
// Verify that we have two intervals
assert(!r1.empty());
assert(!r2.empty());

auto d1 = (const T*)r1.data();
auto d2 = (const T*)r2.data();
assert(d1[0] <= d1[1]);
assert(d2[0] <= d2[1]);

// No overlap
if (d1[0] > d2[1] || d1[1] < d2[0])
const auto r1_low = d1[0];
const auto r1_high = d1[1];
auto r2_low = d2[0];
auto r2_high = d2[1];
assert(r1_low <= r1_high);
assert(r2_low <= r2_high);

// Special case: No overlap, intervals are disjoint
if (r1_low > r2_high || r1_high < r2_low)
return 0.0;
// Special case: All overlap, interval r2 is a subset of interval r1
if (r1_low <= r2_low && r2_high <= r1_high)
return 1.0;
/*
* At this point we know that r2_low < r2_high, because we would have returned
* by now otherwise. Note that for floating point types, however, we cannot
* conclude that r2_high - r2_low > 0 because of the possibility of underflow.
*/
auto intersection_low = std::max(r1_low, r2_low);
auto intersection_high = std::min(r1_high, r2_high);
/*
* The intersection is a proper subset of interval r1, either the upper bounds
* are distinct, or lower bounds are distinct, or both. This means that any
* result has to be strictly greater than zero and strictly less than 1.
*/
/*
* Guard against overflow. If the outer interval has a bound with an absolute
* value that's "too large", meaning that it might lead to an overflow, we
* apply a shrinking transformation that preserves the ratio. We only need to
* check the outer interval because the intersection is strictly smaller. For
* unsigned types we only need to check the upper bound, since the lower bound
* is zero. For signed types we need to check both.
*/
const T safe_upper = std::nextafter(std::numeric_limits<T>::max() / 2, 0);
bool unsafe = safe_upper < r2_high;
if (std::numeric_limits<T>::is_signed) {
const T safe_lower =
std::nextafter(std::numeric_limits<T>::lowest() / 2, 0);
unsafe = unsafe || r2_low < safe_lower;
}
if (unsafe) {
r2_low /= 2;
r2_high /= 2;
intersection_low /= 2;
intersection_high /= 2;
}

// Compute ratio
auto overlap_start = std::max(d1[0], d2[0]);
auto overlap_end = std::min(d1[1], d2[1]);
auto overlap_range = overlap_end - overlap_start;
auto mbr_range = d2[1] - d2[0];
auto max = std::numeric_limits<double>::max();
auto numerator = intersection_high - intersection_low;
auto denominator = r2_high - r2_low;
if (std::numeric_limits<T>::is_integer) {
overlap_range += 1;
mbr_range += 1;
// integer types count elements; they don't measure length
numerator += 1;
denominator += 1;
} else {
if (overlap_range == 0)
overlap_range = std::nextafter(overlap_range, max);
if (mbr_range == 0)
mbr_range = std::nextafter(mbr_range, max);
if (denominator == 0) {
/*
* If the variable `denominator` is zero, it's the result of rounding,
* since checking the "disjoint" and "subset" cases above has ensured that
* the endpoints of the denominator interval are distinct. Thus we have no
* easy-to-access information about the true quotient value, but
* mathematically it must be between 0 and 1, so we need either to return
* some value between 0 and 1 or to throw an exception stating that we
* don't support this case.
*
* The variable `denominator` is larger than the variable `numerator`, so
* the numerator is also be zero. We could extract some information from
* the floating-point representation itself, but that's a lot of work that
* doesn't seem justified at present. Throwing an exception would be
* better on principle, but the code in its current state now would not
* deal with that gracefully. As a compromise, therefore, we return a
* valid but non-computed value. It's a defect, but one that will rarely
* if ever arise in practice.
*/
return 0.5;
}
// The rounded-to-zero numerator is handled in the general case below.
}

return (double)overlap_range / mbr_range;
auto ratio = (double)numerator / denominator;
// Round away from the endpoints if needed.
if (ratio == 0.0) {
ratio = std::nextafter(0, 1);
} else if (ratio == 1.0) {
ratio = std::nextafter(1, 0);
}
assert(0.0 < ratio && ratio < 1.0);
return ratio;
}

double Dimension::overlap_ratio(const Range& r1, const Range& r2) const {
Expand Down

0 comments on commit 4a0d431

Please sign in to comment.