Rewrite Dimension::overlap_ratio to remove overflow, underflow, and…

… divide by zero (#2304) (#2349) * Add tests for overflow defects in overlap_ratio. Minor (non-fix) changes to overlap_ratio. Most importantly, add an assert() to verify the postcondition. Add helpers-dimension.h with some utility functions. Fix an iteration error in a caller of overlap_ratio. Fix typos. * Rewrote Dimension::overlap_ratio to pass all the tests it was failing before. Added some test tags. Reverted a change in subarray.cc * clang format * More clang-format * Use the limits function that returns the actual minimum element for floating point, not the minimum-positive element. * Fix Hilbert test now that size estimate is correct Now that the overlap ratio is correct, we get proper size estimation and the partitioner doesn't do an unnecessary split of the results anymore. * Comment changes. clang-format Co-authored-by: Luc Rancourt <lucrancourt@gmail.com> Co-authored-by: eric-hughes-tiledb <82400964+eric-hughes-tiledb@users.noreply.github.com> Co-authored-by: Luc Rancourt <lucrancourt@gmail.com>
TileDB-Inc · Jun 14, 2021 · 4a0d431 · 4a0d431
1 parent 0b00402
commit 4a0d431
Show file tree

Hide file tree

Showing 6 changed files with 286 additions and 55 deletions.
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -98,6 +98,7 @@ list(APPEND TILEDB_CORE_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../tiledb/sm/c_
 set(TILEDB_TEST_SOURCES
   src/helpers.h
   src/helpers.cc
+  src/helpers-dimension.h
   src/unit-azure.cc
   src/unit-backwards_compat.cc
   src/unit-buffer.cc

diff --git a/test/src/helpers-dimension.h b/test/src/helpers-dimension.h
@@ -0,0 +1,107 @@
+/**
+ * @file   helpers-dimension.h
+ *
+ * @section LICENSE
+ *
+ * The MIT License
+ *
+ * @copyright Copyright (c) 2017-2021 TileDB, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * @section DESCRIPTION
+ *
+ * Helpers for tests involving dimensions.
+ */
+
+#include "tiledb/sm/enums/datatype.h"
+#include "tiledb/sm/misc/types.h"
+
+#ifndef TILEDB_HELPERS_DIMENSION_H
+#define TILEDB_HELPERS_DIMENSION_H
+
+namespace tiledb {
+namespace test {
+
+/**
+ * A typed version of the standard range class. Constructs Range objects without
+ * requiring the caller to know anything about its data structures.
+ *
+ * @tparam T
+ */
+template <class T>
+class TypedRange : public tiledb::sm::Range {
+ public:
+  /**
+   * Construct a Range from a single interval.
+   * @param low Lower bound of interval
+   * @param high Upper bound of interval
+   * @pre low <= high
+   */
+  TypedRange(T low, T high) {
+    T x[2] = {low, high};
+    set_range(&x[0], 2 * sizeof(T));
+    /* Commentary: There's no way of initializing a Range without copying from
+     * existing memory. As a result any value-initializing constructor first
+     * needs to build an array and then copy it.
+     */
+  }
+};
+
+using Datatype = tiledb::sm::Datatype;
+
+template <class T>
+struct RangeTraits {
+  static Datatype datatype;
+};
+
+template <>
+struct RangeTraits<int32_t> {
+  static constexpr Datatype datatype = Datatype::INT32;
+};
+
+template <>
+struct RangeTraits<int64_t> {
+  static constexpr Datatype datatype = Datatype::INT64;
+};
+
+template <>
+struct RangeTraits<uint32_t> {
+  static constexpr Datatype datatype = Datatype::UINT32;
+};
+
+template <>
+struct RangeTraits<uint64_t> {
+  static constexpr Datatype datatype = Datatype::UINT64;
+};
+
+template <>
+struct RangeTraits<float> {
+  static constexpr Datatype datatype = Datatype::FLOAT32;
+};
+
+template <>
+struct RangeTraits<double> {
+  static constexpr Datatype datatype = Datatype::FLOAT64;
+};
+
+}  // namespace test
+}  // namespace tiledb
+
+#endif  // TILEDB_HELPERS_DIMENSION_H
diff --git a/test/src/unit-cppapi-hilbert.cc b/test/src/unit-cppapi-hilbert.cc
@@ -1250,23 +1250,10 @@ TEST_CASE(
 
     // Check results
     CHECK(query_r.query_status() == Query::Status::INCOMPLETE);
-    CHECK(query_r.result_buffer_elements()["a"].second == 1);
-    std::vector<int32_t> c_buff_a = {3};
-    std::vector<float> c_buff_d1 = {0.1f};
-    std::vector<float> c_buff_d2 = {0.1f};
-    CHECK(r_buff_a[0] == c_buff_a[0]);
-    CHECK(r_buff_d1[0] == c_buff_d1[0]);
-    CHECK(r_buff_d2[0] == c_buff_d2[0]);
-
-    // Read again
-    CHECK_NOTHROW(query_r.submit());
-
-    // Check results again
-    CHECK(query_r.query_status() == Query::Status::INCOMPLETE);
-    CHECK(query_r.result_buffer_elements()["a"].second == 1);
-    c_buff_a = {2};
-    c_buff_d1 = {0.1f};
-    c_buff_d2 = {0.3f};
+    CHECK(query_r.result_buffer_elements()["a"].second == 2);
+    std::vector<int32_t> c_buff_a = {3, 2};
+    std::vector<float> c_buff_d1 = {0.1f, 0.1f};
+    std::vector<float> c_buff_d2 = {0.1f, 0.3f};
     CHECK(r_buff_a[0] == c_buff_a[0]);
     CHECK(r_buff_d1[0] == c_buff_d1[0]);
     CHECK(r_buff_d2[0] == c_buff_d2[0]);
@@ -1308,23 +1295,10 @@ TEST_CASE(
 
     // Check results
     CHECK(query_r.query_status() == Query::Status::INCOMPLETE);
-    CHECK(query_r.result_buffer_elements()["a"].second == 1);
-    std::vector<int32_t> c_buff_a = {3};
-    std::vector<float> c_buff_d1 = {0.1f};
-    std::vector<float> c_buff_d2 = {0.1f};
-    CHECK(r_buff_a[0] == c_buff_a[0]);
-    CHECK(r_buff_d1[0] == c_buff_d1[0]);
-    CHECK(r_buff_d2[0] == c_buff_d2[0]);
-
-    // Read again
-    CHECK_NOTHROW(query_r.submit());
-
-    // Check results again
-    CHECK(query_r.query_status() == Query::Status::INCOMPLETE);
-    CHECK(query_r.result_buffer_elements()["a"].second == 1);
-    c_buff_a = {2};
-    c_buff_d1 = {0.1f};
-    c_buff_d2 = {0.3f};
+    CHECK(query_r.result_buffer_elements()["a"].second == 2);
+    std::vector<int32_t> c_buff_a = {3, 2};
+    std::vector<float> c_buff_d1 = {0.1f, 0.1f};
+    std::vector<float> c_buff_d2 = {0.1f, 0.3f};
     CHECK(r_buff_a[0] == c_buff_a[0]);
     CHECK(r_buff_d1[0] == c_buff_d1[0]);
     CHECK(r_buff_d2[0] == c_buff_d2[0]);

diff --git a/test/src/unit-dimension.cc b/test/src/unit-dimension.cc
@@ -30,6 +30,7 @@
  * Tests the `Dimension` class.
  */
 
+#include "test/src/helpers-dimension.h"
 #include "tiledb/sm/array_schema/dimension.h"
 #include "tiledb/sm/enums/datatype.h"
 #include "tiledb/sm/misc/hilbert.h"
@@ -40,6 +41,7 @@
 using namespace tiledb;
 using namespace tiledb::common;
 using namespace tiledb::sm;
+using namespace tiledb::test;
 
 TEST_CASE(
     "Dimension: Test map_to_uint64, integers",
@@ -603,3 +605,87 @@ TEST_CASE(
   val_str = std::string((const char*)(&val[0]), 4);
   CHECK(val_str == std::string("yell", 4));
 }
+
+template <class T>
+double basic_verify_overlap_ratio(
+    T range1_low, T range1_high, T range2_low, T range2_high) {
+  auto r1 = TypedRange<T>(range1_low, range1_high);
+  auto r2 = TypedRange<T>(range2_low, range2_high);
+  Dimension d("foo", RangeTraits<T>::datatype);
+  auto ratio = d.overlap_ratio(r1, r2);
+  CHECK(0.0 <= ratio);
+  CHECK(ratio <= 1.0);
+  return ratio;
+}
+
+/**
+ * The denominator of the ratio is computed as range2_high - range2_low. For a
+ * k-bit signed integer, the largest this value can take is 2^k-1, which is
+ * larger than the maximum signed value of 2^(k-1)-1.
+ */
+TEST_CASE(
+    "Range<int32_t>.overlap_ratio: denominator overflow",
+    "[dimension][overlap_ratio][signed][integral]") {
+  typedef int32_t T;
+  auto min = std::numeric_limits<T>::min();
+  auto max = std::numeric_limits<T>::max();
+  basic_verify_overlap_ratio<T>(min / 3, 1, min + 1, max);
+}
+
+/**
+ * Denominator overflow for an unsigned integral type.
+ */
+TEST_CASE(
+    "Range<uint32_t>.overlap_ratio: denominator overflow",
+    "[dimension][overlap_ratio][unsigned][integral]") {
+  typedef uint32_t T;
+  auto min = std::numeric_limits<T>::min();
+  auto max = std::numeric_limits<T>::max();
+  basic_verify_overlap_ratio<T>(0, 1, min + 1, max);
+}
+
+/**
+ * Denominator overflow for double.
+ */
+TEST_CASE(
+    "Range<double>.overlap_ratio: denominator overflow",
+    "[dimension][overlap_ratio][signed][floating]") {
+  typedef double T;
+  auto min = std::numeric_limits<T>::min();
+  auto max = std::numeric_limits<T>::max();
+  basic_verify_overlap_ratio<T>(0, 1, min + 1, max);
+}
+
+TEST_CASE(
+    "Range<uint32_t>.overlap_ratio: max base range",
+    "[dimension][overlap_ratio][unsigned][integral]") {
+  typedef uint32_t T;
+  auto min = std::numeric_limits<T>::min();
+  auto max = std::numeric_limits<T>::max();
+  basic_verify_overlap_ratio<T>(min, min + 1, min, max);
+}
+
+TEST_CASE(
+    "Range<uint32_t>.overlap_ratio: max both ranges",
+    "[dimension][overlap_ratio][unsigned][integral]") {
+  typedef uint32_t T;
+  auto min = std::numeric_limits<T>::min();
+  auto max = std::numeric_limits<T>::max();
+  basic_verify_overlap_ratio<T>(min, max, min, max);
+}
+
+TEST_CASE(
+    "Range<int32_t>.overlap_ratio: over-by-1 legit-max base range",
+    "[dimension][overlap_ratio][signed][integral]") {
+  typedef int32_t T;
+  auto max = std::numeric_limits<T>::max();
+  basic_verify_overlap_ratio<T>(0, 1, -1, max);
+}
+
+TEST_CASE(
+    "Range<int32_t>.overlap_ratio: legit-max base range",
+    "[dimension][overlap_ratio][signed][integral]") {
+  typedef int32_t T;
+  auto max = std::numeric_limits<T>::max();
+  basic_verify_overlap_ratio<T>(0, 1, -2, max - 2);
+}
diff --git a/tiledb/sm/array_schema/dimension.cc b/tiledb/sm/array_schema/dimension.cc
@@ -734,35 +734,98 @@ double Dimension::overlap_ratio<char>(const Range& r1, const Range& r2) {
 
 template <class T>
 double Dimension::overlap_ratio(const Range& r1, const Range& r2) {
+  // Verify that we have two intervals
   assert(!r1.empty());
   assert(!r2.empty());
-
   auto d1 = (const T*)r1.data();
   auto d2 = (const T*)r2.data();
-  assert(d1[0] <= d1[1]);
-  assert(d2[0] <= d2[1]);
-
-  // No overlap
-  if (d1[0] > d2[1] || d1[1] < d2[0])
+  const auto r1_low = d1[0];
+  const auto r1_high = d1[1];
+  auto r2_low = d2[0];
+  auto r2_high = d2[1];
+  assert(r1_low <= r1_high);
+  assert(r2_low <= r2_high);
+
+  // Special case: No overlap, intervals are disjoint
+  if (r1_low > r2_high || r1_high < r2_low)
     return 0.0;
+  // Special case: All overlap, interval r2 is a subset of interval r1
+  if (r1_low <= r2_low && r2_high <= r1_high)
+    return 1.0;
+  /*
+   * At this point we know that r2_low < r2_high, because we would have returned
+   * by now otherwise. Note that for floating point types, however, we cannot
+   * conclude that r2_high - r2_low > 0 because of the possibility of underflow.
+   */
+  auto intersection_low = std::max(r1_low, r2_low);
+  auto intersection_high = std::min(r1_high, r2_high);
+  /*
+   * The intersection is a proper subset of interval r1, either the upper bounds
+   * are distinct, or lower bounds are distinct, or both. This means that any
+   * result has to be strictly greater than zero and strictly less than 1.
+   */
+  /*
+   * Guard against overflow. If the outer interval has a bound with an absolute
+   * value that's "too large", meaning that it might lead to an overflow, we
+   * apply a shrinking transformation that preserves the ratio. We only need to
+   * check the outer interval because the intersection is strictly smaller. For
+   * unsigned types we only need to check the upper bound, since the lower bound
+   * is zero. For signed types we need to check both.
+   */
+  const T safe_upper = std::nextafter(std::numeric_limits<T>::max() / 2, 0);
+  bool unsafe = safe_upper < r2_high;
+  if (std::numeric_limits<T>::is_signed) {
+    const T safe_lower =
+        std::nextafter(std::numeric_limits<T>::lowest() / 2, 0);
+    unsafe = unsafe || r2_low < safe_lower;
+  }
+  if (unsafe) {
+    r2_low /= 2;
+    r2_high /= 2;
+    intersection_low /= 2;
+    intersection_high /= 2;
+  }
 
   // Compute ratio
-  auto overlap_start = std::max(d1[0], d2[0]);
-  auto overlap_end = std::min(d1[1], d2[1]);
-  auto overlap_range = overlap_end - overlap_start;
-  auto mbr_range = d2[1] - d2[0];
-  auto max = std::numeric_limits<double>::max();
+  auto numerator = intersection_high - intersection_low;
+  auto denominator = r2_high - r2_low;
   if (std::numeric_limits<T>::is_integer) {
-    overlap_range += 1;
-    mbr_range += 1;
+    // integer types count elements; they don't measure length
+    numerator += 1;
+    denominator += 1;
   } else {
-    if (overlap_range == 0)
-      overlap_range = std::nextafter(overlap_range, max);
-    if (mbr_range == 0)
-      mbr_range = std::nextafter(mbr_range, max);
+    if (denominator == 0) {
+      /*
+       * If the variable `denominator` is zero, it's the result of rounding,
+       * since checking the "disjoint" and "subset" cases above has ensured that
+       * the endpoints of the denominator interval are distinct. Thus we have no
+       * easy-to-access information about the true quotient value, but
+       * mathematically it must be between 0 and 1, so we need either to return
+       * some value between 0 and 1 or to throw an exception stating that we
+       * don't support this case.
+       *
+       * The variable `denominator` is larger than the variable `numerator`, so
+       * the numerator is also be zero. We could extract some information from
+       * the floating-point representation itself, but that's a lot of work that
+       * doesn't seem justified at present. Throwing an exception would be
+       * better on principle, but the code in its current state now would not
+       * deal with that gracefully. As a compromise, therefore, we return a
+       * valid but non-computed value. It's a defect, but one that will rarely
+       * if ever arise in practice.
+       */
+      return 0.5;
+    }
+    // The rounded-to-zero numerator is handled in the general case below.
   }
-
-  return (double)overlap_range / mbr_range;
+  auto ratio = (double)numerator / denominator;
+  // Round away from the endpoints if needed.
+  if (ratio == 0.0) {
+    ratio = std::nextafter(0, 1);
+  } else if (ratio == 1.0) {
+    ratio = std::nextafter(1, 0);
+  }
+  assert(0.0 < ratio && ratio < 1.0);
+  return ratio;
 }
 
 double Dimension::overlap_ratio(const Range& r1, const Range& r2) const {