Include explict example inputs for denoise

Some branches of the code were not being hit - this should ensure good coverage. In the process, some errors were found and corrected related to parsing numbers with Python types.
SethMMorton · Nov 29, 2023 · fabe113 · fabe113
1 parent c6db7ff
commit fabe113
Show file tree

Hide file tree

Showing 5 changed files with 129 additions and 44 deletions.
diff --git a/include/fastnumbers/buffer.hpp b/include/fastnumbers/buffer.hpp
@@ -112,6 +112,24 @@ class Buffer {
         }
     }
 
+    /// Mark the first '.', 'e', or 'E' as '\0'.
+    void mark_integer_end() noexcept
+    {
+        // Technically, we set the exponent character to '\0'
+        // and then adjust the length. We are not actually removing data.
+        // Look for '.' or 'e'/'E'. Instead of searching simulataneously
+        // use memchr because it is so darn fast.
+        char* exp_loc = nullptr;
+        for (const char c : { '.', 'e', 'E' }) {
+            exp_loc = static_cast<char*>(std::memchr(m_buffer, c, m_len));
+            if (exp_loc != nullptr) {
+                *exp_loc = '\0';
+                m_len = m_size = exp_loc - m_buffer;
+                return;
+            }
+        }
+    }
+
     /// The largest amount of data the buffer can contain
     std::size_t max_size() const noexcept
     {

diff --git a/include/fastnumbers/c_str_parsing.hpp b/include/fastnumbers/c_str_parsing.hpp
@@ -109,7 +109,10 @@ class StringChecker {
     const char* integer_end() const { return m_decimal_start; }
 
     /// The length of the integer component of the contained number.
-    uint32_t integer_length() const { return integer_end() - integer_start(); }
+    uint32_t integer_length() const
+    {
+        return std::max(static_cast<uint32_t>(integer_end() - integer_start()), 0U);
+    }
 
     /// The number of zeros that trail the integer part of the contained number.
     uint32_t integer_trailing_zeros() const { return m_int_trailing_zeros; }
@@ -125,7 +128,10 @@ class StringChecker {
     const char* decimal_end() const { return m_decimal_end; }
 
     /// The length of the decimal component of the contained number.
-    uint32_t decimal_length() const { return decimal_end() - decimal_start(); }
+    uint32_t decimal_length() const
+    {
+        return std::max(static_cast<uint32_t>(decimal_end() - decimal_start()), 0U);
+    }
 
     /// The number of zeros that trail the decimal part of the contained number.
     uint32_t decimal_trailing_zeros() const { return m_dec_trailing_zeros; }
@@ -140,28 +146,34 @@ class StringChecker {
     bool is_exponent_negative() const { return m_exp_negative; }
 
     /// The total length of the integer plus decimal components.
-    uint32_t total_length() const { return integer_length() + decimal_length(); }
+    uint32_t digit_length() const { return integer_length() + decimal_length(); }
 
     /// The decimal length after removing trailing zeros.
     uint32_t truncated_decimal_length() const
     {
-        return decimal_length() - decimal_trailing_zeros();
+        return std::max(decimal_length() - decimal_trailing_zeros(), 0U);
     }
 
     /// The exponent after taking into account the decimal digits.
     uint32_t adjusted_exponent_value() const
     {
-        if (is_adjusted_exponent_negative()) {
-            return truncated_decimal_length() - exponent_value();
+        if (is_exponent_negative()) {
+            return exponent_value();
         } else {
-            return exponent_value() - truncated_decimal_length();
+            return std::max(exponent_value() - truncated_decimal_length(), 0U);
         }
     }
 
-    /// Is the adjusted exponent negative?
-    bool is_adjusted_exponent_negative() const
+    /// The total length of the entire number.
+    uint32_t total_length() const
     {
-        return truncated_decimal_length() > exponent_value();
+        return std::max(static_cast<uint32_t>(m_total_end - integer_start()), 0U);
+    }
+
+    /// The length of the start of the decimal component to the end of the number.
+    uint32_t decimal_and_exponent_length() const
+    {
+        return std::max(static_cast<uint32_t>(m_total_end - decimal_start()), 0U);
     }
 
 private:
@@ -177,6 +189,9 @@ class StringChecker {
     /// Set the decimal end.
     void set_decimal_end(const char* val) { m_decimal_end = val; }
 
+    /// Set the total end.
+    void set_total_end(const char* val) { m_total_end = val; }
+
     /// Set the exponent value.
     void set_exponent(const uint32_t val) { m_expon = val; }
 
@@ -198,6 +213,9 @@ class StringChecker {
     /// The end of the decimal component of the contained number.
     const char* m_decimal_end;
 
+    /// The end of the contained number.
+    const char* m_total_end;
+
     /// The value of the exponent of the nubmer.
     uint32_t m_expon;
 

diff --git a/src/cpp/c_str_parsing.cpp b/src/cpp/c_str_parsing.cpp
@@ -20,6 +20,7 @@ StringChecker::StringChecker(const char* str, const char* end, int base) noexcep
     : m_integer_start(nullptr)
     , m_decimal_start(nullptr)
     , m_decimal_end(nullptr)
+    , m_total_end(nullptr)
     , m_expon(0U)
     , m_exp_negative(false)
     , m_int_trailing_zeros(0U)
@@ -38,6 +39,7 @@ StringChecker::StringChecker(const char* str, const char* end, int base) noexcep
         set_integer_start(str);
         set_decimal_start(str);
         set_decimal_end(str);
+        set_total_end(str);
         return;
     }
 
@@ -55,6 +57,7 @@ StringChecker::StringChecker(const char* str, const char* end, int base) noexcep
         }
         set_decimal_start(str);
         set_decimal_end(str);
+        set_total_end(str);
         set_type(
             (str == end && str != integer_start()) ? StringType::INTEGER
                                                    : StringType::INVALID
@@ -119,6 +122,7 @@ StringChecker::StringChecker(const char* str, const char* end, int base) noexcep
             set_type(StringType::FLOAT);
         }
     }
+    set_total_end(str);
 
     // If the parsing was not valid or we are not at the end of the string
     // then the string is invalid.

diff --git a/src/cpp/parser.cpp b/src/cpp/parser.cpp
@@ -19,15 +19,17 @@
 // C++ version of https://docs.python.org/3/library/math.html#math.ulp
 static double ulp(const double x)
 {
-    if (x > 0)
-        return std::nexttoward(x, std::numeric_limits<double>::infinity()) - x;
-    else
-        return x - std::nexttoward(x, -std::numeric_limits<double>::infinity());
+    // Assumes positive input
+    return std::nexttoward(x, std::numeric_limits<double>::infinity()) - x;
 }
 
 // Parse a sequence of characters as a Python long
-static PyObject*
-parse_long_helper(const char* start, const char* end, const std::size_t length)
+static PyObject* parse_long_helper(
+    const char* start,
+    const char* end,
+    const std::size_t length,
+    const std::size_t length_to_end
+)
 {
     // We know by construction that this correctly stores a number, so we skip
     // the error checking.
@@ -38,11 +40,41 @@ parse_long_helper(const char* start, const char* end, const std::size_t length)
             length ? parse_int<uint64_t>(start, end, 10, error, overflow) : 0ULL
         );
     } else {
-        char* this_end = const_cast<char*>(end);
-        return length ? PyLong_FromString(start, &this_end, 10) : pyobject_from_int(0);
+        // This Python parse won't let us specify the end of the string, and it errors
+        // out on non-numeric characters, so we have to copy our data to a buffer
+        // and set the exponent character to '\0' so that Python can parse it.
+        Buffer buffer(start, length_to_end);
+        buffer.mark_integer_end();
+        return PyLong_FromString(buffer.start(), nullptr, 10);
     }
 }
 
+// Convert a value into a power of ten Python long in the most efficient method
+// depending on the value of the exponent.
+static PyObject* exponent_creation_helper(const uint32_t exp_val)
+{
+    if (exp_val < overflow_cutoff<uint64_t>()) {
+        return pyobject_from_int(ipow::ipow(10ULL, exp_val));
+    } else {
+        PyObject* py_expon = pyobject_from_int(10);
+        PyObject* py_exp_component = pyobject_from_int(exp_val);
+        in_place_pow(py_expon, py_exp_component);
+        Py_DECREF(py_exp_component);
+        return py_expon;
+    }
+}
+
+// Convert a PyObject to a negative number if needed.
+static PyObject* do_negative(PyObject* obj, const bool negative)
+{
+    if (negative) {
+        PyObject* temp = obj;
+        obj = PyNumber_Negative(temp);
+        Py_DECREF(temp);
+    }
+    return obj;
+}
+
 PyObject* Parser::float_as_int_without_noise(PyObject* obj) noexcept
 {
     const double val = PyFloat_AsDouble(obj);
@@ -82,6 +114,8 @@ PyObject* Parser::float_as_int_without_noise(PyObject* obj) noexcept
 
     // If the number of digits is less than 1 (e.g. the float is already noiselessly
     // converted to an integer) just return the integer directly.
+    // Because of the floor check above, it is unlikely this will ever be true, but
+    // include it for completeness' sake.
     if (digits < 1) {
         return val_int;
     }
@@ -103,8 +137,8 @@ PyObject* Parser::float_as_int_without_noise(
 
     // As a special case, if the number of digits in the integer and decimal
     // parts of the float are small enough that we know it could fit into a unsigned
-    // 65-bit integer, parse using C++ methods in the name of efficiency.
-    if (checker.total_length() < overflow_cutoff<uint64_t>()) {
+    // 64-bit integer, parse using C++ methods in the name of efficiency.
+    if (checker.digit_length() < overflow_cutoff<uint64_t>()) {
         bool error = false;
         bool overflow = false;
         uint64_t integer = 0;
@@ -144,7 +178,10 @@ PyObject* Parser::float_as_int_without_noise(
     else {
         // First parse the integer components of the floating point number.
         py_integer = parse_long_helper(
-            checker.integer_start(), checker.integer_end(), checker.integer_length()
+            checker.integer_start(),
+            checker.integer_end(),
+            checker.integer_length(),
+            checker.total_length()
         );
         if (py_integer == nullptr) {
             return py_integer;
@@ -156,17 +193,19 @@ PyObject* Parser::float_as_int_without_noise(
         if (checker.truncated_decimal_length()) {
             // Parse the decimal component...
             PyObject* py_decimal = parse_long_helper(
-                checker.decimal_start(), checker.decimal_end(), checker.decimal_length()
+                checker.decimal_start(),
+                checker.decimal_end(),
+                checker.decimal_length(),
+                checker.decimal_and_exponent_length()
             );
             if (py_decimal == nullptr) {
                 return py_decimal;
             }
 
             // ... and then "remove" trailing zeros if they exist...
             if (checker.decimal_trailing_zeros()) {
-                PyObject* divisor = pyobject_from_int(
-                    ipow::ipow(10ULL, checker.decimal_trailing_zeros())
-                );
+                PyObject* divisor
+                    = exponent_creation_helper(checker.decimal_trailing_zeros());
                 if (divisor == nullptr) {
                     return divisor;
                 }
@@ -181,9 +220,8 @@ PyObject* Parser::float_as_int_without_noise(
             // ... and then "shift" the integer py powers of ten so we can
             //     add in the decimal component...
             {
-                PyObject* offset = pyobject_from_int(
-                    ipow::ipow(10ULL, checker.truncated_decimal_length())
-                );
+                PyObject* offset
+                    = exponent_creation_helper(checker.truncated_decimal_length());
                 if (offset == nullptr) {
                     Py_DECREF(py_integer);
                     return offset;
@@ -212,19 +250,15 @@ PyObject* Parser::float_as_int_without_noise(
     // of ten. Because this value could be *huge*, we cannot do power nor the
     // mutliplication in the C++ space - it must be all done with Python's types.
     if (checker.adjusted_exponent_value() > 0) {
-        PyObject* py_expon = pyobject_from_int(10);
-        PyObject* py_exp_component
-            = pyobject_from_int(checker.adjusted_exponent_value());
-        in_place_pow(py_expon, py_exp_component);
-        Py_DECREF(py_exp_component);
+        PyObject* py_expon = exponent_creation_helper(checker.adjusted_exponent_value());
         if (py_expon == nullptr) {
             Py_DECREF(py_integer);
             return py_expon;
         }
 
         // We are carrying around the exponent as an unsigned integer,
         // so we handle negatives by dividing, and positives by multiplying.
-        if (checker.is_adjusted_exponent_negative()) {
+        if (checker.is_exponent_negative()) {
             in_place_divide(py_integer, py_expon);
         } else {
             in_place_multiply(py_integer, py_expon);
@@ -233,12 +267,7 @@ PyObject* Parser::float_as_int_without_noise(
     }
 
     // Finally, we negate the result if it is supposed to be negative.
-    if (is_negative) {
-        PyObject* temp = py_integer;
-        py_integer = PyNumber_Negative(temp);
-        Py_DECREF(temp);
-    }
-    return py_integer;
+    return do_negative(py_integer, is_negative);
 }
 
 /**
@@ -336,10 +365,10 @@ RawPayload<PyObject*> CharacterParser::as_pyint() const noexcept(false)
         return pyobject_from_int(result);
     }
 
-    // Parse and record the location where parsing ended (including trailing whitespace)
-    // No need to do input validation with the second argument because we already know
-    // the input is valid from above.
-    // Return the value without checking python's error state.
+    // Parse and record the location where parsing ended (including trailing
+    // whitespace) No need to do input validation with the second argument because we
+    // already know the input is valid from above. Return the value without checking
+    // python's error state.
     return PyLong_FromString(m_start_orig, nullptr, options().get_base());
 }
 

diff --git a/tests/test_fastnumbers.py b/tests/test_fastnumbers.py
@@ -1104,6 +1104,14 @@ def test_given_float_returns_int_matching_decimal_object_with_denoise(
         assert_integers_close(result, expected)
 
     @given(floats(allow_nan=False).filter(an_integer).map(repr))
+    @example("1234.56E56")
+    @example("12345.60000E56")
+    @example("1234560000E-3")
+    @example("1234.56789012345678901234567890123456789e56")
+    @example("1234.5678901234567890123456789012345678900000e56")
+    @example("1234567890123456789012345678901234567890000000000000.")
+    @example("123456789012345678901234567890123456789.0000000000000000000000000000")
+    @example("1234567890123456789012345678901234567890000000000000e-5")
     def test_given_float_str_returns_int_matching_decimal_object_with_denoise(
         self, x: str
     ) -> None:
@@ -1268,6 +1276,14 @@ def test_given_float_returns_int_matching_decimal_object_with_denoise(
         assert_integers_close(result, expected)
 
     @given(floats(allow_nan=False, allow_infinity=False).map(repr))
+    @example("1234.56E56")
+    @example("12345.60000E56")
+    @example("1234560000E-3")
+    @example("1234.56789012345678901234567890123456789e56")
+    @example("1234.5678901234567890123456789012345678900000e56")
+    @example("1234567890123456789012345678901234567890000000000000.")
+    @example("123456789012345678901234567890123456789.0000000000000000000000000000")
+    @example("1234567890123456789012345678901234567890000000000000e-5")
     def test_given_float_str_returns_int_matching_decimal_object_with_denoise(
         self, x: str
     ) -> None: