Merge pull request #77 from SethMMorton/significant-digits

Introduce the "denoise" option to `try_real` and `try_forceint` (and `real`)
SethMMorton · Nov 29, 2023 · 8af03d0 · 8af03d0
2 parents c970c09 + 3258e0f
commit 8af03d0
Show file tree

Hide file tree

Showing 28 changed files with 1,851 additions and 734 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,10 @@ Unreleased
 
 ### Added
 
+- Added the ``denoise`` option to convert floats to int without
+  numerical noise (issues
+  [#69](https://github.com/SethMMorton/fastnumbers/issues/69) and
+  [#77](https://github.com/SethMMorton/fastnumbers/pull/77))
 - Support for Python 3.12 (issues
   [#73](https://github.com/SethMMorton/fastnumbers/issues/73) and
   [#75](https://github.com/SethMMorton/fastnumbers/pull/75))

diff --git a/README.rst b/README.rst
@@ -95,6 +95,7 @@ Error-Handling Functions
 - `Error-handling function API <https://fastnumbers.readthedocs.io/en/stable/api.html#the-error-handling-functions>`_
 - `Fast operations on lists and other iterables`_
 - `About the on_fail option`_
+- `About the denoise option`_
 
 ``try_float`` will be used to demonstrate the functionality of the
 ``try_*`` functions.
@@ -311,6 +312,57 @@ invalid type and b) the default value is ``fastnumbers.RAISE``, not
     The input 'invalid input' is not valid!
     nan
 
+About the ``denoise`` option
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``denoise`` option is available on the ``try_real`` and ``try_forceint`` options.
+To best understand its usage, consider the following native Python behavior:
+
+.. code-block:: python
+
+    >>> int(3.453e21)
+    3452999999999999737856
+    >>> int(float("3.453e21"))
+    3452999999999999737856
+    >>> # Most users would likely expect this result from decimal.Decimal
+    >>> import decimal
+    >>> int(decimal.Decimal("3.453e21"))
+    3453000000000000000000
+    >>> # But watch out, even decimal.Decimal doesn't help for float input
+    >>> import decimal
+    >>> int(decimal.Decimal(3.453e21))
+    3452999999999999737856
+
+Because the conversion of a float to an int goes through the C ``double`` data type which
+has inherent limitations on accuracy (See
+`this Stack Overflow question for examples <https://stackoverflow.com/questions/588004/is-floating-point-math-broken>`_)
+the resulting ``int`` result has "noise" digits that are not part of the original float
+representation.
+
+For functions where this makes sense, ``fastnumbers`` provides the ``denoise`` option to
+give you the results that ``decimal.Decimal`` would give for strings containing floats.
+
+.. code-block:: python
+
+    >>> from fastnumbers import try_real
+    >>> try_real(3.453e21)
+    3452999999999999737856
+    >>> try_real("3.453e21")
+    3452999999999999737856
+    >>> try_real(3.453e21, denoise=True)
+    3453000000000000000000
+    >>> try_real("3.453e21", denoise=True)
+    3453000000000000000000
+
+Two things to keep in mind:
+
+1. The ``denoise`` option adds additional overhead to the conversion calculation, so please consider
+   the trade-offs between speed and accuracy when determining whether or not to use it. It is
+   *significantly* faster than using ``decimal.Decimal``, but much slower than not using it at all.
+2. For string input, ``denoise`` will return results identical to ``decimal.Decimal``. For float
+   input, ``denoise`` will return results that are accurate to about 15 digits (C ``double`` can
+   only store 16 decimal digits, so this means that only the last possible digit may not be accurate).
+
 Checking Functions
 ++++++++++++++++++
 

diff --git a/include/fastnumbers/buffer.hpp b/include/fastnumbers/buffer.hpp
@@ -112,6 +112,24 @@ class Buffer {
         }
     }
 
+    /// Mark the first '.', 'e', or 'E' as '\0'.
+    void mark_integer_end() noexcept
+    {
+        // Technically, we set the exponent character to '\0'
+        // and then adjust the length. We are not actually removing data.
+        // Look for '.' or 'e'/'E'. Instead of searching simulataneously
+        // use memchr because it is so darn fast.
+        char* exp_loc = nullptr;
+        for (const char c : { '.', 'e', 'E' }) {
+            exp_loc = static_cast<char*>(std::memchr(m_buffer, c, m_len));
+            if (exp_loc != nullptr) {
+                *exp_loc = '\0';
+                m_len = m_size = exp_loc - m_buffer;
+                return;
+            }
+        }
+    }
+
     /// The largest amount of data the buffer can contain
     std::size_t max_size() const noexcept
     {

diff --git a/include/fastnumbers/c_str_parsing.hpp b/include/fastnumbers/c_str_parsing.hpp
@@ -55,21 +55,182 @@ constexpr int8_t DIGIT_TABLE_ARBITRARY_BASE[]
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
 
+/// Selector for the type of data a string can contain
+enum class StringType {
+    INVALID, ///< Contains an invalid number
+    INTEGER, ///< Contains an integer
+    FLOAT, ///< Contains a float
+    INTLIKE_FLOAT, ///< Contains an integer-like float
+};
+
 /**
- * \brief Check if a string could be converted to some numeric type
- *
- * Assumes no sign or whitespace.
- *
- * \param str The string to parse, assumed to be non-NULL
- * \param end The end of the string being checked
- * \param base The base to assume when checking an integer, set to 10
- *             unless you know it *must* be an integer.
- * \return 0 - invalid
- *         1 - integer
- *         2 - float
- *         3 - "intlike" float
+ * \class StringChecker
+ * \brief Assess the type of number that is contained in a string
  */
-int string_contains_what(const char* str, const char* end, int base) noexcept;
+class StringChecker {
+public:
+    /**
+     * \brief Check if a string could be converted to some numeric type
+     *
+     * Assumes no sign or whitespace.
+     *
+     * \param str The string to parse, assumed to be non-NULL
+     * \param end The end of the string being checked
+     * \param base The base to assume when checking an integer, set to 10
+     *             unless you know it *must* be an integer.
+     */
+    StringChecker(const char* str, const char* end, int base) noexcept;
+
+    // Default copy/move/assign/destruct
+    StringChecker(const StringChecker&) = default;
+    StringChecker(StringChecker&&) = default;
+    StringChecker& operator=(const StringChecker&) = default;
+    ~StringChecker() = default;
+
+    /// Return the contained type of the string
+    StringType get_type() const { return m_contained_type; }
+
+    /// Is the contained string an invalid number?
+    bool is_invalid() const { return get_type() == StringType::INVALID; }
+
+    /// Is the contained string an integer?
+    bool is_integer() const { return get_type() == StringType::INTEGER; }
+
+    /// Is the contained string a float?
+    bool is_float() const { return get_type() == StringType::FLOAT; }
+
+    /// Is the contained string an int-like float?
+    bool is_intlike_float() const { return get_type() == StringType::INTLIKE_FLOAT; }
+
+    /// The start of the integer component of the contained number.
+    const char* integer_start() const { return m_integer_start; }
+
+    /// The end of the integer component of the contained number.
+    const char* integer_end() const { return m_decimal_start; }
+
+    /// The length of the integer component of the contained number.
+    uint32_t integer_length() const
+    {
+        return std::max(static_cast<uint32_t>(integer_end() - integer_start()), 0U);
+    }
+
+    /// The number of zeros that trail the integer part of the contained number.
+    uint32_t integer_trailing_zeros() const { return m_int_trailing_zeros; }
+
+    /// The start of the decimal component of the contained number.
+    const char* decimal_start() const
+    {
+        return (m_decimal_start == m_decimal_end) ? m_decimal_start
+                                                  : (m_decimal_start + 1);
+    }
+
+    /// The end of the decimal component of the contained number.
+    const char* decimal_end() const { return m_decimal_end; }
+
+    /// The length of the decimal component of the contained number.
+    uint32_t decimal_length() const
+    {
+        return std::max(static_cast<uint32_t>(decimal_end() - decimal_start()), 0U);
+    }
+
+    /// The number of zeros that trail the decimal part of the contained number.
+    uint32_t decimal_trailing_zeros() const { return m_dec_trailing_zeros; }
+
+    /// Was any decimal data found in the string?
+    bool has_decimal_data() const { return m_decimal_start != m_decimal_end; }
+
+    /// The value of the exponent of the nubmer.
+    uint32_t exponent_value() const { return m_expon; }
+
+    /// Is the exponent negative?
+    bool is_exponent_negative() const { return m_exp_negative; }
+
+    /// The total length of the integer plus decimal components.
+    uint32_t digit_length() const { return integer_length() + decimal_length(); }
+
+    /// The decimal length after removing trailing zeros.
+    uint32_t truncated_decimal_length() const
+    {
+        return std::max(decimal_length() - decimal_trailing_zeros(), 0U);
+    }
+
+    /// The exponent after taking into account the decimal digits.
+    uint32_t adjusted_exponent_value() const
+    {
+        if (is_exponent_negative()) {
+            return exponent_value();
+        } else {
+            return std::max(exponent_value() - truncated_decimal_length(), 0U);
+        }
+    }
+
+    /// The total length of the entire number.
+    uint32_t total_length() const
+    {
+        return std::max(static_cast<uint32_t>(m_total_end - integer_start()), 0U);
+    }
+
+    /// The length of the start of the decimal component to the end of the number.
+    uint32_t decimal_and_exponent_length() const
+    {
+        return std::max(static_cast<uint32_t>(m_total_end - decimal_start()), 0U);
+    }
+
+private:
+    /// Set the contained type.
+    void set_type(StringType val) { m_contained_type = val; }
+
+    /// Set the integer start.
+    void set_integer_start(const char* val) { m_integer_start = val; }
+
+    /// Set the decimal start.
+    void set_decimal_start(const char* val) { m_decimal_start = val; }
+
+    /// Set the decimal end.
+    void set_decimal_end(const char* val) { m_decimal_end = val; }
+
+    /// Set the total end.
+    void set_total_end(const char* val) { m_total_end = val; }
+
+    /// Set the exponent value.
+    void set_exponent(const uint32_t val) { m_expon = val; }
+
+    /// Set whether or not the exponent is negative.
+    void set_exponent_negative(const bool val) { m_exp_negative = val; }
+
+    /// Set the number of trailing zeros on the integer part.
+    void set_int_trailing_zeros(const uint32_t val) { m_int_trailing_zeros = val; }
+
+    /// Set the number of trailing zeros on the decimal part.
+    void set_dec_trailing_zeros(const uint32_t val) { m_dec_trailing_zeros = val; }
+
+    /// The start of the integer component of the contained number.
+    const char* m_integer_start;
+
+    /// The start of the decimal component of the contained number.
+    const char* m_decimal_start;
+
+    /// The end of the decimal component of the contained number.
+    const char* m_decimal_end;
+
+    /// The end of the contained number.
+    const char* m_total_end;
+
+    /// The value of the exponent of the nubmer.
+    uint32_t m_expon;
+
+    /// Whether or not the exponent is negative.
+    bool m_exp_negative;
+
+    /// The number of zeros that trail the integer part of the contained number.
+    uint32_t m_int_trailing_zeros;
+
+    /// The number of zeros that trail the decimal part of the contained number.
+    uint32_t m_dec_trailing_zeros;
+
+    /// The contained type of the string.
+    StringType m_contained_type;
+};
 
 /**
  * \brief Remove underscores in a numeric-representing string
@@ -207,11 +368,6 @@ constexpr inline bool is_sign(const char c) noexcept
  */
 constexpr inline bool is_base_prefix(const char c) noexcept
 {
-    // The ASCII standard was quite clever... upper- and lower-case
-    // letters only differ from each other by the 32 bit, otherwise
-    // they are identical.
-    // So, we can OR the 32 bit to force the character to be
-    // lowercase and then just check against the lowercase characters.
     const char lowered = lowercase(c);
     return (lowered == 'x') || (lowered == 'o') || (lowered == 'b');
 }
@@ -370,7 +526,7 @@ constexpr inline int detect_base(const char* str, const char* end) noexcept
  * \brief Return the number of digits an integer type can safely parse without overflow
  */
 template <typename T, typename std::enable_if_t<std::is_integral_v<T>, bool> = true>
-constexpr inline int8_t overflow_cutoff() noexcept
+constexpr inline uint8_t overflow_cutoff() noexcept
 {
     // len('std::numeric_limits<T>::max()') - 1 == return value
     constexpr uint64_t limit = static_cast<uint64_t>(std::numeric_limits<T>::max());
@@ -408,7 +564,7 @@ constexpr inline int8_t overflow_cutoff() noexcept
  * \param always_convert
  */
 template <typename T, typename std::enable_if_t<std::is_integral_v<T>, bool> = true>
-T parse_int(
+inline T parse_int(
     const char* str,
     const char* end,
     int base,
@@ -528,7 +684,7 @@ T parse_int(
 template <
     typename T,
     typename std::enable_if_t<std::is_floating_point_v<T>, bool> = true>
-T parse_float(const char* str, const char* end, bool& error) noexcept
+inline T parse_float(const char* str, const char* end, bool& error) noexcept
 {
     // Use a very fast and accurate string-to-floating point parser
     T value;

diff --git a/include/fastnumbers/compatibility.hpp b/include/fastnumbers/compatibility.hpp
@@ -9,7 +9,7 @@
 // in obtaining the name of a type.
 // The implementation was basically copied from the 3.11 source code,
 // but adjustments were made to make it C++.
-PyObject* PyType_GetName(PyTypeObject* type)
+inline PyObject* PyType_GetName(PyTypeObject* type)
 {
     auto _PyType_Name = [](PyTypeObject* type) -> const char* {
         assert(type->tp_name != nullptr);

diff --git a/include/fastnumbers/docstrings.hpp b/include/fastnumbers/docstrings.hpp
@@ -49,6 +49,10 @@ PyDoc_STRVAR(
     "    If the input can be converted to an *int* without loss of precision\n"
     "    (even if the input was a *float* or float-containing *str*)\n"
     "    coerce to an *int* rather than returning a *float*.\n"
+    "denoise : bool, optional\n"
+    "    When coercing large floating point numbers to an integer, ensure that\n"
+    "    \"noise\" digits are not included. See README for more details.\n"
+    "    Ignored if *coerce* is *False*. The default is *False*.\n"
     "allow_underscores : bool, optional\n"
     "    Underscores are allowed in numeric literals and in strings passed to *int*\n"
     "    or *float* (see PEP 515 for details on what is and is not allowed). You can\n"
@@ -371,6 +375,10 @@ PyDoc_STRVAR(
     "    Control what happens when the input is neither numeric nor string. Behavior\n"
     "    matches that of `on_fail` except that the default value is *RAISE* and a\n"
     "    *TypeError* is raised instead of *ValueError*."
+    "denoise : bool, optional\n"
+    "    When converting large floating point numbers to an integer, ensure that\n"
+    "    \"noise\" digits are not included. See README for more details.\n"
+    "    The default is *False*.\n"
     "allow_underscores : bool, optional\n"
     "    Underscores are allowed in numeric literals and in strings passed to *int*\n"
     "    or *float* (see PEP 515 for details on what is and is not allowed). You can\n"