From dab96fc84b7d5a72555618a02737faf8aa51c973 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Fri, 16 Jul 2021 12:07:50 -0600
Subject: [PATCH 01/14] Add PERL_GCC_VERSION_GE and kin

This saves typing and potential mistakes

The idea and implementation are from Tomasz Konojacki
---
 perl.h | 41 ++++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/perl.h b/perl.h
index 917ed5fcbb31..4f51442d0f0d 100644
--- a/perl.h
+++ b/perl.h
@@ -285,6 +285,19 @@ Now a no-op.
 #  define PERL_IS_GCC 1
 #endif
 
+#define PERL_GCC_VERSION_GE(major,minor,patch)                              \
+    (((100000 * __GNUC__) + (1000 * __GNUC_MINOR__) + __GNUC_PATCHLEVEL__)  \
+        >= ((100000 * (major)) + (1000 * (minor)) + (patch)))
+#define PERL_GCC_VERSION_GT(major,minor,patch)                              \
+    (((100000 * __GNUC__) + (1000 * __GNUC_MINOR__) + __GNUC_PATCHLEVEL__)  \
+        > ((100000 * (major)) + (1000 * (minor)) + (patch)))
+#define PERL_GCC_VERSION_LE(major,minor,patch)                              \
+    (((100000 * __GNUC__) + (1000 * __GNUC_MINOR__) + __GNUC_PATCHLEVEL__)  \
+        <= ((100000 * (major)) + (1000 * (minor)) + (patch)))
+#define PERL_GCC_VERSION_LT(major,minor,patch)                              \
+    (((100000 * __GNUC__) + (1000 * __GNUC_MINOR__) + __GNUC_PATCHLEVEL__)  \
+        < ((100000 * (major)) + (1000 * (minor)) + (patch)))
+
 /* In case Configure was not used (we are using a "canned config"
  * such as Win32, or a cross-compilation setup, for example) try going
  * by the gcc major and minor versions.  One useful URL is
@@ -301,38 +314,38 @@ Now a no-op.
 
 #ifndef PERL_MICRO
 #  if defined __GNUC__ && !defined(__INTEL_COMPILER)
-#    if __GNUC__ == 3 && __GNUC_MINOR__ >= 1 || __GNUC__ > 3 /* 3.1 -> */
+#    if PERL_GCC_VERSION_GE(3,1,0)
 #      define HASATTRIBUTE_DEPRECATED
 #    endif
-#    if __GNUC__ >= 3 /* 3.0 -> */ /* XXX Verify this version */
+#    if PERL_GCC_VERSION_GE(3,0,0)  /* XXX Verify this version */
 #      define HASATTRIBUTE_FORMAT
 #      if defined __MINGW32__
 #        define PRINTF_FORMAT_NULL_OK
 #      endif
 #    endif
-#    if __GNUC__ >= 3 /* 3.0 -> */
+#    if PERL_GCC_VERSION_GE(3,0,0)
 #      define HASATTRIBUTE_MALLOC
 #    endif
-#    if __GNUC__ == 3 && __GNUC_MINOR__ >= 3 || __GNUC__ > 3 /* 3.3 -> */
+#    if PERL_GCC_VERSION_GE(3,3,0)
 #      define HASATTRIBUTE_NONNULL
 #    endif
-#    if __GNUC__ == 2 && __GNUC_MINOR__ >= 5 || __GNUC__ > 2 /* 2.5 -> */
+#    if PERL_GCC_VERSION_GE(2,5,0)
 #      define HASATTRIBUTE_NORETURN
 #    endif
-#    if __GNUC__ >= 3 /* gcc 3.0 -> */
+#    if PERL_GCC_VERSION_GE(3,0,0)
 #      define HASATTRIBUTE_PURE
 #    endif
-#    if __GNUC__ == 3 && __GNUC_MINOR__ >= 4 || __GNUC__ > 3 /* 3.4 -> */
+#    if PERL_GCC_VERSION_GE(3,4,0)
 #      define HASATTRIBUTE_UNUSED
 #    endif
 #    if __GNUC__ == 3 && __GNUC_MINOR__ == 3 && !defined(__cplusplus)
 #      define HASATTRIBUTE_UNUSED /* gcc-3.3, but not g++-3.3. */
 #    endif
-#    if __GNUC__ == 3 && __GNUC_MINOR__ >= 4 || __GNUC__ > 3 /* 3.4 -> */
+#    if PERL_GCC_VERSION_GE(3,4,0)
 #      define HASATTRIBUTE_WARN_UNUSED_RESULT
 #    endif
      /* always_inline is buggy in gcc <= 4.6 and causes compilation errors */
-#    if __GNUC__ == 4 && __GNUC_MINOR__ >= 7 || __GNUC__ > 4 /* 4.7 -> */
+#    if PERL_GCC_VERSION_GE(4,7,0)
 #      define HASATTRIBUTE_ALWAYS_INLINE
 #    endif
 #  endif
@@ -364,7 +377,7 @@ Now a no-op.
 #endif
 #ifdef HASATTRIBUTE_ALWAYS_INLINE
 /* always_inline is buggy in gcc <= 4.6 and causes compilation errors */
-#  if !defined(PERL_IS_GCC) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7 || __GNUC__ > 4)
+#  if !defined(PERL_IS_GCC) || PERL_GCC_VERSION_GE(4,7,0)
 #    define __attribute__always_inline__      __attribute__((always_inline))
 #  endif
 #endif
@@ -482,7 +495,7 @@ compilation causes it be used just some times.
  */
 #if defined(PERL_GCC_PEDANTIC) || \
     (defined(__GNUC__) && defined(__cplusplus) && \
-        ((__GNUC__ < 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ < 2))))
+        (PERL_GCC_VERSION_LT(4,2,0)))
 #  ifndef PERL_GCC_BRACE_GROUPS_FORBIDDEN
 #    define PERL_GCC_BRACE_GROUPS_FORBIDDEN
 #  endif
@@ -558,8 +571,7 @@ __typeof__ and nothing else.
  *
  */
 
-#if defined(__clang__) || defined(__clang) || \
-       (defined( __GNUC__) && ((__GNUC__ * 100) + __GNUC_MINOR__) >= 406)
+#if defined(__clang__) || defined(__clang) || PERL_GCC_VERSION_GE(4,6,0)
 #  define GCC_DIAG_PRAGMA(x) _Pragma (#x)
 /* clang has "clang diagnostic" pragmas, but also understands gcc. */
 #  define GCC_DIAG_IGNORE(x) _Pragma("GCC diagnostic push") \
@@ -3873,8 +3885,7 @@ intrinsic function, see its documents for more details.
 
 #if __has_builtin(__builtin_unreachable)
 #    define HAS_BUILTIN_UNREACHABLE
-#elif (defined(__GNUC__) && (   __GNUC__ > 4                              \
-                             || __GNUC__ == 4 && __GNUC_MINOR__ >= 5))
+#elif PERL_GCC_VERSION_GE(4,5,0)
 #    define HAS_BUILTIN_UNREACHABLE
 #endif
 

From 8ca9ebc9fee56a9daa4937d33169ddd04204d6c1 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Sat, 5 Jun 2021 10:22:42 -0600
Subject: [PATCH 02/14] Create and use single_1bit_pos32()

This moves the code from regcomp.c to inline.h that calculates the
position of the lone set bit in a U32.  This is in preparation for use
by other call sites.
---
 embed.fnc   |  1 +
 embed.h     |  1 +
 globvar.sym |  1 +
 inline.h    | 19 +++++++++++++++++++
 perl.h      | 12 ++++++++++++
 proto.h     |  6 ++++++
 regcomp.c   | 11 +----------
 7 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index 55096fd9ea96..77e0d2cc5d05 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1141,6 +1141,7 @@ ATdmoR	|bool	|is_utf8_invariant_string|NN const U8* const s		    \
 ATidRp	|bool	|is_utf8_invariant_string_loc|NN const U8* const s	    \
 		|STRLEN len						    \
 		|NULLOK const U8 ** ep
+CTiRp	|unsigned|single_1bit_pos32|U32 word
 #ifndef EBCDIC
 CTiRp	|unsigned int|variant_byte_number|PERL_UINTMAX_T word
 #endif
diff --git a/embed.h b/embed.h
index 62c9e68455ad..9418b6751a3f 100644
--- a/embed.h
+++ b/embed.h
@@ -561,6 +561,7 @@
 #define set_context		Perl_set_context
 #define setdefout(a)		Perl_setdefout(aTHX_ a)
 #define share_hek(a,b,c)	Perl_share_hek(aTHX_ a,b,c)
+#define single_1bit_pos32	Perl_single_1bit_pos32
 #define sortsv(a,b,c)		Perl_sortsv(aTHX_ a,b,c)
 #define sortsv_flags(a,b,c,d)	Perl_sortsv_flags(aTHX_ a,b,c,d)
 #define stack_grow(a,b,c)	Perl_stack_grow(aTHX_ a,b,c)
diff --git a/globvar.sym b/globvar.sym
index 03c324868ba4..a3c7c1043b96 100644
--- a/globvar.sym
+++ b/globvar.sym
@@ -12,6 +12,7 @@ PL_c9_utf8_dfa_tab
 PL_charclass
 PL_check
 PL_core_reg_engine
+PL_deBruijn_bitpos_tab32
 PL_EXACTFish_bitmask
 PL_EXACT_REQ8_bitmask
 PL_extended_utf8_dfa_tab
diff --git a/inline.h b/inline.h
index 55b26e0d7810..9ec02265bdc3 100644
--- a/inline.h
+++ b/inline.h
@@ -664,6 +664,25 @@ Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
     return TRUE;
 }
 
+PERL_STATIC_INLINE unsigned
+Perl_single_1bit_pos32(U32 word)
+{
+    /* Given a 32-bit word known to contain all zero bits except one 1 bit,
+     * find and return the 1's position: 0..31 */
+
+#ifdef PERL_CORE    /* macro not exported */
+    ASSUME(isPOWER_OF_2(word));
+#else
+    ASSUME(word && (word & (word-1)) == 0);
+#endif
+
+    /* The position of the only set bit in a word can be quickly calculated
+     * using deBruijn sequences.  See for example
+     * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
+    return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
+                                                    >> PERL_deBruijnShift32_];
+}
+
 #ifndef EBCDIC
 
 PERL_STATIC_INLINE unsigned int
diff --git a/perl.h b/perl.h
index 4f51442d0f0d..12dfa367d9fe 100644
--- a/perl.h
+++ b/perl.h
@@ -5842,6 +5842,13 @@ PL_valid_types_IV_set[] = { 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1 };
 EXTCONST bool
 PL_valid_types_NV_set[] = { 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 };
 
+EXTCONST U8
+PL_deBruijn_bitpos_tab32[] = {
+    /* https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn */
+    0,   1, 28,  2, 29, 14, 24,  3, 30, 22, 20, 15, 25, 17,  4,  8,
+    31, 27, 13, 23, 21, 19, 16,  7, 26, 12, 18,  6, 11,  5, 10,  9
+};
+
 #else
 
 EXTCONST bool PL_valid_types_IVX[];
@@ -5850,9 +5857,14 @@ EXTCONST bool PL_valid_types_PVX[];
 EXTCONST bool PL_valid_types_RV[];
 EXTCONST bool PL_valid_types_IV_set[];
 EXTCONST bool PL_valid_types_NV_set[];
+EXTCONST U8   PL_deBruijn_bitpos_tab32[];
 
 #endif
 
+/* The constants for using PL_deBruijn_bitpos_tab */
+#define PERL_deBruijnMagic32_  0x077CB531
+#define PERL_deBruijnShift32_  27
+
 /* In C99 we could use designated (named field) union initializers.
  * In C89 we need to initialize the member declared first.
  * In C++ we need extern C initializers.
diff --git a/proto.h b/proto.h
index 3900f0671736..adc720278717 100644
--- a/proto.h
+++ b/proto.h
@@ -3246,6 +3246,12 @@ PERL_CALLCONV Signal_t	Perl_sighandler1(int sig);
 #define PERL_ARGS_ASSERT_SIGHANDLER1
 PERL_CALLCONV Signal_t	Perl_sighandler3(int sig, Siginfo_t *info, void *uap);
 #define PERL_ARGS_ASSERT_SIGHANDLER3
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE unsigned	Perl_single_1bit_pos32(U32 word)
+			__attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_SINGLE_1BIT_POS32
+#endif
+
 PERL_CALLCONV char*	Perl_skipspace_flags(pTHX_ char *s, U32 flags)
 			__attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_SKIPSPACE_FLAGS	\
diff --git a/regcomp.c b/regcomp.c
index c5e54cc69a29..5de0aab1b698 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -19398,17 +19398,8 @@ S_optimize_regclass(pTHX_
             bool already_inverted;
             bool are_equivalent;
 
-            /* Compute which bit is set, which is the same thing as, e.g.,
-             * ANYOF_CNTRL.  From
-             * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
-             * */
-            static const int MultiplyDeBruijnBitPosition2[32] = {
-                0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
-                31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
-                };
 
-            namedclass = MultiplyDeBruijnBitPosition2[(posixl
-                                                      * 0x077CB531U) >> 27];
+            namedclass = single_1bit_pos32(posixl);
             classnum = namedclass_to_classnum(namedclass);
 
             /* The named classes are such that the inverted number is one

From e532c22cc4692d40a55f04138726c36f222f7d5a Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Tue, 20 Jul 2021 07:22:00 -0600
Subject: [PATCH 03/14] Add 64bit single-1bit_pos()

This will prove useful in future commits on platforms that have 64 bit
capability.

The deBruijn sequence used here, taken from the internet, differs
from the 32 bit one in how they treat a word with no set bits.  But this
is considered undefined behavior, so that difference is immaterial.

Apparently figuring this out uses brute force methods, and so I decided
to live with this difference, rather than to expend the time needed to
bring them into sync.
---
 embed.fnc   |  3 +++
 embed.h     |  3 +++
 globvar.sym |  1 +
 inline.h    | 26 +++++++++++++++++++++++---
 perl.h      | 12 ++++++++++++
 proto.h     |  8 ++++++++
 6 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index 77e0d2cc5d05..ddc814656e9a 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1142,6 +1142,9 @@ ATidRp	|bool	|is_utf8_invariant_string_loc|NN const U8* const s	    \
 		|STRLEN len						    \
 		|NULLOK const U8 ** ep
 CTiRp	|unsigned|single_1bit_pos32|U32 word
+#ifdef U64TYPE	/* HAS_QUAD undefined outside of core */
+CTiRp	|unsigned|single_1bit_pos64|U64 word
+#endif
 #ifndef EBCDIC
 CTiRp	|unsigned int|variant_byte_number|PERL_UINTMAX_T word
 #endif
diff --git a/embed.h b/embed.h
index 9418b6751a3f..416dbf3002a1 100644
--- a/embed.h
+++ b/embed.h
@@ -870,6 +870,9 @@
 #if defined(PERL_USE_3ARG_SIGHANDLER)
 #define csighandler		Perl_csighandler
 #endif
+#if defined(U64TYPE)	/* HAS_QUAD undefined outside of core */
+#define single_1bit_pos64	Perl_single_1bit_pos64
+#endif
 #if defined(UNLINK_ALL_VERSIONS)
 #define unlnk(a)		Perl_unlnk(aTHX_ a)
 #endif
diff --git a/globvar.sym b/globvar.sym
index a3c7c1043b96..22e353f34d33 100644
--- a/globvar.sym
+++ b/globvar.sym
@@ -13,6 +13,7 @@ PL_charclass
 PL_check
 PL_core_reg_engine
 PL_deBruijn_bitpos_tab32
+PL_deBruijn_bitpos_tab64
 PL_EXACTFish_bitmask
 PL_EXACT_REQ8_bitmask
 PL_extended_utf8_dfa_tab
diff --git a/inline.h b/inline.h
index 9ec02265bdc3..bde3c4c48c42 100644
--- a/inline.h
+++ b/inline.h
@@ -664,6 +664,29 @@ Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
     return TRUE;
 }
 
+#ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
+
+PERL_STATIC_INLINE unsigned
+Perl_single_1bit_pos64(U64 word)
+{
+    /* Given a 64-bit word known to contain all zero bits except one 1 bit,
+     * find and return the 1's position: 0..63 */
+
+#  ifdef PERL_CORE    /* macro not exported */
+    ASSUME(isPOWER_OF_2(word));
+#  else
+    ASSUME(word && (word & (word-1)) == 0);
+#  endif
+
+    /* The position of the only set bit in a word can be quickly calculated
+     * using deBruijn sequences.  See for example
+     * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
+    return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_)
+                                                    >> PERL_deBruijnShift64_];
+}
+
+#endif
+
 PERL_STATIC_INLINE unsigned
 Perl_single_1bit_pos32(U32 word)
 {
@@ -676,9 +699,6 @@ Perl_single_1bit_pos32(U32 word)
     ASSUME(word && (word & (word-1)) == 0);
 #endif
 
-    /* The position of the only set bit in a word can be quickly calculated
-     * using deBruijn sequences.  See for example
-     * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
     return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
                                                     >> PERL_deBruijnShift32_];
 }
diff --git a/perl.h b/perl.h
index 12dfa367d9fe..aa2f6c7fcba0 100644
--- a/perl.h
+++ b/perl.h
@@ -5849,6 +5849,15 @@ PL_deBruijn_bitpos_tab32[] = {
     31, 27, 13, 23, 21, 19, 16,  7, 26, 12, 18,  6, 11,  5, 10,  9
 };
 
+EXTCONST U8
+PL_deBruijn_bitpos_tab64[] = {
+    /* https://stackoverflow.com/questions/11376288/fast-computing-of-log2-for-64-bit-integers */
+    63,  0, 58,  1, 59, 47, 53,  2, 60, 39, 48, 27, 54, 33, 42,  3,
+    61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22,  4,
+    62, 57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21,
+    56, 45, 25, 31, 35, 16,  9, 12, 44, 24, 15,  8, 23,  7,  6,  5
+};
+
 #else
 
 EXTCONST bool PL_valid_types_IVX[];
@@ -5858,12 +5867,15 @@ EXTCONST bool PL_valid_types_RV[];
 EXTCONST bool PL_valid_types_IV_set[];
 EXTCONST bool PL_valid_types_NV_set[];
 EXTCONST U8   PL_deBruijn_bitpos_tab32[];
+EXTCONST U8   PL_deBruijn_bitpos_tab64[];
 
 #endif
 
 /* The constants for using PL_deBruijn_bitpos_tab */
 #define PERL_deBruijnMagic32_  0x077CB531
 #define PERL_deBruijnShift32_  27
+#define PERL_deBruijnMagic64_  0x07EDD5E59A4E28C2
+#define PERL_deBruijnShift64_  58
 
 /* In C99 we could use designated (named field) union initializers.
  * In C89 we need to initialize the member declared first.
diff --git a/proto.h b/proto.h
index adc720278717..9b3e207b2a61 100644
--- a/proto.h
+++ b/proto.h
@@ -6709,6 +6709,14 @@ PERL_CALLCONV Signal_t	Perl_csighandler(int sig, Siginfo_t *info, void *uap);
 #define PERL_ARGS_ASSERT_CSIGHANDLER
 PERL_CALLCONV Signal_t	Perl_sighandler(int sig, Siginfo_t *info, void *uap);
 #define PERL_ARGS_ASSERT_SIGHANDLER
+#endif
+#if defined(U64TYPE)	/* HAS_QUAD undefined outside of core */
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE unsigned	Perl_single_1bit_pos64(U64 word)
+			__attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_SINGLE_1BIT_POS64
+#endif
+
 #endif
 #if defined(UNLINK_ALL_VERSIONS)
 PERL_CALLCONV I32	Perl_unlnk(pTHX_ const char* f);

From 66614becbf2e33f90cfd9b174cdfdacd0ce04570 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Sat, 5 Jun 2021 01:15:37 -0600
Subject: [PATCH 04/14] Perl_variant_byte_number: Generalize

The current mechanism doesn't work if the lowest bit is the one set.  At
the moment that doesn't matter as we aren't looking at that bit anyway.
But a future commit will refactor things so that bit will be looked at.
So prepare for that.  The new expression is simpler, besides.
---
 inline.h | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/inline.h b/inline.h
index bde3c4c48c42..42e1ae7bf4a8 100644
--- a/inline.h
+++ b/inline.h
@@ -727,22 +727,18 @@ Perl_variant_byte_number(PERL_UINTMAX_T word)
      * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
      *
      * The word will look like this, with a rightmost set bit in position 's':
-     * ('x's are don't cares)
+     * ('x's are don't cares, and 'y's are their complements)
      *      s
-     *  x..x100..0
-     *  x..xx10..0      Right shift (rightmost 0 is shifted off)
-     *  x..xx01..1      Subtract 1, turns all the trailing zeros into 1's and
-     *                  the 1 just to their left into a 0; the remainder is
-     *                  untouched
-     *  0..0011..1      The xor with the original, x..xx10..0, clears that
-     *                  remainder, sets the bottom to all 1
-     *  0..0100..0      Add 1 to clear the word except for the bit in 's'
+     *  x..x100..00
+     *  y..y011..11      Complement
+     *  y..y100..00      Add 1
+     *  0..0100..00      AND with the original
      *
-     * Another method is to do 'word &= -word'; but it generates a compiler
-     * message on some platforms about taking the negative of an unsigned */
-
-    word >>= 1;
-    word = 1 + (word ^ (word - 1));
+     *  (Yes, complementing and adding 1 is just taking the negative on 2's
+     *  complement machines, but not on 1's complement ones, and some compilers
+     *  complain about negating an unsigned.)
+     */
+    word &= (~word + 1);
 
 #  elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
 

From 0b364ded344cc0770d218155a24b56f3de61290b Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Sat, 5 Jun 2021 08:57:37 -0600
Subject: [PATCH 05/14] Perl_variant_byte_number: Move assert()

This should be called only when it is known there is a variant byte.
The assert() previously wasn't checking that precisely
---
 inline.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/inline.h b/inline.h
index 42e1ae7bf4a8..2a2cfb3a955e 100644
--- a/inline.h
+++ b/inline.h
@@ -712,11 +712,13 @@ Perl_variant_byte_number(PERL_UINTMAX_T word)
     /* This returns the position in a word (0..7) of the first variant byte in
      * it.  This is a helper function.  Note that there are no branches */
 
-    assert(word);
-
     /* Get just the msb bits of each byte */
     word &= PERL_VARIANTS_WORD_MASK;
 
+    /* This should only be called if we know there is a variant byte in the
+     * word */
+    assert(word);
+
 #  if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
 
     /* Bytes are stored like

From 23de7dfac9a5a7716b05e54c1dd385b5962c91f9 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Tue, 20 Jul 2021 11:51:59 -0600
Subject: [PATCH 06/14] Create and use 32 and 64 bit lsbit_pos() fcns

The existing code to determine the position of the least significant 1
bit in a word is extracted from variant_byte_number() and moved to a new
function in preparation for being called from other places.

A U32 version is created, and on 64 bit platforms, a second, parallel,
version taking a U64 argument is also created.  This is because future
commits may care about the word size differences.
---
 embed.fnc |  2 ++
 embed.h   |  2 ++
 inline.h  | 73 ++++++++++++++++++++++++++++++++++++++++++-------------
 proto.h   | 12 +++++++++
 4 files changed, 72 insertions(+), 17 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index ddc814656e9a..b4e58701185d 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1142,8 +1142,10 @@ ATidRp	|bool	|is_utf8_invariant_string_loc|NN const U8* const s	    \
 		|STRLEN len						    \
 		|NULLOK const U8 ** ep
 CTiRp	|unsigned|single_1bit_pos32|U32 word
+CTiRp	|unsigned|lsbit_pos32|U32 word
 #ifdef U64TYPE	/* HAS_QUAD undefined outside of core */
 CTiRp	|unsigned|single_1bit_pos64|U64 word
+CTiRp	|unsigned|lsbit_pos64|U64 word
 #endif
 #ifndef EBCDIC
 CTiRp	|unsigned int|variant_byte_number|PERL_UINTMAX_T word
diff --git a/embed.h b/embed.h
index 416dbf3002a1..a2fc31f42fec 100644
--- a/embed.h
+++ b/embed.h
@@ -302,6 +302,7 @@
 #define load_module		Perl_load_module
 #endif
 #define looks_like_number(a)	Perl_looks_like_number(aTHX_ a)
+#define lsbit_pos32		Perl_lsbit_pos32
 #define magic_dump(a)		Perl_magic_dump(aTHX_ a)
 #define markstack_grow()	Perl_markstack_grow(aTHX)
 #ifndef MULTIPLICITY
@@ -871,6 +872,7 @@
 #define csighandler		Perl_csighandler
 #endif
 #if defined(U64TYPE)	/* HAS_QUAD undefined outside of core */
+#define lsbit_pos64		Perl_lsbit_pos64
 #define single_1bit_pos64	Perl_single_1bit_pos64
 #endif
 #if defined(UNLINK_ALL_VERSIONS)
diff --git a/inline.h b/inline.h
index 2a2cfb3a955e..eeb2096d36be 100644
--- a/inline.h
+++ b/inline.h
@@ -664,6 +664,55 @@ Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
     return TRUE;
 }
 
+/* Below are functions to find the final or only set bit in a word.  On
+ * platforms with 64-bit capability, there is a pair for each operation; the
+ * first taking a 64 bit operand, and the second a 32 bit one.  The logic is
+ * the same in each pair, so the second is stripped of most comments. */
+
+#ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
+
+PERL_STATIC_INLINE unsigned
+Perl_lsbit_pos64(U64 word)
+{
+    /* Find the position (0..63) of the least significant set bit in the input
+     * word */
+
+    ASSUME(word != 0);
+
+    /*  Isolate the lsb;
+     * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
+     *
+     * The word will look like this, with a rightmost set bit in position 's':
+     * ('x's are don't cares, and 'y's are their complements)
+     *      s
+     *  x..x100..00
+     *  y..y011..11      Complement
+     *  y..y100..00      Add 1
+     *  0..0100..00      And with the original
+     *
+     *  (Yes, complementing and adding 1 is just taking the negative on 2's
+     *  complement machines, but not on 1's complement ones, and some compilers
+     *  complain about negating an unsigned.)
+     */
+    return single_1bit_pos64(word & (~word + 1));
+}
+
+#  define lsbit_pos_uintmax_(word) lsbit_pos64(word)
+#else   /* ! QUAD */
+#  define lsbit_pos_uintmax_(word) lsbit_pos32(word)
+#endif
+
+PERL_STATIC_INLINE unsigned     /* Like above for 32 bit word */
+Perl_lsbit_pos32(U32 word)
+{
+    /* Find the position (0..31) of the least significant set bit in the input
+     * word */
+
+    ASSUME(word != 0);
+
+    return single_1bit_pos32(word & (~word + 1));
+}
+
 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
 
 PERL_STATIC_INLINE unsigned
@@ -724,23 +773,13 @@ Perl_variant_byte_number(PERL_UINTMAX_T word)
     /* Bytes are stored like
      *  Byte8 ... Byte2 Byte1
      *  63..56...15...8 7...0
-     *
-     *  Isolate the lsb;
-     * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
-     *
-     * The word will look like this, with a rightmost set bit in position 's':
-     * ('x's are don't cares, and 'y's are their complements)
-     *      s
-     *  x..x100..00
-     *  y..y011..11      Complement
-     *  y..y100..00      Add 1
-     *  0..0100..00      AND with the original
-     *
-     *  (Yes, complementing and adding 1 is just taking the negative on 2's
-     *  complement machines, but not on 1's complement ones, and some compilers
-     *  complain about negating an unsigned.)
-     */
-    word &= (~word + 1);
+     * so getting the lsb of the whole modified word is getting the msb of the
+     * first byte that has its msb set */
+    word = lsbit_pos_uintmax_(word);
+
+    /* Here, word contains the position 7,15,23,...55,63 of that bit.  Convert
+     * to 0..7 */
+    return (unsigned int) ((word + 1) >> 3) - 1;
 
 #  elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
 
diff --git a/proto.h b/proto.h
index 9b3e207b2a61..1ced39d14bbd 100644
--- a/proto.h
+++ b/proto.h
@@ -1861,6 +1861,12 @@ PERL_CALLCONV I32	Perl_looks_like_number(pTHX_ SV *const sv)
 #define PERL_ARGS_ASSERT_LOOKS_LIKE_NUMBER	\
 	assert(sv)
 
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE unsigned	Perl_lsbit_pos32(U32 word)
+			__attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_LSBIT_POS32
+#endif
+
 PERL_CALLCONV int	Perl_magic_clear_all_env(pTHX_ SV* sv, MAGIC* mg);
 #define PERL_ARGS_ASSERT_MAGIC_CLEAR_ALL_ENV	\
 	assert(sv); assert(mg)
@@ -6711,6 +6717,12 @@ PERL_CALLCONV Signal_t	Perl_sighandler(int sig, Siginfo_t *info, void *uap);
 #define PERL_ARGS_ASSERT_SIGHANDLER
 #endif
 #if defined(U64TYPE)	/* HAS_QUAD undefined outside of core */
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE unsigned	Perl_lsbit_pos64(U64 word)
+			__attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_LSBIT_POS64
+#endif
+
 #ifndef PERL_NO_INLINE_FUNCTIONS
 PERL_STATIC_INLINE unsigned	Perl_single_1bit_pos64(U64 word)
 			__attribute__warn_unused_result__;

From 495905f328045027e352090474be5492068976b2 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Tue, 20 Jul 2021 06:03:16 -0600
Subject: [PATCH 07/14] regcomp.h: Add internal macro

This returns the locale bitmap field from an ANYOF node.

In this commit, it just tidies up the code, omitting lengthy casts.  But
a future commit will use it on its own.
---
 regcomp.h | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/regcomp.h b/regcomp.h
index b92a08820f97..ec3c756148bc 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -680,30 +680,32 @@ struct regnode_ssc {
 
 #define ANYOF_BIT(c)		(1U << ((c) & 7))
 
+#define ANYOF_POSIXL_BITMAP(p)  (((regnode_charclass_posixl*) (p))->classflags)
+
 #define POSIXL_SET(field, c)	((field) |= (1U << (c)))
-#define ANYOF_POSIXL_SET(p, c)	POSIXL_SET(((regnode_charclass_posixl*) (p))->classflags, (c))
+#define ANYOF_POSIXL_SET(p, c)	POSIXL_SET(ANYOF_POSIXL_BITMAP(p), (c))
 
 #define POSIXL_CLEAR(field, c) ((field) &= ~ (1U <<(c)))
-#define ANYOF_POSIXL_CLEAR(p, c) POSIXL_CLEAR(((regnode_charclass_posixl*) (p))->classflags, (c))
+#define ANYOF_POSIXL_CLEAR(p, c) POSIXL_CLEAR(ANYOF_POSIXL_BITMAP(p), (c))
 
 #define POSIXL_TEST(field, c)	((field) & (1U << (c)))
-#define ANYOF_POSIXL_TEST(p, c)	POSIXL_TEST(((regnode_charclass_posixl*) (p))->classflags, (c))
+#define ANYOF_POSIXL_TEST(p, c)	POSIXL_TEST(ANYOF_POSIXL_BITMAP(p), (c))
 
 #define POSIXL_ZERO(field)	STMT_START { (field) = 0; } STMT_END
-#define ANYOF_POSIXL_ZERO(ret)	POSIXL_ZERO(((regnode_charclass_posixl*) (ret))->classflags)
+#define ANYOF_POSIXL_ZERO(ret)	POSIXL_ZERO(ANYOF_POSIXL_BITMAP(ret))
 
 #define ANYOF_POSIXL_SET_TO_BITMAP(p, bits)                                 \
-     STMT_START {                                                           \
-                    ((regnode_charclass_posixl*) (p))->classflags = (bits); \
-     } STMT_END
+                STMT_START { ANYOF_POSIXL_BITMAP(p) = (bits); } STMT_END
 
 /* Shifts a bit to get, eg. 0x4000_0000, then subtracts 1 to get 0x3FFF_FFFF */
-#define ANYOF_POSIXL_SETALL(ret) STMT_START { ((regnode_charclass_posixl*) (ret))->classflags = nBIT_MASK(ANYOF_POSIXL_MAX); } STMT_END
+#define ANYOF_POSIXL_SETALL(ret)                                            \
+                STMT_START {                                                \
+                    ANYOF_POSIXL_BITMAP(ret) = nBIT_MASK(ANYOF_POSIXL_MAX); \
+                } STMT_END
 #define ANYOF_CLASS_SETALL(ret) ANYOF_POSIXL_SETALL(ret)
 
 #define ANYOF_POSIXL_TEST_ANY_SET(p)                               \
-        ((ANYOF_FLAGS(p) & ANYOF_MATCHES_POSIXL)                           \
-         && (((regnode_charclass_posixl*)(p))->classflags))
+        ((ANYOF_FLAGS(p) & ANYOF_MATCHES_POSIXL) && ANYOF_POSIXL_BITMAP(p))
 #define ANYOF_CLASS_TEST_ANY_SET(p) ANYOF_POSIXL_TEST_ANY_SET(p)
 
 /* Since an SSC always has this field, we don't have to test for that; nor do
@@ -716,8 +718,7 @@ struct regnode_ssc {
 
 #define ANYOF_POSIXL_TEST_ALL_SET(p)                                   \
         ((ANYOF_FLAGS(p) & ANYOF_MATCHES_POSIXL)                       \
-         && ((regnode_charclass_posixl*) (p))->classflags              \
-                                    == nBIT_MASK(ANYOF_POSIXL_MAX))
+         && ANYOF_POSIXL_BITMAP(p) == nBIT_MASK(ANYOF_POSIXL_MAX))
 
 #define ANYOF_POSIXL_OR(source, dest) STMT_START { (dest)->classflags |= (source)->classflags ; } STMT_END
 #define ANYOF_CLASS_OR(source, dest) ANYOF_POSIXL_OR((source), (dest))

From 92bb8c75d81436e128d589a59c7e7d4e022e5604 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Sat, 5 Jun 2021 20:43:17 -0600
Subject: [PATCH 08/14] regexec.c: Use lsbit_pos32()  to avoid iterations

Before this commit, the code looped through a bitmap looking for a set
bit.  Now that we have a fast way to find where a set bit is, use it,
and avoid the fruitless iterations.
---
 regexec.c | 39 +++++++++++++--------------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/regexec.c b/regexec.c
index b06b6b0ea37a..512da8b77c62 100644
--- a/regexec.c
+++ b/regexec.c
@@ -10561,26 +10561,11 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
             else if (   ANYOF_POSIXL_TEST_ANY_SET(n)
                      && c <= U8_MAX  /* param to isFOO_lc() */
             ) {
-
                 /* The data structure is arranged so bits 0, 2, 4, ... are set
                  * if the class includes the Posix character class given by
                  * bit/2; and 1, 3, 5, ... are set if the class includes the
-                 * complemented Posix class given by int(bit/2).  So we loop
-                 * through the bits, each time changing whether we complement
-                 * the result or not.  Suppose for the sake of illustration
-                 * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
-                 * is set, it means there is a match for this ANYOF node if the
-                 * character is in the class given by the expression (0 / 2 = 0
-                 * = \w).  If it is in that class, isFOO_lc() will return 1,
-                 * and since 'to_complement' is 0, the result will stay TRUE,
-                 * and we exit the loop.  Suppose instead that bit 0 is 0, but
-                 * bit 1 is 1.  That means there is a match if the character
-                 * matches \W.  We won't bother to call isFOO_lc() on bit 0,
-                 * but will on bit 1.  On the second iteration 'to_complement'
-                 * will be 1, so the exclusive or will reverse things, so we
-                 * are testing for \W.  On the third iteration, 'to_complement'
-                 * will be 0, and we would be testing for \s; the fourth
-                 * iteration would test for \S, etc.
+                 * complemented Posix class given by int(bit/2), so the
+                 * remainder modulo 2 tells us if to complement or not.
                  *
                  * Note that this code assumes that all the classes are closed
                  * under folding.  For example, if a character matches \w, then
@@ -10592,19 +10577,21 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
                  * loop could be used below to iterate over both the source
                  * character, and its fold (if different) */
 
-                int count = 0;
-                int to_complement = 0;
+                U32 posixl_bits = ANYOF_POSIXL_BITMAP(n);
 
-                while (count < ANYOF_MAX) {
-                    if (ANYOF_POSIXL_TEST(n, count)
-                        && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
-                    {
+                do {
+                    /* Find the next set bit indicating a class to try matching
+                     * against */
+                    U8 bit_pos = lsbit_pos32(posixl_bits);
+
+                    if (bit_pos % 2 ^ cBOOL(isFOO_lc(bit_pos/2, (U8) c))) {
                         match = TRUE;
                         break;
                     }
-                    count++;
-                    to_complement ^= 1;
-                }
+
+                    /* Remove this class from consideration; repeat */
+                    POSIXL_CLEAR(posixl_bits, bit_pos);
+                } while(posixl_bits != 0);
             }
         }
     }

From ae8593dedd742b937c8aa4de1f0cc2f8a5c6ba2e Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Tue, 20 Jul 2021 11:57:39 -0600
Subject: [PATCH 09/14] Create and use 32 and 64 bit msbit_pos() fcns

The existing code to determine the position of the most significant 1
bit in a word is extracted from variant_byte_number(), and generalized
to use the deBruijn method previously added that works on any bit in the
word, rather than the existing method which looks just at the msb of
each byte.  The code is moved to a new function in preparation for being
called from other places.

A U32 version is created, and on 64 bit platforms, a second, parallel,
version taking a U64 argument is also created.  This is because future
commits may care about the word size differences.
---
 embed.fnc |   2 ++
 embed.h   |   2 ++
 inline.h  | 104 ++++++++++++++++++++++++++++++++++--------------------
 proto.h   |  12 +++++++
 4 files changed, 81 insertions(+), 39 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index b4e58701185d..a883f0cfacb2 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1143,9 +1143,11 @@ ATidRp	|bool	|is_utf8_invariant_string_loc|NN const U8* const s	    \
 		|NULLOK const U8 ** ep
 CTiRp	|unsigned|single_1bit_pos32|U32 word
 CTiRp	|unsigned|lsbit_pos32|U32 word
+CTiRp	|unsigned|msbit_pos32|U32 word
 #ifdef U64TYPE	/* HAS_QUAD undefined outside of core */
 CTiRp	|unsigned|single_1bit_pos64|U64 word
 CTiRp	|unsigned|lsbit_pos64|U64 word
+CTiRp	|unsigned|msbit_pos64|U64 word
 #endif
 #ifndef EBCDIC
 CTiRp	|unsigned int|variant_byte_number|PERL_UINTMAX_T word
diff --git a/embed.h b/embed.h
index a2fc31f42fec..e32b06076cd5 100644
--- a/embed.h
+++ b/embed.h
@@ -326,6 +326,7 @@
 #define mortal_getenv		Perl_mortal_getenv
 #define mro_get_linear_isa(a)	Perl_mro_get_linear_isa(aTHX_ a)
 #define mro_method_changed_in(a)	Perl_mro_method_changed_in(aTHX_ a)
+#define msbit_pos32		Perl_msbit_pos32
 #define my_atof(a)		Perl_my_atof(aTHX_ a)
 #define my_atof3(a,b,c)		Perl_my_atof3(aTHX_ a,b,c)
 #define my_dirfd		Perl_my_dirfd
@@ -873,6 +874,7 @@
 #endif
 #if defined(U64TYPE)	/* HAS_QUAD undefined outside of core */
 #define lsbit_pos64		Perl_lsbit_pos64
+#define msbit_pos64		Perl_msbit_pos64
 #define single_1bit_pos64	Perl_single_1bit_pos64
 #endif
 #if defined(UNLINK_ALL_VERSIONS)
diff --git a/inline.h b/inline.h
index eeb2096d36be..0255995cc8d2 100644
--- a/inline.h
+++ b/inline.h
@@ -664,7 +664,7 @@ Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
     return TRUE;
 }
 
-/* Below are functions to find the final or only set bit in a word.  On
+/* Below are functions to find the first, last, or only set bit in a word.  On
  * platforms with 64-bit capability, there is a pair for each operation; the
  * first taking a 64 bit operand, and the second a 32 bit one.  The logic is
  * the same in each pair, so the second is stripped of most comments. */
@@ -712,6 +712,59 @@ Perl_lsbit_pos32(U32 word)
 
     return single_1bit_pos32(word & (~word + 1));
 }
+ 
+#ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
+
+PERL_STATIC_INLINE unsigned
+Perl_msbit_pos64(U64 word)
+{
+    /* Find the position (0..63) of the most significant set bit in the input
+     * word */
+
+    ASSUME(word != 0);
+
+    /* Isolate the msb; http://codeforces.com/blog/entry/10330
+     *
+     * Only the most significant set bit matters.  Or'ing word with its right
+     * shift of 1 makes that bit and the next one to its right both 1.
+     * Repeating that with the right shift of 2 makes for 4 1-bits in a row.
+     * ...  We end with the msb and all to the right being 1. */
+    word |= (word >>  1);
+    word |= (word >>  2);
+    word |= (word >>  4);
+    word |= (word >>  8);
+    word |= (word >> 16);
+    word |= (word >> 32);
+
+    /* Then subtracting the right shift by 1 clears all but the left-most of
+     * the 1 bits, which is our desired result */
+    word -= (word >> 1);
+
+    /* Now we have a single bit set */
+    return single_1bit_pos64(word);
+}
+
+#  define msbit_pos_uintmax_(word) msbit_pos64(word)
+#else   /* ! QUAD */
+#  define msbit_pos_uintmax_(word) msbit_pos32(word)
+#endif
+
+PERL_STATIC_INLINE unsigned
+Perl_msbit_pos32(U32 word)
+{
+    /* Find the position (0..31) of the most significant set bit in the input
+     * word */
+
+    ASSUME(word != 0);
+
+    word |= (word >>  1);
+    word |= (word >>  2);
+    word |= (word >>  4);
+    word |= (word >>  8);
+    word |= (word >> 16);
+    word -= (word >> 1);
+    return single_1bit_pos32(word);
+}
 
 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
 
@@ -786,51 +839,24 @@ Perl_variant_byte_number(PERL_UINTMAX_T word)
     /* Bytes are stored like
      *  Byte1 Byte2  ... Byte8
      * 63..56 55..47 ... 7...0
-     *
-     * Isolate the msb; http://codeforces.com/blog/entry/10330
-     *
-     * Only the most significant set bit matters.  Or'ing word with its right
-     * shift of 1 makes that bit and the next one to its right both 1.  Then
-     * right shifting by 2 makes for 4 1-bits in a row. ...  We end with the
-     * msb and all to the right being 1. */
-    word |= word >>  1;
-    word |= word >>  2;
-    word |= word >>  4;
-    word |= word >>  8;
-    word |= word >> 16;
-    word |= word >> 32;  /* This should get optimized out on 32-bit systems. */
-
-    /* Then subtracting the right shift by 1 clears all but the left-most of
-     * the 1 bits, which is our desired result */
-    word -= (word >> 1);
-
-#  else
-#    error Unexpected byte order
-#  endif
+     * so getting the msb of the whole modified word is getting the msb of the
+     * first byte that has its msb set */
+    word = msbit_pos_uintmax_(word);
 
-    /* Here 'word' has a single bit set: the  msb of the first byte in which it
-     * is set.  Calculate that position in the word.  We can use this
-     * specialized solution: https://stackoverflow.com/a/32339674/1626653,
-     * assumes an 8-bit byte.  (On a 32-bit machine, the larger numbers should
-     * just get shifted off at compile time) */
-    word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
-                        | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
-                        |           (39 <<  24) |           (47 <<  16)
-                        |           (55 <<   8) |           (63 <<   0));
-    word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
-
-    /* Here, word contains the position 7,15,23,...,63 of that bit.  Convert to
-     * 0..7 */
+    /* Here, word contains the position 63,55,...,23,15,7 of that bit.  Convert
+     * to 0..7 */
     word = ((word + 1) >> 3) - 1;
 
-#  if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
-
-    /* And invert the result */
+    /* And invert the result because of the reversed byte order on this
+     * platform */
     word = CHARBITS - word - 1;
 
+    return (unsigned int) word;
+
+#  else
+#    error Unexpected byte order
 #  endif
 
-    return (unsigned int) word;
 }
 
 #endif
diff --git a/proto.h b/proto.h
index 1ced39d14bbd..92c371a6493f 100644
--- a/proto.h
+++ b/proto.h
@@ -2152,6 +2152,12 @@ PERL_CALLCONV void	Perl_mro_set_mro(pTHX_ struct mro_meta *const meta, SV *const
 PERL_CALLCONV SV*	Perl_mro_set_private_data(pTHX_ struct mro_meta *const smeta, const struct mro_alg *const which, SV *const data);
 #define PERL_ARGS_ASSERT_MRO_SET_PRIVATE_DATA	\
 	assert(smeta); assert(which); assert(data)
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE unsigned	Perl_msbit_pos32(U32 word)
+			__attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_MSBIT_POS32
+#endif
+
 PERL_CALLCONV SV*	Perl_multiconcat_stringify(pTHX_ const OP* o);
 #define PERL_ARGS_ASSERT_MULTICONCAT_STRINGIFY	\
 	assert(o)
@@ -6723,6 +6729,12 @@ PERL_STATIC_INLINE unsigned	Perl_lsbit_pos64(U64 word)
 #define PERL_ARGS_ASSERT_LSBIT_POS64
 #endif
 
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE unsigned	Perl_msbit_pos64(U64 word)
+			__attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_MSBIT_POS64
+#endif
+
 #ifndef PERL_NO_INLINE_FUNCTIONS
 PERL_STATIC_INLINE unsigned	Perl_single_1bit_pos64(U64 word)
 			__attribute__warn_unused_result__;

From 69d2a073c9da7b006b672d261287e76c5a4e0f65 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Tue, 20 Jul 2021 10:07:42 -0600
Subject: [PATCH 10/14]  Use clz, ctz for msb_pos, lsb_pos, if available

On many modern platforms these functions can be replaced by a single
machine instruction or two.  This commit looks for this possibility and
uses it if possible.
---
 inline.h | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 110 insertions(+), 3 deletions(-)

diff --git a/inline.h b/inline.h
index 0255995cc8d2..d75e079b070a 100644
--- a/inline.h
+++ b/inline.h
@@ -664,6 +664,59 @@ Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
     return TRUE;
 }
 
+/* See if the platform has builtins for finding the most/least significant bit,
+ * and which one is right for using on 32 and 64 bit operands */
+#if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0))
+#  if U32SIZE == INTSIZE
+#    define PERL_CLZ_32 __builtin_clz
+#  endif
+#  if defined(U64TYPE) && U64SIZE == INTSIZE
+#    define PERL_CLZ_64 __builtin_clz
+#  endif
+#endif
+#if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0))
+#  if U32SIZE == INTSIZE
+#    define PERL_CTZ_32 __builtin_ctz
+#  endif
+#  if defined(U64TYPE) && U64SIZE == INTSIZE
+#    define PERL_CTZ_64 __builtin_ctz
+#  endif
+#endif
+
+#if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0))
+#  if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32)
+#    define PERL_CLZ_32 __builtin_clzl
+#  endif
+#  if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64)
+#    define PERL_CLZ_64 __builtin_clzl
+#  endif
+#endif
+#if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0))
+#  if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32)
+#    define PERL_CTZ_32 __builtin_ctzl
+#  endif
+#  if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64)
+#    define PERL_CTZ_64 __builtin_ctzl
+#  endif
+#endif
+
+#if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0))
+#  if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32)
+#    define PERL_CLZ_32 __builtin_clzll
+#  endif
+#  if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64)
+#    define PERL_CLZ_64 __builtin_clzll
+#  endif
+#endif
+#if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0))
+#  if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32)
+#    define PERL_CTZ_32 __builtin_ctzll
+#  endif
+#  if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64)
+#    define PERL_CTZ_64 __builtin_ctzll
+#  endif
+#endif
+
 /* Below are functions to find the first, last, or only set bit in a word.  On
  * platforms with 64-bit capability, there is a pair for each operation; the
  * first taking a 64 bit operand, and the second a 32 bit one.  The logic is
@@ -679,7 +732,20 @@ Perl_lsbit_pos64(U64 word)
 
     ASSUME(word != 0);
 
-    /*  Isolate the lsb;
+    /* If we can determine that the platform has a usable fast method to get
+     * this info, use that */
+
+#  if defined(PERL_CTZ_64)
+
+    return (unsigned) PERL_CTZ_64(word);
+
+#  else
+
+    /* Here, we didn't find a fast method for finding the lsb.  Fall back to
+     * making the lsb the only set bit in the word, and use our function that
+     * works on words with a single bit set.
+     *
+     * Isolate the lsb;
      * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
      *
      * The word will look like this, with a rightmost set bit in position 's':
@@ -695,6 +761,9 @@ Perl_lsbit_pos64(U64 word)
      *  complain about negating an unsigned.)
      */
     return single_1bit_pos64(word & (~word + 1));
+
+#  endif
+
 }
 
 #  define lsbit_pos_uintmax_(word) lsbit_pos64(word)
@@ -710,9 +779,22 @@ Perl_lsbit_pos32(U32 word)
 
     ASSUME(word != 0);
 
+#if defined(PERL_CTZ_32)
+
+    return (unsigned) PERL_CTZ_32(word);
+
+#else
+
     return single_1bit_pos32(word & (~word + 1));
-}
  
+#endif
+
+}
+
+/* Convert the leading zeros count to the bit position of the first set bit.
+ * This just subtracts from the highest position, 31 or 63 */
+#define LZC_TO_MSBIT_POS_(size, lzc)  ((size##SIZE * CHARBITS - 1) - (lzc))
+
 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
 
 PERL_STATIC_INLINE unsigned
@@ -723,7 +805,20 @@ Perl_msbit_pos64(U64 word)
 
     ASSUME(word != 0);
 
-    /* Isolate the msb; http://codeforces.com/blog/entry/10330
+    /* If we can determine that the platform has a usable fast method to get
+     * this, use that */
+
+#  if defined(PERL_CLZ_64)
+
+    return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
+
+#  else
+
+    /* Here, we didn't find a fast method for finding the msb.  Fall back to
+     * making the msb the only set bit in the word, and use our function that
+     * works on words with a single bit set.
+     *
+     * Isolate the msb; http://codeforces.com/blog/entry/10330
      *
      * Only the most significant set bit matters.  Or'ing word with its right
      * shift of 1 makes that bit and the next one to its right both 1.
@@ -742,6 +837,9 @@ Perl_msbit_pos64(U64 word)
 
     /* Now we have a single bit set */
     return single_1bit_pos64(word);
+
+#  endif
+
 }
 
 #  define msbit_pos_uintmax_(word) msbit_pos64(word)
@@ -757,6 +855,12 @@ Perl_msbit_pos32(U32 word)
 
     ASSUME(word != 0);
 
+#if defined(PERL_CLZ_32)
+
+    return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
+
+#else
+
     word |= (word >>  1);
     word |= (word >>  2);
     word |= (word >>  4);
@@ -764,6 +868,9 @@ Perl_msbit_pos32(U32 word)
     word |= (word >> 16);
     word -= (word >> 1);
     return single_1bit_pos32(word);
+
+#endif
+
 }
 
 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */

From bdad1ec22420be781248c6bb8d236271b905c91e Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Tue, 20 Jul 2021 10:22:37 -0600
Subject: [PATCH 11/14] Use windows builtins for msb_pos, lsb_pos, if avail

Windows has different intrinsics than the previous commit added, with a
different API for counting leading/trailing zeros
---
 inline.h | 43 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/inline.h b/inline.h
index d75e079b070a..d9d1360d6e27 100644
--- a/inline.h
+++ b/inline.h
@@ -717,6 +717,16 @@ Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
 #  endif
 #endif
 
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+#  include <intrin.h>
+#  pragma intrinsic(_BitScanForward)
+#  pragma intrinsic(_BitScanReverse)
+#  ifdef _WIN64
+#    pragma intrinsic(_BitScanForward64)
+#    pragma intrinsic(_BitScanReverse64)
+#  endif
+#endif
+
 /* Below are functions to find the first, last, or only set bit in a word.  On
  * platforms with 64-bit capability, there is a pair for each operation; the
  * first taking a 64 bit operand, and the second a 32 bit one.  The logic is
@@ -739,6 +749,14 @@ Perl_lsbit_pos64(U64 word)
 
     return (unsigned) PERL_CTZ_64(word);
 
+#  elif U64SIZE == 8 && defined(_MSC_VER) && _MSC_VER >= 1400
+
+    {
+        unsigned long index;
+        _BitScanForward64(&index, word);
+        return (unsigned)index;
+    }
+
 #  else
 
     /* Here, we didn't find a fast method for finding the lsb.  Fall back to
@@ -783,6 +801,14 @@ Perl_lsbit_pos32(U32 word)
 
     return (unsigned) PERL_CTZ_32(word);
 
+#elif U32SIZE == 4 && defined(_MSC_VER) && _MSC_VER >= 1400
+
+    {
+        unsigned long index;
+        _BitScanForward(&index, word);
+        return (unsigned)index;
+    }
+
 #else
 
     return single_1bit_pos32(word & (~word + 1));
@@ -812,6 +838,14 @@ Perl_msbit_pos64(U64 word)
 
     return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
 
+#  elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER) && _MSC_VER >= 1400
+
+    {
+        unsigned long index;
+        _BitScanReverse64(&index, word);
+        return (unsigned)index;
+    }
+
 #  else
 
     /* Here, we didn't find a fast method for finding the msb.  Fall back to
@@ -859,6 +893,14 @@ Perl_msbit_pos32(U32 word)
 
     return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
 
+#elif U32SIZE == 4 && defined(_MSC_VER) && _MSC_VER >= 1400
+
+    {
+        unsigned long index;
+        _BitScanReverse(&index, word);
+        return (unsigned)index;
+    }
+
 #else
 
     word |= (word >>  1);
@@ -917,7 +959,6 @@ Perl_single_1bit_pos32(U32 word)
 PERL_STATIC_INLINE unsigned int
 Perl_variant_byte_number(PERL_UINTMAX_T word)
 {
-
     /* This returns the position in a word (0..7) of the first variant byte in
      * it.  This is a helper function.  Note that there are no branches */
 

From eafc5ae1deca0267da226f51e81659bce37579e2 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Tue, 20 Jul 2021 10:41:39 -0600
Subject: [PATCH 12/14] Comment why ffs() isn't used for lsbit_pos()

---
 inline.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/inline.h b/inline.h
index d9d1360d6e27..b477e838568f 100644
--- a/inline.h
+++ b/inline.h
@@ -727,6 +727,15 @@ Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
 #  endif
 #endif
 
+/* The reason there are not checks to see if ffs() and ffsl() are available for
+ * determining the lsb, is because these don't improve on the deBruijn method
+ * fallback, which is just a branchless integer multiply, array element
+ * retrieval, and shift.  The others, even if the function call overhead is
+ * optimized out, have to cope with the possibility of the input being all
+ * zeroes, and almost certainly will have conditionals for this eventuality.
+ * khw, at the time of this commit, looked at the source for both gcc and clang
+ * to verify this.  (gcc used a method inferior to deBruijn.) */
+
 /* Below are functions to find the first, last, or only set bit in a word.  On
  * platforms with 64-bit capability, there is a pair for each operation; the
  * first taking a 64 bit operand, and the second a 32 bit one.  The logic is

From 78251027929c41329c2d43458dc65929341e0b62 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Tue, 20 Jul 2021 12:14:35 -0600
Subject: [PATCH 13/14] Always use any fast available msb/lsb method

Some platforms have a fast way to get the msb but not the lsb; others,
more rarely, have the reverse.  But using a few shift and the like
instructions allows us to reduce either instance to terms of the other.

This commit causes any available fast method to be used by turning the
non-available case into the available one
---
 inline.h | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/inline.h b/inline.h
index b477e838568f..597a6a40e104 100644
--- a/inline.h
+++ b/inline.h
@@ -755,10 +755,12 @@ Perl_lsbit_pos64(U64 word)
      * this info, use that */
 
 #  if defined(PERL_CTZ_64)
+#    define PERL_HAS_FAST_GET_LSB_POS64
 
     return (unsigned) PERL_CTZ_64(word);
 
 #  elif U64SIZE == 8 && defined(_MSC_VER) && _MSC_VER >= 1400
+#    define PERL_HAS_FAST_GET_LSB_POS64
 
     {
         unsigned long index;
@@ -807,10 +809,12 @@ Perl_lsbit_pos32(U32 word)
     ASSUME(word != 0);
 
 #if defined(PERL_CTZ_32)
+#  define PERL_HAS_FAST_GET_LSB_POS32
 
     return (unsigned) PERL_CTZ_32(word);
 
 #elif U32SIZE == 4 && defined(_MSC_VER) && _MSC_VER >= 1400
+#  define PERL_HAS_FAST_GET_LSB_POS32
 
     {
         unsigned long index;
@@ -844,10 +848,12 @@ Perl_msbit_pos64(U64 word)
      * this, use that */
 
 #  if defined(PERL_CLZ_64)
+#    define PERL_HAS_FAST_GET_MSB_POS64
 
     return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
 
 #  elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER) && _MSC_VER >= 1400
+#    define PERL_HAS_FAST_GET_MSB_POS64
 
     {
         unsigned long index;
@@ -899,10 +905,12 @@ Perl_msbit_pos32(U32 word)
     ASSUME(word != 0);
 
 #if defined(PERL_CLZ_32)
+#  define PERL_HAS_FAST_GET_MSB_POS32
 
     return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
 
 #elif U32SIZE == 4 && defined(_MSC_VER) && _MSC_VER >= 1400
+#  define PERL_HAS_FAST_GET_MSB_POS32
 
     {
         unsigned long index;
@@ -938,11 +946,29 @@ Perl_single_1bit_pos64(U64 word)
     ASSUME(word && (word & (word-1)) == 0);
 #  endif
 
+    /* The only set bit is both the most and least significant bit.  If we have
+     * a fast way of finding either one, use that.
+     *
+     * It may appear at first glance that those functions call this one, but
+     * they don't if the corresponding #define is set */
+
+#  ifdef PERL_HAS_FAST_GET_MSB_POS64
+
+    return msbit_pos64(word);
+
+#  elif defined(PERL_HAS_FAST_GET_LSB_POS64)
+
+    return lsbit_pos64(word);
+
+#  else
+
     /* The position of the only set bit in a word can be quickly calculated
      * using deBruijn sequences.  See for example
      * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
     return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_)
                                                     >> PERL_deBruijnShift64_];
+#  endif
+
 }
 
 #endif
@@ -958,9 +984,32 @@ Perl_single_1bit_pos32(U32 word)
 #else
     ASSUME(word && (word & (word-1)) == 0);
 #endif
+#ifdef PERL_HAS_FAST_GET_MSB_POS32
+
+    return msbit_pos32(word);
+
+#elif defined(PERL_HAS_FAST_GET_LSB_POS32)
+
+    return lsbit_pos32(word);
+
+/* Unlikely, but possible for the platform to have a wider fast operation but
+ * not a narrower one.  But easy enough to handle the case by widening the
+ * parameter size.  (Going the other way, emulating 64 bit by two 32 bit ops
+ * would be slower than the deBruijn method.) */
+#elif defined(PERL_HAS_FAST_GET_MSB_POS64)
+
+    return msbit_pos64(word);
+
+#elif defined(PERL_HAS_FAST_GET_LSB_POS64)
+
+    return lsbit_pos64(word);
+
+#else
 
     return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
                                                     >> PERL_deBruijnShift32_];
+#endif
+
 }
 
 #ifndef EBCDIC

From 20b3432099486e8d0f09aa290881cb49980443d5 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Tue, 20 Jul 2021 13:43:39 -0600
Subject: [PATCH 14/14] msb_pos(): Bit twiddle a subtraction into an xor

Experiments by Tomasz Konojacki indicated that gcc, for one, doesn't
optimally optimize a subtraction from 2**n-1.  This commit tells the
compiler the optimization.
---
 inline.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/inline.h b/inline.h
index 597a6a40e104..5ed7692984e6 100644
--- a/inline.h
+++ b/inline.h
@@ -830,9 +830,16 @@ Perl_lsbit_pos32(U32 word)
 
 }
 
+
 /* Convert the leading zeros count to the bit position of the first set bit.
- * This just subtracts from the highest position, 31 or 63 */
-#define LZC_TO_MSBIT_POS_(size, lzc)  ((size##SIZE * CHARBITS - 1) - (lzc))
+ * This just subtracts from the highest position, 31 or 63.  But some compilers
+ * don't optimize this optimally, and so a bit of bit twiddling encourages them
+ * to do the right thing.  It turns out that subracting a smaller non-negative
+ * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of the
+ * two numbers.  This is because '11111...1  ^ x'  is just the complement of x,
+ * call it 'y'.  Adding a number and its complement yields all ones.  So y is
+ * actually the same as subtracting x from the ones, our desired value */
+#define LZC_TO_MSBIT_POS_(size, lzc)  ((size##SIZE * CHARBITS - 1) ^ (lzc))
 
 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */