Reimplement OFFUNISKIP

Now that previous commits have made it fast to find the position of the first set bit in a word, we can use a forumla to find how many bytes the UTF-8 of that will occupy. This allows for simplification of this macro, removing several conditionals
Perl · Jun 14, 2021 · 7d67d9e · 7d67d9e
1 parent 02bd4e7
commit 7d67d9e
Showing 1 changed file with 28 additions and 46 deletions.
diff --git a/utf8.h b/utf8.h
@@ -313,55 +313,37 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
  */
 #define UVCHR_IS_INVARIANT(cp)  (OFFUNI_IS_INVARIANT(NATIVE_TO_UNI(cp)))
 
-/* Internal macro to be used only in this file to aid in constructing other
- * publicly accessible macros.
- * The number of bytes required to express this uv in UTF-8, for just those
- * uv's requiring 2 through 6 bytes, as these are common to all platforms and
- * word sizes.  The number of bytes needed is given by the number of leading 1
- * bits in the start byte.  There are 32 start bytes that have 2 initial 1 bits
- * (C0-DF); there are 16 that have 3 initial 1 bits (E0-EF); 8 that have 4
- * initial 1 bits (F0-F8); 4 that have 5 initial 1 bits (F9-FB), and 2 that
- * have 6 initial 1 bits (FC-FD).  The largest number a string of n bytes can
- * represent is       (the number of possible start bytes for 'n')
- *                  * (the number of possiblities for each start byte
- * The latter in turn is
- *                  2  ** (  (how many continuation bytes there are)
- *                         * (the number of bits of information each
- *                            continuation byte holds))
+/* The arithmetic below breaks down for small code points, and even if it
+ * didn't, the algorithms in my_msbit_pos() generally require the input to be
+ * non-zero, so would need to special case NUL.  By using all the invariants,
+ * no extra conditionals are used, and we get past the point where the
+ * algorithms break, and on EBCDIC boxes, the fact the the invariants/two-byte
+ * code points are mixed falls out automatically.
  *
- * If we were on a platform where we could use a fast find first set bit
- * instruction (or count leading zeros instruction) this could be replaced by
- * using that to find the log2 of the uv, and divide that by the number of bits
- * of information in each continuation byte, adjusting for large cases and how
- * much information is in a start byte for that length */
-#define __COMMON_UNI_SKIP(uv)                                               \
-          (UV) (uv) < (32 * (1U << (    UTF_ACCUMULATION_SHIFT))) ? 2 :     \
-          (UV) (uv) < (16 * (1U << (2 * UTF_ACCUMULATION_SHIFT))) ? 3 :     \
-          (UV) (uv) < ( 8 * (1U << (3 * UTF_ACCUMULATION_SHIFT))) ? 4 :     \
-          (UV) (uv) < ( 4 * (1U << (4 * UTF_ACCUMULATION_SHIFT))) ? 5 :     \
-          (UV) (uv) < ( 2 * (1U << (5 * UTF_ACCUMULATION_SHIFT))) ? 6 :
-
-/* Internal macro to be used only in this file.
- * This adds to __COMMON_UNI_SKIP the details at this platform's upper range.
- * For any-sized EBCDIC platforms, or 64-bit ASCII ones, we need one more test
- * to see if just 7 bytes is needed, or if the maximum is needed.  For 32-bit
- * ASCII platforms, everything is representable by 7 bytes */
-#if defined(UV_IS_QUAD) || defined(EBCDIC)
-#   define __BASE_UNI_SKIP(uv) (__COMMON_UNI_SKIP(uv)                       \
-     LIKELY((UV) (uv) < ((UV) 1U << (6 * UTF_ACCUMULATION_SHIFT)))          \
-     ? 7                                                                    \
-     : UTF8_MAXBYTES)
+ * We need a new continuation byte for every increase in the size of 'uv' by
+ * (UTF_CONTINUATION_BYTE_INFO_BITS - 1).  The -1 is because each new byte
+ * removes one bit of information from the start byte.
+ */
+#define OFFUNISKIP(uv)                                                      \
+    ((OFFUNI_IS_INVARIANT(uv))                                              \
+     ? 1                                                                    \
+     : ((OFFUNISKIP_helper_(uv))                                            \
+        ? UTF8_MAXBYTES                                                     \
+        : (my_msbit_pos(uv) + (UTF_CONTINUATION_BYTE_INFO_BITS - 1) - 1)    \
+                            / (UTF_CONTINUATION_BYTE_INFO_BITS - 1)))
+
+/* We need to go to MAXBYTES when we can't represent 'uv' by the number of
+ * information bits in 6 continuation bytes (when we get to 6, the start byte
+ * has no information bits to add to the total).  But on ASCII platforms, that
+ * doesn't happen until 6*6 bits, which is above the 32-bit word size, so on
+ * those platforms, this will always be false */
+#if UVSIZE * CHARBITS > (6 * UTF_CONTINUATION_BYTE_INFO_BITS)
+#  define OFFUNISKIP_helper_(uv)                                            \
+            UNLIKELY(uv > nBIT_UMAX(6 * UTF_CONTINUATION_BYTE_INFO_BITS))
 #else
-#   define __BASE_UNI_SKIP(uv) (__COMMON_UNI_SKIP(uv) 7)
+#  define OFFUNISKIP_helper_(uv)  0
 #endif
 
-/* The next two macros use the base macro defined above, and add in the tests
- * at the low-end of the range, for just 1 byte, yielding complete macros,
- * publicly accessible. */
-
-/* Input is a true Unicode (not-native) code point */
-#define OFFUNISKIP(uv) (OFFUNI_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv))
-
 /*
 
 =for apidoc Am|STRLEN|UVCHR_SKIP|UV cp
@@ -371,7 +353,7 @@ encoded as UTF-8.  C<cp> is a native (ASCII or EBCDIC) code point if less than
 
 =cut
  */
-#define UVCHR_SKIP(uv) ( UVCHR_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv))
+#define UVCHR_SKIP(uv)  OFFUNISKIP(NATIVE_TO_UNI(uv))
 
 #define UTF_MIN_START_BYTE                                                  \
      ((UTF_CONTINUATION_MARK >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))