valid_utf8_to_uvchr() unroll loop; avoid array lookup

This partially unrolls the loop in this function to achieve a slight speed up for UTF8 characters containing multiple bytes. The loop is retained for non-Unicode code points, as those should rarely be seen. The commit also avoids the array lookup in the UTF8SKIP macro; the array may not be in the cache. Since we know the start byte is well-formed, we can use a few shift/mask instructions along with the clz instruction to calculate this value. If the platform doesn't have a fast clz method, the array lookup is cheaper and is retained. Benchmarks are: Key: Ir Instruction read Dr Data read Dw Data write COND conditional branches IND indirect branches _m branch predict miss The numbers represent raw counts per loop iteration. code point 127; single byte UTF-8: blead new Ratio % ------ ------ ------- Ir 1850.0 1866.0 99.1 Dr 613.0 614.0 99.8 Dw 428.0 429.0 99.8 COND 194.0 194.0 100.0 IND 22.0 23.0 95.7 COND_m 3.0 2.0 150.0 IND_m 9.0 9.0 100.0 code point 2047; 2 byte UTF-8: blead new Ratio % ------ ------ ------- Ir 1865.0 1870.0 99.7 Dr 614.0 615.0 99.8 Dw 428.0 429.0 99.8 COND 196.0 194.0 101.0 IND 22.0 23.0 95.7 COND_m 3.0 2.0 150.0 IND_m 9.0 9.0 100.0 code point 65535; 3 byte UTF-8: blead new Ratio % ------ ------ ------- Ir 1872.0 1876.0 99.8 Dr 615.0 616.0 99.8 Dw 428.0 429.0 99.8 COND 197.0 194.0 101.5 IND 22.0 23.0 95.7 COND_m 3.0 2.0 150.0 IND_m 9.0 9.0 100.0 code point 2097151; 4 byte UTF-8: blead new Ratio % ------ ------ ------- Ir 1879.0 1880.0 99.9 Dr 616.0 617.0 99.8 Dw 428.0 429.0 99.8 COND 198.0 194.0 102.1 IND 22.0 23.0 95.7 COND_m 3.0 2.0 150.0 IND_m 9.0 9.0 100.0 code point 67108863; 5 byte UTF-8: blead new Ratio % ------ ------ ------- Ir 1886.0 1897.0 99.4 Dr 617.0 620.0 99.5 Dw 428.0 429.0 99.8 COND 199.0 195.0 102.1 IND 22.0 23.0 95.7 COND_m 3.0 2.0 150.0 IND_m 9.0 9.0 100.0 code point 134217727; 6 byte UTF-8: blead new Ratio % ------ ------ ------- Ir 1893.0 1904.0 99.4 Dr 618.0 621.0 99.5 Dw 428.0 429.0 99.8 COND 200.0 196.0 102.0 IND 22.0 23.0 95.7 COND_m 3.0 2.0 150.0 IND_m 9.0 9.0 100.0
Perl · Aug 27, 2021 · 2f2a723 · 2f2a723
1 parent 33c60d5
commit 2f2a723
Showing 1 changed file with 111 additions and 18 deletions.
diff --git a/inline.h b/inline.h
@@ -997,34 +997,127 @@ points, and non-Unicode code points are allowed.
 PERL_STATIC_INLINE UV
 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
 {
-    const UV expectlen = UTF8SKIP(s);
-    const U8* send = s + expectlen;
-    UV uv = *s;
 
-    PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
-
-    if (retlen) {
-        *retlen = expectlen;
-    }
+#ifndef EBCDIC
+#  ifdef PERL_HAS_FAST_GET_MSB_POS32
+#    define PERL_USE_MSB_FOR_VALID_UTF8_ msbit_pos32
+#  elif defined(PERL_HAS_FAST_GET_MSB_POS64)
+#    define PERL_USE_MSB_FOR_VALID_UTF8_ msbit_pos64
+#  endif
+#endif
+#ifdef PERL_USE_MSB_FOR_VALID_UTF8_
 
-    /* An invariant is trivially returned */
-    if (expectlen == 1) {
-        return uv;
-    }
+    /* Given that *s is known to be a legal start byte, the following returns
+     * its UTF8SKIP, avoiding an array lookup.  This only makes sense to do if
+     * we know that the platform does clz effectively with a single machine
+     * instruction; otherwise the lookup is cheaper.
+     *
+     * The code takes the byte, left shifts it by one, discarding the new upper
+     * bit, retaining the original one.  The position of its complement's first
+     * set bit, subtracted from the total bits, yields UTF8SKIP.  The shift
+     * accomplishes two things.  It eliminates the discontinuity in a single
+     * leading 1 is illegal.  That is 0 leading 1's mean length 1 vs 2 leading
+     * 1's mean length 2.  And it keeps the input to the clz instruction from
+     * ever being all 0's, which would yield undefined behavior.
+     *
+     * To illustrate, where 'x' is a don't care:
+     *  0xxxxxxx
+     *  0xxxxxx0    << 1, retaining original top bit
+     *  1xxxxxx1    complement
+     *  7 => 1      msb of complement => subtracted from 8
+     * -------------------
+     *  11110xxx
+     *  1110xxx0    << 1, retaining original top bit
+     *  0001xxx1    complement
+     *  4 => 4      msb => subtracted from 8
+     * -------------------
+     *  11111111
+     *  11111110    << 1, retaining original top bit
+     *  00000001    complement
+     *  0 => 8      msb => subtracted from 8
+     */
+     PERL_UINT_FAST8_T expectlen
+         = CHARBITS
+         - PERL_USE_MSB_FOR_VALID_UTF8_(
+                                    /* Left shift 1, discarding new top bit */
+                          (U8) ~(  ((*s << 1) & nBIT_MASK(CHARBITS - 1))
+                                    /* Add back the original top bit */
+                                  | (*s & (1 << (CHARBITS - 1)))));
+#else
+     PERL_UINT_FAST8_T expectlen = UTF8SKIP(s);
+#endif
 
     /* Remove the leading bits that indicate the number of bytes, leaving just
      * the bits that are part of the value */
-    uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
+    UV uv = NATIVE_UTF8_TO_I8(*s) & UTF_START_MASK(expectlen);
+
+    PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
+    assert(! UTF8_IS_CONTINUATION(*s));
+
+    switch (expectlen) {
+        PERL_UINT_FAST8_T countdown;
+
+      case 8:   /* Start byte FF is special */
+        expectlen = UTF8_MAXBYTES;
+        /*FALLTHROUGH*/
+
+      default: /* Don't unroll the loop for rarely encountered high code points
+                */
+        countdown = expectlen;
+        while (countdown-- > 4) {
+            s++;
+            uv = UTF8_ACCUMULATE(uv, *s);
+        }
+        /*FALLTHROUGH*/
 
-    /* Now, loop through the remaining bytes, accumulating each into the
-     * working total as we go.  (I khw tried unrolling the loop for up to 4
-     * bytes, but there was no performance improvement) */
-    for (++s; s < send; s++) {
+      case 4:
+        s++;
         uv = UTF8_ACCUMULATE(uv, *s);
+        /*FALLTHROUGH*/
+
+      case 3:
+        s++;
+        uv = UTF8_ACCUMULATE(uv, *s);
+        /*FALLTHROUGH*/
+
+      case 2:
+        s++;
+        uv = UTF8_ACCUMULATE(uv, *s);
+
+#ifndef EBCDIC
+
+        /*FALLTHROUGH*/
+
+        /* On ASCII platforms, UTF_START_MASK() works on all UTF-8 invariants,
+         * so the call to it before this switch() already set up the length 1
+         * case, so no further action should be done. */
+      case 1:
+        break;
+
+#else
+        break;  /* EBCDIC: For case 2: */
+
+        /* There are length 1 characters on EBCDIC platforms for which
+         * UTF_START_MASK() doesn't work properly.  Instead we have to handle
+         * that case specially.
+         *
+         * An invariant is trivially returned; throw away our earlier
+         * calculation */
+      case 1:
+        if (retlen) {
+            *retlen = 1;
+        }
+        return *(s - 1);
+
+#endif
+
     }
 
-    return UNI_TO_NATIVE(uv);
+    if (retlen) {
+        *retlen = expectlen;
+    }
 
+    return UNI_TO_NATIVE(uv);
 }
 
 #ifndef EBCDIC