Add core function valid_utf8_length()

This function assumes that the input is valid UTF-8. It uses a different algorithm which does counting word-at-a-time, very much like variant_under_utf8_count(), leading to significant performance improvements, with longer strings getting more relative improvement. On a 32-bit system, the number of failed branch predictions declines to half as many, with everything else staying about equal. 32-bit UV's; string length 24 characters; 2 bytes per character bytecount wordcount --------- --------- Ir 100.00 100.72 Dr 100.00 100.82 Dw 100.00 101.10 COND 100.00 100.00 IND 100.00 100.00 COND_m 100.00 200.00 IND_m 100.00 100.00 Ir_m1 100.00 100.00 Dr_m1 100.00 100.00 Dw_m1 100.00 100.00 Ir_mm 100.00 100.00 Dr_mm 100.00 100.00 Dw_mm 100.00 100.00 The results are similar for longer strings, and for code points represented by different numbers of bytes. The results on a 64-bit platform also have the branch prediction improve by 200%, but at some short string lengths, the number of branches worsens slightly: 64-bit UV's; string length 4 characters; 3 bytes per character byteutf8_length wordutf8_length --------------- --------------- Ir 100.00 96.08 Dr 100.00 100.88 Dw 100.00 100.00 COND 100.00 97.24 IND 100.00 100.00 COND_m 100.00 200.00 IND_m 100.00 100.00 Ir_m1 100.00 100.00 Dr_m1 100.00 100.00 Dw_m1 100.00 100.00 Ir_mm 100.00 100.00 Dr_mm 100.00 100.00 Dw_mm 100.00 100.00 For longer strings things improve: 64-bit UV's; string length 24 characters; 2 bytes per character byteutf8_length wordutf8_length --------------- --------------- Ir 100.00 103.97 Dr 100.00 112.35 Dw 100.00 100.00 COND 100.00 110.27 IND 100.00 100.00 COND_m 100.00 300.00 IND_m 100.00 100.00 Ir_m1 100.00 100.00 Dr_m1 100.00 100.00 Dw_m1 100.00 100.00 Ir_mm 100.00 100.00 Dr_mm 100.00 100.00 Dw_mm 100.00 100.00 64-bit UV's; string length 24 characters; 3 bytes per character byteutf8_length wordutf8_length --------------- --------------- Ir 100.00 99.73 Dr 100.00 111.37 Dw 100.00 100.00 COND 100.00 108.05 IND 100.00 100.00 COND_m 100.00 150.00 IND_m 100.00 100.00 Ir_m1 100.00 100.00 Dr_m1 100.00 100.00 Dw_m1 100.00 100.00 Ir_mm 100.00 100.00 Dr_mm 100.00 100.00 Dw_mm 100.00 100.00 At very long strings 64-bit UV's; string length 10000000 characters; 2 bytes per character byteutf8_length wordutf8_length --------------- --------------- Ir 100.00 160.00 Dr 100.00 799.91 Dw 100.00 100.00 COND 100.00 399.98 IND 100.00 100.00 COND_m 100.00 150.00 IND_m 100.00 100.00 Ir_m1 100.00 100.00 Dr_m1 100.00 100.00 Dw_m1 100.00 100.00 Ir_mm 100.00 100.00 Dr_mm 100.00 100.00 Dw_mm 100.00 100.00 Performance actually worsens on strings with code points that occupy 7 or 13 bytes per code point. These are not in common use, as the maximum that Unicode recognizes occupies 4 bytes.
Perl · Dec 8, 2017 · ee18a6a · ee18a6a
1 parent ff51532
commit ee18a6a
Show file tree

Hide file tree

Showing 5 changed files with 138 additions and 18 deletions.
diff --git a/embed.fnc b/embed.fnc
@@ -1806,6 +1806,7 @@ ApM	|void	|_force_out_malformed_utf8_message			    \
 EXp	|U8*	|utf16_to_utf8	|NN U8* p|NN U8 *d|I32 bytelen|NN I32 *newlen
 EXp	|U8*	|utf16_to_utf8_reversed|NN U8* p|NN U8 *d|I32 bytelen|NN I32 *newlen
 AdpR	|STRLEN	|utf8_length	|NN const U8* s|NN const U8 *e
+AdinR	|STRLEN	|valid_utf8_length|NN const U8* s|NN const U8 *e
 AipdR	|IV	|utf8_distance	|NN const U8 *a|NN const U8 *b
 AipdRn	|U8*	|utf8_hop	|NN const U8 *s|SSize_t off
 AipdRn	|U8*	|utf8_hop_back|NN const U8 *s|SSize_t off|NN const U8 *start

diff --git a/embed.h b/embed.h
@@ -741,6 +741,7 @@
 #define uvoffuni_to_utf8_flags(a,b,c)	Perl_uvoffuni_to_utf8_flags(aTHX_ a,b,c)
 #define uvuni_to_utf8(a,b)	Perl_uvuni_to_utf8(aTHX_ a,b)
 #define uvuni_to_utf8_flags(a,b,c)	Perl_uvuni_to_utf8_flags(aTHX_ a,b,c)
+#define valid_utf8_length	S_valid_utf8_length
 #define valid_utf8_to_uvchr	Perl_valid_utf8_to_uvchr
 #define valid_utf8_to_uvuni(a,b)	Perl_valid_utf8_to_uvuni(aTHX_ a,b)
 #define vcmp(a,b)		Perl_vcmp(aTHX_ a,b)

diff --git a/inline.h b/inline.h
@@ -575,6 +575,109 @@ S_variant_under_utf8_count(const U8* const s, const U8* const e)
 
 #endif
 
+/*
+=for apidoc valid_utf8_length
+
+Returns the number of characters in the sequence of UTF-8-encoded bytes starting
+at C<s> and ending at the byte just before C<e>.  this function may assume the
+sequence is valid Perl-extended UTF-8, without checking.
+
+If <s> and <e> point to the same place, it returns 0 with no warning raised.
+
+=cut
+
+On ASCII machines, this does word-at-a-time lookups, counting the number of
+continuation bytes, which subtracted from the total number of bytes gives the
+number of starter bytes, hence the number of characters.
+
+*/
+
+STRLEN
+S_valid_utf8_length(const U8 *s, const U8 *e)
+{
+    STRLEN continuations = 0;
+    const U8 * x = s;
+
+    PERL_ARGS_ASSERT_VALID_UTF8_LENGTH;
+
+#ifdef EBCDIC
+
+    /* This could count continuations instead, and use the final loop of the
+     * ASCII case below.  But it's faster to use UTF8SKIP() since it has to be
+     * #ifdef'd anyway */
+
+    while (x < e) {
+        x += UTF8SKIP(s);
+
+        /* misnamed in EBCDIC, should be char_count, but khw doesn't feel like
+         * having more #ifdef's */
+	continuations++;
+    }
+
+    return continuations;
+
+#else
+
+    /* Process per-byte until reach word boundary.  XXX This loop could be
+     * eliminated if we knew that this platform had fast unaligned reads */
+    while (x < e && (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) {
+	continuations += UTF8_IS_CONTINUATION(*x++);
+    }
+
+    /* Process per-word as long as we have at least a full word left */
+    while (x + PERL_WORDSIZE <= e) {
+
+        /* The idea for counting continuation bytes came from
+         * http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html
+         * One thing it does that this doesn't is to prefetch the buffer
+         *      __builtin_prefetch(&s[256], 0, 0);
+         *
+         * A continuation byte has the upper 2 bits be '10', and the rest
+         * dont-cares.  The VARIANTS mask zeroes out all but the upper bit of
+         * each byte in the word.  That gets shifted to the byte's lowest bit,
+         * and 'anded' with the complement of the 2nd highest bit of the byte,
+         * which has also been shifted to that position.  The result will be
+         * that position will be 1 iff the upper bit is 1 and the next one is
+         * 0.  We then use the same integer multiplcation and shifting that are
+         * used in variant_under_utf8_count() to count how many of those are
+         * set in the word. */
+
+        continuations += (((((* (uintmax_t *) x) & PERL_VARIANTS_WORD_MASK)
+                                                                        >> 7)
+                      & (((~ (* (uintmax_t *) x))) >> 6))
+                  * PERL_COUNT_MULTIPLIER)
+                >> ((PERL_WORDSIZE - 1) * CHARBITS);
+        x += PERL_WORDSIZE;
+    }
+
+    /* Process per-byte */
+    while (x < e) {
+	if (UTF8_IS_CONTINUATION(*x)) {
+            continuations++;
+            x++;
+            continue;
+        }
+
+        /* Here is a starter byte.  We may be able to save some iterations by
+         * using UTF8SKIP from now on */
+        do {
+            const Size_t skip = UTF8SKIP(x);
+
+            continuations += skip - 1;
+            x += skip;
+        } while (x < e);
+
+        goto done;  /* This avoids an extra test in the enclosing loop
+                       condition */
+    }
+
+  done:
+    return x - s - continuations;
+
+#  endif
+
+}
+
 #undef PERL_WORDSIZE
 #undef PERL_COUNT_MULTIPLIER
 #undef PERL_WORD_BOUNDARY_MASK

diff --git a/proto.h b/proto.h
@@ -3655,6 +3655,13 @@ PERL_CALLCONV U8*	Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv);
 PERL_CALLCONV U8*	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
 #define PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS	\
 	assert(d)
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE STRLEN	S_valid_utf8_length(const U8* s, const U8 *e)
+			__attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_VALID_UTF8_LENGTH	\
+	assert(s); assert(e)
+#endif
+
 #ifndef PERL_NO_INLINE_FUNCTIONS
 PERL_STATIC_INLINE UV	Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
 			__attribute__warn_unused_result__;

diff --git a/utf8.c b/utf8.c
@@ -2012,37 +2012,45 @@ If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
 and returns the number of valid characters.
 
 =cut
+
 */
 
 STRLEN
 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 {
-    STRLEN len = 0;
+    STRLEN count;
+    const U8 * x = e - 1;
 
     PERL_ARGS_ASSERT_UTF8_LENGTH;
 
-    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
-     * the bitops (especially ~) can create illegal UTF-8.
-     * In other words: in Perl UTF-8 is not just for Unicode. */
+    /* Assume the input is valid, and get the character count.  That function
+     * will not read off the end of the buffer with invalid input */
+    count =  valid_utf8_length(s, e);
 
-    if (e < s)
-	goto warn_and_return;
-    while (s < e) {
-        s += UTF8SKIP(s);
-	len++;
+    if (count == 0) {
+        return 0;
     }
 
-    if (e != s) {
-	len--;
-        warn_and_return:
-	if (PL_op)
-	    Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
-			     "%s in %s", unees, OP_DESC(PL_op));
-	else
-	    Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
+    /* The only checking we do (which is to preserve backward compatibility) is
+     * to make sure that the final character isn't partial.  We do this by
+     * backtracking until we get a non-continuation.  'x' already points to the
+     * byte before 'e' */
+
+    while (x >= s && UTF8_IS_CONTINUATION(*x)) {
+        x--;
+    }
+
+    if (x >= s && x + UTF8SKIP(x) == e) {
+        return count;
     }
 
-    return len;
+    if (PL_op)
+        Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+                            "%s in %s", unees, OP_DESC(PL_op));
+    else
+        Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
+
+    return (x < s) ? 0 : count - 1;
 }
 
 /*