Add variant_under_utf8_count() core function

This function takes a string that isn't encoded in UTF-8 (hence is assumed to be in Latin1), and counts how many of the bytes therein would change if it were to be translated into UTF-8. Each such byte will occupy two UTF-8 bytes. This function is useful for calculating the expansion factor precisely when converting to UTF-8, so as to know how much to malloc. This function uses a non-obvious method to do the calculations word-at-a-time, as opposed to the byte-at-a-time method used now, and hence should be much faster than the current methods. The function is slightly more costly for strings that have fewer bytes per word, with approximately 1.5% more conditionals. But once the string is at least one word long, there is a savings which increases proportionately to the length of the string. On a 64-bit machine, the number of conditional approaches 1/8 of the per-byte algorithm (in other words, a 800% improvement). Here are results from Porting/bench.pl for a 10,000 byte string: per-byte per-word ------ ------ Ir 100.00 434.45 Dr 100.00 785.11 Dw 100.00 102.22 COND 100.00 793.81 IND 100.00 100.00 COND_m 100.00 100.00 IND_m 100.00 100.00 Ir_m1 100.00 100.00 Dr_m1 100.00 99.81 Dw_m1 100.00 133.33 Ir_mm 100.00 100.00 Dr_mm 100.00 100.00 Dw_mm 100.00 100.00 whereas the savings are less for a 24-byte string: per-byte per-word ------ ------ Ir 100.00 112.50 Dr 100.00 108.36 Dw 100.00 102.22 COND 100.00 115.79 IND 100.00 100.00 COND_m 100.00 - IND_m 100.00 100.00 Ir_m1 100.00 100.00 Dr_m1 100.00 100.00 Dw_m1 100.00 100.00 but rising with the length of the string. Here are 96-byte results per-byte per-word ------ ------ Ir 100.00 147.54 Dr 100.00 130.28 Dw 100.00 102.22 COND 100.00 165.85 IND 100.00 100.00 COND_m 100.00 100.00 IND_m 100.00 100.00 Ir_m1 100.00 100.00 Dr_m1 100.00 100.00 Dw_m1 100.00 100.00 Ir_mm 100.00 100.00 Dr_mm 100.00 100.00 Dw_mm 100.00 100.00 The timings are slightly worse for strings that aren't multiples of the word length, but not appreciably so. I found this trick on the internet many years ago, but I can't seem to find it again to give them credit.
Perl · Dec 8, 2017 · 47c620c · 47c620c
1 parent 7a4b369
commit 47c620c
Show file tree

Hide file tree

Showing 6 changed files with 222 additions and 4 deletions.
diff --git a/embed.fnc b/embed.fnc
@@ -782,6 +782,10 @@ AndmoR	|bool	|is_utf8_invariant_string|NN const U8* const s		    \
 AnidR	|bool	|is_utf8_invariant_string_loc|NN const U8* const s	    \
 		|STRLEN len						    \
 		|NULLOK const U8 ** ep
+#if defined(PERL_CORE) || defined(PERL_EXT)
+EinR	|Size_t	|variant_under_utf8_count|NN const U8* const s		    \
+		|NN const U8* const e
+#endif
 AmnpdRP	|bool	|is_ascii_string|NN const U8* const s|const STRLEN len
 AmnpdRP	|bool	|is_invariant_string|NN const U8* const s|STRLEN len
 #if defined(PERL_CORE) || defined (PERL_EXT)

diff --git a/embed.h b/embed.h
@@ -994,6 +994,9 @@
 #define is_utf8_non_invariant_string	S_is_utf8_non_invariant_string
 #define sv_or_pv_pos_u2b(a,b,c,d)	S_sv_or_pv_pos_u2b(aTHX_ a,b,c,d)
 #  endif
+#  if defined(PERL_CORE) || defined(PERL_EXT)
+#define variant_under_utf8_count	S_variant_under_utf8_count
+#  endif
 #  if defined(PERL_IN_REGCOMP_C)
 #define _make_exactf_invlist(a,b)	S__make_exactf_invlist(aTHX_ a,b)
 #define add_above_Latin1_folds(a,b,c)	S_add_above_Latin1_folds(aTHX_ a,b,c)

diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
@@ -6035,6 +6035,18 @@ test_is_utf8_invariant_string_loc(unsigned char *s, STRLEN offset, STRLEN len)
     OUTPUT:
         RETVAL
 
+STRLEN
+test_variant_under_utf8_count(unsigned char *s, STRLEN offset, STRLEN len)
+    PREINIT:
+        PERL_UINTMAX_T * copy;
+    CODE:
+        Newx(copy, (len + WORDSIZE - 1) / WORDSIZE, PERL_UINTMAX_T);
+        Copy(s, (U8 *) copy + offset, len, U8);
+        RETVAL = variant_under_utf8_count((U8 *) copy + offset, (U8 *) copy + offset + len);
+        Safefree(copy);
+    OUTPUT:
+        RETVAL
+
 STRLEN
 test_utf8_length(unsigned char *s, STRLEN offset, STRLEN len)
 CODE:

diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
@@ -67,6 +67,63 @@ for my $pos (0.. length($all_invariants) - 1) {
     }
 }
 
+# Now work on variant_under_utf8_count().
+pass("The tests below are for variant_under_utf8_count() with string"
+   . " starting $offset bytes after a word boundary");
+is(test_variant_under_utf8_count($all_invariants, $offset,
+                                length $all_invariants),
+                                0,
+                                "$display_all_invariants contains 0 variants");
+
+# First, put a variant in each possible position in the flanking partial words
+for my $pos (0 .. $word_length - $offset,
+             2 * $word_length .. length($all_invariants) - 1)
+{
+    my $test_string = $all_invariants;
+    my $test_display = $display_all_invariants;
+
+    substr($test_string, $pos, 1) = $variant;
+    substr($test_display, $pos * 2, 2) = $display_variant;
+    is(test_variant_under_utf8_count($test_string, $offset, length $test_string),
+                                     1,
+                                     "$test_display contains 1 variant");
+}
+
+# Then try all possible combinations of variant/invariant in the full word in
+# the middle  (We've already tested the case with 0 variants, so start at 1.)
+for my $bit_pattern (1 .. (1 << $word_length) - 1) {
+    my $bits = $bit_pattern;
+    my $display_word = "";
+    my $test_word = "";
+    my $count = 0;
+
+    # Every 1 bit gets the variant for this particular $bit_pattern.
+    for my $bit (0 .. 7) {
+        if ($bits & 1) {
+            $count++;
+            $test_word .= $variant;
+            $display_word .= $display_variant;
+        }
+        else {
+            $test_word .= $invariant;
+            $display_word .= $display_invariant;
+        }
+        $bits >>= 1;
+    }
+
+    my $test_string = $variant x ($word_length - 1)
+                    . $test_word
+                    . $variant x ($word_length - 1);
+    my $display_string = $display_variant x ($word_length - 1)
+                        . $display_word
+                        . $display_variant x ($word_length - 1);
+    my $expected_count = $count + 2 * $word_length - 2;
+    is(test_variant_under_utf8_count($test_string, $offset,
+                        length $test_string), $expected_count,
+                        "$display_string contains $expected_count variants");
+}
+
+
 my $pound_sign = chr utf8::unicode_to_native(163);
 
 # This  test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl

diff --git a/inline.h b/inline.h
@@ -425,10 +425,6 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
         x += PERL_WORDSIZE;
     }
 
-#  undef PERL_WORDCAST
-#  undef PERL_WORDSIZE
-#  undef PERL_WORD_BOUNDARY_MASK
-#  undef PERL_VARIANTS_WORD_MASK
 #endif
 
     /* Process per-byte */
@@ -447,6 +443,143 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
     return TRUE;
 }
 
+#if defined(PERL_CORE) || defined(PERL_EXT)
+
+/*
+=for apidoc variant_under_utf8_count
+
+This function looks at the sequence of bytes between C<s> and C<e>, which are
+assumed to be encoded in ASCII/Latin1, and returns how many of them would
+change should the string be translated into UTF-8.  Due to the nature of UTF-8,
+each of these would occupy two bytes instead of the single one in the input
+string.  Thus, this function returns the precise number of bytes the string
+would expand by when translated to UTF-8.
+
+Unlike most of the other functions that have C<utf8> in their name, the input
+to this function is NOT a UTF-8-encoded string.  The function name is slightly
+I<odd> to emphasize this.
+
+This function is internal to Perl because khw thinks that any XS code that
+would want this is probably operating too close to the internals.  Presenting a
+valid use case could change that.
+
+See also
+C<L<perlapi/is_utf8_invariant_string>>
+and
+C<L<perlapi/is_utf8_invariant_string_loc>>,
+
+=cut
+
+*/
+
+PERL_STATIC_INLINE Size_t
+S_variant_under_utf8_count(const U8* const s, const U8* const e)
+{
+    const U8* x = s;
+    Size_t count = 0;
+
+    PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
+
+#  ifndef EBCDIC
+
+    /* Process per-byte until reach word boundary.  XXX This loop could be
+     * eliminated if we knew that this platform had fast unaligned reads */
+    while (x < e && (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) {
+        if (! UTF8_IS_INVARIANT(*x)) {
+            count++;
+        }
+        x++;
+    }
+
+    /* Process per-word as long as we have at least a full word left */
+    while (x + PERL_WORDSIZE <= e) {
+
+        /* It's easier to look at a 16-bit word size to see how this works.
+         * The expression would be:
+         *
+         *  (((*x & 0x8080) >> 7) * 0x0101) >> 8;
+         *
+         * Suppose the value of *x is the 16 bits
+         *
+         *      0by_______z_______
+         *
+         * where the 14 bits represented by '_' could be any combination of 0's
+         * or 1's (we don't care), and 'y' is the high bit of one byte, and 'z'
+         * is the high bit for the other (endianness doesn't matter).  On ASCII
+         * platforms a byte is variant if the high bit is set; invariant
+         * otherwise.  Thus, our goal, the count of variants in this 2-byte
+         * word is
+         *
+         *      y + z
+         *
+         * To turn 0by_______z_______ into (y + z) we mask the intial value
+         * with 0x8080 to turn it into
+         *
+         *      0by0000000z0000000
+         *
+         * Then right shifting by 7 yields
+         *
+         *      0by0000000z
+         *
+         * Viewed as a number, this is
+         *
+         *      2**8 * y + z
+         *
+         * We then multiply by 0x0101 (which is = 2**8 + 1), so
+         *
+         *       (2**8 * y + z) * (2**8 + 1)
+         *     = (2**8 * y * 2**8) + (z * 2**8) + (2**8 * y * 1) + (z * 1)
+         *     = (2**16 * y) + (2**8 * (y + z)) + z
+         *
+         * However (2**16 * y) doesn't fit in a 16-bit word (unless 'y' is zero
+         * in which case it is 0), and since this is unsigned multiplication,
+         * the C standard says that this component just gets ignored, so we are
+         * left with
+         *
+         *     =  2**8 * (y + z) + z
+         *
+         * We then shift right by 8 bits, which divides by 2**8, and gets rid
+         * of the lone 'z', leaving us with
+         *
+         *     =  y + z
+         *
+         * The same principles apply for longer word sizes.  For 32 bit words
+         * we end up with
+         *
+         *     =  2**24 * (w + x + y + z) + (lots of other expressions below
+         *                                   2**24)
+         *
+         * with anything above 2**24 having overflowed and been chopped off.
+         * Shifting right by 24 yields (w + x + y + z)
+         */
+
+        count += ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
+                  * PERL_COUNT_MULTIPLIER)
+                >> ((PERL_WORDSIZE - 1) * CHARBITS);
+        x += PERL_WORDSIZE;
+    }
+
+#  endif
+
+    /* Process per-byte */
+    while (x < e) {
+	if (! UTF8_IS_INVARIANT(*x)) {
+            count++;
+        }
+
+        x++;
+    }
+
+    return count;
+}
+
+#endif
+
+#undef PERL_WORDSIZE
+#undef PERL_COUNT_MULTIPLIER
+#undef PERL_WORD_BOUNDARY_MASK
+#undef PERL_VARIANTS_WORD_MASK
+
 /*
 =for apidoc is_utf8_string
 

diff --git a/proto.h b/proto.h
@@ -4313,6 +4313,15 @@ PERL_STATIC_INLINE STRLEN	S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLE
 #define PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B	\
 	assert(sv); assert(pv)
 #endif
+#endif
+#if defined(PERL_CORE) || defined(PERL_EXT)
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE Size_t	S_variant_under_utf8_count(const U8* const s, const U8* const e)
+			__attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT	\
+	assert(s); assert(e)
+#endif
+
 #endif
 #if defined(PERL_CR_FILTER)
 #  if defined(PERL_IN_TOKE_C)