Skip to content

Commit

Permalink
valid_utf8_to_uvchr() unroll loop; avoid array lookup
Browse files Browse the repository at this point in the history
This partially unrolls the loop in this function to achieve a slight
speed up for UTF8 characters containing multiple bytes.  The loop is
retained for non-Unicode code points, as those should rarely be seen.

The commit also avoids the array lookup in the UTF8SKIP macro; the array
may not be in the cache.  Since we know the start byte is well-formed,
we can use a few shift/mask instructions along with the clz instruction
to calculate this value.  If the platform doesn't have a fast clz
method, the array lookup is cheaper and is retained.

Benchmarks are:

Key:
    Ir   Instruction read
    Dr   Data read
    Dw   Data write
    COND conditional branches
    IND  indirect branches
    _m   branch predict miss
The numbers represent raw counts per loop iteration.

code point 127; single byte UTF-8:

        blead    new Ratio %
       ------ ------ -------
    Ir 1850.0 1866.0    99.1
    Dr  613.0  614.0    99.8
    Dw  428.0  429.0    99.8
  COND  194.0  194.0   100.0
   IND   22.0   23.0    95.7

COND_m    3.0    2.0   150.0
 IND_m    9.0    9.0   100.0

code point 2047; 2 byte UTF-8:

        blead    new Ratio %
       ------ ------ -------
    Ir 1865.0 1870.0    99.7
    Dr  614.0  615.0    99.8
    Dw  428.0  429.0    99.8
  COND  196.0  194.0   101.0
   IND   22.0   23.0    95.7

COND_m    3.0    2.0   150.0
 IND_m    9.0    9.0   100.0

code point 65535; 3 byte UTF-8:

        blead    new Ratio %
       ------ ------ -------
    Ir 1872.0 1876.0    99.8
    Dr  615.0  616.0    99.8
    Dw  428.0  429.0    99.8
  COND  197.0  194.0   101.5
   IND   22.0   23.0    95.7

COND_m    3.0    2.0   150.0
 IND_m    9.0    9.0   100.0

code point 2097151; 4 byte UTF-8:

        blead    new Ratio %
       ------ ------ -------
    Ir 1879.0 1880.0    99.9
    Dr  616.0  617.0    99.8
    Dw  428.0  429.0    99.8
  COND  198.0  194.0   102.1
   IND   22.0   23.0    95.7

COND_m    3.0    2.0   150.0
 IND_m    9.0    9.0   100.0

code point 67108863; 5 byte UTF-8:

        blead    new Ratio %
       ------ ------ -------
    Ir 1886.0 1897.0    99.4
    Dr  617.0  620.0    99.5
    Dw  428.0  429.0    99.8
  COND  199.0  195.0   102.1
   IND   22.0   23.0    95.7

COND_m    3.0    2.0   150.0
 IND_m    9.0    9.0   100.0

code point 134217727; 6 byte UTF-8:

        blead    new Ratio %
       ------ ------ -------
    Ir 1893.0 1904.0    99.4
    Dr  618.0  621.0    99.5
    Dw  428.0  429.0    99.8
  COND  200.0  196.0   102.0
   IND   22.0   23.0    95.7

COND_m    3.0    2.0   150.0
 IND_m    9.0    9.0   100.0
  • Loading branch information
khwilliamson committed Aug 27, 2021
1 parent 33c60d5 commit 2f2a723
Showing 1 changed file with 111 additions and 18 deletions.
129 changes: 111 additions & 18 deletions inline.h
Expand Up @@ -997,34 +997,127 @@ points, and non-Unicode code points are allowed.
PERL_STATIC_INLINE UV
Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
{
const UV expectlen = UTF8SKIP(s);
const U8* send = s + expectlen;
UV uv = *s;

PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;

if (retlen) {
*retlen = expectlen;
}
#ifndef EBCDIC
# ifdef PERL_HAS_FAST_GET_MSB_POS32
# define PERL_USE_MSB_FOR_VALID_UTF8_ msbit_pos32
# elif defined(PERL_HAS_FAST_GET_MSB_POS64)
# define PERL_USE_MSB_FOR_VALID_UTF8_ msbit_pos64
# endif
#endif
#ifdef PERL_USE_MSB_FOR_VALID_UTF8_

/* An invariant is trivially returned */
if (expectlen == 1) {
return uv;
}
/* Given that *s is known to be a legal start byte, the following returns
* its UTF8SKIP, avoiding an array lookup. This only makes sense to do if
* we know that the platform does clz effectively with a single machine
* instruction; otherwise the lookup is cheaper.
*
* The code takes the byte, left shifts it by one, discarding the new upper
* bit, retaining the original one. The position of its complement's first
* set bit, subtracted from the total bits, yields UTF8SKIP. The shift
* accomplishes two things. It eliminates the discontinuity in a single
* leading 1 is illegal. That is 0 leading 1's mean length 1 vs 2 leading
* 1's mean length 2. And it keeps the input to the clz instruction from
* ever being all 0's, which would yield undefined behavior.
*
* To illustrate, where 'x' is a don't care:
* 0xxxxxxx
* 0xxxxxx0 << 1, retaining original top bit
* 1xxxxxx1 complement
* 7 => 1 msb of complement => subtracted from 8
* -------------------
* 11110xxx
* 1110xxx0 << 1, retaining original top bit
* 0001xxx1 complement
* 4 => 4 msb => subtracted from 8
* -------------------
* 11111111
* 11111110 << 1, retaining original top bit
* 00000001 complement
* 0 => 8 msb => subtracted from 8
*/
PERL_UINT_FAST8_T expectlen
= CHARBITS
- PERL_USE_MSB_FOR_VALID_UTF8_(
/* Left shift 1, discarding new top bit */
(U8) ~( ((*s << 1) & nBIT_MASK(CHARBITS - 1))
/* Add back the original top bit */
| (*s & (1 << (CHARBITS - 1)))));
#else
PERL_UINT_FAST8_T expectlen = UTF8SKIP(s);
#endif

/* Remove the leading bits that indicate the number of bytes, leaving just
* the bits that are part of the value */
uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
UV uv = NATIVE_UTF8_TO_I8(*s) & UTF_START_MASK(expectlen);

PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
assert(! UTF8_IS_CONTINUATION(*s));

switch (expectlen) {
PERL_UINT_FAST8_T countdown;

case 8: /* Start byte FF is special */
expectlen = UTF8_MAXBYTES;
/*FALLTHROUGH*/

default: /* Don't unroll the loop for rarely encountered high code points
*/
countdown = expectlen;
while (countdown-- > 4) {
s++;
uv = UTF8_ACCUMULATE(uv, *s);
}
/*FALLTHROUGH*/

/* Now, loop through the remaining bytes, accumulating each into the
* working total as we go. (I khw tried unrolling the loop for up to 4
* bytes, but there was no performance improvement) */
for (++s; s < send; s++) {
case 4:
s++;
uv = UTF8_ACCUMULATE(uv, *s);
/*FALLTHROUGH*/

case 3:
s++;
uv = UTF8_ACCUMULATE(uv, *s);
/*FALLTHROUGH*/

case 2:
s++;
uv = UTF8_ACCUMULATE(uv, *s);

#ifndef EBCDIC

/*FALLTHROUGH*/

/* On ASCII platforms, UTF_START_MASK() works on all UTF-8 invariants,
* so the call to it before this switch() already set up the length 1
* case, so no further action should be done. */
case 1:
break;

#else
break; /* EBCDIC: For case 2: */

/* There are length 1 characters on EBCDIC platforms for which
* UTF_START_MASK() doesn't work properly. Instead we have to handle
* that case specially.
*
* An invariant is trivially returned; throw away our earlier
* calculation */
case 1:
if (retlen) {
*retlen = 1;
}
return *(s - 1);

#endif

}

return UNI_TO_NATIVE(uv);
if (retlen) {
*retlen = expectlen;
}

return UNI_TO_NATIVE(uv);
}

#ifndef EBCDIC
Expand Down

0 comments on commit 2f2a723

Please sign in to comment.