Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions embed.fnc
Original file line number Diff line number Diff line change
Expand Up @@ -1864,6 +1864,9 @@ CTopr |void |locale_panic |NN const char *msg \
: Used in perly.y
p |OP * |localize |NN OP *o \
|I32 lex
CTp |UV |long_valid_utf8_to_uv \
|NN const U8 * const s \
|NN const U8 * const e
ARdp |I32 |looks_like_number \
|NN SV * const sv
CRTip |unsigned|lsbit_pos32 |U32 word
Expand Down
1 change: 1 addition & 0 deletions embed.h
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@
# define lex_stuff_pvn(a,b,c) Perl_lex_stuff_pvn(aTHX_ a,b,c)
# define lex_stuff_sv(a,b) Perl_lex_stuff_sv(aTHX_ a,b)
# define lex_unstuff(a) Perl_lex_unstuff(aTHX_ a)
# define long_valid_utf8_to_uv Perl_long_valid_utf8_to_uv
# define looks_like_number(a) Perl_looks_like_number(aTHX_ a)
# define lsbit_pos32 Perl_lsbit_pos32
# define magic_dump(a) Perl_magic_dump(aTHX_ a)
Expand Down
108 changes: 88 additions & 20 deletions inline.h
Original file line number Diff line number Diff line change
Expand Up @@ -1334,31 +1334,99 @@ Perl_valid_utf8_to_uv(const U8 *s, STRLEN *retlen)

const UV expectlen = UTF8SKIP(s);
ASSUME(inRANGE(expectlen, 1, UTF8_MAXBYTES));
const U8* send = s + expectlen;
UV uv = *s;
UV uv = 0;

if (retlen) {
*retlen = expectlen;
}

/* An invariant is trivially returned */
if (expectlen == 1) {
return uv;
/* Note that this is branchless except for the switch() jump table, and
* checking that the caller wants a *retlen returned.
*
* There is wasted effort for length 1 inputs of initializing 'uv' to 0
* and calculating 'full_shift' (unless the compiler optimizes that out).
* Benchmarks indicate this is acceptable.
* See GH #23690 */

/* Consider a 4-byte UTF-8-encoded charater. On ASCII platforms it looks
* like:
* 1st Byte 2nd Byte 3rd Byte 4th Byte
* 1111 0ddd 10cc cccc 10bb bbbb 10aa aaaa
*
* And the code point it represents is dddccccccbbbbbbbbaaaaaa
* Each continuation byte contributes its lower 6 bits to the total. For
* generality call that number 'L'.
*
* You get that code point by masking off the top bits of each byte, then
* or'ing together:
* the start byte shifted left by 3*L bits,
* with byte [1] shifted left by 2*L bits
* with byte [2] shifted left by 1*L bits
* with byte [3] shifted left by 0*L bits
*
* The order is immaterial, so we can rewrite that as
* 'or' together byte [3] shifted left by 0*L bits
* with byte [2] shifted left by 1*L bits
* with byte [1] shifted left by 2*L bits
* with byte [0] shifted left by 3*L bits,
*
* All share the paradigm that for byte n you mask off the top bits and
* shift the remainder left by (4 - 1 - n) * L bits. So we get
* (s[n] & mask) << (4 - 1 - n) * L
* For a three-byte character it would be
* (s[n] & mask) << (3 - 1 - n) * L
* and generally
* (s[n] & mask) << (expectlen - 1 - n) * L
* which can be rewritten
* (s[n] & mask) << (expectlen - 1) * L - nL
* Calculate the term once that isn't compile-time constant and is the same
* for all n */
U8 full_shift = (expectlen - 1) * UTF_ACCUMULATION_SHIFT;

/* Then create a macro that does the full calculation given n. For EBCDIC,
* we need to transform s[n] to I8 */
#define PERL_VALID_UTF8_NEXT_ACCUMULATION(n) \
(( (UV) ( NATIVE_UTF8_TO_I8( s[n] ) & UTF_CONTINUATION_MASK)) \
<< (full_shift - (n) * UTF_ACCUMULATION_SHIFT))

switch (expectlen) {
default:
uv = long_valid_utf8_to_uv(s, s + expectlen);
break;

#if 0 /* See GH #23690 */
/* These cases give the correct results, but the extra memory used lowers
* the chances of the compiler actually inlining this, and we only care
* about performance for Unicode code points, all of which can be
* expressed with 4 bytes (5 on EBCDIC). Experiements with clang showed
* no difference between 4,5,6, but a huge drop off with 7. */
case 7: uv |= PERL_VALID_UTF8_NEXT_ACCUMULATION(6);
/* FALLTHROUGH */
case 6: uv |= PERL_VALID_UTF8_NEXT_ACCUMULATION(5);
/* FALLTHROUGH */
#endif
case 5: uv |= PERL_VALID_UTF8_NEXT_ACCUMULATION(4);
/* FALLTHROUGH */
case 4:
uv |= PERL_VALID_UTF8_NEXT_ACCUMULATION(3);
/* FALLTHROUGH */
case 3:
uv |= PERL_VALID_UTF8_NEXT_ACCUMULATION(2);
/* FALLTHROUGH */
case 2:
uv |= PERL_VALID_UTF8_NEXT_ACCUMULATION(1);

uv = UNI_TO_NATIVE(uv | ( ((UV)( NATIVE_UTF8_TO_I8(s[0])
& UTF_START_MASK(expectlen))
<< full_shift)));
break;

case 1:
uv = s[0];
break;
}

/* Remove the leading bits that indicate the number of bytes, leaving just
* the bits that are part of the value */
uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);

/* Now, loop through the remaining bytes, accumulating each into the
* working total as we go. (I khw tried unrolling the loop for up to 4
* bytes, but there was no performance improvement) */
for (++s; s < send; s++) {
uv = UTF8_ACCUMULATE(uv, *s);
if (retlen) {
*retlen = expectlen;
}

return UNI_TO_NATIVE(uv);

return uv;
}

/* This looks like 0x010101... */
Expand Down
5 changes: 5 additions & 0 deletions proto.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 15 additions & 0 deletions utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,21 @@ static const char malformed_text[] = "Malformed UTF-8 character";
static const char unees[] =
"Malformed UTF-8 character (unexpected end of string)";

UV
Perl_long_valid_utf8_to_uv(const U8 * const s, const U8 * const e)
{
PERL_ARGS_ASSERT_LONG_VALID_UTF8_TO_UV;

/* This exists entirely to make the inlined 'valid_utf8_to_uv' smaller, to
* increase its chances of actually getting inlined. For the code points
* it doesn't handle, it calls utf8_to_uv_or_die(), which is also inlined.
* So the compiler would try to inline both, getting a too-large-to-inline
* result. So this non-inlined routine acts as an intermediary, to avoid
* that */

return utf8_to_uv_or_die(s, e, NULL);
}

/*
These are various utility functions for manipulating UTF8-encoded
strings. For the uninitiated, this is a method of representing arbitrary
Expand Down
Loading