From f1d3134b5007a4ff67b1d69997425169ef0d5eab Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 13 Aug 2025 07:48:29 -0600 Subject: [PATCH] Add ASSUMEs for UTF-8 byte lengths The maximum number of bytes in a Perl extended UTF-8 character is 13 on ASCII platforms; 14 on EBCDIC. Yet the variable that returns that number is a Size_t in the cases changed by this commit. By adding these ASSUMES to these functions, the compiler may be able to do some optimizations. I looked through the code base, and found no other instances where such a small value could be stored in a fully wide variable. With link time optimization, an ASSUME may be helpful even in non-inline functions. --- inline.h | 8 +++++++- utf8.c | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/inline.h b/inline.h index 0c952c582f6a..fa132e2cb2e3 100644 --- a/inline.h +++ b/inline.h @@ -1333,6 +1333,7 @@ Perl_valid_utf8_to_uv(const U8 *s, STRLEN *retlen) PERL_ARGS_ASSERT_VALID_UTF8_TO_UV; const UV expectlen = UTF8SKIP(s); + ASSUME(inRANGE(expectlen, 1, UTF8_MAXBYTES)); const U8* send = s + expectlen; UV uv = *s; @@ -3213,6 +3214,7 @@ Perl_utf8_to_uv_msgs(const U8 * const s0, if (LIKELY(state == 0)) { if (advance_p) { *advance_p = s - s0 + 1; + ASSUME(*advance_p <= UTF8_MAXBYTES); } *cp_p = UNI_TO_NATIVE(uv); @@ -3221,7 +3223,10 @@ Perl_utf8_to_uv_msgs(const U8 * const s0, } /* Here is potentially problematic. Use the full mechanism */ - return utf8_to_uv_msgs_helper_(s0, e, cp_p, advance_p, flags, errors, msgs); + bool success = utf8_to_uv_msgs_helper_(s0, e, cp_p, advance_p, + flags, errors, msgs); + ASSUME(advance_p == NULL || inRANGE(*advance_p, 1, UTF8_MAXBYTES)); + return success; } PERL_STATIC_INLINE UV @@ -3231,6 +3236,7 @@ Perl_utf8_to_uv_or_die(const U8 *s, const U8 *e, STRLEN *advance_p) UV cp; (void) utf8_to_uv_flags(s, e, &cp, advance_p, UTF8_DIE_IF_MALFORMED); + ASSUME(advance_p == NULL || inRANGE(*advance_p, 1, UTF8_MAXBYTES)); return cp; } diff --git a/utf8.c b/utf8.c index d72de4e60c96..94ae8ab17e90 100644 --- a/utf8.c +++ b/utf8.c @@ -2568,6 +2568,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, if (advance_p) { *advance_p = curlen; + ASSUME(inRANGE(*advance_p, 1, UTF8_MAXBYTES)); } *cp_p = UNI_TO_NATIVE(uv);