From 592b07989742a430085037172d210461e15e978c Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 8 Sep 2025 07:11:40 -0600 Subject: [PATCH 01/11] handy.h: Swap order of conditionals for clarity This moves the trivial case to before the complicated one, which is easier to comprehend. And instead of complementing the conditional, use a different name (that evaluates to that complement) which makes it clearer what's going on. --- handy.h | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/handy.h b/handy.h index 7b91b99a1c7c..05ffddf4cd07 100644 --- a/handy.h +++ b/handy.h @@ -2259,23 +2259,23 @@ END_EXTERN_C ? (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)\ : (UTF8_IS_INVARIANT(*(p))) \ ? generic_isCC_(*(p), classnum) \ - : (UTF8_IS_DOWNGRADEABLE_START(*(p)) \ - ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \ + : (UTF8_IS_ABOVE_LATIN1_START(*(p)) \ + ? above_latin1 \ + : ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \ ? generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )), \ classnum) \ : (force_out_malformed_utf8_message_( \ - (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)) \ - : above_latin1)) + (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)))) /* Like the above, but calls 'above_latin1(p, e)' to get the utf8 value. * 'above_latin1' can be a macro */ #define generic_func_utf8_safe_(classnum, above_latin1, p, e) \ generic_utf8_safe_(classnum, p, e, above_latin1(p, e)) #define generic_non_invlist_utf8_safe_(classnum, above_latin1, p, e) \ generic_utf8_safe_(classnum, p, e, \ - (UNLIKELY((e) - (p) < UTF8SKIP(p)) \ - ? (force_out_malformed_utf8_message_( \ - (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0) \ - : above_latin1(p))) + (LIKELY((e) - (p) >= UTF8SKIP(p)) \ + ? above_latin1(p) \ + : (force_out_malformed_utf8_message_( \ + (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0))) /* Like the above, but passes classnum to _isFOO_utf8(), instead of having an * 'above_latin1' parameter */ #define generic_invlist_utf8_safe_(classnum, p, e) \ @@ -2289,10 +2289,9 @@ END_EXTERN_C (assert(utf8_safe_assert_(p, e)), \ (isASCII(*(p))) \ ? generic_isCC_(*(p), classnum) \ - : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \ - ? 0 /* Note that doesn't check validity for latin1 */ \ - : above_latin1) - + : (UTF8_IS_ABOVE_LATIN1_START(*(p)) \ + ? above_latin1 \ + : 0)) /* Note that doesn't check validity for latin1 */ #define isALPHA_utf8(p, e) isALPHA_utf8_safe(p, e) #define isALPHANUMERIC_utf8(p, e) isALPHANUMERIC_utf8_safe(p, e) @@ -2409,12 +2408,12 @@ END_EXTERN_C (assert_(utf8_safe_assert_(p, e)) \ (UTF8_IS_INVARIANT(*(p))) \ ? macro(*(p)) \ - : (UTF8_IS_DOWNGRADEABLE_START(*(p)) \ - ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \ + : (UTF8_IS_ABOVE_LATIN1_START(*(p)) \ + ? above_latin1 \ + : ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \ ? macro(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1))) \ : (force_out_malformed_utf8_message_( \ - (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)) \ - : above_latin1)) + (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)))) #define generic_LC_invlist_utf8_safe_(macro, classnum, p, e) \ generic_LC_utf8_safe_(macro, p, e, \ @@ -2425,10 +2424,10 @@ END_EXTERN_C #define generic_LC_non_invlist_utf8_safe_(classnum, above_latin1, p, e) \ generic_LC_utf8_safe_(classnum, p, e, \ - (UNLIKELY((e) - (p) < UTF8SKIP(p)) \ - ? (force_out_malformed_utf8_message_( \ - (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0) \ - : above_latin1(p))) + (LIKELY((e) - (p) >= UTF8SKIP(p)) \ + ? above_latin1(p) \ + : (force_out_malformed_utf8_message_( \ + (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0))) #define isALPHANUMERIC_LC_utf8_safe(p, e) \ generic_LC_invlist_utf8_safe_(isALPHANUMERIC_LC, \ From fd90e73dc106320553eb8035da107911fdb0ee57 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 8 Sep 2025 07:32:52 -0600 Subject: [PATCH 02/11] handy.h: White space only This cleans up some ragged edges, makes things fit in 80 columns --- handy.h | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/handy.h b/handy.h index 05ffddf4cd07..30ba859ca841 100644 --- a/handy.h +++ b/handy.h @@ -2256,7 +2256,8 @@ END_EXTERN_C #define generic_utf8_safe_(classnum, p, e, above_latin1) \ ((! utf8_safe_assert_(p, e)) \ - ? (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)\ + ? (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e), 0, \ + MALFORMED_UTF8_DIE), 0) \ : (UTF8_IS_INVARIANT(*(p))) \ ? generic_isCC_(*(p), classnum) \ : (UTF8_IS_ABOVE_LATIN1_START(*(p)) \ @@ -2264,18 +2265,25 @@ END_EXTERN_C : ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \ ? generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )), \ classnum) \ - : (force_out_malformed_utf8_message_( \ - (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)))) + : (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e),\ + 0, MALFORMED_UTF8_DIE),\ + 0)))) + /* Like the above, but calls 'above_latin1(p, e)' to get the utf8 value. * 'above_latin1' can be a macro */ #define generic_func_utf8_safe_(classnum, above_latin1, p, e) \ generic_utf8_safe_(classnum, p, e, above_latin1(p, e)) + #define generic_non_invlist_utf8_safe_(classnum, above_latin1, p, e) \ - generic_utf8_safe_(classnum, p, e, \ - (LIKELY((e) - (p) >= UTF8SKIP(p)) \ - ? above_latin1(p) \ - : (force_out_malformed_utf8_message_( \ - (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0))) + generic_utf8_safe_(classnum, p, e, \ + (LIKELY((e) - (p) >= UTF8SKIP(p)) \ + ? above_latin1(p) \ + : (force_out_malformed_utf8_message_( \ + (U8 *) (p), \ + (U8 *) (e), \ + 0, \ + MALFORMED_UTF8_DIE),\ + 0))) /* Like the above, but passes classnum to _isFOO_utf8(), instead of having an * 'above_latin1' parameter */ #define generic_invlist_utf8_safe_(classnum, p, e) \ @@ -2405,7 +2413,7 @@ END_EXTERN_C * point in 'p' is within the 0-255 range, it uses locale rules from the * passed-in 'macro' parameter */ #define generic_LC_utf8_safe_(macro, p, e, above_latin1) \ - (assert_(utf8_safe_assert_(p, e)) \ + (assert_(utf8_safe_assert_(p, e)) \ (UTF8_IS_INVARIANT(*(p))) \ ? macro(*(p)) \ : (UTF8_IS_ABOVE_LATIN1_START(*(p)) \ From 04b84bef96af5f486940108b39dda6a250cd02f4 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 8 Sep 2025 07:43:19 -0600 Subject: [PATCH 03/11] handy.h: Remove unnecessary cast The && in this expression already makes the result a boolean; no need to cast it to such. Removing it allows the entire expression to fit on one line. --- handy.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/handy.h b/handy.h index 30ba859ca841..fe7eef30240f 100644 --- a/handy.h +++ b/handy.h @@ -1595,8 +1595,8 @@ END_EXTERN_C /* For internal core Perl use only: the base macro for defining macros like * isALPHA */ -# define generic_isCC_(c, classnum) cBOOL(FITS_IN_8_BITS(c) \ - && (PL_charclass[(U8) (c)] & CC_mask_(classnum))) +# define generic_isCC_(c, classnum) \ + (FITS_IN_8_BITS(c) && (PL_charclass[(U8) (c)] & CC_mask_(classnum))) /* The mask for the _A versions of the macros; it just adds in the bit for * ASCII. */ From e5ef7eb18606d03f484aa871661b6dd569ba1a74 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 8 Sep 2025 06:23:54 -0600 Subject: [PATCH 04/11] utf8.c: Replace macro by a static function This will be useful in the next commits --- utf8.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/utf8.c b/utf8.c index b0922d138470..33028dd057ed 100644 --- a/utf8.c +++ b/utf8.c @@ -3903,15 +3903,18 @@ S_warn_on_first_deprecated_use(pTHX_ U32 category, /* returns a boolean giving whether or not the UTF8-encoded character that * starts at

, and extending no further than is in the inversion * list . */ -#define IS_UTF8_IN_INVLIST(p, e, invlist) \ - _invlist_contains_cp(invlist, utf8_to_uv_or_die(p, e, NULL)) +STATIC bool +S_is_utf8_in_invlist(pTHX_ const U8 * p, const U8 * e, SV * const invlist) +{ + return _invlist_contains_cp(invlist, utf8_to_uv_or_die(p, e, NULL)); +} bool Perl_is_utf8_FOO_(pTHX_ const U8 classnum, const U8 *p, const U8 * const e) { PERL_ARGS_ASSERT_IS_UTF8_FOO_; - return IS_UTF8_IN_INVLIST(p, e, PL_XPosix_ptrs[classnum]); + return S_is_utf8_in_invlist(aTHX_ p, e, PL_XPosix_ptrs[classnum]); } bool @@ -3919,7 +3922,7 @@ Perl_is_utf8_perl_idstart_(pTHX_ const U8 *p, const U8 * const e) { PERL_ARGS_ASSERT_IS_UTF8_PERL_IDSTART_; - return IS_UTF8_IN_INVLIST(p, e, PL_utf8_perl_idstart); + return S_is_utf8_in_invlist(aTHX_ p, e, PL_utf8_perl_idstart); } bool @@ -3927,7 +3930,7 @@ Perl_is_utf8_perl_idcont_(pTHX_ const U8 *p, const U8 * const e) { PERL_ARGS_ASSERT_IS_UTF8_PERL_IDCONT_; - return IS_UTF8_IN_INVLIST(p, e, PL_utf8_perl_idcont); + return S_is_utf8_in_invlist(aTHX_ p, e, PL_utf8_perl_idcont); } STATIC UV From 8994f610a8ee16d489ca58580f97308a3629ea2e Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 8 Sep 2025 06:41:01 -0600 Subject: [PATCH 05/11] Generalize 3 functions to return length on success Instead of a bool, they will now return the number of bytes that comprise the character being checked. So the result can be used as a bool, just as before; or the extra information can save recalculations, as done in the future commits. --- embed.fnc | 6 +++--- proto.h | 6 +++--- utf8.c | 21 +++++++++++++-------- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/embed.fnc b/embed.fnc index 99c664b8a5b8..4eb1df93843a 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1753,17 +1753,17 @@ ATdip |bool |is_utf8_fixed_width_buf_loclen_flags \ |NULLOK const U8 **ep \ |NULLOK STRLEN *el \ |const U32 flags -CRp |bool |is_utf8_FOO_ |const U8 classnum \ +CRp |Size_t |is_utf8_FOO_ |const U8 classnum \ |NN const U8 *p \ |NN const U8 * const e ARTdip |bool |is_utf8_invariant_string_loc \ |NN const U8 * const s \ |STRLEN len \ |NULLOK const U8 **ep -CRp |bool |is_utf8_perl_idcont_ \ +CRp |Size_t |is_utf8_perl_idcont_ \ |NN const U8 *p \ |NN const U8 * const e -CRp |bool |is_utf8_perl_idstart_ \ +CRp |Size_t |is_utf8_perl_idstart_ \ |NN const U8 *p \ |NN const U8 * const e ARTdmp |bool |is_utf8_string |NN const U8 *s \ diff --git a/proto.h b/proto.h index 41cd957f7829..2a9e2e05227b 100644 --- a/proto.h +++ b/proto.h @@ -1850,7 +1850,7 @@ Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e, const bool requ #define PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_ \ assert(s0); assert(e) -PERL_CALLCONV bool +PERL_CALLCONV Size_t Perl_is_utf8_FOO_(pTHX_ const U8 classnum, const U8 *p, const U8 * const e) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_IS_UTF8_FOO_ \ @@ -1872,13 +1872,13 @@ Perl_is_utf8_fixed_width_buf_flags(const U8 * const s, STRLEN len, const U32 fla /* PERL_CALLCONV bool Perl_is_utf8_fixed_width_buf_loc_flags(const U8 * const s, STRLEN len, const U8 **ep, const U32 flags); */ -PERL_CALLCONV bool +PERL_CALLCONV Size_t Perl_is_utf8_perl_idcont_(pTHX_ const U8 *p, const U8 * const e) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_IS_UTF8_PERL_IDCONT_ \ assert(p); assert(e) -PERL_CALLCONV bool +PERL_CALLCONV Size_t Perl_is_utf8_perl_idstart_(pTHX_ const U8 *p, const U8 * const e) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_IS_UTF8_PERL_IDSTART_ \ diff --git a/utf8.c b/utf8.c index 33028dd057ed..e9b84200917c 100644 --- a/utf8.c +++ b/utf8.c @@ -3900,16 +3900,21 @@ S_warn_on_first_deprecated_use(pTHX_ U32 category, } #endif -/* returns a boolean giving whether or not the UTF8-encoded character that - * starts at

, and extending no further than is in the inversion - * list . */ -STATIC bool +/* returns the number of bytes comprising the UTF8-encoded character that + * starts at

, and extending no further than if it is in the + * inversion list ; or 0 if it isn't */ +STATIC Size_t S_is_utf8_in_invlist(pTHX_ const U8 * p, const U8 * e, SV * const invlist) { - return _invlist_contains_cp(invlist, utf8_to_uv_or_die(p, e, NULL)); + Size_t advance; + if (_invlist_contains_cp(invlist, utf8_to_uv_or_die(p, e, &advance))) { + return advance; + } + + return 0; } -bool +Size_t Perl_is_utf8_FOO_(pTHX_ const U8 classnum, const U8 *p, const U8 * const e) { PERL_ARGS_ASSERT_IS_UTF8_FOO_; @@ -3917,7 +3922,7 @@ Perl_is_utf8_FOO_(pTHX_ const U8 classnum, const U8 *p, const U8 * const e) return S_is_utf8_in_invlist(aTHX_ p, e, PL_XPosix_ptrs[classnum]); } -bool +Size_t Perl_is_utf8_perl_idstart_(pTHX_ const U8 *p, const U8 * const e) { PERL_ARGS_ASSERT_IS_UTF8_PERL_IDSTART_; @@ -3925,7 +3930,7 @@ Perl_is_utf8_perl_idstart_(pTHX_ const U8 *p, const U8 * const e) return S_is_utf8_in_invlist(aTHX_ p, e, PL_utf8_perl_idstart); } -bool +Size_t Perl_is_utf8_perl_idcont_(pTHX_ const U8 *p, const U8 * const e) { PERL_ARGS_ASSERT_IS_UTF8_PERL_IDCONT_; From f6f4bce1257b0f345dacd550f0923a439c5e6e6d Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 8 Sep 2025 08:43:20 -0600 Subject: [PATCH 06/11] Change isFOO_utf8_safe macros to return matched byte length Or 0 when the character isn't of type FOO. This allows these macros to be used as booleans, as previously; or to give you how many bytes there are in the matched UTF-8 character. This was always trivially the case for ASCII-range characters, as the former boolean 0,1 gave you the correct length if they matched. The previous commit extended this to return the length for above-Latin1 characters. This commit is the final piece. Latin1 characters that aren't ASCII always are two bytes. So just multiply the return by 2, yielding 0 if no match or 2 bytes if matched. --- handy.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/handy.h b/handy.h index fe7eef30240f..fec7d83e3bd5 100644 --- a/handy.h +++ b/handy.h @@ -2263,8 +2263,10 @@ END_EXTERN_C : (UTF8_IS_ABOVE_LATIN1_START(*(p)) \ ? above_latin1 \ : ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \ - ? generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )), \ - classnum) \ + /* Multiply by 2 to return byte length of matched \ + * character */ \ + ? 2 * generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(*(p),*((p)+1)),\ + classnum) \ : (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e),\ 0, MALFORMED_UTF8_DIE),\ 0)))) From ba197ce98ad8fba8a32ed07a3de13d67cefd2bb2 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 8 Sep 2025 09:05:06 -0600 Subject: [PATCH 07/11] class.c: Avoid UTF8SKIPs This value is now returned from the isSPACE_utf8_safe macro. Use it instead of re-deriving it. --- class.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/class.c b/class.c index f731b986dc00..871a68e752bd 100644 --- a/class.c +++ b/class.c @@ -445,8 +445,9 @@ static const char *S_split_package_ver(pTHX_ SV *value, SV *pkgname, SV *pkgvers if(SvUTF8(value)) SvUTF8_on(pkgname); - while(*p && isSPACE_utf8_safe(p, end)) - p += UTF8SKIP(p); + Size_t advance; + while(*p && (advance = isSPACE_utf8_safe(p, end))) + p += advance; if(*p) { /* scan_version() gets upset about trailing content. We need to extract @@ -463,8 +464,8 @@ static const char *S_split_package_ver(pTHX_ SV *value, SV *pkgname, SV *pkgvers scan_version(SvPVX(tmpsv), pkgversion, FALSE); } - while(*p && isSPACE_utf8_safe(p, end)) - p += UTF8SKIP(p); + while(*p && (advance = isSPACE_utf8_safe(p, end))) + p += advance; return p; } From b784fff78ecf674fce86b23dc095f1e1c798fac6 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 8 Sep 2025 09:19:11 -0600 Subject: [PATCH 08/11] pp_ctl.c: Avoid UTF8SKIPs This value is now returned from the isID(FIRST|CONT)_utf8_safe macros. Use it instead of re-deriving it. --- pp_ctl.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pp_ctl.c b/pp_ctl.c index 5cfd919e6b6d..d713cdc0cb1c 100644 --- a/pp_ctl.c +++ b/pp_ctl.c @@ -5192,11 +5192,17 @@ S_require_file(pTHX_ SV *sv) S_parse_ident */ c = name; while (c < e) { - if (utf8 && isIDFIRST_utf8_safe(c, e)) { - c += UTF8SKIP(c); - while (c < e && isIDCONT_utf8_safe( - (const U8*) c, (const U8*) e)) - c += UTF8SKIP(c); + Size_t advance; + + if (utf8 && (advance = isIDFIRST_utf8_safe(c, e))) + { + c += advance; + while ( c < e + && (advance = isIDCONT_utf8_safe( + (const U8*) c, (const U8*) e))) + { + c += advance; + } } else if (isWORDCHAR_A(*c)) { while (c < e && isWORDCHAR_A(*c)) From d16ee554ab016dd85c9e13c18802101d3cc3f9e5 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 8 Sep 2025 09:36:00 -0600 Subject: [PATCH 09/11] regcomp.c: Avoid UTF8SKIPs This value is now returned from the isID(FIRST|CONT)_lazy_if_safe macros. Use it instead of re-deriving it. This also simplifies the code --- regcomp.c | 13 +++++-------- regcomp_internal.h | 8 ++++++++ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/regcomp.c b/regcomp.c index c76a467bcd93..3417b3ea0bbb 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2522,19 +2522,16 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) PERL_ARGS_ASSERT_REG_SCAN_NAME; assert (RExC_parse <= RExC_end); + Size_t advance; if (RExC_parse == RExC_end) NOOP; - else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) { + else if ((advance = isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF))) { /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by * using do...while */ - if (UTF) do { - RExC_parse_inc_utf8(); + RExC_parse_advance(advance); } while ( RExC_parse < RExC_end - && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end)); - else - do { - RExC_parse_inc_by(1); - } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse)); + && (advance = isWORDCHAR_utf8_safe( (U8 *) RExC_parse, + (U8 *) RExC_end))); } else { RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending character */ diff --git a/regcomp_internal.h b/regcomp_internal.h index 60ec13777a61..27e543d80be7 100644 --- a/regcomp_internal.h +++ b/regcomp_internal.h @@ -279,6 +279,14 @@ struct RExC_state_t { * output during the parse process. */ +/* RExC_parse_advance(count) + * + * Increment RExC_parse to point at the next codepoint, when we *know* that the + * correct byte count is in the passed parameter */ +#define RExC_parse_advance(count) STMT_START { \ + RExC_parse += count; \ +} STMT_END + /* RExC_parse_incf(flag) * * Increment RExC_parse to point at the next codepoint, while doing From 70a8504dae5b69e838d20256829d9800024b0fa2 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 8 Sep 2025 09:43:01 -0600 Subject: [PATCH 10/11] regcomp.c: White space only The previous commit removed a surrounding block; outdent correspondingly --- regcomp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/regcomp.c b/regcomp.c index 3417b3ea0bbb..1831effb0dba 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2527,11 +2527,11 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) else if ((advance = isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF))) { /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by * using do...while */ - do { - RExC_parse_advance(advance); - } while ( RExC_parse < RExC_end - && (advance = isWORDCHAR_utf8_safe( (U8 *) RExC_parse, - (U8 *) RExC_end))); + do { + RExC_parse_advance(advance); + } while ( RExC_parse < RExC_end + && (advance = isWORDCHAR_utf8_safe( (U8 *) RExC_parse, + (U8 *) RExC_end))); } else { RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending character */ From 8887d78912ec1e48b1ba17f259316fcbd34fd81e Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 8 Sep 2025 10:13:58 -0600 Subject: [PATCH 11/11] toke.c: Avoid UTF8SKIPs This value is now returned from the isID(FIRST|CONT)_lazy_if_safe macros. Use it instead of re-deriving it. --- toke.c | 81 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 51 insertions(+), 30 deletions(-) diff --git a/toke.c b/toke.c index 62f427bf67c3..3a3c0e7a2168 100644 --- a/toke.c +++ b/toke.c @@ -2088,8 +2088,10 @@ S_check_uni(pTHX) while (isSPACE(*PL_last_uni)) PL_last_uni++; s = PL_last_uni; - while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-') - s += UTF ? UTF8SKIP(s) : 1; + Size_t advance; + while ( (advance = isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)) + || (advance = (*s == '-'))) + s += advance; if (s < PL_bufptr && memchr(s, '(', PL_bufptr - s)) return; @@ -5193,10 +5195,11 @@ S_check_scalar_slice(pTHX_ char *s) { return; } - while ( isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) - || (*s && memCHRs(" \t$#+-'\"", *s))) + Size_t advance; + while ( (advance = isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)) + || (advance = (*s && memCHRs(" \t$#+-'\"", *s)))) { - s += UTF ? UTF8SKIP(s) : 1; + s += advance; } if (*s == '}' || *s == ']') pl_yylval.ival = OPpSLICEWARNING; @@ -5402,8 +5405,11 @@ yyl_dollar(pTHX_ char *s) while (t < PL_bufend && *t == ' ') t++; /* strip off the name of the var */ - while (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)) - t += UTF ? UTF8SKIP(t) : 1; + Size_t advance; + while ((advance = (isWORDCHAR_lazy_if_safe(t, + PL_bufend, + UTF)))) + t += advance; /* consumed a varname */ } else if (isDIGIT(*t)) { /* deal with hex constants like 0x11 */ @@ -6407,6 +6413,7 @@ yyl_leftcurly(pTHX_ char *s, const U8 formbrack) * GSAR 97-07-21 */ t = s; + Size_t advance; if (*s == '\'' || *s == '"' || *s == '`') { /* common case: get past first string, handling escapes */ for (t++; t < PL_bufend && *t != *s;) @@ -6455,20 +6462,24 @@ yyl_leftcurly(pTHX_ char *s, const U8 formbrack) } t++; } - else + else { /* skip plain q word */ - while ( t < PL_bufend - && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)) + while ( t < PL_bufend + && (advance = isWORDCHAR_lazy_if_safe(t, + PL_bufend, + UTF))) { - t += UTF ? UTF8SKIP(t) : 1; + t += advance; } + } } - else if (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)) { - t += UTF ? UTF8SKIP(t) : 1; + else if ((advance = isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))) { + t += advance; while ( t < PL_bufend - && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)) + && (advance = isWORDCHAR_lazy_if_safe(t, PL_bufend, + UTF))) { - t += UTF ? UTF8SKIP(t) : 1; + t += advance; } } while (t < PL_bufend && isSPACE(*t)) @@ -10125,11 +10136,12 @@ S_checkcomma(pTHX_ const char *s, const char *name, const char *what) s++; while (s < PL_bufend && isSPACE(*s)) s++; - if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) { + Size_t advance; + if ((advance = isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))) { const char * const w = s; - s += UTF ? UTF8SKIP(s) : 1; - while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)) - s += UTF ? UTF8SKIP(s) : 1; + s += advance; + while ((advance = isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))) + s += advance; while (s < PL_bufend && isSPACE(*s)) s++; if (*s == ',') { @@ -10281,15 +10293,18 @@ S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package, while (*s < PL_bufend) { if (*d >= e) croak("%s", ident_too_long); - if (is_utf8 && isIDFIRST_utf8_safe(*s, PL_bufend)) { + Size_t advance; + if (is_utf8 && (advance = isIDFIRST_utf8_safe(*s, PL_bufend))) { /* The UTF-8 case must come first, otherwise things * like c\N{COMBINING TILDE} would start failing, as the * isWORDCHAR_A case below would gobble the 'c' up. */ - char *t = *s + UTF8SKIP(*s); - while (isIDCONT_utf8_safe((const U8*) t, (const U8*) PL_bufend)) { - t += UTF8SKIP(t); + char *t = *s + advance; + while ((advance = isIDCONT_utf8_safe((const U8*) t, + (const U8*) PL_bufend))) + { + t += advance; } if (*d + (t - *s) > e) croak("%s", ident_too_long); @@ -10496,11 +10511,12 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni) /* note we have to check for a normal identifier first, * as it handles utf8 symbols, and only after that has * been ruled out can we look at the caret words */ - if (isIDFIRST_lazy_if_safe(d, e, is_utf8) ) { + Size_t advance; + if ((advance = isIDFIRST_lazy_if_safe(d, e, is_utf8) )) { /* if it starts as a valid identifier, assume that it is one. (the later check for } being at the expected point will trap cases where this doesn't pan out.) */ - d += is_utf8 ? UTF8SKIP(d) : 1; + d += advance; parse_ident(&s, &d, e, 1, is_utf8, TRUE); *d = '\0'; } @@ -10998,8 +11014,9 @@ S_scan_heredoc(pTHX_ char *s) peek = s; - while (isWORDCHAR_lazy_if_safe(peek, PL_bufend, UTF)) { - peek += UTF ? UTF8SKIP(peek) : 1; + Size_t advance; + while ((advance = isWORDCHAR_lazy_if_safe(peek, PL_bufend, UTF))) { + peek += advance; } len = (peek - s >= e - d) ? (e - d) : (peek - s); @@ -11442,9 +11459,13 @@ S_scan_inputsymbol(pTHX_ char *start) if (*d == '$' && d[1]) d++; /* allow or */ - while (isWORDCHAR_lazy_if_safe(d, e, UTF) || *d == ':' - || (*d == '\'' && FEATURE_APOS_AS_NAME_SEP_IS_ENABLED)) { - d += UTF ? UTF8SKIP(d) : 1; + Size_t advance; + while ( (advance = isWORDCHAR_lazy_if_safe(d, e, UTF)) + || (advance = ( *d == ':' + || ( *d == '\'' + && FEATURE_APOS_AS_NAME_SEP_IS_ENABLED)))) + { + d += advance; } /* If we've tried to read what we allow filehandles to look like, and