From 8bb981763a7a734dc1d026683969aa5780a0434f Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sat, 6 Sep 2025 20:48:51 -0600 Subject: [PATCH 1/5] regcomp.c: Avoid a UTF8SKIP This value is returnable from the called function. Use it instead of re-deriving it --- regcomp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/regcomp.c b/regcomp.c index 700bdd3c2b4b..c76a467bcd93 100644 --- a/regcomp.c +++ b/regcomp.c @@ -8183,10 +8183,11 @@ S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, p++; } else { + Size_t advance; input_text[name_len++] = utf8_to_uv_or_die((const U8 *) p, (const U8 *) e, - NULL); - p+= UTF8SKIP(p); + &advance); + p += advance; } /* The declaration of 'input_text' is how long we allow a potential From d0955c88f6e94a165f50abc9e33c3ee7097ac320 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 8 Sep 2025 08:26:01 -0600 Subject: [PATCH 2/5] locale.c: Avoid a UTF8SKIP This value is returnable from the called function. Use it instead of re-deriving it --- locale.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/locale.c b/locale.c index 0810bb0726ba..4992f5e937ad 100644 --- a/locale.c +++ b/locale.c @@ -944,9 +944,16 @@ S_get_displayable_string(pTHX_ SAVEFREEPV(ret); while (t < e) { - UV cp = (is_utf8) - ? utf8_to_uv_or_die((const U8 *) t, (const U8 *) e, NULL) - : * (U8 *) t; + UV cp; + Size_t advance; + if (is_utf8) { + cp = utf8_to_uv_or_die((const U8 *) t, (const U8 *) e, &advance); + } + else { + cp = *t; + advance = 1; + } + if (isPRINT(cp)) { if (! prev_was_printable) { my_strlcat(ret, " ", size); @@ -966,7 +973,7 @@ S_get_displayable_string(pTHX_ my_strlcat(ret, form("%02" UVXf, cp), size); prev_was_printable = FALSE; } - t += (is_utf8) ? UTF8SKIP(t) : 1; + t += advance; first_time = FALSE; } From 1529196bdf6ce25b6266c74d77bba798907b5eb9 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 8 Sep 2025 08:57:47 -0600 Subject: [PATCH 3/5] regexec.c: regrepeat: Avoid UTF8SKIPs This value is returnable from the called functions. Use it instead of re-deriving it --- regexec.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/regexec.c b/regexec.c index d258965529c9..1d865a6da3f1 100644 --- a/regexec.c +++ b/regexec.c @@ -10461,6 +10461,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, switch (with_t_UTF8ness(OP(p), utf8_target)) { SV * anyofh_list; + Size_t advance; case REG_ANY_t8: while (scan < this_eol && hardcount < max && *scan != '\n') { @@ -10746,9 +10747,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, && _invlist_contains_cp(anyofh_list, utf8_to_uv_or_die((U8 *) scan, (U8 *) this_eol, - NULL))) + &advance))) { - scan += UTF8SKIP(scan); + scan += advance; hardcount++; } break; @@ -10762,9 +10763,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, && _invlist_contains_cp(anyofh_list, utf8_to_uv_or_die((U8 *) scan, (U8 *) this_eol, - NULL))) + &advance))) { - scan += UTF8SKIP(scan); + scan += advance; hardcount++; } break; @@ -10792,9 +10793,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, && _invlist_contains_cp(anyofh_list, utf8_to_uv_or_die((U8 *) scan, (U8 *) this_eol, - NULL))) + &advance))) { - scan += UTF8SKIP(scan); + scan += advance; hardcount++; } break; @@ -10807,9 +10808,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, && _invlist_contains_cp(anyofh_list, utf8_to_uv_or_die((U8 *) scan, (U8 *) this_eol, - NULL))) + &advance))) { - scan += UTF8SKIP(scan); + scan += advance; hardcount++; } break; @@ -10820,10 +10821,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) && withinCOUNT(utf8_to_uv_or_die((U8 *) scan, (U8 *) this_eol, - NULL), + &advance), ANYOFRbase(p), ANYOFRdelta(p))) { - scan += UTF8SKIP(scan); + scan += advance; hardcount++; } break; @@ -10844,10 +10845,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, && (U8) *scan == ANYOF_FLAGS(p) && withinCOUNT(utf8_to_uv_or_die((U8 *) scan, (U8 *) this_eol, - NULL), + &advance), ANYOFRbase(p), ANYOFRdelta(p))) { - scan += UTF8SKIP(scan); + scan += advance; hardcount++; } break; From 8b8dbd7aa0d29fc609818a73a55a877c955572d8 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 8 Sep 2025 09:46:55 -0600 Subject: [PATCH 4/5] toke.c: Remove unnecessary code If a character matches isSPACE, it must be an ASCII character, and hence its length is 1. No need check for and call UTF8SKIP on it. --- toke.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/toke.c b/toke.c index 4de39efd1bc3..99316451a606 100644 --- a/toke.c +++ b/toke.c @@ -730,7 +730,7 @@ S_warn_expect_operator(pTHX_ const char *const what, char *s, I32 pop_oldbufptr) const char *t= oldbp; assert(s >= oldbp); while (t < s && isSPACE(*t)) { - t += UTF ? UTF8SKIP(t) : 1; + t++; } sv_catpvf(message, @@ -5390,7 +5390,7 @@ yyl_dollar(pTHX_ char *s) while ( t < PL_bufend ) { if (isSPACE(*t)) { - do { t += UTF ? UTF8SKIP(t) : 1; } while (t < PL_bufend && isSPACE(*t)); + do { t++; } while (t < PL_bufend && isSPACE(*t)); /* consumed one or more space chars */ } else if (*t == '$' || *t == '@') { /* could be more than one '$' like $$ref or @$ref */ From ea90d4213fb553770915e9a952bf474e848478b3 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 8 Sep 2025 09:50:22 -0600 Subject: [PATCH 5/5] toke.c: Avoid some UTF8SKIPs This value is returnable from the called functions. Use it instead of re-deriving it --- toke.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/toke.c b/toke.c index 99316451a606..62f427bf67c3 100644 --- a/toke.c +++ b/toke.c @@ -2964,14 +2964,15 @@ Perl_get_and_check_backslash_N_name(pTHX_ const char* s, s += 2; } else { + Size_t advance; if (! _invlist_contains_cp(PL_utf8_charname_begin, utf8_to_uv_or_die((const U8 *) s, (const U8 *) e, - NULL))) + &advance))) { goto bad_charname; } - s += UTF8SKIP(s); + s += advance; } while (s < e) { @@ -2992,14 +2993,15 @@ Perl_get_and_check_backslash_N_name(pTHX_ const char* s, s += 2; } else { + Size_t advance; if (! _invlist_contains_cp(PL_utf8_charname_continue, utf8_to_uv_or_die((const U8 *) s, (const U8 *) e, - NULL))) + &advance))) { goto bad_charname; } - s += UTF8SKIP(s); + s += advance; } } }