From 592b07989742a430085037172d210461e15e978c Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 8 Sep 2025 07:11:40 -0600
Subject: [PATCH 01/11] handy.h: Swap order of conditionals for clarity

This moves the trivial case to before the complicated one, which is
easier to comprehend.  And instead of complementing the conditional, use
a different name (that evaluates to that complement) which makes it
clearer what's going on.
---
 handy.h | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/handy.h b/handy.h
index 7b91b99a1c7c..05ffddf4cd07 100644
--- a/handy.h
+++ b/handy.h
@@ -2259,23 +2259,23 @@ END_EXTERN_C
       ? (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)\
       : (UTF8_IS_INVARIANT(*(p)))                                           \
           ? generic_isCC_(*(p), classnum)                                   \
-          : (UTF8_IS_DOWNGRADEABLE_START(*(p))                              \
-             ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1))))   \
+          : (UTF8_IS_ABOVE_LATIN1_START(*(p))                               \
+             ? above_latin1                                                 \
+             : ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1))))   \
                 ? generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )),  \
                                 classnum)                                   \
                 : (force_out_malformed_utf8_message_(                       \
-                                        (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0))  \
-             : above_latin1))
+                                        (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0))))
 /* Like the above, but calls 'above_latin1(p, e)' to get the utf8 value.
  * 'above_latin1' can be a macro */
 #define generic_func_utf8_safe_(classnum, above_latin1, p, e)               \
                     generic_utf8_safe_(classnum, p, e, above_latin1(p, e))
 #define generic_non_invlist_utf8_safe_(classnum, above_latin1, p, e)        \
           generic_utf8_safe_(classnum, p, e,                                \
-                             (UNLIKELY((e) - (p) < UTF8SKIP(p))             \
-                              ? (force_out_malformed_utf8_message_(         \
-                                      (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0) \
-                              : above_latin1(p)))
+                             (LIKELY((e) - (p) >= UTF8SKIP(p))              \
+                              ? above_latin1(p)                             \
+                              : (force_out_malformed_utf8_message_(         \
+                                      (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)))
 /* Like the above, but passes classnum to _isFOO_utf8(), instead of having an
  * 'above_latin1' parameter */
 #define generic_invlist_utf8_safe_(classnum, p, e)                          \
@@ -2289,10 +2289,9 @@ END_EXTERN_C
          (assert(utf8_safe_assert_(p, e)),                                  \
          (isASCII(*(p)))                                                    \
           ? generic_isCC_(*(p), classnum)                                   \
-          : (UTF8_IS_DOWNGRADEABLE_START(*(p)))                             \
-             ? 0 /* Note that doesn't check validity for latin1 */          \
-             : above_latin1)
-
+          : (UTF8_IS_ABOVE_LATIN1_START(*(p))                               \
+             ? above_latin1                                                 \
+             : 0)) /* Note that doesn't check validity for latin1 */
 
 #define isALPHA_utf8(p, e)         isALPHA_utf8_safe(p, e)
 #define isALPHANUMERIC_utf8(p, e)  isALPHANUMERIC_utf8_safe(p, e)
@@ -2409,12 +2408,12 @@ END_EXTERN_C
          (assert_(utf8_safe_assert_(p, e))                                \
          (UTF8_IS_INVARIANT(*(p)))                                          \
           ? macro(*(p))                                                     \
-          : (UTF8_IS_DOWNGRADEABLE_START(*(p))                              \
-             ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1))))   \
+          : (UTF8_IS_ABOVE_LATIN1_START(*(p))                               \
+             ? above_latin1                                                 \
+             : ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1))))   \
                 ? macro(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1)))           \
                 : (force_out_malformed_utf8_message_(                       \
-                                        (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)) \
-              : above_latin1))
+                                        (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0))))
 
 #define generic_LC_invlist_utf8_safe_(macro, classnum, p, e)                  \
             generic_LC_utf8_safe_(macro, p, e,                              \
@@ -2425,10 +2424,10 @@ END_EXTERN_C
 
 #define generic_LC_non_invlist_utf8_safe_(classnum, above_latin1, p, e)       \
           generic_LC_utf8_safe_(classnum, p, e,                             \
-                             (UNLIKELY((e) - (p) < UTF8SKIP(p))             \
-                              ? (force_out_malformed_utf8_message_(         \
-                                      (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0) \
-                              : above_latin1(p)))
+                             (LIKELY((e) - (p) >= UTF8SKIP(p))              \
+                              ? above_latin1(p)                             \
+                              : (force_out_malformed_utf8_message_(         \
+                                      (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)))
 
 #define isALPHANUMERIC_LC_utf8_safe(p, e)                                   \
             generic_LC_invlist_utf8_safe_(isALPHANUMERIC_LC,                \

From fd90e73dc106320553eb8035da107911fdb0ee57 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 8 Sep 2025 07:32:52 -0600
Subject: [PATCH 02/11] handy.h: White space only

This cleans up some ragged edges, makes things fit in 80 columns
---
 handy.h | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/handy.h b/handy.h
index 05ffddf4cd07..30ba859ca841 100644
--- a/handy.h
+++ b/handy.h
@@ -2256,7 +2256,8 @@ END_EXTERN_C
 
 #define generic_utf8_safe_(classnum, p, e, above_latin1)                    \
     ((! utf8_safe_assert_(p, e))                                            \
-      ? (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)\
+      ? (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e), 0,       \
+                                           MALFORMED_UTF8_DIE), 0)          \
       : (UTF8_IS_INVARIANT(*(p)))                                           \
           ? generic_isCC_(*(p), classnum)                                   \
           : (UTF8_IS_ABOVE_LATIN1_START(*(p))                               \
@@ -2264,18 +2265,25 @@ END_EXTERN_C
              : ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1))))   \
                 ? generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )),  \
                                 classnum)                                   \
-                : (force_out_malformed_utf8_message_(                       \
-                                        (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0))))
+                : (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e),\
+                                                     0, MALFORMED_UTF8_DIE),\
+                   0))))
+
 /* Like the above, but calls 'above_latin1(p, e)' to get the utf8 value.
  * 'above_latin1' can be a macro */
 #define generic_func_utf8_safe_(classnum, above_latin1, p, e)               \
                     generic_utf8_safe_(classnum, p, e, above_latin1(p, e))
+
 #define generic_non_invlist_utf8_safe_(classnum, above_latin1, p, e)        \
-          generic_utf8_safe_(classnum, p, e,                                \
-                             (LIKELY((e) - (p) >= UTF8SKIP(p))              \
-                              ? above_latin1(p)                             \
-                              : (force_out_malformed_utf8_message_(         \
-                                      (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)))
+    generic_utf8_safe_(classnum, p, e,                                      \
+                       (LIKELY((e) - (p) >= UTF8SKIP(p))                    \
+                        ? above_latin1(p)                                   \
+                        : (force_out_malformed_utf8_message_(               \
+                                                        (U8 *) (p),         \
+                                                        (U8 *) (e),         \
+                                                        0,                  \
+                                                        MALFORMED_UTF8_DIE),\
+                           0)))
 /* Like the above, but passes classnum to _isFOO_utf8(), instead of having an
  * 'above_latin1' parameter */
 #define generic_invlist_utf8_safe_(classnum, p, e)                          \
@@ -2405,7 +2413,7 @@ END_EXTERN_C
  * point in 'p' is within the 0-255 range, it uses locale rules from the
  * passed-in 'macro' parameter */
 #define generic_LC_utf8_safe_(macro, p, e, above_latin1)                    \
-         (assert_(utf8_safe_assert_(p, e))                                \
+         (assert_(utf8_safe_assert_(p, e))                                  \
          (UTF8_IS_INVARIANT(*(p)))                                          \
           ? macro(*(p))                                                     \
           : (UTF8_IS_ABOVE_LATIN1_START(*(p))                               \

From 04b84bef96af5f486940108b39dda6a250cd02f4 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 8 Sep 2025 07:43:19 -0600
Subject: [PATCH 03/11] handy.h: Remove unnecessary cast

The && in this expression already makes the result a boolean; no need to
cast it to such.  Removing it allows the entire expression to fit on one
line.
---
 handy.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/handy.h b/handy.h
index 30ba859ca841..fe7eef30240f 100644
--- a/handy.h
+++ b/handy.h
@@ -1595,8 +1595,8 @@ END_EXTERN_C
 
     /* For internal core Perl use only: the base macro for defining macros like
      * isALPHA */
-#   define generic_isCC_(c, classnum) cBOOL(FITS_IN_8_BITS(c)    \
-                && (PL_charclass[(U8) (c)] & CC_mask_(classnum)))
+#   define generic_isCC_(c, classnum)                                       \
+       (FITS_IN_8_BITS(c) && (PL_charclass[(U8) (c)] & CC_mask_(classnum)))
 
     /* The mask for the _A versions of the macros; it just adds in the bit for
      * ASCII. */

From e5ef7eb18606d03f484aa871661b6dd569ba1a74 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 8 Sep 2025 06:23:54 -0600
Subject: [PATCH 04/11] utf8.c: Replace macro by a static function

This will be useful in the next commits
---
 utf8.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/utf8.c b/utf8.c
index b0922d138470..33028dd057ed 100644
--- a/utf8.c
+++ b/utf8.c
@@ -3903,15 +3903,18 @@ S_warn_on_first_deprecated_use(pTHX_ U32 category,
 /* returns a boolean giving whether or not the UTF8-encoded character that
  * starts at <p>, and extending no further than <e - 1> is in the inversion
  * list <invlist>. */
-#define IS_UTF8_IN_INVLIST(p, e, invlist)                                   \
-            _invlist_contains_cp(invlist, utf8_to_uv_or_die(p, e, NULL))
+STATIC bool
+S_is_utf8_in_invlist(pTHX_ const U8 * p, const U8 * e, SV * const invlist)
+{
+    return _invlist_contains_cp(invlist, utf8_to_uv_or_die(p, e, NULL));
+}
 
 bool
 Perl_is_utf8_FOO_(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
 {
     PERL_ARGS_ASSERT_IS_UTF8_FOO_;
 
-    return IS_UTF8_IN_INVLIST(p, e, PL_XPosix_ptrs[classnum]);
+    return S_is_utf8_in_invlist(aTHX_ p, e, PL_XPosix_ptrs[classnum]);
 }
 
 bool
@@ -3919,7 +3922,7 @@ Perl_is_utf8_perl_idstart_(pTHX_ const U8 *p, const U8 * const e)
 {
     PERL_ARGS_ASSERT_IS_UTF8_PERL_IDSTART_;
 
-    return IS_UTF8_IN_INVLIST(p, e, PL_utf8_perl_idstart);
+    return S_is_utf8_in_invlist(aTHX_ p, e, PL_utf8_perl_idstart);
 }
 
 bool
@@ -3927,7 +3930,7 @@ Perl_is_utf8_perl_idcont_(pTHX_ const U8 *p, const U8 * const e)
 {
     PERL_ARGS_ASSERT_IS_UTF8_PERL_IDCONT_;
 
-    return IS_UTF8_IN_INVLIST(p, e, PL_utf8_perl_idcont);
+    return S_is_utf8_in_invlist(aTHX_ p, e, PL_utf8_perl_idcont);
 }
 
 STATIC UV

From 8994f610a8ee16d489ca58580f97308a3629ea2e Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 8 Sep 2025 06:41:01 -0600
Subject: [PATCH 05/11] Generalize 3 functions to return length on success

Instead of a bool, they will now return the number of bytes that
comprise the character being checked.  So the result can be used as a
bool, just as before; or the extra information can save recalculations,
as done in the future commits.
---
 embed.fnc |  6 +++---
 proto.h   |  6 +++---
 utf8.c    | 21 +++++++++++++--------
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index 99c664b8a5b8..4eb1df93843a 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1753,17 +1753,17 @@ ATdip	|bool	|is_utf8_fixed_width_buf_loclen_flags			\
 				|NULLOK const U8 **ep			\
 				|NULLOK STRLEN *el			\
 				|const U32 flags
-CRp	|bool	|is_utf8_FOO_	|const U8 classnum			\
+CRp	|Size_t |is_utf8_FOO_	|const U8 classnum			\
 				|NN const U8 *p 			\
 				|NN const U8 * const e
 ARTdip	|bool	|is_utf8_invariant_string_loc				\
 				|NN const U8 * const s			\
 				|STRLEN len				\
 				|NULLOK const U8 **ep
-CRp	|bool	|is_utf8_perl_idcont_					\
+CRp	|Size_t |is_utf8_perl_idcont_					\
 				|NN const U8 *p 			\
 				|NN const U8 * const e
-CRp	|bool	|is_utf8_perl_idstart_					\
+CRp	|Size_t |is_utf8_perl_idstart_					\
 				|NN const U8 *p 			\
 				|NN const U8 * const e
 ARTdmp	|bool	|is_utf8_string |NN const U8 *s 			\
diff --git a/proto.h b/proto.h
index 41cd957f7829..2a9e2e05227b 100644
--- a/proto.h
+++ b/proto.h
@@ -1850,7 +1850,7 @@ Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e, const bool requ
 #define PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_     \
         assert(s0); assert(e)
 
-PERL_CALLCONV bool
+PERL_CALLCONV Size_t
 Perl_is_utf8_FOO_(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
         __attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_IS_UTF8_FOO_           \
@@ -1872,13 +1872,13 @@ Perl_is_utf8_fixed_width_buf_flags(const U8 * const s, STRLEN len, const U32 fla
 /* PERL_CALLCONV bool
 Perl_is_utf8_fixed_width_buf_loc_flags(const U8 * const s, STRLEN len, const U8 **ep, const U32 flags); */
 
-PERL_CALLCONV bool
+PERL_CALLCONV Size_t
 Perl_is_utf8_perl_idcont_(pTHX_ const U8 *p, const U8 * const e)
         __attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_IS_UTF8_PERL_IDCONT_   \
         assert(p); assert(e)
 
-PERL_CALLCONV bool
+PERL_CALLCONV Size_t
 Perl_is_utf8_perl_idstart_(pTHX_ const U8 *p, const U8 * const e)
         __attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_IS_UTF8_PERL_IDSTART_  \
diff --git a/utf8.c b/utf8.c
index 33028dd057ed..e9b84200917c 100644
--- a/utf8.c
+++ b/utf8.c
@@ -3900,16 +3900,21 @@ S_warn_on_first_deprecated_use(pTHX_ U32 category,
 }
 #endif
 
-/* returns a boolean giving whether or not the UTF8-encoded character that
- * starts at <p>, and extending no further than <e - 1> is in the inversion
- * list <invlist>. */
-STATIC bool
+/* returns the number of bytes comprising the UTF8-encoded character that
+ * starts at <p>, and extending no further than <e - 1> if it is in the
+ * inversion list <invlist>; or 0 if it isn't */
+STATIC Size_t
 S_is_utf8_in_invlist(pTHX_ const U8 * p, const U8 * e, SV * const invlist)
 {
-    return _invlist_contains_cp(invlist, utf8_to_uv_or_die(p, e, NULL));
+    Size_t advance;
+    if (_invlist_contains_cp(invlist, utf8_to_uv_or_die(p, e, &advance))) {
+        return advance;
+    }
+
+    return 0;
 }
 
-bool
+Size_t
 Perl_is_utf8_FOO_(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
 {
     PERL_ARGS_ASSERT_IS_UTF8_FOO_;
@@ -3917,7 +3922,7 @@ Perl_is_utf8_FOO_(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
     return S_is_utf8_in_invlist(aTHX_ p, e, PL_XPosix_ptrs[classnum]);
 }
 
-bool
+Size_t
 Perl_is_utf8_perl_idstart_(pTHX_ const U8 *p, const U8 * const e)
 {
     PERL_ARGS_ASSERT_IS_UTF8_PERL_IDSTART_;
@@ -3925,7 +3930,7 @@ Perl_is_utf8_perl_idstart_(pTHX_ const U8 *p, const U8 * const e)
     return S_is_utf8_in_invlist(aTHX_ p, e, PL_utf8_perl_idstart);
 }
 
-bool
+Size_t
 Perl_is_utf8_perl_idcont_(pTHX_ const U8 *p, const U8 * const e)
 {
     PERL_ARGS_ASSERT_IS_UTF8_PERL_IDCONT_;

From f6f4bce1257b0f345dacd550f0923a439c5e6e6d Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 8 Sep 2025 08:43:20 -0600
Subject: [PATCH 06/11] Change isFOO_utf8_safe macros to return matched byte
 length

Or 0 when the character isn't of type FOO.  This allows these macros to
be used as booleans, as previously; or to give you how many bytes there
are in the matched UTF-8 character.

This was always trivially the case for ASCII-range characters, as the
former boolean 0,1 gave you the correct length if they matched.

The previous commit extended this to return the length for above-Latin1
characters.

This commit is the final piece.  Latin1 characters that aren't ASCII
always are two bytes.  So just multiply the return by 2, yielding 0 if
no match or 2 bytes if matched.
---
 handy.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/handy.h b/handy.h
index fe7eef30240f..fec7d83e3bd5 100644
--- a/handy.h
+++ b/handy.h
@@ -2263,8 +2263,10 @@ END_EXTERN_C
           : (UTF8_IS_ABOVE_LATIN1_START(*(p))                               \
              ? above_latin1                                                 \
              : ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1))))   \
-                ? generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )),  \
-                                classnum)                                   \
+                  /* Multiply by 2 to return byte length of matched         \
+                   * character */                                           \
+                ? 2 * generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(*(p),*((p)+1)),\
+                                    classnum)                               \
                 : (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e),\
                                                      0, MALFORMED_UTF8_DIE),\
                    0))))

From ba197ce98ad8fba8a32ed07a3de13d67cefd2bb2 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 8 Sep 2025 09:05:06 -0600
Subject: [PATCH 07/11] class.c: Avoid UTF8SKIPs

This value is now returned from the isSPACE_utf8_safe macro.  Use it
instead of re-deriving it.
---
 class.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/class.c b/class.c
index f731b986dc00..871a68e752bd 100644
--- a/class.c
+++ b/class.c
@@ -445,8 +445,9 @@ static const char *S_split_package_ver(pTHX_ SV *value, SV *pkgname, SV *pkgvers
     if(SvUTF8(value))
         SvUTF8_on(pkgname);
 
-    while(*p && isSPACE_utf8_safe(p, end))
-        p += UTF8SKIP(p);
+    Size_t advance;
+    while(*p && (advance = isSPACE_utf8_safe(p, end)))
+        p += advance;
 
     if(*p) {
         /* scan_version() gets upset about trailing content. We need to extract
@@ -463,8 +464,8 @@ static const char *S_split_package_ver(pTHX_ SV *value, SV *pkgname, SV *pkgvers
         scan_version(SvPVX(tmpsv), pkgversion, FALSE);
     }
 
-    while(*p && isSPACE_utf8_safe(p, end))
-        p += UTF8SKIP(p);
+    while(*p && (advance = isSPACE_utf8_safe(p, end)))
+        p += advance;
 
     return p;
 }

From b784fff78ecf674fce86b23dc095f1e1c798fac6 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 8 Sep 2025 09:19:11 -0600
Subject: [PATCH 08/11] pp_ctl.c: Avoid UTF8SKIPs

This value is now returned from the isID(FIRST|CONT)_utf8_safe macros.
Use it instead of re-deriving it.
---
 pp_ctl.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/pp_ctl.c b/pp_ctl.c
index 5cfd919e6b6d..d713cdc0cb1c 100644
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -5192,11 +5192,17 @@ S_require_file(pTHX_ SV *sv)
                          S_parse_ident */
                         c = name;
                         while (c < e) {
-                            if (utf8 && isIDFIRST_utf8_safe(c, e)) {
-                                c += UTF8SKIP(c);
-                                while (c < e && isIDCONT_utf8_safe(
-                                            (const U8*) c, (const U8*) e))
-                                    c += UTF8SKIP(c);
+                            Size_t advance;
+
+                            if (utf8 && (advance = isIDFIRST_utf8_safe(c, e)))
+                            {
+                                c += advance;
+                                while (   c < e
+                                       && (advance = isIDCONT_utf8_safe(
+                                                (const U8*) c, (const U8*) e)))
+                                {
+                                    c += advance;
+                                }
                             }
                             else if (isWORDCHAR_A(*c)) {
                                 while (c < e && isWORDCHAR_A(*c))

From d16ee554ab016dd85c9e13c18802101d3cc3f9e5 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 8 Sep 2025 09:36:00 -0600
Subject: [PATCH 09/11] regcomp.c: Avoid UTF8SKIPs

This value is now returned from the isID(FIRST|CONT)_lazy_if_safe macros.
Use it instead of re-deriving it.  This also simplifies the code
---
 regcomp.c          | 13 +++++--------
 regcomp_internal.h |  8 ++++++++
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/regcomp.c b/regcomp.c
index c76a467bcd93..3417b3ea0bbb 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2522,19 +2522,16 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
     PERL_ARGS_ASSERT_REG_SCAN_NAME;
 
     assert (RExC_parse <= RExC_end);
+    Size_t advance;
     if (RExC_parse == RExC_end) NOOP;
-    else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
+    else if ((advance = isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF))) {
          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
           * using do...while */
-        if (UTF)
             do {
-                RExC_parse_inc_utf8();
+                RExC_parse_advance(advance);
             } while (   RExC_parse < RExC_end
-                     && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
-        else
-            do {
-                RExC_parse_inc_by(1);
-            } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
+                     && (advance = isWORDCHAR_utf8_safe( (U8 *) RExC_parse,
+                                                         (U8 *) RExC_end)));
     } else {
         RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending
                          character */
diff --git a/regcomp_internal.h b/regcomp_internal.h
index 60ec13777a61..27e543d80be7 100644
--- a/regcomp_internal.h
+++ b/regcomp_internal.h
@@ -279,6 +279,14 @@ struct RExC_state_t {
  * output during the parse process.
  */
 
+/* RExC_parse_advance(count)
+ *
+ * Increment RExC_parse to point at the next codepoint, when we *know* that the
+ * correct byte count is in the passed parameter */
+#define RExC_parse_advance(count) STMT_START {          \
+    RExC_parse += count;                                \
+} STMT_END
+
 /* RExC_parse_incf(flag)
  *
  * Increment RExC_parse to point at the next codepoint, while doing

From 70a8504dae5b69e838d20256829d9800024b0fa2 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 8 Sep 2025 09:43:01 -0600
Subject: [PATCH 10/11] regcomp.c: White space only

The previous commit removed a surrounding block; outdent correspondingly
---
 regcomp.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/regcomp.c b/regcomp.c
index 3417b3ea0bbb..1831effb0dba 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2527,11 +2527,11 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
     else if ((advance = isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF))) {
          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
           * using do...while */
-            do {
-                RExC_parse_advance(advance);
-            } while (   RExC_parse < RExC_end
-                     && (advance = isWORDCHAR_utf8_safe( (U8 *) RExC_parse,
-                                                         (U8 *) RExC_end)));
+        do {
+            RExC_parse_advance(advance);
+        } while (   RExC_parse < RExC_end
+                 && (advance = isWORDCHAR_utf8_safe( (U8 *) RExC_parse,
+                                                     (U8 *) RExC_end)));
     } else {
         RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending
                          character */

From 8887d78912ec1e48b1ba17f259316fcbd34fd81e Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 8 Sep 2025 10:13:58 -0600
Subject: [PATCH 11/11] toke.c: Avoid UTF8SKIPs

This value is now returned from the isID(FIRST|CONT)_lazy_if_safe macros.
Use it instead of re-deriving it.
---
 toke.c | 81 ++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 51 insertions(+), 30 deletions(-)

diff --git a/toke.c b/toke.c
index 62f427bf67c3..3a3c0e7a2168 100644
--- a/toke.c
+++ b/toke.c
@@ -2088,8 +2088,10 @@ S_check_uni(pTHX)
     while (isSPACE(*PL_last_uni))
         PL_last_uni++;
     s = PL_last_uni;
-    while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
-        s += UTF ? UTF8SKIP(s) : 1;
+    Size_t advance;
+    while (   (advance = isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
+           || (advance = (*s == '-')))
+        s += advance;
     if (s < PL_bufptr && memchr(s, '(', PL_bufptr - s))
         return;
 
@@ -5193,10 +5195,11 @@ S_check_scalar_slice(pTHX_ char *s)
     {
         return;
     }
-    while (    isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)
-           || (*s && memCHRs(" \t$#+-'\"", *s)))
+    Size_t advance;
+    while (   (advance = isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
+           || (advance = (*s && memCHRs(" \t$#+-'\"", *s))))
     {
-        s += UTF ? UTF8SKIP(s) : 1;
+        s += advance;
     }
     if (*s == '}' || *s == ']')
         pl_yylval.ival = OPpSLICEWARNING;
@@ -5402,8 +5405,11 @@ yyl_dollar(pTHX_ char *s)
                             while (t < PL_bufend && *t == ' ') t++;
 
                             /* strip off the name of the var */
-                            while (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))
-                                t += UTF ? UTF8SKIP(t) : 1;
+                            Size_t advance;
+                            while ((advance = (isWORDCHAR_lazy_if_safe(t,
+                                                                    PL_bufend,
+                                                                    UTF))))
+                                t += advance;
                             /* consumed a varname */
                         } else if (isDIGIT(*t)) {
                             /* deal with hex constants like 0x11 */
@@ -6407,6 +6413,7 @@ yyl_leftcurly(pTHX_ char *s, const U8 formbrack)
              * GSAR 97-07-21
              */
             t = s;
+            Size_t advance;
             if (*s == '\'' || *s == '"' || *s == '`') {
                 /* common case: get past first string, handling escapes */
                 for (t++; t < PL_bufend && *t != *s;)
@@ -6455,20 +6462,24 @@ yyl_leftcurly(pTHX_ char *s, const U8 formbrack)
                     }
                     t++;
                 }
-                else
+                else {
                     /* skip plain q word */
-                    while (   t < PL_bufend
-                           && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))
+                    while (   t < PL_bufend 
+                           && (advance = isWORDCHAR_lazy_if_safe(t,
+                                                                 PL_bufend,
+                                                                 UTF)))
                     {
-                        t += UTF ? UTF8SKIP(t) : 1;
+                        t += advance;
                     }
+                }
             }
-            else if (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)) {
-                t += UTF ? UTF8SKIP(t) : 1;
+            else if ((advance = isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))) {
+                t += advance;
                 while (   t < PL_bufend
-                       && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))
+                       && (advance = isWORDCHAR_lazy_if_safe(t, PL_bufend,
+                                                             UTF)))
                 {
-                    t += UTF ? UTF8SKIP(t) : 1;
+                    t += advance;
                 }
             }
             while (t < PL_bufend && isSPACE(*t))
@@ -10125,11 +10136,12 @@ S_checkcomma(pTHX_ const char *s, const char *name, const char *what)
         s++;
     while (s < PL_bufend && isSPACE(*s))
         s++;
-    if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
+    Size_t advance;
+    if ((advance = isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))) {
         const char * const w = s;
-        s += UTF ? UTF8SKIP(s) : 1;
-        while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
-            s += UTF ? UTF8SKIP(s) : 1;
+        s += advance;
+        while ((advance = isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)))
+            s += advance;
         while (s < PL_bufend && isSPACE(*s))
             s++;
         if (*s == ',') {
@@ -10281,15 +10293,18 @@ S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package,
     while (*s < PL_bufend) {
         if (*d >= e)
             croak("%s", ident_too_long);
-        if (is_utf8 && isIDFIRST_utf8_safe(*s, PL_bufend)) {
+        Size_t advance;
+        if (is_utf8 && (advance = isIDFIRST_utf8_safe(*s, PL_bufend))) {
              /* The UTF-8 case must come first, otherwise things
              * like c\N{COMBINING TILDE} would start failing, as the
              * isWORDCHAR_A case below would gobble the 'c' up.
              */
 
-            char *t = *s + UTF8SKIP(*s);
-            while (isIDCONT_utf8_safe((const U8*) t, (const U8*) PL_bufend)) {
-                t += UTF8SKIP(t);
+            char *t = *s + advance;
+            while ((advance = isIDCONT_utf8_safe((const U8*) t,
+                                                 (const U8*) PL_bufend)))
+            {
+                t += advance;
             }
             if (*d + (t - *s) > e)
                 croak("%s", ident_too_long);
@@ -10496,11 +10511,12 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
             /* note we have to check for a normal identifier first,
              * as it handles utf8 symbols, and only after that has
              * been ruled out can we look at the caret words */
-            if (isIDFIRST_lazy_if_safe(d, e, is_utf8) ) {
+            Size_t advance;
+            if ((advance = isIDFIRST_lazy_if_safe(d, e, is_utf8) )) {
                 /* if it starts as a valid identifier, assume that it is one.
                    (the later check for } being at the expected point will trap
                    cases where this doesn't pan out.)  */
-                d += is_utf8 ? UTF8SKIP(d) : 1;
+                d += advance;
                 parse_ident(&s, &d, e, 1, is_utf8, TRUE);
                 *d = '\0';
             }
@@ -10998,8 +11014,9 @@ S_scan_heredoc(pTHX_ char *s)
 
         peek = s;
 
-        while (isWORDCHAR_lazy_if_safe(peek, PL_bufend, UTF)) {
-            peek += UTF ? UTF8SKIP(peek) : 1;
+        Size_t advance;
+        while ((advance = isWORDCHAR_lazy_if_safe(peek, PL_bufend, UTF))) {
+            peek += advance;
         }
 
         len = (peek - s >= e - d) ? (e - d) : (peek - s);
@@ -11442,9 +11459,13 @@ S_scan_inputsymbol(pTHX_ char *start)
     if (*d == '$' && d[1]) d++;
 
     /* allow <Pkg'VALUE> or <Pkg::VALUE> */
-    while (isWORDCHAR_lazy_if_safe(d, e, UTF) || *d == ':'
-           || (*d == '\'' && FEATURE_APOS_AS_NAME_SEP_IS_ENABLED)) {
-        d += UTF ? UTF8SKIP(d) : 1;
+    Size_t advance;
+    while (   (advance = isWORDCHAR_lazy_if_safe(d, e, UTF))
+           || (advance = (   *d == ':'
+                          || (   *d == '\''
+                              && FEATURE_APOS_AS_NAME_SEP_IS_ENABLED))))
+    {
+        d += advance;
     }
 
     /* If we've tried to read what we allow filehandles to look like, and