Perl · khwilliamson · Sep 15, 2025 · Sep 8, 2025 · Sep 8, 2025 · Sep 8, 2025
diff --git a/class.c b/class.c
@@ -445,8 +445,9 @@ static const char *S_split_package_ver(pTHX_ SV *value, SV *pkgname, SV *pkgvers
     if(SvUTF8(value))
         SvUTF8_on(pkgname);
 
-    while(*p && isSPACE_utf8_safe(p, end))
-        p += UTF8SKIP(p);
+    Size_t advance;
+    while(*p && (advance = isSPACE_utf8_safe(p, end)))
+        p += advance;
 
     if(*p) {
         /* scan_version() gets upset about trailing content. We need to extract
@@ -463,8 +464,8 @@ static const char *S_split_package_ver(pTHX_ SV *value, SV *pkgname, SV *pkgvers
         scan_version(SvPVX(tmpsv), pkgversion, FALSE);
     }
 
-    while(*p && isSPACE_utf8_safe(p, end))
-        p += UTF8SKIP(p);
+    while(*p && (advance = isSPACE_utf8_safe(p, end)))
+        p += advance;
 
     return p;
 }

diff --git a/embed.fnc b/embed.fnc
@@ -1753,17 +1753,17 @@ ATdip	|bool	|is_utf8_fixed_width_buf_loclen_flags			\
 				|NULLOK const U8 **ep			\
 				|NULLOK STRLEN *el			\
 				|const U32 flags
-CRp	|bool	|is_utf8_FOO_	|const U8 classnum			\
+CRp	|Size_t |is_utf8_FOO_	|const U8 classnum			\
 				|NN const U8 *p 			\
 				|NN const U8 * const e
 ARTdip	|bool	|is_utf8_invariant_string_loc				\
 				|NN const U8 * const s			\
 				|STRLEN len				\
 				|NULLOK const U8 **ep
-CRp	|bool	|is_utf8_perl_idcont_					\
+CRp	|Size_t |is_utf8_perl_idcont_					\
 				|NN const U8 *p 			\
 				|NN const U8 * const e
-CRp	|bool	|is_utf8_perl_idstart_					\
+CRp	|Size_t |is_utf8_perl_idstart_					\
 				|NN const U8 *p 			\
 				|NN const U8 * const e
 ARTdmp	|bool	|is_utf8_string |NN const U8 *s 			\

diff --git a/handy.h b/handy.h
@@ -1595,8 +1595,8 @@ END_EXTERN_C
 
     /* For internal core Perl use only: the base macro for defining macros like
      * isALPHA */
-#   define generic_isCC_(c, classnum) cBOOL(FITS_IN_8_BITS(c)    \
-                && (PL_charclass[(U8) (c)] & CC_mask_(classnum)))
+#   define generic_isCC_(c, classnum)                                       \
+       (FITS_IN_8_BITS(c) && (PL_charclass[(U8) (c)] & CC_mask_(classnum)))
 
     /* The mask for the _A versions of the macros; it just adds in the bit for
      * ASCII. */
@@ -2256,26 +2256,36 @@ END_EXTERN_C
 
 #define generic_utf8_safe_(classnum, p, e, above_latin1)                    \
     ((! utf8_safe_assert_(p, e))                                            \
-      ? (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)\
+      ? (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e), 0,       \
+                                           MALFORMED_UTF8_DIE), 0)          \
       : (UTF8_IS_INVARIANT(*(p)))                                           \
           ? generic_isCC_(*(p), classnum)                                   \
-          : (UTF8_IS_DOWNGRADEABLE_START(*(p))                              \
-             ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1))))   \
-                ? generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )),  \
-                                classnum)                                   \
-                : (force_out_malformed_utf8_message_(                       \
-                                        (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0))  \
-             : above_latin1))
+          : (UTF8_IS_ABOVE_LATIN1_START(*(p))                               \
+             ? above_latin1                                                 \
+             : ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1))))   \
+                  /* Multiply by 2 to return byte length of matched         \
+                   * character */                                           \
+                ? 2 * generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(*(p),*((p)+1)),\
+                                    classnum)                               \
+                : (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e),\
+                                                     0, MALFORMED_UTF8_DIE),\
+                   0))))
+
 /* Like the above, but calls 'above_latin1(p, e)' to get the utf8 value.
  * 'above_latin1' can be a macro */
 #define generic_func_utf8_safe_(classnum, above_latin1, p, e)               \
                     generic_utf8_safe_(classnum, p, e, above_latin1(p, e))
+
 #define generic_non_invlist_utf8_safe_(classnum, above_latin1, p, e)        \
-          generic_utf8_safe_(classnum, p, e,                                \
-                             (UNLIKELY((e) - (p) < UTF8SKIP(p))             \
-                              ? (force_out_malformed_utf8_message_(         \
-                                      (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0) \
-                              : above_latin1(p)))
+    generic_utf8_safe_(classnum, p, e,                                      \
+                       (LIKELY((e) - (p) >= UTF8SKIP(p))                    \
+                        ? above_latin1(p)                                   \
+                        : (force_out_malformed_utf8_message_(               \
+                                                        (U8 *) (p),         \
+                                                        (U8 *) (e),         \
+                                                        0,                  \
+                                                        MALFORMED_UTF8_DIE),\
+                           0)))
 /* Like the above, but passes classnum to _isFOO_utf8(), instead of having an
  * 'above_latin1' parameter */
 #define generic_invlist_utf8_safe_(classnum, p, e)                          \
@@ -2289,10 +2299,9 @@ END_EXTERN_C
          (assert(utf8_safe_assert_(p, e)),                                  \
          (isASCII(*(p)))                                                    \
           ? generic_isCC_(*(p), classnum)                                   \
-          : (UTF8_IS_DOWNGRADEABLE_START(*(p)))                             \
-             ? 0 /* Note that doesn't check validity for latin1 */          \
-             : above_latin1)
-
+          : (UTF8_IS_ABOVE_LATIN1_START(*(p))                               \
+             ? above_latin1                                                 \
+             : 0)) /* Note that doesn't check validity for latin1 */
 
 #define isALPHA_utf8(p, e)         isALPHA_utf8_safe(p, e)
 #define isALPHANUMERIC_utf8(p, e)  isALPHANUMERIC_utf8_safe(p, e)
@@ -2406,15 +2415,15 @@ END_EXTERN_C
  * point in 'p' is within the 0-255 range, it uses locale rules from the
  * passed-in 'macro' parameter */
 #define generic_LC_utf8_safe_(macro, p, e, above_latin1)                    \
-         (assert_(utf8_safe_assert_(p, e))                                \
+         (assert_(utf8_safe_assert_(p, e))                                  \
          (UTF8_IS_INVARIANT(*(p)))                                          \
           ? macro(*(p))                                                     \
-          : (UTF8_IS_DOWNGRADEABLE_START(*(p))                              \
-             ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1))))   \
+          : (UTF8_IS_ABOVE_LATIN1_START(*(p))                               \
+             ? above_latin1                                                 \
+             : ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1))))   \
                 ? macro(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1)))           \
                 : (force_out_malformed_utf8_message_(                       \
-                                        (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)) \
-              : above_latin1))
+                                        (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0))))
 
 #define generic_LC_invlist_utf8_safe_(macro, classnum, p, e)                  \
             generic_LC_utf8_safe_(macro, p, e,                              \
@@ -2425,10 +2434,10 @@ END_EXTERN_C
 
 #define generic_LC_non_invlist_utf8_safe_(classnum, above_latin1, p, e)       \
           generic_LC_utf8_safe_(classnum, p, e,                             \
-                             (UNLIKELY((e) - (p) < UTF8SKIP(p))             \
-                              ? (force_out_malformed_utf8_message_(         \
-                                      (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0) \
-                              : above_latin1(p)))
+                             (LIKELY((e) - (p) >= UTF8SKIP(p))              \
+                              ? above_latin1(p)                             \
+                              : (force_out_malformed_utf8_message_(         \
+                                      (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)))
 
 #define isALPHANUMERIC_LC_utf8_safe(p, e)                                   \
             generic_LC_invlist_utf8_safe_(isALPHANUMERIC_LC,                \

diff --git a/pp_ctl.c b/pp_ctl.c
@@ -5192,11 +5192,17 @@ S_require_file(pTHX_ SV *sv)
                          S_parse_ident */
                         c = name;
                         while (c < e) {
-                            if (utf8 && isIDFIRST_utf8_safe(c, e)) {
-                                c += UTF8SKIP(c);
-                                while (c < e && isIDCONT_utf8_safe(
-                                            (const U8*) c, (const U8*) e))
-                                    c += UTF8SKIP(c);
+                            Size_t advance;
+
+                            if (utf8 && (advance = isIDFIRST_utf8_safe(c, e)))
+                            {
+                                c += advance;
+                                while (   c < e
+                                       && (advance = isIDCONT_utf8_safe(
+                                                (const U8*) c, (const U8*) e)))
+                                {
+                                    c += advance;
+                                }
                             }
                             else if (isWORDCHAR_A(*c)) {
                                 while (c < e && isWORDCHAR_A(*c))

diff --git a/proto.h b/proto.h
diff --git a/regcomp.c b/regcomp.c
@@ -2522,19 +2522,16 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
     PERL_ARGS_ASSERT_REG_SCAN_NAME;
 
     assert (RExC_parse <= RExC_end);
+    Size_t advance;
     if (RExC_parse == RExC_end) NOOP;
-    else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
+    else if ((advance = isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF))) {
          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
           * using do...while */
-        if (UTF)
-            do {
-                RExC_parse_inc_utf8();
-            } while (   RExC_parse < RExC_end
-                     && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
-        else
-            do {
-                RExC_parse_inc_by(1);
-            } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
+        do {
+            RExC_parse_advance(advance);
+        } while (   RExC_parse < RExC_end
+                 && (advance = isWORDCHAR_utf8_safe( (U8 *) RExC_parse,
+                                                     (U8 *) RExC_end)));
     } else {
         RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending
                          character */

diff --git a/regcomp_internal.h b/regcomp_internal.h
@@ -279,6 +279,14 @@ struct RExC_state_t {
  * output during the parse process.
  */
 
+/* RExC_parse_advance(count)
+ *
+ * Increment RExC_parse to point at the next codepoint, when we *know* that the
+ * correct byte count is in the passed parameter */
+#define RExC_parse_advance(count) STMT_START {          \
+    RExC_parse += count;                                \
+} STMT_END
+
 /* RExC_parse_incf(flag)
  *
  * Increment RExC_parse to point at the next codepoint, while doing