diff --git a/class.c b/class.c index f731b986dc00..871a68e752bd 100644 --- a/class.c +++ b/class.c @@ -445,8 +445,9 @@ static const char *S_split_package_ver(pTHX_ SV *value, SV *pkgname, SV *pkgvers if(SvUTF8(value)) SvUTF8_on(pkgname); - while(*p && isSPACE_utf8_safe(p, end)) - p += UTF8SKIP(p); + Size_t advance; + while(*p && (advance = isSPACE_utf8_safe(p, end))) + p += advance; if(*p) { /* scan_version() gets upset about trailing content. We need to extract @@ -463,8 +464,8 @@ static const char *S_split_package_ver(pTHX_ SV *value, SV *pkgname, SV *pkgvers scan_version(SvPVX(tmpsv), pkgversion, FALSE); } - while(*p && isSPACE_utf8_safe(p, end)) - p += UTF8SKIP(p); + while(*p && (advance = isSPACE_utf8_safe(p, end))) + p += advance; return p; } diff --git a/embed.fnc b/embed.fnc index 99c664b8a5b8..4eb1df93843a 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1753,17 +1753,17 @@ ATdip |bool |is_utf8_fixed_width_buf_loclen_flags \ |NULLOK const U8 **ep \ |NULLOK STRLEN *el \ |const U32 flags -CRp |bool |is_utf8_FOO_ |const U8 classnum \ +CRp |Size_t |is_utf8_FOO_ |const U8 classnum \ |NN const U8 *p \ |NN const U8 * const e ARTdip |bool |is_utf8_invariant_string_loc \ |NN const U8 * const s \ |STRLEN len \ |NULLOK const U8 **ep -CRp |bool |is_utf8_perl_idcont_ \ +CRp |Size_t |is_utf8_perl_idcont_ \ |NN const U8 *p \ |NN const U8 * const e -CRp |bool |is_utf8_perl_idstart_ \ +CRp |Size_t |is_utf8_perl_idstart_ \ |NN const U8 *p \ |NN const U8 * const e ARTdmp |bool |is_utf8_string |NN const U8 *s \ diff --git a/handy.h b/handy.h index 7b91b99a1c7c..fec7d83e3bd5 100644 --- a/handy.h +++ b/handy.h @@ -1595,8 +1595,8 @@ END_EXTERN_C /* For internal core Perl use only: the base macro for defining macros like * isALPHA */ -# define generic_isCC_(c, classnum) cBOOL(FITS_IN_8_BITS(c) \ - && (PL_charclass[(U8) (c)] & CC_mask_(classnum))) +# define generic_isCC_(c, classnum) \ + (FITS_IN_8_BITS(c) && (PL_charclass[(U8) (c)] & CC_mask_(classnum))) /* The mask for the _A versions of the macros; it just adds in the bit for * ASCII. */ @@ -2256,26 +2256,36 @@ END_EXTERN_C #define generic_utf8_safe_(classnum, p, e, above_latin1) \ ((! utf8_safe_assert_(p, e)) \ - ? (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)\ + ? (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e), 0, \ + MALFORMED_UTF8_DIE), 0) \ : (UTF8_IS_INVARIANT(*(p))) \ ? generic_isCC_(*(p), classnum) \ - : (UTF8_IS_DOWNGRADEABLE_START(*(p)) \ - ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \ - ? generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )), \ - classnum) \ - : (force_out_malformed_utf8_message_( \ - (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)) \ - : above_latin1)) + : (UTF8_IS_ABOVE_LATIN1_START(*(p)) \ + ? above_latin1 \ + : ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \ + /* Multiply by 2 to return byte length of matched \ + * character */ \ + ? 2 * generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(*(p),*((p)+1)),\ + classnum) \ + : (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e),\ + 0, MALFORMED_UTF8_DIE),\ + 0)))) + /* Like the above, but calls 'above_latin1(p, e)' to get the utf8 value. * 'above_latin1' can be a macro */ #define generic_func_utf8_safe_(classnum, above_latin1, p, e) \ generic_utf8_safe_(classnum, p, e, above_latin1(p, e)) + #define generic_non_invlist_utf8_safe_(classnum, above_latin1, p, e) \ - generic_utf8_safe_(classnum, p, e, \ - (UNLIKELY((e) - (p) < UTF8SKIP(p)) \ - ? (force_out_malformed_utf8_message_( \ - (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0) \ - : above_latin1(p))) + generic_utf8_safe_(classnum, p, e, \ + (LIKELY((e) - (p) >= UTF8SKIP(p)) \ + ? above_latin1(p) \ + : (force_out_malformed_utf8_message_( \ + (U8 *) (p), \ + (U8 *) (e), \ + 0, \ + MALFORMED_UTF8_DIE),\ + 0))) /* Like the above, but passes classnum to _isFOO_utf8(), instead of having an * 'above_latin1' parameter */ #define generic_invlist_utf8_safe_(classnum, p, e) \ @@ -2289,10 +2299,9 @@ END_EXTERN_C (assert(utf8_safe_assert_(p, e)), \ (isASCII(*(p))) \ ? generic_isCC_(*(p), classnum) \ - : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \ - ? 0 /* Note that doesn't check validity for latin1 */ \ - : above_latin1) - + : (UTF8_IS_ABOVE_LATIN1_START(*(p)) \ + ? above_latin1 \ + : 0)) /* Note that doesn't check validity for latin1 */ #define isALPHA_utf8(p, e) isALPHA_utf8_safe(p, e) #define isALPHANUMERIC_utf8(p, e) isALPHANUMERIC_utf8_safe(p, e) @@ -2406,15 +2415,15 @@ END_EXTERN_C * point in 'p' is within the 0-255 range, it uses locale rules from the * passed-in 'macro' parameter */ #define generic_LC_utf8_safe_(macro, p, e, above_latin1) \ - (assert_(utf8_safe_assert_(p, e)) \ + (assert_(utf8_safe_assert_(p, e)) \ (UTF8_IS_INVARIANT(*(p))) \ ? macro(*(p)) \ - : (UTF8_IS_DOWNGRADEABLE_START(*(p)) \ - ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \ + : (UTF8_IS_ABOVE_LATIN1_START(*(p)) \ + ? above_latin1 \ + : ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \ ? macro(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1))) \ : (force_out_malformed_utf8_message_( \ - (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)) \ - : above_latin1)) + (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)))) #define generic_LC_invlist_utf8_safe_(macro, classnum, p, e) \ generic_LC_utf8_safe_(macro, p, e, \ @@ -2425,10 +2434,10 @@ END_EXTERN_C #define generic_LC_non_invlist_utf8_safe_(classnum, above_latin1, p, e) \ generic_LC_utf8_safe_(classnum, p, e, \ - (UNLIKELY((e) - (p) < UTF8SKIP(p)) \ - ? (force_out_malformed_utf8_message_( \ - (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0) \ - : above_latin1(p))) + (LIKELY((e) - (p) >= UTF8SKIP(p)) \ + ? above_latin1(p) \ + : (force_out_malformed_utf8_message_( \ + (U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0))) #define isALPHANUMERIC_LC_utf8_safe(p, e) \ generic_LC_invlist_utf8_safe_(isALPHANUMERIC_LC, \ diff --git a/pp_ctl.c b/pp_ctl.c index 5cfd919e6b6d..d713cdc0cb1c 100644 --- a/pp_ctl.c +++ b/pp_ctl.c @@ -5192,11 +5192,17 @@ S_require_file(pTHX_ SV *sv) S_parse_ident */ c = name; while (c < e) { - if (utf8 && isIDFIRST_utf8_safe(c, e)) { - c += UTF8SKIP(c); - while (c < e && isIDCONT_utf8_safe( - (const U8*) c, (const U8*) e)) - c += UTF8SKIP(c); + Size_t advance; + + if (utf8 && (advance = isIDFIRST_utf8_safe(c, e))) + { + c += advance; + while ( c < e + && (advance = isIDCONT_utf8_safe( + (const U8*) c, (const U8*) e))) + { + c += advance; + } } else if (isWORDCHAR_A(*c)) { while (c < e && isWORDCHAR_A(*c)) diff --git a/proto.h b/proto.h index 41cd957f7829..2a9e2e05227b 100644 --- a/proto.h +++ b/proto.h @@ -1850,7 +1850,7 @@ Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e, const bool requ #define PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_ \ assert(s0); assert(e) -PERL_CALLCONV bool +PERL_CALLCONV Size_t Perl_is_utf8_FOO_(pTHX_ const U8 classnum, const U8 *p, const U8 * const e) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_IS_UTF8_FOO_ \ @@ -1872,13 +1872,13 @@ Perl_is_utf8_fixed_width_buf_flags(const U8 * const s, STRLEN len, const U32 fla /* PERL_CALLCONV bool Perl_is_utf8_fixed_width_buf_loc_flags(const U8 * const s, STRLEN len, const U8 **ep, const U32 flags); */ -PERL_CALLCONV bool +PERL_CALLCONV Size_t Perl_is_utf8_perl_idcont_(pTHX_ const U8 *p, const U8 * const e) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_IS_UTF8_PERL_IDCONT_ \ assert(p); assert(e) -PERL_CALLCONV bool +PERL_CALLCONV Size_t Perl_is_utf8_perl_idstart_(pTHX_ const U8 *p, const U8 * const e) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_IS_UTF8_PERL_IDSTART_ \ diff --git a/regcomp.c b/regcomp.c index c76a467bcd93..1831effb0dba 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2522,19 +2522,16 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) PERL_ARGS_ASSERT_REG_SCAN_NAME; assert (RExC_parse <= RExC_end); + Size_t advance; if (RExC_parse == RExC_end) NOOP; - else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) { + else if ((advance = isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF))) { /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by * using do...while */ - if (UTF) - do { - RExC_parse_inc_utf8(); - } while ( RExC_parse < RExC_end - && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end)); - else - do { - RExC_parse_inc_by(1); - } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse)); + do { + RExC_parse_advance(advance); + } while ( RExC_parse < RExC_end + && (advance = isWORDCHAR_utf8_safe( (U8 *) RExC_parse, + (U8 *) RExC_end))); } else { RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending character */ diff --git a/regcomp_internal.h b/regcomp_internal.h index 60ec13777a61..27e543d80be7 100644 --- a/regcomp_internal.h +++ b/regcomp_internal.h @@ -279,6 +279,14 @@ struct RExC_state_t { * output during the parse process. */ +/* RExC_parse_advance(count) + * + * Increment RExC_parse to point at the next codepoint, when we *know* that the + * correct byte count is in the passed parameter */ +#define RExC_parse_advance(count) STMT_START { \ + RExC_parse += count; \ +} STMT_END + /* RExC_parse_incf(flag) * * Increment RExC_parse to point at the next codepoint, while doing diff --git a/toke.c b/toke.c index 62f427bf67c3..3a3c0e7a2168 100644 --- a/toke.c +++ b/toke.c @@ -2088,8 +2088,10 @@ S_check_uni(pTHX) while (isSPACE(*PL_last_uni)) PL_last_uni++; s = PL_last_uni; - while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-') - s += UTF ? UTF8SKIP(s) : 1; + Size_t advance; + while ( (advance = isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)) + || (advance = (*s == '-'))) + s += advance; if (s < PL_bufptr && memchr(s, '(', PL_bufptr - s)) return; @@ -5193,10 +5195,11 @@ S_check_scalar_slice(pTHX_ char *s) { return; } - while ( isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) - || (*s && memCHRs(" \t$#+-'\"", *s))) + Size_t advance; + while ( (advance = isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)) + || (advance = (*s && memCHRs(" \t$#+-'\"", *s)))) { - s += UTF ? UTF8SKIP(s) : 1; + s += advance; } if (*s == '}' || *s == ']') pl_yylval.ival = OPpSLICEWARNING; @@ -5402,8 +5405,11 @@ yyl_dollar(pTHX_ char *s) while (t < PL_bufend && *t == ' ') t++; /* strip off the name of the var */ - while (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)) - t += UTF ? UTF8SKIP(t) : 1; + Size_t advance; + while ((advance = (isWORDCHAR_lazy_if_safe(t, + PL_bufend, + UTF)))) + t += advance; /* consumed a varname */ } else if (isDIGIT(*t)) { /* deal with hex constants like 0x11 */ @@ -6407,6 +6413,7 @@ yyl_leftcurly(pTHX_ char *s, const U8 formbrack) * GSAR 97-07-21 */ t = s; + Size_t advance; if (*s == '\'' || *s == '"' || *s == '`') { /* common case: get past first string, handling escapes */ for (t++; t < PL_bufend && *t != *s;) @@ -6455,20 +6462,24 @@ yyl_leftcurly(pTHX_ char *s, const U8 formbrack) } t++; } - else + else { /* skip plain q word */ - while ( t < PL_bufend - && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)) + while ( t < PL_bufend + && (advance = isWORDCHAR_lazy_if_safe(t, + PL_bufend, + UTF))) { - t += UTF ? UTF8SKIP(t) : 1; + t += advance; } + } } - else if (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)) { - t += UTF ? UTF8SKIP(t) : 1; + else if ((advance = isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))) { + t += advance; while ( t < PL_bufend - && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)) + && (advance = isWORDCHAR_lazy_if_safe(t, PL_bufend, + UTF))) { - t += UTF ? UTF8SKIP(t) : 1; + t += advance; } } while (t < PL_bufend && isSPACE(*t)) @@ -10125,11 +10136,12 @@ S_checkcomma(pTHX_ const char *s, const char *name, const char *what) s++; while (s < PL_bufend && isSPACE(*s)) s++; - if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) { + Size_t advance; + if ((advance = isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))) { const char * const w = s; - s += UTF ? UTF8SKIP(s) : 1; - while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)) - s += UTF ? UTF8SKIP(s) : 1; + s += advance; + while ((advance = isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))) + s += advance; while (s < PL_bufend && isSPACE(*s)) s++; if (*s == ',') { @@ -10281,15 +10293,18 @@ S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package, while (*s < PL_bufend) { if (*d >= e) croak("%s", ident_too_long); - if (is_utf8 && isIDFIRST_utf8_safe(*s, PL_bufend)) { + Size_t advance; + if (is_utf8 && (advance = isIDFIRST_utf8_safe(*s, PL_bufend))) { /* The UTF-8 case must come first, otherwise things * like c\N{COMBINING TILDE} would start failing, as the * isWORDCHAR_A case below would gobble the 'c' up. */ - char *t = *s + UTF8SKIP(*s); - while (isIDCONT_utf8_safe((const U8*) t, (const U8*) PL_bufend)) { - t += UTF8SKIP(t); + char *t = *s + advance; + while ((advance = isIDCONT_utf8_safe((const U8*) t, + (const U8*) PL_bufend))) + { + t += advance; } if (*d + (t - *s) > e) croak("%s", ident_too_long); @@ -10496,11 +10511,12 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni) /* note we have to check for a normal identifier first, * as it handles utf8 symbols, and only after that has * been ruled out can we look at the caret words */ - if (isIDFIRST_lazy_if_safe(d, e, is_utf8) ) { + Size_t advance; + if ((advance = isIDFIRST_lazy_if_safe(d, e, is_utf8) )) { /* if it starts as a valid identifier, assume that it is one. (the later check for } being at the expected point will trap cases where this doesn't pan out.) */ - d += is_utf8 ? UTF8SKIP(d) : 1; + d += advance; parse_ident(&s, &d, e, 1, is_utf8, TRUE); *d = '\0'; } @@ -10998,8 +11014,9 @@ S_scan_heredoc(pTHX_ char *s) peek = s; - while (isWORDCHAR_lazy_if_safe(peek, PL_bufend, UTF)) { - peek += UTF ? UTF8SKIP(peek) : 1; + Size_t advance; + while ((advance = isWORDCHAR_lazy_if_safe(peek, PL_bufend, UTF))) { + peek += advance; } len = (peek - s >= e - d) ? (e - d) : (peek - s); @@ -11442,9 +11459,13 @@ S_scan_inputsymbol(pTHX_ char *start) if (*d == '$' && d[1]) d++; /* allow or */ - while (isWORDCHAR_lazy_if_safe(d, e, UTF) || *d == ':' - || (*d == '\'' && FEATURE_APOS_AS_NAME_SEP_IS_ENABLED)) { - d += UTF ? UTF8SKIP(d) : 1; + Size_t advance; + while ( (advance = isWORDCHAR_lazy_if_safe(d, e, UTF)) + || (advance = ( *d == ':' + || ( *d == '\'' + && FEATURE_APOS_AS_NAME_SEP_IS_ENABLED)))) + { + d += advance; } /* If we've tried to read what we allow filehandles to look like, and diff --git a/utf8.c b/utf8.c index b0922d138470..e9b84200917c 100644 --- a/utf8.c +++ b/utf8.c @@ -3900,34 +3900,42 @@ S_warn_on_first_deprecated_use(pTHX_ U32 category, } #endif -/* returns a boolean giving whether or not the UTF8-encoded character that - * starts at

, and extending no further than is in the inversion - * list . */ -#define IS_UTF8_IN_INVLIST(p, e, invlist) \ - _invlist_contains_cp(invlist, utf8_to_uv_or_die(p, e, NULL)) +/* returns the number of bytes comprising the UTF8-encoded character that + * starts at

, and extending no further than if it is in the + * inversion list ; or 0 if it isn't */ +STATIC Size_t +S_is_utf8_in_invlist(pTHX_ const U8 * p, const U8 * e, SV * const invlist) +{ + Size_t advance; + if (_invlist_contains_cp(invlist, utf8_to_uv_or_die(p, e, &advance))) { + return advance; + } -bool + return 0; +} + +Size_t Perl_is_utf8_FOO_(pTHX_ const U8 classnum, const U8 *p, const U8 * const e) { PERL_ARGS_ASSERT_IS_UTF8_FOO_; - return IS_UTF8_IN_INVLIST(p, e, PL_XPosix_ptrs[classnum]); + return S_is_utf8_in_invlist(aTHX_ p, e, PL_XPosix_ptrs[classnum]); } -bool +Size_t Perl_is_utf8_perl_idstart_(pTHX_ const U8 *p, const U8 * const e) { PERL_ARGS_ASSERT_IS_UTF8_PERL_IDSTART_; - return IS_UTF8_IN_INVLIST(p, e, PL_utf8_perl_idstart); + return S_is_utf8_in_invlist(aTHX_ p, e, PL_utf8_perl_idstart); } -bool +Size_t Perl_is_utf8_perl_idcont_(pTHX_ const U8 *p, const U8 * const e) { PERL_ARGS_ASSERT_IS_UTF8_PERL_IDCONT_; - return IS_UTF8_IN_INVLIST(p, e, PL_utf8_perl_idcont); + return S_is_utf8_in_invlist(aTHX_ p, e, PL_utf8_perl_idcont); } STATIC UV