Skip to content
9 changes: 5 additions & 4 deletions class.c
Original file line number Diff line number Diff line change
Expand Up @@ -445,8 +445,9 @@ static const char *S_split_package_ver(pTHX_ SV *value, SV *pkgname, SV *pkgvers
if(SvUTF8(value))
SvUTF8_on(pkgname);

while(*p && isSPACE_utf8_safe(p, end))
p += UTF8SKIP(p);
Size_t advance;
while(*p && (advance = isSPACE_utf8_safe(p, end)))
p += advance;

if(*p) {
/* scan_version() gets upset about trailing content. We need to extract
Expand All @@ -463,8 +464,8 @@ static const char *S_split_package_ver(pTHX_ SV *value, SV *pkgname, SV *pkgvers
scan_version(SvPVX(tmpsv), pkgversion, FALSE);
}

while(*p && isSPACE_utf8_safe(p, end))
p += UTF8SKIP(p);
while(*p && (advance = isSPACE_utf8_safe(p, end)))
p += advance;

return p;
}
Expand Down
6 changes: 3 additions & 3 deletions embed.fnc
Original file line number Diff line number Diff line change
Expand Up @@ -1753,17 +1753,17 @@ ATdip |bool |is_utf8_fixed_width_buf_loclen_flags \
|NULLOK const U8 **ep \
|NULLOK STRLEN *el \
|const U32 flags
CRp |bool |is_utf8_FOO_ |const U8 classnum \
CRp |Size_t |is_utf8_FOO_ |const U8 classnum \
|NN const U8 *p \
|NN const U8 * const e
ARTdip |bool |is_utf8_invariant_string_loc \
|NN const U8 * const s \
|STRLEN len \
|NULLOK const U8 **ep
CRp |bool |is_utf8_perl_idcont_ \
CRp |Size_t |is_utf8_perl_idcont_ \
|NN const U8 *p \
|NN const U8 * const e
CRp |bool |is_utf8_perl_idstart_ \
CRp |Size_t |is_utf8_perl_idstart_ \
|NN const U8 *p \
|NN const U8 * const e
ARTdmp |bool |is_utf8_string |NN const U8 *s \
Expand Down
65 changes: 37 additions & 28 deletions handy.h
Original file line number Diff line number Diff line change
Expand Up @@ -1595,8 +1595,8 @@ END_EXTERN_C

/* For internal core Perl use only: the base macro for defining macros like
* isALPHA */
# define generic_isCC_(c, classnum) cBOOL(FITS_IN_8_BITS(c) \
&& (PL_charclass[(U8) (c)] & CC_mask_(classnum)))
# define generic_isCC_(c, classnum) \
(FITS_IN_8_BITS(c) && (PL_charclass[(U8) (c)] & CC_mask_(classnum)))

/* The mask for the _A versions of the macros; it just adds in the bit for
* ASCII. */
Expand Down Expand Up @@ -2256,26 +2256,36 @@ END_EXTERN_C

#define generic_utf8_safe_(classnum, p, e, above_latin1) \
((! utf8_safe_assert_(p, e)) \
? (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)\
? (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e), 0, \
MALFORMED_UTF8_DIE), 0) \
: (UTF8_IS_INVARIANT(*(p))) \
? generic_isCC_(*(p), classnum) \
: (UTF8_IS_DOWNGRADEABLE_START(*(p)) \
? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \
? generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )), \
classnum) \
: (force_out_malformed_utf8_message_( \
(U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)) \
: above_latin1))
: (UTF8_IS_ABOVE_LATIN1_START(*(p)) \
? above_latin1 \
: ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \
/* Multiply by 2 to return byte length of matched \
* character */ \
? 2 * generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(*(p),*((p)+1)),\
classnum) \
: (force_out_malformed_utf8_message_((U8 *) (p), (U8 *) (e),\
0, MALFORMED_UTF8_DIE),\
0))))

/* Like the above, but calls 'above_latin1(p, e)' to get the utf8 value.
* 'above_latin1' can be a macro */
#define generic_func_utf8_safe_(classnum, above_latin1, p, e) \
generic_utf8_safe_(classnum, p, e, above_latin1(p, e))

#define generic_non_invlist_utf8_safe_(classnum, above_latin1, p, e) \
generic_utf8_safe_(classnum, p, e, \
(UNLIKELY((e) - (p) < UTF8SKIP(p)) \
? (force_out_malformed_utf8_message_( \
(U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0) \
: above_latin1(p)))
generic_utf8_safe_(classnum, p, e, \
(LIKELY((e) - (p) >= UTF8SKIP(p)) \
? above_latin1(p) \
: (force_out_malformed_utf8_message_( \
(U8 *) (p), \
(U8 *) (e), \
0, \
MALFORMED_UTF8_DIE),\
0)))
/* Like the above, but passes classnum to _isFOO_utf8(), instead of having an
* 'above_latin1' parameter */
#define generic_invlist_utf8_safe_(classnum, p, e) \
Expand All @@ -2289,10 +2299,9 @@ END_EXTERN_C
(assert(utf8_safe_assert_(p, e)), \
(isASCII(*(p))) \
? generic_isCC_(*(p), classnum) \
: (UTF8_IS_DOWNGRADEABLE_START(*(p))) \
? 0 /* Note that doesn't check validity for latin1 */ \
: above_latin1)

: (UTF8_IS_ABOVE_LATIN1_START(*(p)) \
? above_latin1 \
: 0)) /* Note that doesn't check validity for latin1 */

#define isALPHA_utf8(p, e) isALPHA_utf8_safe(p, e)
#define isALPHANUMERIC_utf8(p, e) isALPHANUMERIC_utf8_safe(p, e)
Expand Down Expand Up @@ -2406,15 +2415,15 @@ END_EXTERN_C
* point in 'p' is within the 0-255 range, it uses locale rules from the
* passed-in 'macro' parameter */
#define generic_LC_utf8_safe_(macro, p, e, above_latin1) \
(assert_(utf8_safe_assert_(p, e)) \
(assert_(utf8_safe_assert_(p, e)) \
(UTF8_IS_INVARIANT(*(p))) \
? macro(*(p)) \
: (UTF8_IS_DOWNGRADEABLE_START(*(p)) \
? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \
: (UTF8_IS_ABOVE_LATIN1_START(*(p)) \
? above_latin1 \
: ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \
? macro(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1))) \
: (force_out_malformed_utf8_message_( \
(U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)) \
: above_latin1))
(U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0))))

#define generic_LC_invlist_utf8_safe_(macro, classnum, p, e) \
generic_LC_utf8_safe_(macro, p, e, \
Expand All @@ -2425,10 +2434,10 @@ END_EXTERN_C

#define generic_LC_non_invlist_utf8_safe_(classnum, above_latin1, p, e) \
generic_LC_utf8_safe_(classnum, p, e, \
(UNLIKELY((e) - (p) < UTF8SKIP(p)) \
? (force_out_malformed_utf8_message_( \
(U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0) \
: above_latin1(p)))
(LIKELY((e) - (p) >= UTF8SKIP(p)) \
? above_latin1(p) \
: (force_out_malformed_utf8_message_( \
(U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)))

#define isALPHANUMERIC_LC_utf8_safe(p, e) \
generic_LC_invlist_utf8_safe_(isALPHANUMERIC_LC, \
Expand Down
16 changes: 11 additions & 5 deletions pp_ctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -5192,11 +5192,17 @@ S_require_file(pTHX_ SV *sv)
S_parse_ident */
c = name;
while (c < e) {
if (utf8 && isIDFIRST_utf8_safe(c, e)) {
c += UTF8SKIP(c);
while (c < e && isIDCONT_utf8_safe(
(const U8*) c, (const U8*) e))
c += UTF8SKIP(c);
Size_t advance;

if (utf8 && (advance = isIDFIRST_utf8_safe(c, e)))
{
c += advance;
while ( c < e
&& (advance = isIDCONT_utf8_safe(
(const U8*) c, (const U8*) e)))
{
c += advance;
}
}
else if (isWORDCHAR_A(*c)) {
while (c < e && isWORDCHAR_A(*c))
Expand Down
6 changes: 3 additions & 3 deletions proto.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 7 additions & 10 deletions regcomp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2522,19 +2522,16 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
PERL_ARGS_ASSERT_REG_SCAN_NAME;

assert (RExC_parse <= RExC_end);
Size_t advance;
if (RExC_parse == RExC_end) NOOP;
else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
else if ((advance = isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF))) {
/* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
* using do...while */
if (UTF)
do {
RExC_parse_inc_utf8();
} while ( RExC_parse < RExC_end
&& isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
else
do {
RExC_parse_inc_by(1);
} while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
do {
RExC_parse_advance(advance);
} while ( RExC_parse < RExC_end
&& (advance = isWORDCHAR_utf8_safe( (U8 *) RExC_parse,
(U8 *) RExC_end)));
} else {
RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending
character */
Expand Down
8 changes: 8 additions & 0 deletions regcomp_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,14 @@ struct RExC_state_t {
* output during the parse process.
*/

/* RExC_parse_advance(count)
*
* Increment RExC_parse to point at the next codepoint, when we *know* that the
* correct byte count is in the passed parameter */
#define RExC_parse_advance(count) STMT_START { \
RExC_parse += count; \
} STMT_END

/* RExC_parse_incf(flag)
*
* Increment RExC_parse to point at the next codepoint, while doing
Expand Down
Loading
Loading