Skip to content

Commit

Permalink
Merge 6cf1bd4 into 7e18321
Browse files Browse the repository at this point in the history
  • Loading branch information
khwilliamson committed Aug 10, 2021
2 parents 7e18321 + 6cf1bd4 commit 7db8a44
Show file tree
Hide file tree
Showing 8 changed files with 218 additions and 196 deletions.
9 changes: 6 additions & 3 deletions embed.fnc
Expand Up @@ -1115,7 +1115,7 @@ pR |OP* |cmpchain_finish|NN OP* ch
ApR |I32 |is_lvalue_sub
: Used in cop.h
XopR |I32 |was_lvalue_sub
CpRTP |STRLEN |is_utf8_char_helper|NN const U8 * const s|NN const U8 * e|const U32 flags
CpRTP |STRLEN |is_utf8_char_helper_|NN const U8 * const s|NN const U8 * e|const U32 flags
CpRTP |Size_t |is_utf8_FF_helper_|NN const U8 * const s0 \
|NN const U8 * const e \
|const bool require_partial
Expand Down Expand Up @@ -1164,6 +1164,9 @@ AbTpdD |STRLEN |is_utf8_char |NN const U8 *s
AbMTpd |STRLEN |is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end
ATidRp |Size_t |isUTF8_CHAR|NN const U8 * const s0 \
|NN const U8 * const e
ATidRp |Size_t |isUTF8_CHAR_flags|NN const U8 * const s0 \
|NN const U8 * const e \
|const U32 flags
ATidRp |Size_t |isSTRICT_UTF8_CHAR |NN const U8 * const s0 \
|NN const U8 * const e
ATidRp |Size_t |isC9_STRICT_UTF8_CHAR |NN const U8 * const s0 \
Expand Down Expand Up @@ -1203,9 +1206,9 @@ ATidp |bool |is_utf8_fixed_width_buf_loclen_flags \
|NN const U8 * const s|STRLEN len \
|NULLOK const U8 **ep|NULLOK STRLEN *el|const U32 flags
AmTdP |bool |is_utf8_valid_partial_char \
|NN const U8 * const s|NN const U8 * const e
|NN const U8 * const s0|NN const U8 * const e
ATidRp |bool |is_utf8_valid_partial_char_flags \
|NN const U8 * const s|NN const U8 * const e|const U32 flags
|NN const U8 * const s0|NN const U8 * const e|const U32 flags
CpR |bool |_is_uni_FOO|const U8 classnum|const UV c
CpR |bool |_is_utf8_FOO|const U8 classnum|NN const U8 *p \
|NN const U8 * const e
Expand Down
3 changes: 2 additions & 1 deletion embed.h
Expand Up @@ -268,6 +268,7 @@
#define isC9_STRICT_UTF8_CHAR Perl_isC9_STRICT_UTF8_CHAR
#define isSTRICT_UTF8_CHAR Perl_isSTRICT_UTF8_CHAR
#define isUTF8_CHAR Perl_isUTF8_CHAR
#define isUTF8_CHAR_flags Perl_isUTF8_CHAR_flags
#define is_c9strict_utf8_string_loclen Perl_is_c9strict_utf8_string_loclen
#define is_lvalue_sub() Perl_is_lvalue_sub(aTHX)
#define is_safe_syscall(a,b,c,d) Perl_is_safe_syscall(aTHX_ a,b,c,d)
Expand All @@ -276,7 +277,7 @@
#ifndef NO_MATHOMS
#define is_utf8_char Perl_is_utf8_char
#endif
#define is_utf8_char_helper Perl_is_utf8_char_helper
#define is_utf8_char_helper_ Perl_is_utf8_char_helper_
#define is_utf8_fixed_width_buf_loclen_flags Perl_is_utf8_fixed_width_buf_loclen_flags
#define is_utf8_invariant_string_loc Perl_is_utf8_invariant_string_loc
#define is_utf8_string_flags Perl_is_utf8_string_flags
Expand Down
106 changes: 101 additions & 5 deletions inline.h
Expand Up @@ -2169,6 +2169,73 @@ Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)

/*
=for apidoc Am|STRLEN|isUTF8_CHAR_flags|const U8 *s|const U8 *e| const U32 flags
Evaluates to non-zero if the first few bytes of the string starting at C<s> and
looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
that represents some code point, subject to the restrictions given by C<flags>;
otherwise it evaluates to 0. If non-zero, the value gives how many bytes
starting at C<s> comprise the code point's representation. Any bytes remaining
before C<e>, but beyond the ones needed to form the first code point in C<s>,
are not examined.
If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
as C<L</isSTRICT_UTF8_CHAR>>;
and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
understood by C<L</utf8n_to_uvchr>>, with the same meanings.
The three alternative macros are for the most commonly needed validations; they
are likely to run somewhat faster than this more general one, as they can be
inlined into your code.
Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
L</is_utf8_string_loclen_flags> to check entire strings.
=cut
*/

PERL_STATIC_INLINE STRLEN
Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
{
PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
|UTF8_DISALLOW_PERL_EXTENDED)));

PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
goto check_success,
DFA_TEASE_APART_FF_,
DFA_RETURN_FAILURE_);

check_success:

return is_utf8_char_helper_(s0, e, flags);

#ifdef HAS_EXTRA_LONG_UTF8

tease_apart_FF:

/* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
* either malformed, or was for the largest possible start byte, which
* indicates perl extended UTF-8, well above the Unicode maximum */
if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
|| (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
{
return 0;
}

/* Otherwise examine the sequence not inline */
return is_utf8_FF_helper_(s0, e,
FALSE /* require full, not partial char */
);
#endif

}

/*
=for apidoc is_utf8_valid_partial_char
Returns 0 if the sequence of bytes starting at C<s> and looking no further than
Expand Down Expand Up @@ -2217,18 +2284,47 @@ determined from just the first one or two bytes.
*/

PERL_STATIC_INLINE bool
Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
{
PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;

assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
|UTF8_DISALLOW_PERL_EXTENDED)));

if (s >= e || s + UTF8SKIP(s) <= e) {
return FALSE;
PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
DFA_RETURN_FAILURE_,
DFA_TEASE_APART_FF_,
NOOP);

/* The NOOP above causes the DFA to drop down here iff the input was a
* partial character. flags=0 => can return TRUE immediately; otherwise we
* need to check (not inline) if the partial character is the beginning of
* a disallowed one */
if (flags == 0) {
return TRUE;
}

return cBOOL(is_utf8_char_helper(s, e, flags));
return cBOOL(is_utf8_char_helper_(s0, e, flags));

#ifdef HAS_EXTRA_LONG_UTF8

tease_apart_FF:

/* Getting here means the input is either malformed, or, in the case of
* PL_extended_utf8_dfa_tab, was for the largest possible start byte. The
* latter case has to be extended UTF-8, so can fail immediately if that is
* forbidden */

if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
|| (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
{
return 0;
}

return is_utf8_FF_helper_(s0, e,
TRUE /* Require to be a partial character */
);
#endif

}

/*
Expand Down
17 changes: 12 additions & 5 deletions proto.h
Expand Up @@ -1665,6 +1665,13 @@ PERL_STATIC_INLINE Size_t Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const
assert(s0); assert(e)
#endif

#ifndef PERL_NO_INLINE_FUNCTIONS
PERL_STATIC_INLINE Size_t Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS \
assert(s0); assert(e)
#endif

/* PERL_CALLCONV bool is_ascii_string(const U8* const s, STRLEN len)
__attribute__warn_unused_result__
__attribute__pure__; */
Expand Down Expand Up @@ -1726,10 +1733,10 @@ PERL_CALLCONV STRLEN Perl_is_utf8_char_buf(const U8 *buf, const U8 *buf_end);
#define PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF \
assert(buf); assert(buf_end)
#endif
PERL_CALLCONV STRLEN Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
PERL_CALLCONV STRLEN Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags)
__attribute__warn_unused_result__
__attribute__pure__;
#define PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER \
#define PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER_ \
assert(s); assert(e)

/* PERL_CALLCONV bool is_utf8_fixed_width_buf_flags(const U8 * const s, STRLEN len, const U32 flags); */
Expand Down Expand Up @@ -1780,16 +1787,16 @@ PERL_STATIC_INLINE bool Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len
#define PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS \
assert(s)
#endif
/* PERL_CALLCONV bool is_utf8_valid_partial_char(const U8 * const s, const U8 * const e)
/* PERL_CALLCONV bool is_utf8_valid_partial_char(const U8 * const s0, const U8 * const e)
__attribute__warn_unused_result__
__attribute__pure__; */
#define PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR

#ifndef PERL_NO_INLINE_FUNCTIONS
PERL_STATIC_INLINE bool Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
PERL_STATIC_INLINE bool Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS \
assert(s); assert(e)
assert(s0); assert(e)
#endif

PERL_CALLCONV bool Perl_isinfnan(NV nv)
Expand Down
2 changes: 1 addition & 1 deletion regcharclass.h
Expand Up @@ -3765,6 +3765,6 @@
* 3f9b68df38ab90c512f7159658ea01cc6fee76420c5953300a31fb06ac632e62 lib/unicore/mktables
* 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
* 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
* 3fb6bafb4c830dd501868e34f550cdad3bf8d2c9eed44756488f36c484969417 regen/regcharclass.pl
* 1aa94679c695efd507b7e4491629dba1021b74c21a5324dfd3a582a5d654bd32 regen/regcharclass.pl
* b2f896452d2b30da3e04800f478c60c1fd0b03d6b668689b020f1e3cf1f1cdd9 regen/regcharclass_multi_char_folds.pl
* ex: set ro: */
2 changes: 2 additions & 0 deletions regen/regcharclass.pl
Expand Up @@ -1805,6 +1805,8 @@ sub make_macro {
0xFFFFE - 0xFFFFF
0x10FFFE - 0x10FFFF
# Note that code in utf8.c is counting on the 'fast' version to look at no
# more than two bytes
SURROGATE: Surrogate code points
=> UTF8 :safe fast
\p{_Perl_Surrogate}
Expand Down

0 comments on commit 7db8a44

Please sign in to comment.