Skip to content

Commit

Permalink
Make macro isUTF8_CHAR_flags an inline fcn
Browse files Browse the repository at this point in the history
This makes it use the fast DFA for this functionality.
  • Loading branch information
khwilliamson committed Aug 7, 2021
1 parent b479e9c commit f57a53a
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 40 deletions.
3 changes: 3 additions & 0 deletions embed.fnc
Expand Up @@ -1164,6 +1164,9 @@ AbTpdD |STRLEN |is_utf8_char |NN const U8 *s
AbMTpd |STRLEN |is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end
ATidRp |Size_t |isUTF8_CHAR|NN const U8 * const s0 \
|NN const U8 * const e
ATidRp |Size_t |isUTF8_CHAR_flags|NN const U8 * const s0 \
|NN const U8 * const e \
|const U32 flags
ATidRp |Size_t |isSTRICT_UTF8_CHAR |NN const U8 * const s0 \
|NN const U8 * const e
ATidRp |Size_t |isC9_STRICT_UTF8_CHAR |NN const U8 * const s0 \
Expand Down
1 change: 1 addition & 0 deletions embed.h
Expand Up @@ -268,6 +268,7 @@
#define isC9_STRICT_UTF8_CHAR Perl_isC9_STRICT_UTF8_CHAR
#define isSTRICT_UTF8_CHAR Perl_isSTRICT_UTF8_CHAR
#define isUTF8_CHAR Perl_isUTF8_CHAR
#define isUTF8_CHAR_flags Perl_isUTF8_CHAR_flags
#define is_c9strict_utf8_string_loclen Perl_is_c9strict_utf8_string_loclen
#define is_lvalue_sub() Perl_is_lvalue_sub(aTHX)
#define is_safe_syscall(a,b,c,d) Perl_is_safe_syscall(aTHX_ a,b,c,d)
Expand Down
67 changes: 67 additions & 0 deletions inline.h
Expand Up @@ -2169,6 +2169,73 @@ Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)

/*
=for apidoc Am|STRLEN|isUTF8_CHAR_flags|const U8 *s|const U8 *e| const U32 flags
Evaluates to non-zero if the first few bytes of the string starting at C<s> and
looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
that represents some code point, subject to the restrictions given by C<flags>;
otherwise it evaluates to 0. If non-zero, the value gives how many bytes
starting at C<s> comprise the code point's representation. Any bytes remaining
before C<e>, but beyond the ones needed to form the first code point in C<s>,
are not examined.
If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
as C<L</isSTRICT_UTF8_CHAR>>;
and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
understood by C<L</utf8n_to_uvchr>>, with the same meanings.
The three alternative macros are for the most commonly needed validations; they
are likely to run somewhat faster than this more general one, as they can be
inlined into your code.
Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
L</is_utf8_string_loclen_flags> to check entire strings.
=cut
*/

PERL_STATIC_INLINE STRLEN
Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
{
PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
|UTF8_DISALLOW_PERL_EXTENDED)));

PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
goto check_success,
DFA_TEASE_APART_FF_,
DFA_RETURN_FAILURE_);

check_success:

return is_utf8_char_helper(s0, e, flags);

#ifdef HAS_EXTRA_LONG_UTF8

tease_apart_FF:

/* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
* either malformed, or was for the largest possible start byte, which
* indicates perl extended UTF-8, well above the Unicode maximum */
if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
|| (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
{
return 0;
}

/* Otherwise examine the sequence not inline */
return is_utf8_FF_helper_(s0, e,
FALSE /* require full, not partial char */
);
#endif

}

/*
=for apidoc is_utf8_valid_partial_char
Returns 0 if the sequence of bytes starting at C<s> and looking no further than
Expand Down
7 changes: 7 additions & 0 deletions proto.h
Expand Up @@ -1665,6 +1665,13 @@ PERL_STATIC_INLINE Size_t Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const
assert(s0); assert(e)
#endif

#ifndef PERL_NO_INLINE_FUNCTIONS
PERL_STATIC_INLINE Size_t Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS \
assert(s0); assert(e)
#endif

/* PERL_CALLCONV bool is_ascii_string(const U8* const s, STRLEN len)
__attribute__warn_unused_result__
__attribute__pure__; */
Expand Down
5 changes: 4 additions & 1 deletion utf8.c
Expand Up @@ -817,7 +817,10 @@ Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)

assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
|UTF8_DISALLOW_PERL_EXTENDED)));
assert(! UTF8_IS_INVARIANT(*s));

if (UTF8_IS_INVARIANT(*s)) {
return 1;
}

/* A variant char must begin with a start byte */
if (UNLIKELY(! UTF8_IS_START(*s))) {
Expand Down
39 changes: 0 additions & 39 deletions utf8.h
Expand Up @@ -1219,45 +1219,6 @@ point's representation.
#define bytes_from_utf8(s, lenp, is_utf8p) \
bytes_from_utf8_loc(s, lenp, is_utf8p, 0)

/*
=for apidoc Am|STRLEN|isUTF8_CHAR_flags|const U8 *s|const U8 *e| const U32 flags
Evaluates to non-zero if the first few bytes of the string starting at C<s> and
looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
that represents some code point, subject to the restrictions given by C<flags>;
otherwise it evaluates to 0. If non-zero, the value gives how many bytes
starting at C<s> comprise the code point's representation. Any bytes remaining
before C<e>, but beyond the ones needed to form the first code point in C<s>,
are not examined.
If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
as C<L</isSTRICT_UTF8_CHAR>>;
and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
understood by C<L</utf8n_to_uvchr>>, with the same meanings.
The three alternative macros are for the most commonly needed validations; they
are likely to run somewhat faster than this more general one, as they can be
inlined into your code.
Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
L</is_utf8_string_loclen_flags> to check entire strings.
=cut
*/

#define isUTF8_CHAR_flags(s, e, flags) \
(UNLIKELY((e) <= (s)) \
? 0 \
: (UTF8_IS_INVARIANT(*s)) \
? 1 \
: UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \
? 0 \
: is_utf8_char_helper(s, e, flags))

/* Do not use; should be deprecated. Use isUTF8_CHAR() instead; this is
* retained solely for backwards compatibility */
#define IS_UTF8_CHAR(p, n) (isUTF8_CHAR(p, (p) + (n)) == n)
Expand Down

0 comments on commit f57a53a

Please sign in to comment.