Skip to content

Commit

Permalink
is_utf8_valid_partial_char_flags: Use DFA
Browse files Browse the repository at this point in the history
The DFA macro for determining if a sequence is valid UTF-8 was
deliberately made general enough to accommodate this use-case, in which
only a partial character is acceptable.  Change the code to use the DFA.

The helper function's name is changed to indicate it is private
  • Loading branch information
khwilliamson committed Aug 7, 2021
1 parent 3986cf4 commit b479e9c
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 10 deletions.
4 changes: 2 additions & 2 deletions embed.fnc
Expand Up @@ -1203,9 +1203,9 @@ ATidp |bool |is_utf8_fixed_width_buf_loclen_flags \
|NN const U8 * const s|STRLEN len \
|NULLOK const U8 **ep|NULLOK STRLEN *el|const U32 flags
AmTdP |bool |is_utf8_valid_partial_char \
|NN const U8 * const s|NN const U8 * const e
|NN const U8 * const s0|NN const U8 * const e
ATidRp |bool |is_utf8_valid_partial_char_flags \
|NN const U8 * const s|NN const U8 * const e|const U32 flags
|NN const U8 * const s0|NN const U8 * const e|const U32 flags
CpR |bool |_is_uni_FOO|const U8 classnum|const UV c
CpR |bool |_is_utf8_FOO|const U8 classnum|NN const U8 *p \
|NN const U8 * const e
Expand Down
39 changes: 34 additions & 5 deletions inline.h
Expand Up @@ -2217,18 +2217,47 @@ determined from just the first one or two bytes.
*/

PERL_STATIC_INLINE bool
Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
{
PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;

assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
|UTF8_DISALLOW_PERL_EXTENDED)));

if (s >= e || s + UTF8SKIP(s) <= e) {
return FALSE;
PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
DFA_RETURN_FAILURE_,
DFA_TEASE_APART_FF_,
NOOP);

/* The NOOP above causes the DFA to drop down here iff the input was a
* partial character. flags=0 => can return TRUE immediately; otherwise we
* need to check (not inline) if the partial character is the beginning of
* a disallowed one */
if (flags == 0) {
return TRUE;
}

return cBOOL(is_utf8_char_helper(s0, e, flags));

#ifdef HAS_EXTRA_LONG_UTF8

tease_apart_FF:

/* Getting here means the input is either malformed, or, in the case of
* PL_extended_utf8_dfa_tab, was for the largest possible start byte. The
* latter case has to be extended UTF-8, so can fail immediately if that is
* forbidden */

if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
|| (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
{
return 0;
}

return cBOOL(is_utf8_char_helper(s, e, flags));
return is_utf8_FF_helper_(s0, e,
TRUE /* Require to be a partial character */
);
#endif

}

/*
Expand Down
6 changes: 3 additions & 3 deletions proto.h
Expand Up @@ -1780,16 +1780,16 @@ PERL_STATIC_INLINE bool Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len
#define PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS \
assert(s)
#endif
/* PERL_CALLCONV bool is_utf8_valid_partial_char(const U8 * const s, const U8 * const e)
/* PERL_CALLCONV bool is_utf8_valid_partial_char(const U8 * const s0, const U8 * const e)
__attribute__warn_unused_result__
__attribute__pure__; */
#define PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR

#ifndef PERL_NO_INLINE_FUNCTIONS
PERL_STATIC_INLINE bool Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
PERL_STATIC_INLINE bool Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS \
assert(s); assert(e)
assert(s0); assert(e)
#endif

PERL_CALLCONV bool Perl_isinfnan(NV nv)
Expand Down

0 comments on commit b479e9c

Please sign in to comment.