Make macro isUTF8_CHAR_flags an inline fcn

This makes it use the fast DFA for this functionality.
Perl · Aug 7, 2021 · f57a53a · f57a53a
1 parent b479e9c
commit f57a53a
Show file tree

Hide file tree

Showing 6 changed files with 82 additions and 40 deletions.
diff --git a/embed.fnc b/embed.fnc
@@ -1164,6 +1164,9 @@ AbTpdD	|STRLEN	|is_utf8_char	|NN const U8 *s
 AbMTpd	|STRLEN	|is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end
 ATidRp	|Size_t	|isUTF8_CHAR|NN const U8 * const s0			    \
 			    |NN const U8 * const e
+ATidRp	|Size_t	|isUTF8_CHAR_flags|NN const U8 * const s0		    \
+			    |NN const U8 * const e			    \
+			    |const U32 flags
 ATidRp	|Size_t	|isSTRICT_UTF8_CHAR |NN const U8 * const s0		    \
 				    |NN const U8 * const e
 ATidRp	|Size_t	|isC9_STRICT_UTF8_CHAR |NN const U8 * const s0		    \

diff --git a/embed.h b/embed.h
@@ -268,6 +268,7 @@
 #define isC9_STRICT_UTF8_CHAR	Perl_isC9_STRICT_UTF8_CHAR
 #define isSTRICT_UTF8_CHAR	Perl_isSTRICT_UTF8_CHAR
 #define isUTF8_CHAR		Perl_isUTF8_CHAR
+#define isUTF8_CHAR_flags	Perl_isUTF8_CHAR_flags
 #define is_c9strict_utf8_string_loclen	Perl_is_c9strict_utf8_string_loclen
 #define is_lvalue_sub()		Perl_is_lvalue_sub(aTHX)
 #define is_safe_syscall(a,b,c,d)	Perl_is_safe_syscall(aTHX_ a,b,c,d)

diff --git a/inline.h b/inline.h
@@ -2169,6 +2169,73 @@ Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
 
 /*
 
+=for apidoc Am|STRLEN|isUTF8_CHAR_flags|const U8 *s|const U8 *e| const U32 flags
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
+that represents some code point, subject to the restrictions given by C<flags>;
+otherwise it evaluates to 0.  If non-zero, the value gives how many bytes
+starting at C<s> comprise the code point's representation.  Any bytes remaining
+before C<e>, but beyond the ones needed to form the first code point in C<s>,
+are not examined.
+
+If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
+if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
+as C<L</isSTRICT_UTF8_CHAR>>;
+and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
+the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
+Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
+understood by C<L</utf8n_to_uvchr>>, with the same meanings.
+
+The three alternative macros are for the most commonly needed validations; they
+are likely to run somewhat faster than this more general one, as they can be
+inlined into your code.
+
+Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
+L</is_utf8_string_loclen_flags> to check entire strings.
+
+=cut
+*/
+
+PERL_STATIC_INLINE STRLEN
+Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
+{
+    PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
+    assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+                          |UTF8_DISALLOW_PERL_EXTENDED)));
+
+    PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
+                          goto check_success,
+                          DFA_TEASE_APART_FF_,
+                          DFA_RETURN_FAILURE_);
+
+  check_success:
+
+    return is_utf8_char_helper(s0, e, flags);
+
+#ifdef HAS_EXTRA_LONG_UTF8
+
+  tease_apart_FF:
+
+    /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
+     * either malformed, or was for the largest possible start byte, which
+     * indicates perl extended UTF-8, well above the Unicode maximum */
+    if (   *s0 != I8_TO_NATIVE_UTF8(0xFF)
+        || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
+    {
+        return 0;
+    }
+
+    /* Otherwise examine the sequence not inline */
+    return is_utf8_FF_helper_(s0, e,
+                              FALSE /* require full, not partial char */
+                             );
+#endif
+
+}
+
+/*
+
 =for apidoc is_utf8_valid_partial_char
 
 Returns 0 if the sequence of bytes starting at C<s> and looking no further than

diff --git a/proto.h b/proto.h
@@ -1665,6 +1665,13 @@ PERL_STATIC_INLINE Size_t	Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const
 	assert(s0); assert(e)
 #endif
 
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE Size_t	Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
+			__attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS	\
+	assert(s0); assert(e)
+#endif
+
 /* PERL_CALLCONV bool	is_ascii_string(const U8* const s, STRLEN len)
 			__attribute__warn_unused_result__
 			__attribute__pure__; */

diff --git a/utf8.c b/utf8.c
@@ -817,7 +817,10 @@ Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
 
     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
                           |UTF8_DISALLOW_PERL_EXTENDED)));
-    assert(! UTF8_IS_INVARIANT(*s));
+
+    if (UTF8_IS_INVARIANT(*s)) {
+        return 1;
+    }
 
     /* A variant char must begin with a start byte */
     if (UNLIKELY(! UTF8_IS_START(*s))) {

diff --git a/utf8.h b/utf8.h
@@ -1219,45 +1219,6 @@ point's representation.
 #define bytes_from_utf8(s, lenp, is_utf8p)                                  \
                             bytes_from_utf8_loc(s, lenp, is_utf8p, 0)
 
-/*
-
-=for apidoc Am|STRLEN|isUTF8_CHAR_flags|const U8 *s|const U8 *e| const U32 flags
-
-Evaluates to non-zero if the first few bytes of the string starting at C<s> and
-looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
-that represents some code point, subject to the restrictions given by C<flags>;
-otherwise it evaluates to 0.  If non-zero, the value gives how many bytes
-starting at C<s> comprise the code point's representation.  Any bytes remaining
-before C<e>, but beyond the ones needed to form the first code point in C<s>,
-are not examined.
-
-If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
-if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
-as C<L</isSTRICT_UTF8_CHAR>>;
-and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
-the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
-Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
-understood by C<L</utf8n_to_uvchr>>, with the same meanings.
-
-The three alternative macros are for the most commonly needed validations; they
-are likely to run somewhat faster than this more general one, as they can be
-inlined into your code.
-
-Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
-L</is_utf8_string_loclen_flags> to check entire strings.
-
-=cut
-*/
-
-#define isUTF8_CHAR_flags(s, e, flags)                                      \
-    (UNLIKELY((e) <= (s))                                                   \
-    ? 0                                                                     \
-    : (UTF8_IS_INVARIANT(*s))                                               \
-      ? 1                                                                   \
-      : UNLIKELY(((e) - (s)) < UTF8SKIP(s))                                 \
-        ? 0                                                                 \
-        : is_utf8_char_helper(s, e, flags))
-
 /* Do not use; should be deprecated.  Use isUTF8_CHAR() instead; this is
  * retained solely for backwards compatibility */
 #define IS_UTF8_CHAR(p, n)      (isUTF8_CHAR(p, (p) + (n)) == n)