Merge 6cf1bd4 into 7e18321

Perl · Aug 10, 2021 · 7db8a44 · 7db8a44
2 parents 7e18321 + 6cf1bd4
commit 7db8a44
Show file tree

Hide file tree

Showing 8 changed files with 218 additions and 196 deletions.
diff --git a/embed.fnc b/embed.fnc
@@ -1115,7 +1115,7 @@ pR	|OP*	|cmpchain_finish|NN OP* ch
 ApR	|I32	|is_lvalue_sub
 : Used in cop.h
 XopR	|I32	|was_lvalue_sub
-CpRTP	|STRLEN	|is_utf8_char_helper|NN const U8 * const s|NN const U8 * e|const U32 flags
+CpRTP	|STRLEN	|is_utf8_char_helper_|NN const U8 * const s|NN const U8 * e|const U32 flags
 CpRTP	|Size_t	|is_utf8_FF_helper_|NN const U8 * const s0		    \
 				|NN const U8 * const e			    \
 				|const bool require_partial
@@ -1164,6 +1164,9 @@ AbTpdD	|STRLEN	|is_utf8_char	|NN const U8 *s
 AbMTpd	|STRLEN	|is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end
 ATidRp	|Size_t	|isUTF8_CHAR|NN const U8 * const s0			    \
 			    |NN const U8 * const e
+ATidRp	|Size_t	|isUTF8_CHAR_flags|NN const U8 * const s0		    \
+			    |NN const U8 * const e			    \
+			    |const U32 flags
 ATidRp	|Size_t	|isSTRICT_UTF8_CHAR |NN const U8 * const s0		    \
 				    |NN const U8 * const e
 ATidRp	|Size_t	|isC9_STRICT_UTF8_CHAR |NN const U8 * const s0		    \
@@ -1203,9 +1206,9 @@ ATidp	|bool	|is_utf8_fixed_width_buf_loclen_flags			    \
 		|NN const U8 * const s|STRLEN len			    \
 		|NULLOK const U8 **ep|NULLOK STRLEN *el|const U32 flags
 AmTdP	|bool	|is_utf8_valid_partial_char				    \
-		|NN const U8 * const s|NN const U8 * const e
+		|NN const U8 * const s0|NN const U8 * const e
 ATidRp	|bool	|is_utf8_valid_partial_char_flags			    \
-		|NN const U8 * const s|NN const U8 * const e|const U32 flags
+		|NN const U8 * const s0|NN const U8 * const e|const U32 flags
 CpR     |bool   |_is_uni_FOO|const U8 classnum|const UV c
 CpR     |bool   |_is_utf8_FOO|const U8 classnum|NN const U8 *p     \
 		|NN const U8 * const e

diff --git a/embed.h b/embed.h
@@ -268,6 +268,7 @@
 #define isC9_STRICT_UTF8_CHAR	Perl_isC9_STRICT_UTF8_CHAR
 #define isSTRICT_UTF8_CHAR	Perl_isSTRICT_UTF8_CHAR
 #define isUTF8_CHAR		Perl_isUTF8_CHAR
+#define isUTF8_CHAR_flags	Perl_isUTF8_CHAR_flags
 #define is_c9strict_utf8_string_loclen	Perl_is_c9strict_utf8_string_loclen
 #define is_lvalue_sub()		Perl_is_lvalue_sub(aTHX)
 #define is_safe_syscall(a,b,c,d)	Perl_is_safe_syscall(aTHX_ a,b,c,d)
@@ -276,7 +277,7 @@
 #ifndef NO_MATHOMS
 #define is_utf8_char		Perl_is_utf8_char
 #endif
-#define is_utf8_char_helper	Perl_is_utf8_char_helper
+#define is_utf8_char_helper_	Perl_is_utf8_char_helper_
 #define is_utf8_fixed_width_buf_loclen_flags	Perl_is_utf8_fixed_width_buf_loclen_flags
 #define is_utf8_invariant_string_loc	Perl_is_utf8_invariant_string_loc
 #define is_utf8_string_flags	Perl_is_utf8_string_flags

diff --git a/inline.h b/inline.h
@@ -2169,6 +2169,73 @@ Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
 
 /*
 
+=for apidoc Am|STRLEN|isUTF8_CHAR_flags|const U8 *s|const U8 *e| const U32 flags
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
+that represents some code point, subject to the restrictions given by C<flags>;
+otherwise it evaluates to 0.  If non-zero, the value gives how many bytes
+starting at C<s> comprise the code point's representation.  Any bytes remaining
+before C<e>, but beyond the ones needed to form the first code point in C<s>,
+are not examined.
+
+If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
+if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
+as C<L</isSTRICT_UTF8_CHAR>>;
+and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
+the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
+Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
+understood by C<L</utf8n_to_uvchr>>, with the same meanings.
+
+The three alternative macros are for the most commonly needed validations; they
+are likely to run somewhat faster than this more general one, as they can be
+inlined into your code.
+
+Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
+L</is_utf8_string_loclen_flags> to check entire strings.
+
+=cut
+*/
+
+PERL_STATIC_INLINE STRLEN
+Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
+{
+    PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
+    assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+                          |UTF8_DISALLOW_PERL_EXTENDED)));
+
+    PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
+                          goto check_success,
+                          DFA_TEASE_APART_FF_,
+                          DFA_RETURN_FAILURE_);
+
+  check_success:
+
+    return is_utf8_char_helper_(s0, e, flags);
+
+#ifdef HAS_EXTRA_LONG_UTF8
+
+  tease_apart_FF:
+
+    /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
+     * either malformed, or was for the largest possible start byte, which
+     * indicates perl extended UTF-8, well above the Unicode maximum */
+    if (   *s0 != I8_TO_NATIVE_UTF8(0xFF)
+        || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
+    {
+        return 0;
+    }
+
+    /* Otherwise examine the sequence not inline */
+    return is_utf8_FF_helper_(s0, e,
+                              FALSE /* require full, not partial char */
+                             );
+#endif
+
+}
+
+/*
+
 =for apidoc is_utf8_valid_partial_char
 
 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
@@ -2217,18 +2284,47 @@ determined from just the first one or two bytes.
  */
 
 PERL_STATIC_INLINE bool
-Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
+Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
 {
     PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
-
     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
                           |UTF8_DISALLOW_PERL_EXTENDED)));
 
-    if (s >= e || s + UTF8SKIP(s) <= e) {
-        return FALSE;
+    PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
+                          DFA_RETURN_FAILURE_,
+                          DFA_TEASE_APART_FF_,
+                          NOOP);
+
+    /* The NOOP above causes the DFA to drop down here iff the input was a
+     * partial character.  flags=0 => can return TRUE immediately; otherwise we
+     * need to check (not inline) if the partial character is the beginning of
+     * a disallowed one */
+    if (flags == 0) {
+        return TRUE;
     }
 
-    return cBOOL(is_utf8_char_helper(s, e, flags));
+    return cBOOL(is_utf8_char_helper_(s0, e, flags));
+
+#ifdef HAS_EXTRA_LONG_UTF8
+
+  tease_apart_FF:
+
+    /* Getting here means the input is either malformed, or, in the case of
+     * PL_extended_utf8_dfa_tab, was for the largest possible start byte.  The
+     * latter case has to be extended UTF-8, so can fail immediately if that is
+     * forbidden */
+
+    if (   *s0 != I8_TO_NATIVE_UTF8(0xFF)
+        || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
+    {
+        return 0;
+    }
+
+    return is_utf8_FF_helper_(s0, e,
+                              TRUE /* Require to be a partial character */
+                             );
+#endif
+
 }
 
 /*

diff --git a/proto.h b/proto.h
@@ -1665,6 +1665,13 @@ PERL_STATIC_INLINE Size_t	Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const
 	assert(s0); assert(e)
 #endif
 
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE Size_t	Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
+			__attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS	\
+	assert(s0); assert(e)
+#endif
+
 /* PERL_CALLCONV bool	is_ascii_string(const U8* const s, STRLEN len)
 			__attribute__warn_unused_result__
 			__attribute__pure__; */
@@ -1726,10 +1733,10 @@ PERL_CALLCONV STRLEN	Perl_is_utf8_char_buf(const U8 *buf, const U8 *buf_end);
 #define PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF	\
 	assert(buf); assert(buf_end)
 #endif
-PERL_CALLCONV STRLEN	Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
+PERL_CALLCONV STRLEN	Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags)
 			__attribute__warn_unused_result__
 			__attribute__pure__;
-#define PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER	\
+#define PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER_	\
 	assert(s); assert(e)
 
 /* PERL_CALLCONV bool	is_utf8_fixed_width_buf_flags(const U8 * const s, STRLEN len, const U32 flags); */
@@ -1780,16 +1787,16 @@ PERL_STATIC_INLINE bool	Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len
 #define PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS	\
 	assert(s)
 #endif
-/* PERL_CALLCONV bool	is_utf8_valid_partial_char(const U8 * const s, const U8 * const e)
+/* PERL_CALLCONV bool	is_utf8_valid_partial_char(const U8 * const s0, const U8 * const e)
 			__attribute__warn_unused_result__
 			__attribute__pure__; */
 #define PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR
 
 #ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE bool	Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
+PERL_STATIC_INLINE bool	Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
 			__attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS	\
-	assert(s); assert(e)
+	assert(s0); assert(e)
 #endif
 
 PERL_CALLCONV bool	Perl_isinfnan(NV nv)

diff --git a/regcharclass.h b/regcharclass.h
@@ -3765,6 +3765,6 @@
  * 3f9b68df38ab90c512f7159658ea01cc6fee76420c5953300a31fb06ac632e62 lib/unicore/mktables
  * 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
  * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
- * 3fb6bafb4c830dd501868e34f550cdad3bf8d2c9eed44756488f36c484969417 regen/regcharclass.pl
+ * 1aa94679c695efd507b7e4491629dba1021b74c21a5324dfd3a582a5d654bd32 regen/regcharclass.pl
  * b2f896452d2b30da3e04800f478c60c1fd0b03d6b668689b020f1e3cf1f1cdd9 regen/regcharclass_multi_char_folds.pl
  * ex: set ro: */
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
@@ -1805,6 +1805,8 @@ sub make_macro {
 0xFFFFE - 0xFFFFF
 0x10FFFE - 0x10FFFF
 
+# Note that code in utf8.c is counting on the 'fast' version to look at no
+# more than two bytes
 SURROGATE: Surrogate code points
 => UTF8 :safe fast
 \p{_Perl_Surrogate}