isUTF8_CHAR(): Bring UTF-EBCDIC to parity with ASCII

This changes the macro isUTF8_CHAR to have the same number of code points built-in for EBCDIC as ASCII. This obsoletes the IS_UTF8_CHAR_FAST macro, which is removed. Previously, the code generated by regen/regcharclass.pl for ASCII platforms was hand copied into utf8.h, and LIKELY's manually added, then the generating code was commented out. Now this has been done with EBCDIC platforms as well. This makes regenerating regcharclass.h faster. The copied macro in utf8.h is moved by this commit to within the main code section for non-EBCDIC compiles, cutting the number of #ifdef's down, and the comments about it are changed somewhat.
Perl · Sep 17, 2016 · 784d4f3 · 784d4f3
1 parent 21cb232
commit 784d4f3
Show file tree

Hide file tree

Showing 4 changed files with 101 additions and 92 deletions.
diff --git a/regcharclass.h b/regcharclass.h
@@ -788,17 +788,6 @@
 #define is_SURROGATE_utf8_safe(s,e)                                         \
 ( ( ( ( ( ( ((e) - (s)) >= 4 ) && ( 0xDD == ((U8*)s)[0] ) ) && ( 0x65 == ((U8*)s)[1] || 0x66 == ((U8*)s)[1] ) ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( ((U8*)s)[2] & 0xFC ) == 0x70 ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( ((U8*)s)[3] & 0xFC ) == 0x70 ) ) ? 4 : 0 )
 
-/*
-	UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 2 through 3 bytes
-
-	0xA0 - 0x3FFF
-*/
-/*** GENERATED CODE ***/
-#define is_UTF8_CHAR_utf8_no_length_checks(s)                               \
-( ( 0x80 == ((U8*)s)[0] || ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA0 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xAC ) || ( 0xAE <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xB6 ) ) ?\
-    ( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) ? 2 : 0 )\
-: ( ( ( ( ( ((U8*)s)[0] & 0xFC ) == 0xB8 ) || ((U8*)s)[0] == 0xBC || ( ( ((U8*)s)[0] & 0xFE ) == 0xBE ) || ( ( ((U8*)s)[0] & 0xEE ) == 0xCA ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xCC ) ) && ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( ((U8*)s)[2] & 0xFC ) == 0x70 ) ) ? 3 : 0 )
-
 /*
 	QUOTEMETA: Meta-characters that \Q should quote
 
@@ -1400,17 +1389,6 @@
 #define is_SURROGATE_utf8_safe(s,e)                                         \
 ( ( ( ( ( ( ((e) - (s)) >= 4 ) && ( 0xDD == ((U8*)s)[0] ) ) && ( ( ((U8*)s)[1] & 0xFE ) == 0x64 ) ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || 0x5F == ((U8*)s)[2] || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( 0x70 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x72 ) ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || 0x5F == ((U8*)s)[3] || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( 0x70 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x72 ) ) ) ? 4 : 0 )
 
-/*
-	UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 2 through 3 bytes
-
-	0xA0 - 0x3FFF
-*/
-/*** GENERATED CODE ***/
-#define is_UTF8_CHAR_utf8_no_length_checks(s)                               \
-( ( 0x78 == ((U8*)s)[0] || 0x80 == ((U8*)s)[0] || ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA0 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xAF ) || ( 0xB1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xB5 ) ) ?\
-    ( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x72 ) ) ? 2 : 0 )\
-: ( ( ( ((U8*)s)[0] == 0xB7 || ( ( ((U8*)s)[0] & 0xFE ) == 0xB8 ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xBC ) || ( ( ((U8*)s)[0] & 0xEE ) == 0xCA ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xCC ) ) && ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x72 ) ) ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || 0x5F == ((U8*)s)[2] || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( 0x70 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x72 ) ) ) ? 3 : 0 )
-
 /*
 	QUOTEMETA: Meta-characters that \Q should quote
 
@@ -1898,6 +1876,6 @@
  * 5c7eb94310e2aaa15702fd6bed24ff0e7ab5448f9a8231d8c49ca96c9e941089 lib/unicore/mktables
  * cdecb300baad839a6f62791229f551a4fa33f3cbdca08e378dc976466354e778 lib/unicore/version
  * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl
- * 76075c280f7e89bb2b15672be32b6cf0375bc68ef8e557c17b2e8e9d97fa86b2 regen/regcharclass.pl
+ * 1876ece914e2c14ed38c8a589adaa3d8193532c3a5bbe9ea5c3279bc9d29b279 regen/regcharclass.pl
  * 393f8d882713a3ba227351ad0f00ea4839fda74fcf77dcd1cdf31519925adba5 regen/regcharclass_multi_char_folds.pl
  * ex: set ro: */
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
@@ -1637,35 +1637,29 @@ sub make_macro {
 => UTF8 :safe
 \p{_Perl_Surrogate}
 
-# This program was run with this enabled, and the results copied to utf8.h;
-# then this was commented out because it takes so long to figure out these 2
-# million code points.  The results would not change unless utf8.h decides it
-# wants a maximum other than 4 bytes, or this program creates better
+# This program was run with this enabled, and the results copied to utf8.h and
+# utfebcdic.h; then this was commented out because it takes so long to figure
+# out these 2 million code points.  The results would not change unless utf8.h
+# decides it wants a different maximum, or this program creates better
 # optimizations.  Trying with 5 bytes used too much memory to calculate.
 #
 # We don't generate code for invariants here because the EBCDIC form is too
 # complicated and would slow things down; instead the user should test for
 # invariants first.
 #
-# NOTE: The number of bytes generated here must match the value in
-# IS_UTF8_CHAR_FAST in utf8.h
+# 0x1FFFFF was chosen because for both UTF-8 and UTF-EBCDIC, its start byte
+# is the same as 0x10FFFF, and it includes all the above-Unicode code points
+# that have that start byte.  In other words, it is the natural stopping place
+# that includes all Unicode code points.
 #
-#UTF8_CHAR: Matches legal UTF-8 encoded characters from 2 through 4 bytes
+#UTF8_CHAR: Matches legal UTF-8 variant code points up through the 0x1FFFFFF
 #=> UTF8 :no_length_checks only_ascii_platform
 #0x80 - 0x1FFFFF
 
-# This hasn't been commented out, but the number of bytes it works on has been
-# cut down to 3, so it doesn't cover the full legal Unicode range.  Making it
-# 5 bytes would cover beyond the full range, but takes quite a bit of time and
-# memory to calculate.  The generated table varies depending on the EBCDIC
-# code page.
+#UTF8_CHAR: Matches legal UTF-EBCDIC variant code points up through 0x1FFFFFF
+#=> UTF8 :no_length_checks only_ebcdic_platform
+#0xA0 - 0x1FFFFF
 
-# NOTE: The number of bytes generated here must match the value in
-# IS_UTF8_CHAR_FAST in utf8.h
-#
-UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 2 through 3 bytes
-=> UTF8 :no_length_checks only_ebcdic_platform
-0xA0 - 0x3FFF
 
 QUOTEMETA: Meta-characters that \Q should quote
 => high :fast

diff --git a/utf8.h b/utf8.h
@@ -303,6 +303,33 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
  * encounter */
 #define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED)
 
+/* A helper macro for isUTF8_CHAR, so use that one instead of this.  This was
+ * generated by regen/regcharclass.pl, and then moved here.  Then it was
+ * hand-edited to add some LIKELY() calls, presuming that malformations are
+ * unlikely.  The lines that generated it were then commented out.  This was
+ * done because it takes on the order of 10 minutes to generate, and is never
+ * going to change, unless the generated code is improved, and figuring out
+ * the LIKELYs there would be hard.
+ *
+        UTF8_CHAR: Matches legal UTF-8 variant code points up through 0x1FFFFFF
+
+	0x80 - 0x1FFFFF
+*/
+/*** GENERATED CODE ***/
+#define is_UTF8_CHAR_utf8_no_length_checks(s)                               \
+( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ?                          \
+    ( LIKELY( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 )                    \
+: ( 0xE0 == ((U8*)s)[0] ) ?                                                 \
+    ( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ?                          \
+    ( LIKELY( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( 0xF0 == ((U8*)s)[0] ) ?                                                 \
+    ( LIKELY( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+: ( ( ( ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF7 ) && LIKELY( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) && LIKELY( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && LIKELY( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )
+
+/* The above macro handles UTF-8 that has this start byte as the maximum */
+#define _IS_UTF8_CHAR_HIGHEST_START_BYTE 0xF7
+
 #endif /* EBCDIC vs ASCII */
 
 /* 2**UTF_ACCUMULATION_SHIFT - 1 */
@@ -857,48 +884,6 @@ point's representation.
 /* If you want to exclude surrogates, and beyond legal Unicode, see the blame
  * log for earlier versions which gave details for these */
 
-/* A helper macro for isUTF8_CHAR, so use that one, and not this one.  This is
- * retained solely for backwards compatibility and may be deprecated and
- * removed in a future Perl version.
- *
- * regen/regcharclass.pl generates is_UTF8_CHAR_utf8() macros for up to these
- * number of bytes.  So this has to be coordinated with that file */
-#ifdef EBCDIC
-#   define IS_UTF8_CHAR_FAST(n) ((n) <= 3)
-#else
-#   define IS_UTF8_CHAR_FAST(n) ((n) <= 4)
-#endif
-
-#ifndef EBCDIC
-/* A helper macro for isUTF8_CHAR, so use that one instead of this.  This was
- * generated by regen/regcharclass.pl, and then moved here.  Then it was
- * hand-edited to add some LIKELY() calls, presuming that malformations are
- * unlikely.  The lines that generated it were then commented out.  This was
- * done because it takes on the order of 10 minutes to generate, and is never
- * going to change, unless the generated code is improved, and figuring out
- * there the LIKELYs would be hard.
- *
- * The EBCDIC versions have been cut to not cover all of legal Unicode,
- * otherwise they take too long to generate; besides there is a separate one
- * for each code page, so they are in regcharclass.h instead of here */
-/*
-	UTF8_CHAR: Matches legal UTF-8 encoded characters from 2 through 4 bytes
-
-	0x80 - 0x1FFFFF
-*/
-/*** GENERATED CODE ***/
-#define is_UTF8_CHAR_utf8_no_length_checks(s)                               \
-( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ?                          \
-    ( LIKELY( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 )                    \
-: ( 0xE0 == ((U8*)s)[0] ) ?                                                 \
-    ( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-: ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ?                          \
-    ( LIKELY( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-: ( 0xF0 == ((U8*)s)[0] ) ?                                                 \
-    ( LIKELY( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
-: ( ( ( ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF7 ) && LIKELY( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) && LIKELY( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && LIKELY( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )
-#endif
-
 /*
 
 =for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e
@@ -925,15 +910,16 @@ is a valid UTF-8 character.
 =cut
 */
 
-#define isUTF8_CHAR(s, e)   (UNLIKELY((e) <= (s))                           \
-                             ? 0                                            \
-                             : (UTF8_IS_INVARIANT(*s))                      \
-                               ? 1                                          \
-                               : UNLIKELY(((e) - (s)) < UTF8SKIP(s))        \
-                                 ? 0                                        \
-                                 : LIKELY(IS_UTF8_CHAR_FAST(UTF8SKIP(s)))   \
-                                   ? is_UTF8_CHAR_utf8_no_length_checks(s)  \
-                                   : _is_utf8_char_slow(s, UTF8SKIP(s)))
+#define isUTF8_CHAR(s, e)                                                   \
+    (UNLIKELY((e) <= (s))                                                   \
+    ? 0                                                                     \
+    : (UTF8_IS_INVARIANT(*s))                                               \
+    ? 1                                                                     \
+    : UNLIKELY(((e) - (s)) < UTF8SKIP(s))                                   \
+      ? 0                                                                   \
+      : LIKELY(NATIVE_UTF8_TO_I8(*s) <= _IS_UTF8_CHAR_HIGHEST_START_BYTE)   \
+      ? is_UTF8_CHAR_utf8_no_length_checks(s)                               \
+      : _is_utf8_char_slow(s, UTF8SKIP(s)))
 
 #define is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)