Improve utf16_to_utf8_reversed()

Instead of destroying the input by first swapping the bytes, this calls a base function with the order to use. The non-reverse function is changed to call the base function with the non-reversed order.
Perl · Aug 14, 2021 · 5fd2667 · 5fd2667
1 parent 2709957
commit 5fd2667
Show file tree

Hide file tree

Showing 5 changed files with 64 additions and 41 deletions.
diff --git a/embed.fnc b/embed.fnc
@@ -2542,8 +2542,10 @@ p	|void	|utilize	|int aver|I32 floor|NULLOK OP* version|NN OP* idop|NULLOK OP* a
 Cp	|void	|_force_out_malformed_utf8_message			    \
 		|NN const U8 *const p|NN const U8 * const e|const U32 flags \
 		|const bool die_here
-EXp	|U8*	|utf16_to_utf8	|NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen
-EXp	|U8*	|utf16_to_utf8_reversed|NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen
+EXp	|U8*	|utf16_to_utf8_base|NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen	\
+				|const bool high|const bool low
+EMXp	|U8*	|utf16_to_utf8	|NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen
+EMXp	|U8*	|utf16_to_utf8_reversed|NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen
 AdpR	|STRLEN	|utf8_length	|NN const U8* s|NN const U8 *e
 AipdR	|IV	|utf8_distance	|NN const U8 *a|NN const U8 *b
 AipdRT	|U8*	|utf8_hop	|NN const U8 *s|SSize_t off

diff --git a/embed.h b/embed.h
@@ -975,8 +975,7 @@
 #define skipspace_flags(a,b)	Perl_skipspace_flags(aTHX_ a,b)
 #define sv_magicext_mglob(a)	Perl_sv_magicext_mglob(aTHX_ a)
 #define sv_only_taint_gmagic	Perl_sv_only_taint_gmagic
-#define utf16_to_utf8(a,b,c,d)	Perl_utf16_to_utf8(aTHX_ a,b,c,d)
-#define utf16_to_utf8_reversed(a,b,c,d)	Perl_utf16_to_utf8_reversed(aTHX_ a,b,c,d)
+#define utf16_to_utf8_base(a,b,c,d,e,f)	Perl_utf16_to_utf8_base(aTHX_ a,b,c,d,e,f)
 #define validate_proto(a,b,c,d)	Perl_validate_proto(aTHX_ a,b,c,d)
 #define vivify_defelem(a)	Perl_vivify_defelem(aTHX_ a)
 #define yylex()			Perl_yylex(aTHX)

diff --git a/proto.h b/proto.h
@@ -3999,6 +3999,9 @@ PERL_CALLCONV SV*	Perl_upg_version(pTHX_ SV *ver, bool qv);
 PERL_CALLCONV U8*	Perl_utf16_to_utf8(pTHX_ U8* p, U8 *d, Size_t bytelen, Size_t *newlen);
 #define PERL_ARGS_ASSERT_UTF16_TO_UTF8	\
 	assert(p); assert(d); assert(newlen)
+PERL_CALLCONV U8*	Perl_utf16_to_utf8_base(pTHX_ U8* p, U8 *d, Size_t bytelen, Size_t *newlen, const bool high, const bool low);
+#define PERL_ARGS_ASSERT_UTF16_TO_UTF8_BASE	\
+	assert(p); assert(d); assert(newlen)
 PERL_CALLCONV U8*	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8 *d, Size_t bytelen, Size_t *newlen);
 #define PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED	\
 	assert(p); assert(d); assert(newlen)

diff --git a/utf8.c b/utf8.c
@@ -2578,37 +2578,52 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
 }
 
 /*
- * Convert native (big-endian) UTF-16 to UTF-8.  For reversed (little-endian),
- * use utf16_to_utf8_reversed().
+ * Convert native UTF-16 to UTF-8. Called via the more public functions
+ * utf16_to_utf8() for big-endian and utf16_to_utf8_reversed() for
+ * little-endian,
  *
- * UTF-16 requires 2 bytes for every code point below 0x10000; otherwise 4 bytes.
- * UTF-8 requires 1-3 bytes for every code point below 0x1000; otherwise 4 bytes.
- * UTF-EBCDIC requires 1-4 bytes for every code point below 0x1000; otherwise 4-5 bytes.
+ * 'p' is the UTF-16 input string, passed as a pointer to U8.
+ * 'bytelen' is its length (must be even)
+ * 'd' is the pointer to the destination buffer.  The caller must ensure that
+ *     the space is large enough.  The maximum expansion factor is 2 times
+ *     'bytelen'.  1.5 if never going to run on an EBCDIC box.
+ * '*newlen' will contain the number of bytes this function filled of 'd'.
+ * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
+ * 'low_byte' is 1  if UTF-16BE; 0 if UTF-16LE
  *
- * These functions don't check for overflow.  The worst case is every code
- * point in the input is 2 bytes, and requires 4 bytes on output.  (If the code
- * is never going to run in EBCDIC, it is 2 bytes requiring 3 on output.)  Therefore the
- * destination must be pre-extended to 2 times the source length.
+ * The expansion factor is because UTF-16 requires 2 bytes for every code point
+ * below 0x10000; otherwise 4 bytes.  UTF-8 requires 1-3 bytes for every code
+ * point below 0x1000; otherwise 4 bytes.  UTF-EBCDIC requires 1-4 bytes for
+ * every code point below 0x1000; otherwise 4-5 bytes.
  *
- * Do not use in-place.  We optimize for native, for obvious reasons. */
+ * The worst case is where every code point is below U+10000, hence requiring 2
+ * UTF-16 bytes, but is U+0800 or higher on ASCII platforms, requiring 3 UTF-8
+ * bytes; or >= U+4000 on EBCDIC requiring 4 UTF-8 bytes.
+ *
+ * Do not use in-place. */
 
 U8*
-Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
+Perl_utf16_to_utf8_base(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen,
+                              const bool high_byte, /* Which of next two bytes is
+                                                  high order */
+                              const bool low_byte)
 {
     U8* pend;
     U8* dstart = d;
 
-    PERL_ARGS_ASSERT_UTF16_TO_UTF8;
+    PERL_ARGS_ASSERT_UTF16_TO_UTF8_BASE;
 
     if (bytelen & 1)
-        Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf,
-                                                               (UV)bytelen);
+        Perl_croak(aTHX_ "panic: utf16_to_utf8%s: odd bytelen %" UVuf,
+                ((high_byte == 0) ? "" : "_reversed"), (UV)bytelen);
     pend = p + bytelen;
 
     while (p < pend) {
 
-        /* Next 16 bits is what we want, assumes UTF-16BE */
-        UV uv = (p[0] << 8) + p[1];
+        /* Next 16 bits is what we want.  (The bool is cast to U8 because on
+         * platforms where a bool is implemented as a signed char, a compiler
+         * warning may be generated) */
+        U32 uv = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
         p += 2;
 
         /* If it's a surrogate, we find the uv that the surrogate pair encodes.
@@ -2625,47 +2640,46 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
             }
             else {
-                UV low = (p[0] << 8) + p[1];
-                if (UNLIKELY(! inRANGE(low, FIRST_LOW_SURROGATE,
-                                            LAST_LOW_SURROGATE)))
+                U32 low_surrogate = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
+                if (UNLIKELY(! inRANGE(low_surrogate, FIRST_LOW_SURROGATE,
+                                                       LAST_LOW_SURROGATE)))
                 {
                     Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
                 }
+
                 p += 2;
-                uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
-                                + (low - FIRST_LOW_SURROGATE) + FIRST_IN_PLANE1;
+
+                /* Here uv is the high surrogate.  Combine with low surrogate
+                 * just computed to form the actual U32 code point.
+                 *
+                 * From https://unicode.org/faq/utf_bom.html#utf16-4 */
+                uv = FIRST_IN_PLANE1 + (uv << 10) - (FIRST_HIGH_SURROGATE << 10)
+                                     + low_surrogate - FIRST_LOW_SURROGATE;
             }
         }
 
-        /* Here, 'uv' is the real uv we want to find the UTF-8 of */
+        /* Here, 'uv' is the real U32 we want to find the UTF-8 of */
         d = uvoffuni_to_utf8_flags(d, uv, 0);
     }
 
     *newlen = d - dstart;
     return d;
 }
 
-/* Note: this one is slightly destructive of the source. */
+U8*
+Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
+{
+    PERL_ARGS_ASSERT_UTF16_TO_UTF8;
+
+    return utf16_to_utf8(p, d, bytelen, newlen);
+}
 
 U8*
 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
 {
-    U8* s = (U8*)p;
-    U8* const send = s + bytelen;
-
     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
 
-    if (bytelen & 1)
-        Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf,
-                   (UV)bytelen);
-
-    while (s < send) {
-        const U8 tmp = s[0];
-        s[0] = s[1];
-        s[1] = tmp;
-        s += 2;
-    }
-    return utf16_to_utf8(p, d, bytelen, newlen);
+    return utf16_to_utf8_reversed(p, d, bytelen, newlen);
 }
 
 bool

diff --git a/utf8.h b/utf8.h
@@ -82,6 +82,11 @@ the string is invariant.
 #define utf8n_to_uvchr_error(s, len, lenp, flags, errors)                      \
                         utf8n_to_uvchr_msgs(s, len, lenp, flags, errors, 0)
 
+#define utf16_to_utf8(p, d, bytelen, newlen)                                \
+                            utf16_to_utf8_base(p, d, bytelen, newlen, 0, 1)
+#define utf16_to_utf8_reversed(p, d, bytelen, newlen)                       \
+                            utf16_to_utf8_base(p, d, bytelen, newlen, 1, 0)
+
 #define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)
 
 #define foldEQ_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2) \