Add utf8_to_utf16

Perl · Aug 7, 2021 · 0e86b23 · 0e86b23
1 parent ca152df
commit 0e86b23
Show file tree

Hide file tree

Showing 7 changed files with 134 additions and 4 deletions.
diff --git a/embed.fnc b/embed.fnc
@@ -2543,6 +2543,8 @@ EXp	|U8*	|utf16_to_utf8_base|NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen
 				|const bool high|const bool low
 EMXp	|U8*	|utf16_to_utf8	|NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen
 EMXp	|U8*	|utf16_to_utf8_reversed|NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen
+EXpx	|U8*	|utf8_to_utf16_base|NN U8* s|NN U8 *d|Size_t bytelen|NN Size_t *newlen	\
+				|const bool high|const bool low
 AdpR	|STRLEN	|utf8_length	|NN const U8* s|NN const U8 *e
 AipdR	|IV	|utf8_distance	|NN const U8 *a|NN const U8 *b
 AipdRT	|U8*	|utf8_hop	|NN const U8 *s|SSize_t off

diff --git a/embed.h b/embed.h
@@ -975,6 +975,7 @@
 #define sv_magicext_mglob(a)	Perl_sv_magicext_mglob(aTHX_ a)
 #define sv_only_taint_gmagic	Perl_sv_only_taint_gmagic
 #define utf16_to_utf8_base(a,b,c,d,e,f)	Perl_utf16_to_utf8_base(aTHX_ a,b,c,d,e,f)
+#define utf8_to_utf16_base(a,b,c,d,e,f)	Perl_utf8_to_utf16_base(aTHX_ a,b,c,d,e,f)
 #define validate_proto(a,b,c,d)	Perl_validate_proto(aTHX_ a,b,c,d)
 #define vivify_defelem(a)	Perl_vivify_defelem(aTHX_ a)
 #define yylex()			Perl_yylex(aTHX)

diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
@@ -3092,6 +3092,36 @@ utf16_to_utf8 (sv, ...)
         ST(0) = dest;
         XSRETURN(1);
 
+void
+utf8_to_utf16 (sv, ...)
+    SV* sv
+        ALIAS:
+            utf8_to_utf16_reversed = 1
+    PREINIT:
+        STRLEN len;
+        U8 *source;
+        SV *dest;
+        Size_t got;
+    CODE:
+        if (ix) (void)SvPV_force_nolen(sv);
+        source = (U8 *)SvPV(sv, len);
+        /* Optionally only convert part of the buffer.  */
+        if (items > 1) {
+            len = SvUV(ST(1));
+        }
+        /* Mortalise this right now, as we'll be testing croak()s  */
+        dest = sv_2mortal(newSV(len * 2 + 1));
+        if (ix) {
+            utf8_to_utf16_reversed(source, (U8 *)SvPVX(dest), len, &got);
+        } else {
+            utf8_to_utf16(source, (U8 *)SvPVX(dest), len, &got);
+        }
+        SvCUR_set(dest, got);
+        SvPVX(dest)[got] = '\0';
+        SvPOK_on(dest);
+        ST(0) = dest;
+        XSRETURN(1);
+
 void
 my_exit(int exitcode)
         PPCODE:

diff --git a/ext/XS-APItest/t/utf16_to_utf8.t b/ext/XS-APItest/t/utf16_to_utf8.t
@@ -6,7 +6,9 @@ use Encode;
 
 plan skip_all => 'Unclear how EBCIDC should behave' if ord "A" != 65;
 
-use XS::APItest qw(utf16_to_utf8 utf16_to_utf8_reversed);
+# Bug in Encode, non chars are rejected
+use XS::APItest qw(utf16_to_utf8 utf16_to_utf8_reversed
+                   utf8_to_utf16 utf8_to_utf16_reversed);
 
 for my $ord (0, 10, 13, 78, 255, 256, 0xD7FF, 0xE000, 0xFFFD,
 	     0x10000, 0x10FC00, 0x103FF, 0x10FFFD) {
@@ -15,13 +17,17 @@ for my $ord (0, 10, 13, 78, 255, 256, 0xD7FF, 0xE000, 0xFFFD,
 	for my $suffix ('', "\0", "Moo!") {
 	    my $string = $prefix . $chr . $suffix;
 	    my $name = sprintf "for chr $ord prefix %d, suffix %d",
-		length $prefix, length $suffix;
+		               length $prefix, length $suffix;
 	    my $as_utf8 = $string;
 	    utf8::encode($as_utf8);
-	    is(utf16_to_utf8(encode('UTF-16BE', $string)), $as_utf8,
-	       "utf16_to_utf8 $name");
+            my $be_16 = encode('UTF-16BE', $string);
+            my $le_16 = encode('UTF-16LE', $string);
+	    is(utf16_to_utf8($be_16), $as_utf8, "utf16_to_utf8 $name");
+	    is(utf8_to_utf16($as_utf8), $be_16, "utf8_to_utf16 $name");
 	    is(utf16_to_utf8_reversed(encode('UTF-16LE', $string)), $as_utf8,
 	       "utf16_to_utf8_reversed $name");
+	    is(utf8_to_utf16_reversed($as_utf8), $le_16,
+               "utf8_to_utf16_reversed $name");
 	}
     }
 }
@@ -66,4 +72,19 @@ like($@, qr/^Malformed UTF-16 surrogate at/, 'Lone surrogate croaks');
 (ok(!defined $got, 'hence eval returns undef')) or
     diag(join ', ', map {ord $_} split //, $got);
 
+{   # This example is published by Unicode, so verifies we aren't just
+    # internally consistent; we conform to the Standard
+    my $utf16_of_U10302 = utf8_to_utf16(chr 0x10302);
+    is(substr($utf16_of_U10302, 0, 1), chr 0xD8);
+    is(substr($utf16_of_U10302, 1, 1), chr 0x00);
+    is(substr($utf16_of_U10302, 2, 1), chr 0xDF);
+    is(substr($utf16_of_U10302, 3, 1), chr 0x02);
+
+    $utf16_of_U10302 = utf8_to_utf16_reversed(chr 0x10302);
+    is(substr($utf16_of_U10302, 0, 1), chr 0x00);
+    is(substr($utf16_of_U10302, 1, 1), chr 0xD8);
+    is(substr($utf16_of_U10302, 2, 1), chr 0x02);
+    is(substr($utf16_of_U10302, 3, 1), chr 0xDF);
+}
+
 done_testing;
diff --git a/proto.h b/proto.h
@@ -4041,6 +4041,9 @@ PERL_CALLCONV STRLEN	Perl_utf8_length(pTHX_ const U8* s, const U8 *e)
 PERL_CALLCONV U8*	Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp);
 #define PERL_ARGS_ASSERT_UTF8_TO_BYTES	\
 	assert(s); assert(lenp)
+PERL_CALLCONV U8*	Perl_utf8_to_utf16_base(pTHX_ U8* s, U8 *d, Size_t bytelen, Size_t *newlen, const bool high, const bool low);
+#define PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE	\
+	assert(s); assert(d); assert(newlen)
 #ifndef NO_MATHOMS
 PERL_CALLCONV UV	Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 			__attribute__deprecated__;

diff --git a/utf8.c b/utf8.c
@@ -2726,6 +2726,75 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
     return utf16_to_utf8_reversed(p, d, bytelen, newlen);
 }
 
+/*
+ * Convert UTF-8 to native UTF-16. Called via the macros utf8_to_utf16() for
+ * big-endian and utf8_to_utf16_reversed() for little-endian,
+ *
+ * 's' is the UTF-8 input string, passed as a pointer to U8.
+ * 'bytelen' is its length
+ * 'd' is the pointer to the destination buffer, currently passed as U8 *.  The
+ *     caller must ensure that the space is large enough.  The maximum
+ *     expansion factor is 2 times 'bytelen'.  This happens when the input is
+ *     entirely single-byte ASCII, expanding to two-byte UTF-16.
+ * '*newlen' will contain the number of bytes this function filled of 'd'.
+ * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
+ * 'low_byte'  is 1 if UTF-16BE; 0 if UTF-16LE
+ *
+ * Do not use in-place. */
+U8*
+Perl_utf8_to_utf16_base(pTHX_ U8* s, U8* d, Size_t bytelen, Size_t *newlen,
+                              const bool high_byte, /* Which of next two bytes
+                                                       is high order */
+                              const bool low_byte)
+{
+    U8* send;
+    U8* dstart = d;
+
+    PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE;
+
+    send = s + bytelen;
+
+    while (s < send) {
+        STRLEN retlen;
+        UV uv = NATIVE_TO_UNI(utf8n_to_uvchr(s, send - s, &retlen,
+                               /* No surrogates nor above-Unicode */
+                               UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE));
+
+        /* The modern method is to keep going with malformed input,
+         * substituting the REPLACEMENT CHARACTER */
+        if (UNLIKELY(uv == 0 && *s != '\0')) {
+            uv = UNICODE_REPLACEMENT;
+        }
+
+        if (uv >= FIRST_IN_PLANE1) {    /* Requires a surrogate pair */
+
+            /* From https://unicode.org/faq/utf_bom.html#utf16-4 */
+            U32 high_surrogate = (uv >> 10) - (FIRST_IN_PLANE1 >> 10)
+                               + FIRST_HIGH_SURROGATE;
+
+            d[high_byte] = high_surrogate >> 8;
+            d[low_byte]  = high_surrogate & nBIT_MASK(8);
+            d += 2;
+
+            /* The low surrogate is the lower 10 bits plus the offset */
+            uv &= nBIT_MASK(10);
+            uv += FIRST_LOW_SURROGATE;
+
+            /* Drop down to output the low surrogate like it were a
+             * non-surrogate */
+        }
+
+        d[high_byte] = uv >> 8;
+        d[low_byte] = uv & nBIT_MASK(8);
+        d += 2;
+
+        s += retlen;
+    }
+
+    *newlen = d - dstart;
+    return d;
+}
+
 bool
 Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
 {

diff --git a/utf8.h b/utf8.h
@@ -86,6 +86,10 @@ the string is invariant.
                             utf16_to_utf8_base(p, d, bytelen, newlen, 0, 1)
 #define utf16_to_utf8_reversed(p, d, bytelen, newlen)                       \
                             utf16_to_utf8_base(p, d, bytelen, newlen, 1, 0)
+#define utf8_to_utf16(p, d, bytelen, newlen)                                \
+                            utf8_to_utf16_base(p, d, bytelen, newlen, 0, 1)
+#define utf8_to_utf16_reversed(p, d, bytelen, newlen)                       \
+                            utf8_to_utf16_base(p, d, bytelen, newlen, 1, 0)
 
 #define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)