Skip to content

Commit

Permalink
Add utf8_to_utf16
Browse files Browse the repository at this point in the history
  • Loading branch information
khwilliamson committed Aug 7, 2021
1 parent ca152df commit 0e86b23
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 4 deletions.
2 changes: 2 additions & 0 deletions embed.fnc
Expand Up @@ -2543,6 +2543,8 @@ EXp |U8* |utf16_to_utf8_base|NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen
|const bool high|const bool low
EMXp |U8* |utf16_to_utf8 |NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen
EMXp |U8* |utf16_to_utf8_reversed|NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen
EXpx |U8* |utf8_to_utf16_base|NN U8* s|NN U8 *d|Size_t bytelen|NN Size_t *newlen \
|const bool high|const bool low
AdpR |STRLEN |utf8_length |NN const U8* s|NN const U8 *e
AipdR |IV |utf8_distance |NN const U8 *a|NN const U8 *b
AipdRT |U8* |utf8_hop |NN const U8 *s|SSize_t off
Expand Down
1 change: 1 addition & 0 deletions embed.h
Expand Up @@ -975,6 +975,7 @@
#define sv_magicext_mglob(a) Perl_sv_magicext_mglob(aTHX_ a)
#define sv_only_taint_gmagic Perl_sv_only_taint_gmagic
#define utf16_to_utf8_base(a,b,c,d,e,f) Perl_utf16_to_utf8_base(aTHX_ a,b,c,d,e,f)
#define utf8_to_utf16_base(a,b,c,d,e,f) Perl_utf8_to_utf16_base(aTHX_ a,b,c,d,e,f)
#define validate_proto(a,b,c,d) Perl_validate_proto(aTHX_ a,b,c,d)
#define vivify_defelem(a) Perl_vivify_defelem(aTHX_ a)
#define yylex() Perl_yylex(aTHX)
Expand Down
30 changes: 30 additions & 0 deletions ext/XS-APItest/APItest.xs
Expand Up @@ -3092,6 +3092,36 @@ utf16_to_utf8 (sv, ...)
ST(0) = dest;
XSRETURN(1);

void
utf8_to_utf16 (sv, ...)
SV* sv
ALIAS:
utf8_to_utf16_reversed = 1
PREINIT:
STRLEN len;
U8 *source;
SV *dest;
Size_t got;
CODE:
if (ix) (void)SvPV_force_nolen(sv);
source = (U8 *)SvPV(sv, len);
/* Optionally only convert part of the buffer. */
if (items > 1) {
len = SvUV(ST(1));
}
/* Mortalise this right now, as we'll be testing croak()s */
dest = sv_2mortal(newSV(len * 2 + 1));
if (ix) {
utf8_to_utf16_reversed(source, (U8 *)SvPVX(dest), len, &got);
} else {
utf8_to_utf16(source, (U8 *)SvPVX(dest), len, &got);
}
SvCUR_set(dest, got);
SvPVX(dest)[got] = '\0';
SvPOK_on(dest);
ST(0) = dest;
XSRETURN(1);

void
my_exit(int exitcode)
PPCODE:
Expand Down
29 changes: 25 additions & 4 deletions ext/XS-APItest/t/utf16_to_utf8.t
Expand Up @@ -6,7 +6,9 @@ use Encode;

plan skip_all => 'Unclear how EBCIDC should behave' if ord "A" != 65;

use XS::APItest qw(utf16_to_utf8 utf16_to_utf8_reversed);
# Bug in Encode, non chars are rejected
use XS::APItest qw(utf16_to_utf8 utf16_to_utf8_reversed
utf8_to_utf16 utf8_to_utf16_reversed);

for my $ord (0, 10, 13, 78, 255, 256, 0xD7FF, 0xE000, 0xFFFD,
0x10000, 0x10FC00, 0x103FF, 0x10FFFD) {
Expand All @@ -15,13 +17,17 @@ for my $ord (0, 10, 13, 78, 255, 256, 0xD7FF, 0xE000, 0xFFFD,
for my $suffix ('', "\0", "Moo!") {
my $string = $prefix . $chr . $suffix;
my $name = sprintf "for chr $ord prefix %d, suffix %d",
length $prefix, length $suffix;
length $prefix, length $suffix;
my $as_utf8 = $string;
utf8::encode($as_utf8);
is(utf16_to_utf8(encode('UTF-16BE', $string)), $as_utf8,
"utf16_to_utf8 $name");
my $be_16 = encode('UTF-16BE', $string);
my $le_16 = encode('UTF-16LE', $string);
is(utf16_to_utf8($be_16), $as_utf8, "utf16_to_utf8 $name");
is(utf8_to_utf16($as_utf8), $be_16, "utf8_to_utf16 $name");
is(utf16_to_utf8_reversed(encode('UTF-16LE', $string)), $as_utf8,
"utf16_to_utf8_reversed $name");
is(utf8_to_utf16_reversed($as_utf8), $le_16,
"utf8_to_utf16_reversed $name");
}
}
}
Expand Down Expand Up @@ -66,4 +72,19 @@ like($@, qr/^Malformed UTF-16 surrogate at/, 'Lone surrogate croaks');
(ok(!defined $got, 'hence eval returns undef')) or
diag(join ', ', map {ord $_} split //, $got);

{ # This example is published by Unicode, so verifies we aren't just
# internally consistent; we conform to the Standard
my $utf16_of_U10302 = utf8_to_utf16(chr 0x10302);
is(substr($utf16_of_U10302, 0, 1), chr 0xD8);
is(substr($utf16_of_U10302, 1, 1), chr 0x00);
is(substr($utf16_of_U10302, 2, 1), chr 0xDF);
is(substr($utf16_of_U10302, 3, 1), chr 0x02);

$utf16_of_U10302 = utf8_to_utf16_reversed(chr 0x10302);
is(substr($utf16_of_U10302, 0, 1), chr 0x00);
is(substr($utf16_of_U10302, 1, 1), chr 0xD8);
is(substr($utf16_of_U10302, 2, 1), chr 0x02);
is(substr($utf16_of_U10302, 3, 1), chr 0xDF);
}

done_testing;
3 changes: 3 additions & 0 deletions proto.h
Expand Up @@ -4041,6 +4041,9 @@ PERL_CALLCONV STRLEN Perl_utf8_length(pTHX_ const U8* s, const U8 *e)
PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp);
#define PERL_ARGS_ASSERT_UTF8_TO_BYTES \
assert(s); assert(lenp)
PERL_CALLCONV U8* Perl_utf8_to_utf16_base(pTHX_ U8* s, U8 *d, Size_t bytelen, Size_t *newlen, const bool high, const bool low);
#define PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE \
assert(s); assert(d); assert(newlen)
#ifndef NO_MATHOMS
PERL_CALLCONV UV Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
__attribute__deprecated__;
Expand Down
69 changes: 69 additions & 0 deletions utf8.c
Expand Up @@ -2726,6 +2726,75 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
return utf16_to_utf8_reversed(p, d, bytelen, newlen);
}

/*
* Convert UTF-8 to native UTF-16. Called via the macros utf8_to_utf16() for
* big-endian and utf8_to_utf16_reversed() for little-endian,
*
* 's' is the UTF-8 input string, passed as a pointer to U8.
* 'bytelen' is its length
* 'd' is the pointer to the destination buffer, currently passed as U8 *. The
* caller must ensure that the space is large enough. The maximum
* expansion factor is 2 times 'bytelen'. This happens when the input is
* entirely single-byte ASCII, expanding to two-byte UTF-16.
* '*newlen' will contain the number of bytes this function filled of 'd'.
* 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
* 'low_byte' is 1 if UTF-16BE; 0 if UTF-16LE
*
* Do not use in-place. */
U8*
Perl_utf8_to_utf16_base(pTHX_ U8* s, U8* d, Size_t bytelen, Size_t *newlen,
const bool high_byte, /* Which of next two bytes
is high order */
const bool low_byte)
{
U8* send;
U8* dstart = d;

PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE;

send = s + bytelen;

while (s < send) {
STRLEN retlen;
UV uv = NATIVE_TO_UNI(utf8n_to_uvchr(s, send - s, &retlen,
/* No surrogates nor above-Unicode */
UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE));

/* The modern method is to keep going with malformed input,
* substituting the REPLACEMENT CHARACTER */
if (UNLIKELY(uv == 0 && *s != '\0')) {
uv = UNICODE_REPLACEMENT;
}

if (uv >= FIRST_IN_PLANE1) { /* Requires a surrogate pair */

/* From https://unicode.org/faq/utf_bom.html#utf16-4 */
U32 high_surrogate = (uv >> 10) - (FIRST_IN_PLANE1 >> 10)
+ FIRST_HIGH_SURROGATE;

d[high_byte] = high_surrogate >> 8;
d[low_byte] = high_surrogate & nBIT_MASK(8);
d += 2;

/* The low surrogate is the lower 10 bits plus the offset */
uv &= nBIT_MASK(10);
uv += FIRST_LOW_SURROGATE;

/* Drop down to output the low surrogate like it were a
* non-surrogate */
}

d[high_byte] = uv >> 8;
d[low_byte] = uv & nBIT_MASK(8);
d += 2;

s += retlen;
}

*newlen = d - dstart;
return d;
}

bool
Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
{
Expand Down
4 changes: 4 additions & 0 deletions utf8.h
Expand Up @@ -86,6 +86,10 @@ the string is invariant.
utf16_to_utf8_base(p, d, bytelen, newlen, 0, 1)
#define utf16_to_utf8_reversed(p, d, bytelen, newlen) \
utf16_to_utf8_base(p, d, bytelen, newlen, 1, 0)
#define utf8_to_utf16(p, d, bytelen, newlen) \
utf8_to_utf16_base(p, d, bytelen, newlen, 0, 1)
#define utf8_to_utf16_reversed(p, d, bytelen, newlen) \
utf8_to_utf16_base(p, d, bytelen, newlen, 1, 0)

#define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)

Expand Down

0 comments on commit 0e86b23

Please sign in to comment.