Skip to content

Commit

Permalink
Improve utf16_to_utf8_reversed()
Browse files Browse the repository at this point in the history
Instead of destroying the input by first swapping the bytes, this calls
a base function with the order to use.  The non-reverse function is
changed to call the base function with the non-reversed order.
  • Loading branch information
khwilliamson committed Aug 14, 2021
1 parent 2709957 commit 5fd2667
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 41 deletions.
6 changes: 4 additions & 2 deletions embed.fnc
Expand Up @@ -2542,8 +2542,10 @@ p |void |utilize |int aver|I32 floor|NULLOK OP* version|NN OP* idop|NULLOK OP* a
Cp |void |_force_out_malformed_utf8_message \
|NN const U8 *const p|NN const U8 * const e|const U32 flags \
|const bool die_here
EXp |U8* |utf16_to_utf8 |NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen
EXp |U8* |utf16_to_utf8_reversed|NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen
EXp |U8* |utf16_to_utf8_base|NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen \
|const bool high|const bool low
EMXp |U8* |utf16_to_utf8 |NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen
EMXp |U8* |utf16_to_utf8_reversed|NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen
AdpR |STRLEN |utf8_length |NN const U8* s|NN const U8 *e
AipdR |IV |utf8_distance |NN const U8 *a|NN const U8 *b
AipdRT |U8* |utf8_hop |NN const U8 *s|SSize_t off
Expand Down
3 changes: 1 addition & 2 deletions embed.h
Expand Up @@ -975,8 +975,7 @@
#define skipspace_flags(a,b) Perl_skipspace_flags(aTHX_ a,b)
#define sv_magicext_mglob(a) Perl_sv_magicext_mglob(aTHX_ a)
#define sv_only_taint_gmagic Perl_sv_only_taint_gmagic
#define utf16_to_utf8(a,b,c,d) Perl_utf16_to_utf8(aTHX_ a,b,c,d)
#define utf16_to_utf8_reversed(a,b,c,d) Perl_utf16_to_utf8_reversed(aTHX_ a,b,c,d)
#define utf16_to_utf8_base(a,b,c,d,e,f) Perl_utf16_to_utf8_base(aTHX_ a,b,c,d,e,f)
#define validate_proto(a,b,c,d) Perl_validate_proto(aTHX_ a,b,c,d)
#define vivify_defelem(a) Perl_vivify_defelem(aTHX_ a)
#define yylex() Perl_yylex(aTHX)
Expand Down
3 changes: 3 additions & 0 deletions proto.h
Expand Up @@ -3999,6 +3999,9 @@ PERL_CALLCONV SV* Perl_upg_version(pTHX_ SV *ver, bool qv);
PERL_CALLCONV U8* Perl_utf16_to_utf8(pTHX_ U8* p, U8 *d, Size_t bytelen, Size_t *newlen);
#define PERL_ARGS_ASSERT_UTF16_TO_UTF8 \
assert(p); assert(d); assert(newlen)
PERL_CALLCONV U8* Perl_utf16_to_utf8_base(pTHX_ U8* p, U8 *d, Size_t bytelen, Size_t *newlen, const bool high, const bool low);
#define PERL_ARGS_ASSERT_UTF16_TO_UTF8_BASE \
assert(p); assert(d); assert(newlen)
PERL_CALLCONV U8* Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8 *d, Size_t bytelen, Size_t *newlen);
#define PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED \
assert(p); assert(d); assert(newlen)
Expand Down
88 changes: 51 additions & 37 deletions utf8.c
Expand Up @@ -2578,37 +2578,52 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
}

/*
* Convert native (big-endian) UTF-16 to UTF-8. For reversed (little-endian),
* use utf16_to_utf8_reversed().
* Convert native UTF-16 to UTF-8. Called via the more public functions
* utf16_to_utf8() for big-endian and utf16_to_utf8_reversed() for
* little-endian,
*
* UTF-16 requires 2 bytes for every code point below 0x10000; otherwise 4 bytes.
* UTF-8 requires 1-3 bytes for every code point below 0x1000; otherwise 4 bytes.
* UTF-EBCDIC requires 1-4 bytes for every code point below 0x1000; otherwise 4-5 bytes.
* 'p' is the UTF-16 input string, passed as a pointer to U8.
* 'bytelen' is its length (must be even)
* 'd' is the pointer to the destination buffer. The caller must ensure that
* the space is large enough. The maximum expansion factor is 2 times
* 'bytelen'. 1.5 if never going to run on an EBCDIC box.
* '*newlen' will contain the number of bytes this function filled of 'd'.
* 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
* 'low_byte' is 1 if UTF-16BE; 0 if UTF-16LE
*
* These functions don't check for overflow. The worst case is every code
* point in the input is 2 bytes, and requires 4 bytes on output. (If the code
* is never going to run in EBCDIC, it is 2 bytes requiring 3 on output.) Therefore the
* destination must be pre-extended to 2 times the source length.
* The expansion factor is because UTF-16 requires 2 bytes for every code point
* below 0x10000; otherwise 4 bytes. UTF-8 requires 1-3 bytes for every code
* point below 0x1000; otherwise 4 bytes. UTF-EBCDIC requires 1-4 bytes for
* every code point below 0x1000; otherwise 4-5 bytes.
*
* Do not use in-place. We optimize for native, for obvious reasons. */
* The worst case is where every code point is below U+10000, hence requiring 2
* UTF-16 bytes, but is U+0800 or higher on ASCII platforms, requiring 3 UTF-8
* bytes; or >= U+4000 on EBCDIC requiring 4 UTF-8 bytes.
*
* Do not use in-place. */

U8*
Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
Perl_utf16_to_utf8_base(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen,
const bool high_byte, /* Which of next two bytes is
high order */
const bool low_byte)
{
U8* pend;
U8* dstart = d;

PERL_ARGS_ASSERT_UTF16_TO_UTF8;
PERL_ARGS_ASSERT_UTF16_TO_UTF8_BASE;

if (bytelen & 1)
Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf,
(UV)bytelen);
Perl_croak(aTHX_ "panic: utf16_to_utf8%s: odd bytelen %" UVuf,
((high_byte == 0) ? "" : "_reversed"), (UV)bytelen);
pend = p + bytelen;

while (p < pend) {

/* Next 16 bits is what we want, assumes UTF-16BE */
UV uv = (p[0] << 8) + p[1];
/* Next 16 bits is what we want. (The bool is cast to U8 because on
* platforms where a bool is implemented as a signed char, a compiler
* warning may be generated) */
U32 uv = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
p += 2;

/* If it's a surrogate, we find the uv that the surrogate pair encodes.
Expand All @@ -2625,47 +2640,46 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
}
else {
UV low = (p[0] << 8) + p[1];
if (UNLIKELY(! inRANGE(low, FIRST_LOW_SURROGATE,
LAST_LOW_SURROGATE)))
U32 low_surrogate = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
if (UNLIKELY(! inRANGE(low_surrogate, FIRST_LOW_SURROGATE,
LAST_LOW_SURROGATE)))
{
Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
}

p += 2;
uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
+ (low - FIRST_LOW_SURROGATE) + FIRST_IN_PLANE1;

/* Here uv is the high surrogate. Combine with low surrogate
* just computed to form the actual U32 code point.
*
* From https://unicode.org/faq/utf_bom.html#utf16-4 */
uv = FIRST_IN_PLANE1 + (uv << 10) - (FIRST_HIGH_SURROGATE << 10)
+ low_surrogate - FIRST_LOW_SURROGATE;
}
}

/* Here, 'uv' is the real uv we want to find the UTF-8 of */
/* Here, 'uv' is the real U32 we want to find the UTF-8 of */
d = uvoffuni_to_utf8_flags(d, uv, 0);
}

*newlen = d - dstart;
return d;
}

/* Note: this one is slightly destructive of the source. */
U8*
Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
{
PERL_ARGS_ASSERT_UTF16_TO_UTF8;

return utf16_to_utf8(p, d, bytelen, newlen);
}

U8*
Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
{
U8* s = (U8*)p;
U8* const send = s + bytelen;

PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;

if (bytelen & 1)
Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf,
(UV)bytelen);

while (s < send) {
const U8 tmp = s[0];
s[0] = s[1];
s[1] = tmp;
s += 2;
}
return utf16_to_utf8(p, d, bytelen, newlen);
return utf16_to_utf8_reversed(p, d, bytelen, newlen);
}

bool
Expand Down
5 changes: 5 additions & 0 deletions utf8.h
Expand Up @@ -82,6 +82,11 @@ the string is invariant.
#define utf8n_to_uvchr_error(s, len, lenp, flags, errors) \
utf8n_to_uvchr_msgs(s, len, lenp, flags, errors, 0)

#define utf16_to_utf8(p, d, bytelen, newlen) \
utf16_to_utf8_base(p, d, bytelen, newlen, 0, 1)
#define utf16_to_utf8_reversed(p, d, bytelen, newlen) \
utf16_to_utf8_base(p, d, bytelen, newlen, 1, 0)

#define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)

#define foldEQ_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2) \
Expand Down

0 comments on commit 5fd2667

Please sign in to comment.