From b7f8f73f819616fa4105cb1b8d1297755f28589d Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sat, 7 Aug 2021 07:38:54 -0600 Subject: [PATCH] Simplify utf16_to_utf8() A previous commit has simplified uvoffuni_to_utf8_flags() so that it is hardly more than the code in this function. So strip out the code and replace it by a call to uvoffuni_to_utf8_flags(). --- utf8.c | 42 ++++++++++-------------------------------- 1 file changed, 10 insertions(+), 32 deletions(-) diff --git a/utf8.c b/utf8.c index 344957824c10..ca9fac56a4fa 100644 --- a/utf8.c +++ b/utf8.c @@ -2649,21 +2649,17 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen) if (bytelen & 1) Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf, (UV)bytelen); - pend = p + bytelen; while (p < pend) { - UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */ + + /* Next 16 bits is what we want, assumes UTF-16BE */ + UV uv = (p[0] << 8) + p[1]; p += 2; - if (OFFUNI_IS_INVARIANT(uv)) { - *d++ = LATIN1_TO_NATIVE((U8) uv); - continue; - } - if (uv <= MAX_UTF8_TWO_BYTE) { - *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv)); - *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv)); - continue; - } + + /* If it's a surrogate, we find the uv that the surrogate pair encodes. + * */ + if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) { #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST #define LAST_HIGH_SURROGATE 0xDBFF @@ -2671,11 +2667,6 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen) #define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST #define FIRST_IN_PLANE1 0x10000 - /* This assumes that most uses will be in the first Unicode plane, not - * needing surrogates */ - if (UNLIKELY(inRANGE(uv, UNICODE_SURROGATE_FIRST, - UNICODE_SURROGATE_LAST))) - { if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) { Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); } @@ -2691,24 +2682,11 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen) + (low - FIRST_LOW_SURROGATE) + FIRST_IN_PLANE1; } } -#ifdef EBCDIC + + /* Here, 'uv' is the real uv we want to find the UTF-8 of */ d = uvoffuni_to_utf8_flags(d, uv, 0); -#else - if (uv < FIRST_IN_PLANE1) { - *d++ = (U8)(( uv >> 12) | 0xe0); - *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); - *d++ = (U8)(( uv & 0x3f) | 0x80); - continue; - } - else { - *d++ = (U8)(( uv >> 18) | 0xf0); - *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); - *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); - *d++ = (U8)(( uv & 0x3f) | 0x80); - continue; - } -#endif } + *newlen = d - dstart; return d; }