Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 101 additions & 3 deletions locale.c
Original file line number Diff line number Diff line change
Expand Up @@ -9962,13 +9962,111 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
}

len = d - sans_highs;
*d++ = '\0';
*d = '\0';

s = sans_highs;
}
}
/* else // Here both the locale and string are UTF-8 */
/* XXX convert above Unicode to 10FFFF? */
else { /* Here both the locale and string are UTF-8 */

/* In a UTF-8 locale, we can reasonably expect strxfrm() to properly
* handle any legal Unicode code point, including the non-character
* code points that are affirmed legal in Corrigendum #9. Less certain
* is its handling of the surrogate characters, and those code points
* above the Unicode maximum of U+10FFFF. It definitely won't know
* about Perl's invented UTF-8 extension for very large code points.
* Since surrogates and above-Unicode code points were formerly legal
* UTF-8, it very well may be that strxfrm() handles them, rather than
* going to the likely extra trouble of detecting and excluding them.
* This is especially true of surrogates where the code points the
* UTF-8 represents are listed in the Unicode Standard as being in a
* subset of the General Category "Other". Indeed, glibc looks like it
* returns the identical collation sequence for all "Other" code points
* that have the same number of bytes in their representation. That
* is, all such code points collate to the same spot. glibc does the
* same for the above-Unicode code points, but it gets a little weird,
* as might be expected, when presented with Perl's invented UTF-8
* extension, but still serviceable. But it is really undefined
* behavior, and we therefore should not present strxfrm with such
* input. The code below does that. And it is just about as easy to
* exclude all above-Unicode code points, as that is really undefined
* behavior as well, so the code below does that too. These all are
* effectively permanently unassigned by Unicode, so the code below
* maps them all to the highest legal permanently unassigned code
* point, U+10FFFF. XXX Could use find_next_masked() instead of
* strpbrk() on ASCII platforms to do per-word scanning */

# ifdef EBCDIC /* Native; known valid only for IBM-1047, 037 */
# define SUPER_START_BYTES "\xEE\xEF\xFA\xFB\xFC\xFD\xFE"
# else
# define SUPER_START_BYTES \
"\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF"
# endif

const char * const e = s + len;

/* Scan the input to find something that could be the start byte for an
* above-Unicode code point. If none found, we are done. */
char * candidate = s;
while ((candidate = strpbrk(candidate, SUPER_START_BYTES))) {
char * next_char_start = candidate + UTF8SKIP(candidate);
assert(next_char_start <= e);

/* It may require more than the single start byte to determine if a
* sequence is for an above-Unicode code point. Look to determine
* for sure. If the sequence isn't for an above-Unicode code
* point, continue scanning for the next possible one. */
if (! UTF8_IS_SUPER_NO_CHECK_(candidate)) {
candidate = next_char_start;
continue;
}

/* Here, is above-Unicode. Need to make a copy to translate this
* code code point (and any others that follow) to be within the
* Unicode range */
Newx(sans_highs, len + 1, char); /* May shrink; will never grow */
Size_t initial_length = candidate - s;

/* Copy as-is any initial portion that is Unicode */
Copy(s, sans_highs, initial_length, U8);

/* Replace this first above-Unicode character */
char * d = sans_highs + initial_length;
Copy(MAX_UNICODE_UTF8, d, STRLENs(MAX_UNICODE_UTF8), U8);
d += STRLENs(MAX_UNICODE_UTF8);

/* Then go through the rest of the string */
s = next_char_start;
while (s < e) {
if (UTF8_IS_INVARIANT(*s)) {
*d++ = *s++;
continue;
}

const Size_t this_len = UTF8SKIP(s);
next_char_start = s + this_len;
assert(next_char_start <= e);

if (UTF8_IS_SUPER_NO_CHECK_(s)) {
Copy(MAX_UNICODE_UTF8, d, STRLENs(MAX_UNICODE_UTF8), U8);
d += STRLENs(MAX_UNICODE_UTF8);
}
else {
Copy(s, d, this_len, U8);
d += this_len;
}

s = next_char_start;
}

len = d - sans_highs;
*d = '\0';

/* The rest of the routine will look at this modified copy */
s = sans_highs;
break;
}
}

length_in_chars = (utf8)
? utf8_length((U8 *) s, (U8 *) s + len)
Expand Down
7 changes: 7 additions & 0 deletions pod/perldelta.pod
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,13 @@ object's stash weren't always NULL or not-NULL, confusing sv_dump()
(and hence Devel::Peek's Dump()) into crashing on an object with no
defined fields in some cases. [github #22959]

=item *

When comparing strings when using a UTF-8 locale, the behavior was
previously undefined if either or both contained an above-Unicode code
point, such as 0x110000. Now all such code points will collate the same
as the highest Unicode code point, U+10FFFF.

=back

=head1 Known Problems
Expand Down
54 changes: 49 additions & 5 deletions t/run/locale.t
Original file line number Diff line number Diff line change
Expand Up @@ -584,17 +584,17 @@ else {
EOF
}

@locales = find_locales( [ qw(LC_CTYPE LC_COLLATE) ] );
my ($utf8_ref, $non_utf8_ref) = classify_locales_wrt_utf8ness(\@locales);
my @non_utf8_locales = grep { $_ !~ / \b C \b | POSIX /x } $non_utf8_ref->@*;

SKIP:
{
my @locales = find_locales( [ qw(LC_CTYPE LC_COLLATE) ] );
my (undef, $non_utf8_ref) = classify_locales_wrt_utf8ness(\@locales);
my @non_utf8_locales = grep { $_ !~ / \b C \b | POSIX /x }
$non_utf8_ref->@*;
skip "didn't find a suitable non-UTF-8 locale", 1 unless
@non_utf8_locales;
my $locale = $non_utf8_locales[0];

fresh_perl_is(<<"EOF", "ok\n", {}, "Handles above Latin1 and NUL in non-UTF8 locale");
fresh_perl_is(<<"EOF", "ok\n", {}, "cmp() handles above Latin1 and NUL in non-UTF8 locale");
use locale;
use POSIX qw(setlocale LC_COLLATE);
if (setlocale(LC_COLLATE, '$locale')) {
Expand All @@ -610,6 +610,50 @@ else {
EOF
}

SKIP:
{
skip "didn't find a suitable UTF-8 locale", 1 unless $utf8_ref;
my $locale = $utf8_ref->[0];

fresh_perl_is(<<"EOF", "ok\n", {}, "Handles above Unicode in a UTF8 locale");
use locale;
use POSIX qw(setlocale LC_COLLATE);
if (setlocale(LC_COLLATE, '$locale')) {
my \$x = "a\\x{10FFFF}\\x{110000}a\\x{10FFFF}a\\x{110000}";
my \$y = "a\\x{10FFFF}\\x{10FFFF}a\\x{10FFFF}a\\x{10FFFF}";
my \$cmp = \$x cmp \$y;
print \$cmp >= 0 ? "ok\n" : "not ok\n";
}
else {
print "ok\n";
}
EOF
}

SKIP:
{
skip "didn't find a suitable UTF-8 locale", 1 unless $utf8_ref;
my $is64bit = length sprintf("%x", ~0) > 8;
skip "32-bit ASCII platforms can't physically have extended UTF-8", 1
if $::IS_ASCII && ! $is64bit;
my $locale = $utf8_ref->[0];

fresh_perl_is(<<"EOF", "ok\n", {}, "cmp() handles Perl extended UTF-8");
use locale;
use POSIX qw(setlocale LC_COLLATE);
if (setlocale(LC_COLLATE, '$locale')) {
no warnings qw(non_unicode portable);
my \$x = "\\x{10FFFF}";
my \$y = "\\x{100000000}";
my \$cmp = \$x cmp \$y;
print \$cmp <= 0 ? "ok\n" : "not ok\n";
}
else {
print "ok\n";
}
EOF
}

SKIP: { # GH #20085
my @utf8_locales = find_utf8_ctype_locales();
skip "didn't find a UTF-8 locale", 1 unless @utf8_locales;
Expand Down
15 changes: 10 additions & 5 deletions utf8.h
Original file line number Diff line number Diff line change
Expand Up @@ -1065,13 +1065,18 @@ this macro matches
#define UTF_START_BYTE_110000_ UTF_START_BYTE(PERL_UNICODE_MAX + 1, 21)
#define UTF_FIRST_CONT_BYTE_110000_ \
UTF_FIRST_CONT_BYTE(PERL_UNICODE_MAX + 1, 21)

/* Internal macro when we don't care about it being well-formed, and know we
* have two bytes available to read */
#define UTF8_IS_SUPER_NO_CHECK_(s) \
( NATIVE_UTF8_TO_I8(s[0]) >= UTF_START_BYTE_110000_ \
&& ( NATIVE_UTF8_TO_I8(s[0]) > UTF_START_BYTE_110000_ \
|| NATIVE_UTF8_TO_I8(s[1]) >= UTF_FIRST_CONT_BYTE_110000_))

#define UTF8_IS_SUPER(s, e) \
( ((e) - (s)) >= UNISKIP_BY_MSB_(20) \
&& ( NATIVE_UTF8_TO_I8(s[0]) >= UTF_START_BYTE_110000_ \
&& ( NATIVE_UTF8_TO_I8(s[0]) > UTF_START_BYTE_110000_ \
|| NATIVE_UTF8_TO_I8(s[1]) >= UTF_FIRST_CONT_BYTE_110000_))) \
((((e) - (s)) >= UNISKIP_BY_MSB_(20) && UTF8_IS_SUPER_NO_CHECK_(s)) \
? isUTF8_CHAR(s, e) \
: 0
: 0)

/*
=for apidoc Am|bool|UNICODE_IS_NONCHAR|const UV uv
Expand Down