Skip to content

Commit da89cf9

Browse files
trflynn89linusg
authored andcommitted
LibUnicode: Canonicalize calendar subtags
Calendar subtags are a bit of an odd-man-out in that we must match the variants "ethiopic-amete-alem" in that order, without any other variant in the locale. So a separate method is needed for this, and we now defer sorting the variant list until after other canonicalization is done.
1 parent 8458f47 commit da89cf9

File tree

2 files changed

+35
-6
lines changed

2 files changed

+35
-6
lines changed

Tests/LibUnicode/TestUnicodeLocale.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,14 @@ TEST_CASE(canonicalize_unicode_locale_id)
320320
test("EN-U-TZ-HONGKONG"sv, "en-u-tz-hkhkg"sv);
321321
test("en-u-ta-hongkong"sv, "en-u-ta-hongkong"sv);
322322
test("EN-U-TA-HONGKONG"sv, "en-u-ta-hongkong"sv);
323+
test("en-u-ca-ethiopic-amete-alem"sv, "en-u-ca-ethioaa"sv);
324+
test("EN-U-CA-ETHIOPIC-AMETE-ALEM"sv, "en-u-ca-ethioaa"sv);
325+
test("en-u-ca-alem-ethiopic-amete"sv, "en-u-ca-alem-ethiopic-amete"sv);
326+
test("EN-U-CA-ALEM-ETHIOPIC-AMETE"sv, "en-u-ca-alem-ethiopic-amete"sv);
327+
test("en-u-ca-ethiopic-amete-xxx-alem"sv, "en-u-ca-ethiopic-amete-xxx-alem"sv);
328+
test("EN-U-CA-ETHIOPIC-AMETE-XXX-ALEM"sv, "en-u-ca-ethiopic-amete-xxx-alem"sv);
329+
test("en-u-cb-ethiopic-amete-alem"sv, "en-u-cb-ethiopic-amete-alem"sv);
330+
test("EN-U-CB-ETHIOPIC-AMETE-ALEM"sv, "en-u-cb-ethiopic-amete-alem"sv);
323331

324332
test("en-t-en"sv, "en-t-en"sv);
325333
test("EN-T-EN"sv, "en-t-en"sv);
@@ -345,6 +353,8 @@ TEST_CASE(canonicalize_unicode_locale_id)
345353
test("EN-T-K1-IMPERIAL"sv, "en-t-k1-imperial"sv);
346354
test("en-t-k1-hongkong"sv, "en-t-k1-hongkong"sv);
347355
test("EN-T-K1-HONGKONG"sv, "en-t-k1-hongkong"sv);
356+
test("en-t-k1-ethiopic-amete-alem"sv, "en-t-k1-ethiopic-amete-alem"sv);
357+
test("EN-T-K1-ETHIOPIC-AMETE-ALEM"sv, "en-t-k1-ethiopic-amete-alem"sv);
348358

349359
test("en-0-aaa"sv, "en-0-aaa"sv);
350360
test("EN-0-AAA"sv, "en-0-aaa"sv);

Userland/Libraries/LibUnicode/Locale.cpp

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -483,14 +483,17 @@ Optional<LocaleID> parse_unicode_locale_id(StringView locale)
483483
static void perform_hard_coded_key_value_substitutions(String& key, String& value)
484484
{
485485
// FIXME: In the XML export of CLDR, there are some aliases defined in the following files:
486+
// https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/calendar.xml
486487
// https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/collation.xml
487488
// https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/measure.xml
488489
// https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/timezone.xml
489490
// https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/transform.xml
490491
//
491492
// There doesn't seem to be a counterpart in the JSON export. Since there aren't many such
492493
// aliases, until an XML parser is implemented, those aliases are implemented here.
493-
if (key.is_one_of("kb"sv, "kc"sv, "kh"sv, "kk"sv, "kn"sv) && (value == "yes"sv)) {
494+
if ((key == "ca"sv) && (value == "islamicc"sv)) {
495+
value = "islamic-civil"sv;
496+
} else if (key.is_one_of("kb"sv, "kc"sv, "kh"sv, "kk"sv, "kn"sv) && (value == "yes"sv)) {
494497
value = "true"sv;
495498
} else if (key == "ks"sv) {
496499
if (value == "primary"sv)
@@ -540,6 +543,20 @@ static void perform_hard_coded_key_value_substitutions(String& key, String& valu
540543
}
541544
}
542545

546+
static void perform_hard_coded_key_multi_value_substitutions(String const& key, Vector<String>& values)
547+
{
548+
// Similar to perform_hard_coded_key_value_substitutions, some aliases depend on multiple
549+
// variants being present in the original locale. Those are canonicalized separately here.
550+
// https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/calendar.xml
551+
if ((key != "ca"sv) || (values.size() != 3))
552+
return;
553+
554+
static Vector<String> ethiopic_amete_alem { "ethiopic"sv, "amete"sv, "alem"sv };
555+
556+
if (values == ethiopic_amete_alem)
557+
values = { "ethioaa"sv };
558+
}
559+
543560
static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id)
544561
{
545562
auto canonicalize_language = [](LanguageID& language_id, bool force_lowercase) {
@@ -626,6 +643,8 @@ static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id)
626643

627644
values.append(move(value));
628645
}
646+
647+
perform_hard_coded_key_multi_value_substitutions(key, values);
629648
};
630649

631650
canonicalize_language(locale_id.language_id, false);
@@ -644,22 +663,22 @@ static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id)
644663
for (auto& extension : locale_id.extensions) {
645664
extension.visit(
646665
[&](LocaleExtension& ext) {
647-
quick_sort(ext.attributes);
648-
quick_sort(ext.keywords, [](auto const& a, auto const& b) { return a.key < b.key; });
649-
650666
for (auto& attribute : ext.attributes)
651667
attribute = attribute.to_lowercase();
652668
for (auto& keyword : ext.keywords)
653669
canonicalize_key_value_list(keyword.key, keyword.types, true);
670+
671+
quick_sort(ext.attributes);
672+
quick_sort(ext.keywords, [](auto const& a, auto const& b) { return a.key < b.key; });
654673
},
655674
[&](TransformedExtension& ext) {
656675
if (ext.language.has_value())
657676
canonicalize_language(*ext.language, true);
658677

659-
quick_sort(ext.fields, [](auto const& a, auto const& b) { return a.key < b.key; });
660-
661678
for (auto& field : ext.fields)
662679
canonicalize_key_value_list(field.key, field.values, false);
680+
681+
quick_sort(ext.fields, [](auto const& a, auto const& b) { return a.key < b.key; });
663682
},
664683
[&](OtherExtension& ext) {
665684
ext.key = static_cast<char>(to_ascii_lowercase(ext.key));

0 commit comments

Comments
 (0)