Skip to content

Commit eda92d1

Browse files
trflynn89linusg
authored andcommitted
LibUnicode: Parse locale extensions of the Unicode locale extension form
1 parent dd89901 commit eda92d1

File tree

3 files changed

+192
-4
lines changed

3 files changed

+192
-4
lines changed

Tests/LibUnicode/TestUnicodeLocale.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,52 @@ TEST_CASE(parse_unicode_locale_id)
100100
pass("aaa-bbbb-cc-1234-5678"sv, "aaa"sv, "bbbb"sv, "cc"sv, { "1234"sv, "5678"sv });
101101
}
102102

103+
TEST_CASE(parse_unicode_locale_id_with_unicode_locale_extension)
104+
{
105+
auto fail = [](StringView locale) {
106+
auto locale_id = Unicode::parse_unicode_locale_id(locale);
107+
EXPECT(!locale_id.has_value());
108+
};
109+
auto pass = [](StringView locale, Unicode::LocaleExtension const& expected_extension) {
110+
auto locale_id = Unicode::parse_unicode_locale_id(locale);
111+
VERIFY(locale_id.has_value());
112+
EXPECT_EQ(locale_id->extensions.size(), 1u);
113+
114+
auto const& actual_extension = locale_id->extensions[0].get<Unicode::LocaleExtension>();
115+
VERIFY(actual_extension.attributes == expected_extension.attributes);
116+
EXPECT_EQ(actual_extension.keywords.size(), expected_extension.keywords.size());
117+
118+
for (size_t i = 0; i < actual_extension.keywords.size(); ++i) {
119+
auto const& actual_keyword = actual_extension.keywords[i];
120+
auto const& expected_keyword = expected_extension.keywords[i];
121+
122+
EXPECT_EQ(actual_keyword.key, expected_keyword.key);
123+
EXPECT_EQ(actual_keyword.types, expected_keyword.types);
124+
}
125+
};
126+
127+
fail("en-u"sv);
128+
fail("en-u-"sv);
129+
fail("en-u-x"sv);
130+
fail("en-u-xx-"sv);
131+
fail("en-u--xx"sv);
132+
fail("en-u-xx-xxxxx-"sv);
133+
fail("en-u-xx--xxxxx"sv);
134+
fail("en-u-xx-xxxxxxxxx"sv);
135+
fail("en-u-xxxxx-"sv);
136+
fail("en-u-xxxxxxxxx"sv);
137+
138+
pass("en-u-xx"sv, { {}, { { "xx"sv, {} } } });
139+
pass("en-u-xx-yyyy"sv, { {}, { { "xx"sv, { "yyyy"sv } } } });
140+
pass("en-u-xx-yyyy-zzzz"sv, { {}, { { "xx"sv, { "yyyy"sv, "zzzz"sv } } } });
141+
pass("en-u-xx-yyyy-zzzz-aa"sv, { {}, { { "xx"sv, { "yyyy"sv, "zzzz"sv } }, { "aa"sv, {} } } });
142+
pass("en-u-xxx"sv, { { "xxx"sv }, {} });
143+
pass("en-u-fff-gggg"sv, { { "fff"sv, "gggg"sv }, {} });
144+
pass("en-u-fff-xx"sv, { { "fff"sv }, { { "xx"sv, {} } } });
145+
pass("en-u-fff-xx-yyyy"sv, { { "fff"sv }, { { "xx"sv, { "yyyy"sv } } } });
146+
pass("en-u-fff-gggg-xx-yyyy"sv, { { "fff"sv, "gggg"sv }, { { "xx"sv, { "yyyy"sv } } } });
147+
}
148+
103149
TEST_CASE(canonicalize_unicode_locale_id)
104150
{
105151
auto test = [](StringView locale, StringView expected_canonical_locale) {

Userland/Libraries/LibUnicode/Locale.cpp

Lines changed: 132 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,32 @@ bool is_unicode_variant_subtag(StringView subtag)
5353
return false;
5454
}
5555

56-
static Optional<StringView> consume_next_segment(GenericLexer& lexer, bool with_separator)
56+
static bool is_key(StringView key)
57+
{
58+
// key = alphanum alpha
59+
if (key.length() != 2)
60+
return false;
61+
return is_ascii_alphanumeric(key[0]) && is_ascii_alpha(key[1]);
62+
}
63+
64+
static bool is_single_type(StringView type)
65+
{
66+
// type = alphanum{3,8} (sep alphanum{3,8})*
67+
// Note: Consecutive types are not handled here, that is left to the caller.
68+
if ((type.length() < 3) || (type.length() > 8))
69+
return false;
70+
return all_of(type, is_ascii_alphanumeric);
71+
}
72+
73+
static bool is_attribute(StringView type)
74+
{
75+
// attribute = alphanum{3,8}
76+
if ((type.length() < 3) || (type.length() > 8))
77+
return false;
78+
return all_of(type, is_ascii_alphanumeric);
79+
}
80+
81+
static Optional<StringView> consume_next_segment(GenericLexer& lexer, bool with_separator = true)
5782
{
5883
constexpr auto is_separator = is_any_of("-_"sv);
5984

@@ -153,6 +178,101 @@ static Optional<LanguageID> parse_unicode_language_id(GenericLexer& lexer)
153178
return language_id;
154179
}
155180

181+
static Optional<LocaleExtension> parse_unicode_locale_extension(GenericLexer& lexer)
182+
{
183+
// https://unicode.org/reports/tr35/#unicode_locale_extensions
184+
//
185+
// unicode_locale_extensions = sep [uU] ((sep keyword)+ | (sep attribute)+ (sep keyword)*)
186+
LocaleExtension locale_extension {};
187+
188+
enum class ParseState {
189+
ParsingAttributeOrKeyword,
190+
ParsingAttribute,
191+
ParsingKeyword,
192+
Done,
193+
};
194+
195+
auto state = ParseState::ParsingAttributeOrKeyword;
196+
197+
while (!lexer.is_eof() && (state != ParseState::Done)) {
198+
auto segment = consume_next_segment(lexer);
199+
if (!segment.has_value())
200+
return {};
201+
202+
if (state == ParseState::ParsingAttributeOrKeyword)
203+
state = is_key(*segment) ? ParseState::ParsingKeyword : ParseState::ParsingAttribute;
204+
205+
switch (state) {
206+
case ParseState::ParsingAttribute:
207+
if (is_attribute(*segment)) {
208+
locale_extension.attributes.append(*segment);
209+
break;
210+
}
211+
212+
state = ParseState::ParsingKeyword;
213+
[[fallthrough]];
214+
215+
case ParseState::ParsingKeyword: {
216+
// keyword = key (sep type)?
217+
Keyword keyword { .key = *segment };
218+
219+
if (!is_key(*segment)) {
220+
lexer.retreat(segment->length() + 1);
221+
state = ParseState::Done;
222+
break;
223+
}
224+
225+
while (true) {
226+
auto type = consume_next_segment(lexer);
227+
228+
if (!type.has_value() || !is_single_type(*type)) {
229+
if (type.has_value())
230+
lexer.retreat(type->length() + 1);
231+
break;
232+
}
233+
234+
keyword.types.append(*type);
235+
}
236+
237+
locale_extension.keywords.append(move(keyword));
238+
break;
239+
}
240+
241+
default:
242+
VERIFY_NOT_REACHED();
243+
}
244+
}
245+
246+
if (locale_extension.attributes.is_empty() && locale_extension.keywords.is_empty())
247+
return {};
248+
return locale_extension;
249+
}
250+
251+
static Optional<Extension> parse_extension(GenericLexer& lexer)
252+
{
253+
// https://unicode.org/reports/tr35/#extensions
254+
//
255+
// extensions = unicode_locale_extensions | transformed_extensions | other_extensions
256+
size_t starting_position = lexer.tell();
257+
258+
if (auto header = consume_next_segment(lexer); header.has_value() && (header->length() == 1)) {
259+
switch ((*header)[0]) {
260+
case 'u':
261+
case 'U':
262+
if (auto extension = parse_unicode_locale_extension(lexer); extension.has_value())
263+
return Extension { extension.release_value() };
264+
break;
265+
266+
default:
267+
// FIXME: Handle transformed_extensions / other_extensions
268+
break;
269+
}
270+
}
271+
272+
lexer.retreat(lexer.tell() - starting_position);
273+
return {};
274+
}
275+
156276
Optional<LanguageID> parse_unicode_language_id(StringView language)
157277
{
158278
GenericLexer lexer { language };
@@ -167,7 +287,6 @@ Optional<LanguageID> parse_unicode_language_id(StringView language)
167287
Optional<LocaleID> parse_unicode_locale_id(StringView locale)
168288
{
169289
GenericLexer lexer { locale };
170-
LocaleID locale_id {};
171290

172291
// https://unicode.org/reports/tr35/#Unicode_locale_identifier
173292
//
@@ -178,12 +297,21 @@ Optional<LocaleID> parse_unicode_locale_id(StringView locale)
178297
if (!language_id.has_value())
179298
return {};
180299

181-
// FIXME: Handle extensions and pu_extensions.
300+
LocaleID locale_id { language_id.release_value() };
301+
302+
while (true) {
303+
auto extension = parse_extension(lexer);
304+
if (!extension.has_value())
305+
break;
306+
locale_id.extensions.append(extension.release_value());
307+
}
308+
309+
// FIXME: Handle pu_extensions.
182310

183311
if (!lexer.is_eof())
184312
return {};
185313

186-
return LocaleID { language_id.release_value() };
314+
return locale_id;
187315
}
188316

189317
Optional<String> canonicalize_unicode_locale_id(LocaleID& locale_id)

Userland/Libraries/LibUnicode/Locale.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <AK/Optional.h>
1010
#include <AK/String.h>
1111
#include <AK/StringView.h>
12+
#include <AK/Variant.h>
1213
#include <AK/Vector.h>
1314
#include <LibUnicode/Forward.h>
1415

@@ -22,8 +23,21 @@ struct LanguageID {
2223
Vector<StringView> variants {};
2324
};
2425

26+
struct Keyword {
27+
StringView key {};
28+
Vector<StringView> types {};
29+
};
30+
31+
struct LocaleExtension {
32+
Vector<StringView> attributes {};
33+
Vector<Keyword> keywords {};
34+
};
35+
36+
using Extension = Variant<LocaleExtension>;
37+
2538
struct LocaleID {
2639
LanguageID language_id {};
40+
Vector<Extension> extensions {};
2741
};
2842

2943
// Note: These methods only verify that the provided strings match the EBNF grammar of the

0 commit comments

Comments
 (0)