@@ -53,7 +53,32 @@ bool is_unicode_variant_subtag(StringView subtag)
53
53
return false ;
54
54
}
55
55
56
- static Optional<StringView> consume_next_segment (GenericLexer& lexer, bool with_separator)
56
+ static bool is_key (StringView key)
57
+ {
58
+ // key = alphanum alpha
59
+ if (key.length () != 2 )
60
+ return false ;
61
+ return is_ascii_alphanumeric (key[0 ]) && is_ascii_alpha (key[1 ]);
62
+ }
63
+
64
+ static bool is_single_type (StringView type)
65
+ {
66
+ // type = alphanum{3,8} (sep alphanum{3,8})*
67
+ // Note: Consecutive types are not handled here, that is left to the caller.
68
+ if ((type.length () < 3 ) || (type.length () > 8 ))
69
+ return false ;
70
+ return all_of (type, is_ascii_alphanumeric);
71
+ }
72
+
73
+ static bool is_attribute (StringView type)
74
+ {
75
+ // attribute = alphanum{3,8}
76
+ if ((type.length () < 3 ) || (type.length () > 8 ))
77
+ return false ;
78
+ return all_of (type, is_ascii_alphanumeric);
79
+ }
80
+
81
+ static Optional<StringView> consume_next_segment (GenericLexer& lexer, bool with_separator = true )
57
82
{
58
83
constexpr auto is_separator = is_any_of (" -_" sv);
59
84
@@ -153,6 +178,101 @@ static Optional<LanguageID> parse_unicode_language_id(GenericLexer& lexer)
153
178
return language_id;
154
179
}
155
180
181
+ static Optional<LocaleExtension> parse_unicode_locale_extension (GenericLexer& lexer)
182
+ {
183
+ // https://unicode.org/reports/tr35/#unicode_locale_extensions
184
+ //
185
+ // unicode_locale_extensions = sep [uU] ((sep keyword)+ | (sep attribute)+ (sep keyword)*)
186
+ LocaleExtension locale_extension {};
187
+
188
+ enum class ParseState {
189
+ ParsingAttributeOrKeyword,
190
+ ParsingAttribute,
191
+ ParsingKeyword,
192
+ Done,
193
+ };
194
+
195
+ auto state = ParseState::ParsingAttributeOrKeyword;
196
+
197
+ while (!lexer.is_eof () && (state != ParseState::Done)) {
198
+ auto segment = consume_next_segment (lexer);
199
+ if (!segment.has_value ())
200
+ return {};
201
+
202
+ if (state == ParseState::ParsingAttributeOrKeyword)
203
+ state = is_key (*segment) ? ParseState::ParsingKeyword : ParseState::ParsingAttribute;
204
+
205
+ switch (state) {
206
+ case ParseState::ParsingAttribute:
207
+ if (is_attribute (*segment)) {
208
+ locale_extension.attributes .append (*segment);
209
+ break ;
210
+ }
211
+
212
+ state = ParseState::ParsingKeyword;
213
+ [[fallthrough]];
214
+
215
+ case ParseState::ParsingKeyword: {
216
+ // keyword = key (sep type)?
217
+ Keyword keyword { .key = *segment };
218
+
219
+ if (!is_key (*segment)) {
220
+ lexer.retreat (segment->length () + 1 );
221
+ state = ParseState::Done;
222
+ break ;
223
+ }
224
+
225
+ while (true ) {
226
+ auto type = consume_next_segment (lexer);
227
+
228
+ if (!type.has_value () || !is_single_type (*type)) {
229
+ if (type.has_value ())
230
+ lexer.retreat (type->length () + 1 );
231
+ break ;
232
+ }
233
+
234
+ keyword.types .append (*type);
235
+ }
236
+
237
+ locale_extension.keywords .append (move (keyword));
238
+ break ;
239
+ }
240
+
241
+ default :
242
+ VERIFY_NOT_REACHED ();
243
+ }
244
+ }
245
+
246
+ if (locale_extension.attributes .is_empty () && locale_extension.keywords .is_empty ())
247
+ return {};
248
+ return locale_extension;
249
+ }
250
+
251
+ static Optional<Extension> parse_extension (GenericLexer& lexer)
252
+ {
253
+ // https://unicode.org/reports/tr35/#extensions
254
+ //
255
+ // extensions = unicode_locale_extensions | transformed_extensions | other_extensions
256
+ size_t starting_position = lexer.tell ();
257
+
258
+ if (auto header = consume_next_segment (lexer); header.has_value () && (header->length () == 1 )) {
259
+ switch ((*header)[0 ]) {
260
+ case ' u' :
261
+ case ' U' :
262
+ if (auto extension = parse_unicode_locale_extension (lexer); extension.has_value ())
263
+ return Extension { extension.release_value () };
264
+ break ;
265
+
266
+ default :
267
+ // FIXME: Handle transformed_extensions / other_extensions
268
+ break ;
269
+ }
270
+ }
271
+
272
+ lexer.retreat (lexer.tell () - starting_position);
273
+ return {};
274
+ }
275
+
156
276
Optional<LanguageID> parse_unicode_language_id (StringView language)
157
277
{
158
278
GenericLexer lexer { language };
@@ -167,7 +287,6 @@ Optional<LanguageID> parse_unicode_language_id(StringView language)
167
287
Optional<LocaleID> parse_unicode_locale_id (StringView locale)
168
288
{
169
289
GenericLexer lexer { locale };
170
- LocaleID locale_id {};
171
290
172
291
// https://unicode.org/reports/tr35/#Unicode_locale_identifier
173
292
//
@@ -178,12 +297,21 @@ Optional<LocaleID> parse_unicode_locale_id(StringView locale)
178
297
if (!language_id.has_value ())
179
298
return {};
180
299
181
- // FIXME: Handle extensions and pu_extensions.
300
+ LocaleID locale_id { language_id.release_value () };
301
+
302
+ while (true ) {
303
+ auto extension = parse_extension (lexer);
304
+ if (!extension.has_value ())
305
+ break ;
306
+ locale_id.extensions .append (extension.release_value ());
307
+ }
308
+
309
+ // FIXME: Handle pu_extensions.
182
310
183
311
if (!lexer.is_eof ())
184
312
return {};
185
313
186
- return LocaleID { language_id. release_value () } ;
314
+ return locale_id ;
187
315
}
188
316
189
317
Optional<String> canonicalize_unicode_locale_id (LocaleID& locale_id)
0 commit comments