@@ -50,6 +50,22 @@ struct Alias {
50
50
String alias;
51
51
};
52
52
53
+ // Normalization source: https://www.unicode.org/Public/13.0.0/ucd/DerivedNormalizationProps.txt
54
+ // Normalization descriptions: https://www.unicode.org/reports/tr44/#DerivedNormalizationProps.txt
55
+ enum class QuickCheck {
56
+ Yes,
57
+ No,
58
+ Maybe,
59
+ };
60
+
61
+ struct Normalization {
62
+ CodePointRange code_point_range;
63
+ Vector<u32 > value;
64
+ QuickCheck quick_check { QuickCheck::Yes };
65
+ };
66
+
67
+ using NormalizationProps = HashMap<String, Vector<Normalization>>;
68
+
53
69
// UnicodeData source: https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt
54
70
// Field descriptions: https://www.unicode.org/reports/tr44/tr44-13.html#UnicodeData.txt
55
71
// https://www.unicode.org/reports/tr44/#General_Category_Values
@@ -99,6 +115,9 @@ struct UnicodeData {
99
115
};
100
116
Vector<Alias> script_aliases;
101
117
PropList script_extensions;
118
+
119
+ // FIXME: We are not yet doing anything with this data. It will be needed for String.prototype.normalize.
120
+ NormalizationProps normalization_props;
102
121
};
103
122
104
123
static constexpr auto s_desired_fields = Array {
@@ -118,18 +137,38 @@ static void write_to_file_if_different(Core::File& file, StringView contents)
118
137
VERIFY (file.write (contents));
119
138
}
120
139
121
- static void parse_special_casing (Core::File& file, UnicodeData& unicode_data )
140
+ static Vector< u32 > parse_code_point_list (StringView const & list )
122
141
{
123
- auto parse_code_point_list = [&](auto const & line) {
124
- Vector<u32 > code_points;
142
+ Vector<u32 > code_points;
125
143
126
- auto segments = line. split (' ' );
127
- for (auto const & code_point : segments)
128
- code_points.append (AK::StringUtils::convert_to_uint_from_hex<u32 >(code_point).value ());
144
+ auto segments = list. split_view (' ' );
145
+ for (auto const & code_point : segments)
146
+ code_points.append (AK::StringUtils::convert_to_uint_from_hex<u32 >(code_point).value ());
129
147
130
- return code_points;
131
- };
148
+ return code_points;
149
+ }
150
+
151
+ static CodePointRange parse_code_point_range (StringView const & list)
152
+ {
153
+ CodePointRange code_point_range {};
132
154
155
+ if (list.contains (" .." sv)) {
156
+ auto segments = list.split_view (" .." sv);
157
+ VERIFY (segments.size () == 2 );
158
+
159
+ auto begin = AK::StringUtils::convert_to_uint_from_hex<u32 >(segments[0 ]).value ();
160
+ auto end = AK::StringUtils::convert_to_uint_from_hex<u32 >(segments[1 ]).value ();
161
+ code_point_range = { begin, end };
162
+ } else {
163
+ auto code_point = AK::StringUtils::convert_to_uint_from_hex<u32 >(list).value ();
164
+ code_point_range = { code_point, code_point };
165
+ }
166
+
167
+ return code_point_range;
168
+ }
169
+
170
+ static void parse_special_casing (Core::File& file, UnicodeData& unicode_data)
171
+ {
133
172
while (file.can_read_line ()) {
134
173
auto line = file.read_line ();
135
174
if (line.is_empty () || line.starts_with (' #' ))
@@ -191,7 +230,7 @@ static void parse_prop_list(Core::File& file, PropList& prop_list, bool multi_va
191
230
auto segments = line.split_view (' ;' , true );
192
231
VERIFY (segments.size () == 2 );
193
232
194
- auto code_point_range = segments[0 ].trim_whitespace ();
233
+ auto code_point_range = parse_code_point_range ( segments[0 ].trim_whitespace () );
195
234
Vector<StringView> properties;
196
235
197
236
if (multi_value_property)
@@ -201,18 +240,7 @@ static void parse_prop_list(Core::File& file, PropList& prop_list, bool multi_va
201
240
202
241
for (auto const & property : properties) {
203
242
auto & code_points = prop_list.ensure (property.trim_whitespace ());
204
-
205
- if (code_point_range.contains (" .." sv)) {
206
- segments = code_point_range.split_view (" .." sv);
207
- VERIFY (segments.size () == 2 );
208
-
209
- auto begin = AK::StringUtils::convert_to_uint_from_hex<u32 >(segments[0 ]).value ();
210
- auto end = AK::StringUtils::convert_to_uint_from_hex<u32 >(segments[1 ]).value ();
211
- code_points.append ({ begin, end });
212
- } else {
213
- auto code_point = AK::StringUtils::convert_to_uint_from_hex<u32 >(code_point_range).value ();
214
- code_points.append ({ code_point, code_point });
215
- }
243
+ code_points.append (code_point_range);
216
244
}
217
245
}
218
246
}
@@ -301,6 +329,44 @@ static void parse_value_alias_list(Core::File& file, StringView desired_category
301
329
}
302
330
}
303
331
332
+ static void parse_normalization_props (Core::File& file, UnicodeData& unicode_data)
333
+ {
334
+ while (file.can_read_line ()) {
335
+ auto line = file.read_line ();
336
+ if (line.is_empty () || line.starts_with (' #' ))
337
+ continue ;
338
+
339
+ if (auto index = line.find (' #' ); index.has_value ())
340
+ line = line.substring (0 , *index);
341
+
342
+ auto segments = line.split_view (' ;' , true );
343
+ VERIFY ((segments.size () == 2 ) || (segments.size () == 3 ));
344
+
345
+ auto code_point_range = parse_code_point_range (segments[0 ].trim_whitespace ());
346
+ auto property = segments[1 ].trim_whitespace ().to_string ();
347
+
348
+ Vector<u32 > value;
349
+ QuickCheck quick_check = QuickCheck::Yes;
350
+
351
+ if (segments.size () == 3 ) {
352
+ auto value_or_quick_check = segments[2 ].trim_whitespace ();
353
+
354
+ if ((value_or_quick_check == " N" sv))
355
+ quick_check = QuickCheck::No;
356
+ else if ((value_or_quick_check == " M" sv))
357
+ quick_check = QuickCheck::Maybe;
358
+ else
359
+ value = parse_code_point_list (value_or_quick_check);
360
+ }
361
+
362
+ auto & normalizations = unicode_data.normalization_props .ensure (property);
363
+ normalizations.append ({ code_point_range, move (value), quick_check });
364
+
365
+ auto & prop_list = unicode_data.prop_list .ensure (property);
366
+ prop_list.append (move (code_point_range));
367
+ }
368
+ }
369
+
304
370
static void parse_unicode_data (Core::File& file, UnicodeData& unicode_data)
305
371
{
306
372
Optional<u32 > code_point_range_start;
@@ -927,6 +993,7 @@ int main(int argc, char** argv)
927
993
char const * scripts_path = nullptr ;
928
994
char const * script_extensions_path = nullptr ;
929
995
char const * emoji_data_path = nullptr ;
996
+ char const * normalization_path = nullptr ;
930
997
931
998
Core::ArgsParser args_parser;
932
999
args_parser.add_option (generated_header_path, " Path to the Unicode Data header file to generate" , " generated-header-path" , ' h' , " generated-header-path" );
@@ -942,6 +1009,7 @@ int main(int argc, char** argv)
942
1009
args_parser.add_option (scripts_path, " Path to Scripts.txt file" , " scripts-path" , ' r' , " scripts-path" );
943
1010
args_parser.add_option (script_extensions_path, " Path to ScriptExtensions.txt file" , " script-extensions-path" , ' x' , " script-extensions-path" );
944
1011
args_parser.add_option (emoji_data_path, " Path to emoji-data.txt file" , " emoji-data-path" , ' e' , " emoji-data-path" );
1012
+ args_parser.add_option (normalization_path, " Path to DerivedNormalizationProps.txt file" , " normalization-path" , ' n' , " normalization-path" );
945
1013
args_parser.parse (argc, argv);
946
1014
947
1015
auto open_file = [&](StringView path, StringView flags, Core::OpenMode mode = Core::OpenMode::ReadOnly) {
@@ -973,6 +1041,7 @@ int main(int argc, char** argv)
973
1041
auto scripts_file = open_file (scripts_path, " -r/--scripts-path" );
974
1042
auto script_extensions_file = open_file (script_extensions_path, " -x/--script-extensions-path" );
975
1043
auto emoji_data_file = open_file (emoji_data_path, " -e/--emoji-data-path" );
1044
+ auto normalization_file = open_file (normalization_path, " -n/--normalization-path" );
976
1045
977
1046
UnicodeData unicode_data {};
978
1047
parse_special_casing (special_casing_file, unicode_data);
@@ -981,6 +1050,7 @@ int main(int argc, char** argv)
981
1050
parse_prop_list (derived_core_prop_file, unicode_data.prop_list );
982
1051
parse_prop_list (derived_binary_prop_file, unicode_data.prop_list );
983
1052
parse_prop_list (emoji_data_file, unicode_data.prop_list );
1053
+ parse_normalization_props (normalization_file, unicode_data);
984
1054
parse_alias_list (prop_alias_file, unicode_data.prop_list , unicode_data.prop_aliases );
985
1055
parse_prop_list (scripts_file, unicode_data.script_list );
986
1056
parse_prop_list (script_extensions_file, unicode_data.script_extensions , true );
0 commit comments