7
7
8
8
#include < AK/CharacterTypes.h>
9
9
#include < AK/Function.h>
10
- #include < AK/Utf8View .h>
10
+ #include < AK/Utf16View .h>
11
11
#include < LibJS/Runtime/AbstractOperations.h>
12
12
#include < LibJS/Runtime/Array.h>
13
13
#include < LibJS/Runtime/Error.h>
16
16
#include < LibJS/Runtime/RegExpObject.h>
17
17
#include < LibJS/Runtime/RegExpPrototype.h>
18
18
#include < LibJS/Runtime/RegExpStringIterator.h>
19
+ #include < LibJS/Runtime/StringPrototype.h>
19
20
#include < LibJS/Token.h>
20
21
21
22
namespace JS {
@@ -90,30 +91,25 @@ static String escape_regexp_pattern(const RegExpObject& regexp_object)
90
91
}
91
92
92
93
// 22.2.5.2.3 AdvanceStringIndex ( S, index, unicode ), https://tc39.es/ecma262/#sec-advancestringindex
93
- size_t advance_string_index (String const & string, size_t index, bool unicode)
94
+ size_t advance_string_index (Utf16View const & string, size_t index, bool unicode)
94
95
{
95
96
if (!unicode)
96
97
return index + 1 ;
97
98
98
- Utf8View view (string);
99
-
100
- if (index + 1 >= view.length ())
99
+ if (index + 1 >= string.length_in_code_units ())
101
100
return index + 1 ;
102
101
103
- auto it = view.begin ();
104
- for (size_t i = 0 ; i < index; ++i)
105
- ++it;
106
-
107
- // See https://tc39.es/ecma262/#sec-codepointat for details on [[CodeUnitCount]].
108
- auto code_unit_count = 1 ;
109
- if (is_unicode_surrogate (*it)) {
110
- ++it;
102
+ auto code_point = code_point_at (string, index);
103
+ return index + code_point.code_unit_count ;
104
+ }
111
105
112
- if ((it != view.end ()) && is_unicode_surrogate (*it))
113
- code_unit_count = 2 ;
114
- }
106
+ // 22.2.5.2.3 AdvanceStringIndex ( S, index, unicode ), https://tc39.es/ecma262/#sec-advancestringindex
107
+ size_t advance_string_index (String const & string, size_t index, bool unicode)
108
+ {
109
+ auto utf16_string = AK::utf8_to_utf16 (string);
110
+ Utf16View utf16_string_view { utf16_string };
115
111
116
- return index + code_unit_count ;
112
+ return advance_string_index (utf16_string_view, index, unicode) ;
117
113
}
118
114
119
115
static void increment_last_index (GlobalObject& global_object, Object& regexp_object, String const & string, bool unicode)
@@ -143,17 +139,17 @@ struct Match {
143
139
};
144
140
145
141
// 1.1.4.1.4 GetMatchIndicesArray ( S, match ), https://tc39.es/proposal-regexp-match-indices/#sec-getmatchindicesarray
146
- static Value get_match_indices_array (GlobalObject& global_object, String const & string, Match const & match)
142
+ static Value get_match_indices_array (GlobalObject& global_object, Utf16View const & string, Match const & match)
147
143
{
148
- VERIFY (match.start_index <= string.length ());
144
+ VERIFY (match.start_index <= string.length_in_code_units ());
149
145
VERIFY (match.end_index >= match.start_index );
150
- VERIFY (match.end_index <= string.length ());
146
+ VERIFY (match.end_index <= string.length_in_code_units ());
151
147
152
148
return Array::create_from (global_object, { Value (match.start_index ), Value (match.end_index ) });
153
149
}
154
150
155
151
// 1.1.4.1.5 MakeIndicesArray ( S , indices, groupNames, hasGroups ), https://tc39.es/proposal-regexp-match-indices/#sec-makeindicesarray
156
- static Value make_indices_array (GlobalObject& global_object, String const & string, Vector<Optional<Match>> const & indices, HashMap<String, Match> const & group_names, bool has_groups)
152
+ static Value make_indices_array (GlobalObject& global_object, Utf16View const & string, Vector<Optional<Match>> const & indices, HashMap<String, Match> const & group_names, bool has_groups)
157
153
{
158
154
// Note: This implementation differs from the spec, but has the same behavior.
159
155
//
@@ -205,7 +201,7 @@ static Value make_indices_array(GlobalObject& global_object, String const& strin
205
201
}
206
202
207
203
// 22.2.5.2.2 RegExpBuiltinExec ( R, S ), https://tc39.es/ecma262/#sec-regexpbuiltinexec
208
- static Value regexp_builtin_exec (GlobalObject& global_object, RegExpObject& regexp_object, String const & string)
204
+ static Value regexp_builtin_exec (GlobalObject& global_object, RegExpObject& regexp_object, Utf16View const & string)
209
205
{
210
206
// FIXME: This should try using internal slots [[RegExpMatcher]], [[OriginalFlags]], etc.
211
207
auto & vm = global_object.vm ();
@@ -229,7 +225,7 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege
229
225
RegexResult result;
230
226
231
227
while (true ) {
232
- if (last_index > string.length ()) {
228
+ if (last_index > string.length_in_code_units ()) {
233
229
if (global || sticky) {
234
230
regexp_object.set (vm.names .lastIndex , Value (0 ), Object::ShouldThrowExceptions::Yes);
235
231
if (vm.exception ())
@@ -239,13 +235,8 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege
239
235
return js_null ();
240
236
}
241
237
242
- regex.start_offset = last_index;
243
- // FIXME: JavaScript strings are UTF-16, update this if the backing storage
244
- // encoding changes for spec compliance reasons.
245
- if (unicode)
246
- result = regex.match (Utf8View { string });
247
- else
248
- result = regex.match (string);
238
+ regex.start_offset = unicode ? string.code_point_offset_of (last_index) : last_index;
239
+ result = regex.match (string);
249
240
250
241
if (result.success )
251
242
break ;
@@ -262,12 +253,16 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege
262
253
}
263
254
264
255
auto & match = result.matches [0 ];
256
+ auto match_index = match.global_offset ;
265
257
266
258
// https://tc39.es/ecma262/#sec-notation:
267
259
// The endIndex is one plus the index of the last input character matched so far by the pattern.
268
- auto end_index = match. global_offset + match.view .length ();
260
+ auto end_index = match_index + match.view .length ();
269
261
270
- // FIXME: Do code point index correction if the Unicode flag is set.
262
+ if (unicode) {
263
+ match_index = string.code_unit_offset_of (match.global_offset );
264
+ end_index = string.code_unit_offset_of (end_index);
265
+ }
271
266
272
267
if (global || sticky) {
273
268
regexp_object.set (vm.names .lastIndex , Value (end_index), Object::ShouldThrowExceptions::Yes);
@@ -279,9 +274,9 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege
279
274
if (vm.exception ())
280
275
return {};
281
276
282
- array->create_data_property_or_throw (vm.names .index , Value (match. global_offset ));
277
+ array->create_data_property_or_throw (vm.names .index , Value (match_index ));
283
278
array->create_data_property_or_throw (vm.names .input , js_string (vm, string));
284
- array->create_data_property_or_throw (0 , js_string (vm, match.view .to_string ()));
279
+ array->create_data_property_or_throw (0 , js_string (vm, match.view .u16_view ()));
285
280
286
281
Vector<Optional<Match>> indices { Match::create (match) };
287
282
HashMap<String, Match> group_names;
@@ -292,7 +287,7 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege
292
287
if (capture.view .is_null ()) {
293
288
indices.append ({});
294
289
} else {
295
- capture_value = js_string (vm, capture.view .to_string ());
290
+ capture_value = js_string (vm, capture.view .u16_view ());
296
291
indices.append (Match::create (capture));
297
292
}
298
293
array->create_data_property_or_throw (i + 1 , capture_value);
@@ -305,7 +300,7 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege
305
300
auto groups_object = Object::create (global_object, nullptr );
306
301
307
302
for (auto & entry : result.named_capture_group_matches [0 ]) {
308
- groups_object->create_data_property_or_throw (entry.key , js_string (vm, entry.value .view .to_string ()));
303
+ groups_object->create_data_property_or_throw (entry.key , js_string (vm, entry.value .view .u16_view ()));
309
304
group_names.set (entry.key , Match::create (entry.value ));
310
305
}
311
306
@@ -325,7 +320,7 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege
325
320
}
326
321
327
322
// 22.2.5.2.1 RegExpExec ( R, S ), https://tc39.es/ecma262/#sec-regexpexec
328
- Value regexp_exec (GlobalObject& global_object, Object& regexp_object, String const & string)
323
+ Value regexp_exec (GlobalObject& global_object, Object& regexp_object, Utf16View const & string)
329
324
{
330
325
auto & vm = global_object.vm ();
331
326
@@ -352,6 +347,15 @@ Value regexp_exec(GlobalObject& global_object, Object& regexp_object, String con
352
347
return regexp_builtin_exec (global_object, static_cast <RegExpObject&>(regexp_object), string);
353
348
}
354
349
350
+ // 22.2.5.2.1 RegExpExec ( R, S ), https://tc39.es/ecma262/#sec-regexpexec
351
+ Value regexp_exec (GlobalObject& global_object, Object& regexp_object, String const & string)
352
+ {
353
+ auto utf16_string = AK::utf8_to_utf16 (string);
354
+ Utf16View utf16_string_view { utf16_string };
355
+
356
+ return regexp_exec (global_object, regexp_object, utf16_string_view);
357
+ }
358
+
355
359
// 1.1.4.3 get RegExp.prototype.hasIndices, https://tc39.es/proposal-regexp-match-indices/#sec-get-regexp.prototype.hasIndices
356
360
// 22.2.5.3 get RegExp.prototype.dotAll, https://tc39.es/ecma262/#sec-get-regexp.prototype.dotAll
357
361
// 22.2.5.5 get RegExp.prototype.global, https://tc39.es/ecma262/#sec-get-regexp.prototype.global
@@ -424,11 +428,12 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::exec)
424
428
if (!regexp_object)
425
429
return {};
426
430
427
- auto string = vm.argument (0 ).to_string (global_object);
431
+ auto string = vm.argument (0 ).to_utf16_string (global_object);
428
432
if (vm.exception ())
429
433
return {};
434
+ Utf16View string_view { string };
430
435
431
- return regexp_builtin_exec (global_object, *regexp_object, string );
436
+ return regexp_builtin_exec (global_object, *regexp_object, string_view );
432
437
}
433
438
434
439
// 22.2.5.15 RegExp.prototype.test ( S ), https://tc39.es/ecma262/#sec-regexp.prototype.test
@@ -438,11 +443,12 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::test)
438
443
if (!regexp_object)
439
444
return {};
440
445
441
- auto str = vm.argument (0 ).to_string (global_object);
446
+ auto string = vm.argument (0 ).to_utf16_string (global_object);
442
447
if (vm.exception ())
443
448
return {};
449
+ Utf16View string_view { string };
444
450
445
- auto match = regexp_exec (global_object, *regexp_object, str );
451
+ auto match = regexp_exec (global_object, *regexp_object, string_view );
446
452
if (vm.exception ())
447
453
return {};
448
454
0 commit comments