Skip to content

Commit b1ea9c2

Browse files
trflynn89linusg
authored andcommitted
LibJS: Implement RegExp.prototype.{exec,test} with UTF-16 code units
This converts RegExpExec to perform matching with UTF-16 strings. As a very temporary stop-gap, this adds overloads to RegExpExec and friends for both UTF-8 and UTF-16 strings. This is only needed until the rest of RegExp.prototype is UTF-16 capable. This also addresses a FIXME regarding code point index correction in RegExpExec when the Unicode flag is set.
1 parent a0c19de commit b1ea9c2

File tree

2 files changed

+49
-41
lines changed

2 files changed

+49
-41
lines changed

Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp

Lines changed: 47 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
#include <AK/CharacterTypes.h>
99
#include <AK/Function.h>
10-
#include <AK/Utf8View.h>
10+
#include <AK/Utf16View.h>
1111
#include <LibJS/Runtime/AbstractOperations.h>
1212
#include <LibJS/Runtime/Array.h>
1313
#include <LibJS/Runtime/Error.h>
@@ -16,6 +16,7 @@
1616
#include <LibJS/Runtime/RegExpObject.h>
1717
#include <LibJS/Runtime/RegExpPrototype.h>
1818
#include <LibJS/Runtime/RegExpStringIterator.h>
19+
#include <LibJS/Runtime/StringPrototype.h>
1920
#include <LibJS/Token.h>
2021

2122
namespace JS {
@@ -90,30 +91,25 @@ static String escape_regexp_pattern(const RegExpObject& regexp_object)
9091
}
9192

9293
// 22.2.5.2.3 AdvanceStringIndex ( S, index, unicode ), https://tc39.es/ecma262/#sec-advancestringindex
93-
size_t advance_string_index(String const& string, size_t index, bool unicode)
94+
size_t advance_string_index(Utf16View const& string, size_t index, bool unicode)
9495
{
9596
if (!unicode)
9697
return index + 1;
9798

98-
Utf8View view(string);
99-
100-
if (index + 1 >= view.length())
99+
if (index + 1 >= string.length_in_code_units())
101100
return index + 1;
102101

103-
auto it = view.begin();
104-
for (size_t i = 0; i < index; ++i)
105-
++it;
106-
107-
// See https://tc39.es/ecma262/#sec-codepointat for details on [[CodeUnitCount]].
108-
auto code_unit_count = 1;
109-
if (is_unicode_surrogate(*it)) {
110-
++it;
102+
auto code_point = code_point_at(string, index);
103+
return index + code_point.code_unit_count;
104+
}
111105

112-
if ((it != view.end()) && is_unicode_surrogate(*it))
113-
code_unit_count = 2;
114-
}
106+
// 22.2.5.2.3 AdvanceStringIndex ( S, index, unicode ), https://tc39.es/ecma262/#sec-advancestringindex
107+
size_t advance_string_index(String const& string, size_t index, bool unicode)
108+
{
109+
auto utf16_string = AK::utf8_to_utf16(string);
110+
Utf16View utf16_string_view { utf16_string };
115111

116-
return index + code_unit_count;
112+
return advance_string_index(utf16_string_view, index, unicode);
117113
}
118114

119115
static void increment_last_index(GlobalObject& global_object, Object& regexp_object, String const& string, bool unicode)
@@ -143,17 +139,17 @@ struct Match {
143139
};
144140

145141
// 1.1.4.1.4 GetMatchIndicesArray ( S, match ), https://tc39.es/proposal-regexp-match-indices/#sec-getmatchindicesarray
146-
static Value get_match_indices_array(GlobalObject& global_object, String const& string, Match const& match)
142+
static Value get_match_indices_array(GlobalObject& global_object, Utf16View const& string, Match const& match)
147143
{
148-
VERIFY(match.start_index <= string.length());
144+
VERIFY(match.start_index <= string.length_in_code_units());
149145
VERIFY(match.end_index >= match.start_index);
150-
VERIFY(match.end_index <= string.length());
146+
VERIFY(match.end_index <= string.length_in_code_units());
151147

152148
return Array::create_from(global_object, { Value(match.start_index), Value(match.end_index) });
153149
}
154150

155151
// 1.1.4.1.5 MakeIndicesArray ( S , indices, groupNames, hasGroups ), https://tc39.es/proposal-regexp-match-indices/#sec-makeindicesarray
156-
static Value make_indices_array(GlobalObject& global_object, String const& string, Vector<Optional<Match>> const& indices, HashMap<String, Match> const& group_names, bool has_groups)
152+
static Value make_indices_array(GlobalObject& global_object, Utf16View const& string, Vector<Optional<Match>> const& indices, HashMap<String, Match> const& group_names, bool has_groups)
157153
{
158154
// Note: This implementation differs from the spec, but has the same behavior.
159155
//
@@ -205,7 +201,7 @@ static Value make_indices_array(GlobalObject& global_object, String const& strin
205201
}
206202

207203
// 22.2.5.2.2 RegExpBuiltinExec ( R, S ), https://tc39.es/ecma262/#sec-regexpbuiltinexec
208-
static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& regexp_object, String const& string)
204+
static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& regexp_object, Utf16View const& string)
209205
{
210206
// FIXME: This should try using internal slots [[RegExpMatcher]], [[OriginalFlags]], etc.
211207
auto& vm = global_object.vm();
@@ -229,7 +225,7 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege
229225
RegexResult result;
230226

231227
while (true) {
232-
if (last_index > string.length()) {
228+
if (last_index > string.length_in_code_units()) {
233229
if (global || sticky) {
234230
regexp_object.set(vm.names.lastIndex, Value(0), Object::ShouldThrowExceptions::Yes);
235231
if (vm.exception())
@@ -239,13 +235,8 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege
239235
return js_null();
240236
}
241237

242-
regex.start_offset = last_index;
243-
// FIXME: JavaScript strings are UTF-16, update this if the backing storage
244-
// encoding changes for spec compliance reasons.
245-
if (unicode)
246-
result = regex.match(Utf8View { string });
247-
else
248-
result = regex.match(string);
238+
regex.start_offset = unicode ? string.code_point_offset_of(last_index) : last_index;
239+
result = regex.match(string);
249240

250241
if (result.success)
251242
break;
@@ -262,12 +253,16 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege
262253
}
263254

264255
auto& match = result.matches[0];
256+
auto match_index = match.global_offset;
265257

266258
// https://tc39.es/ecma262/#sec-notation:
267259
// The endIndex is one plus the index of the last input character matched so far by the pattern.
268-
auto end_index = match.global_offset + match.view.length();
260+
auto end_index = match_index + match.view.length();
269261

270-
// FIXME: Do code point index correction if the Unicode flag is set.
262+
if (unicode) {
263+
match_index = string.code_unit_offset_of(match.global_offset);
264+
end_index = string.code_unit_offset_of(end_index);
265+
}
271266

272267
if (global || sticky) {
273268
regexp_object.set(vm.names.lastIndex, Value(end_index), Object::ShouldThrowExceptions::Yes);
@@ -279,9 +274,9 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege
279274
if (vm.exception())
280275
return {};
281276

282-
array->create_data_property_or_throw(vm.names.index, Value(match.global_offset));
277+
array->create_data_property_or_throw(vm.names.index, Value(match_index));
283278
array->create_data_property_or_throw(vm.names.input, js_string(vm, string));
284-
array->create_data_property_or_throw(0, js_string(vm, match.view.to_string()));
279+
array->create_data_property_or_throw(0, js_string(vm, match.view.u16_view()));
285280

286281
Vector<Optional<Match>> indices { Match::create(match) };
287282
HashMap<String, Match> group_names;
@@ -292,7 +287,7 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege
292287
if (capture.view.is_null()) {
293288
indices.append({});
294289
} else {
295-
capture_value = js_string(vm, capture.view.to_string());
290+
capture_value = js_string(vm, capture.view.u16_view());
296291
indices.append(Match::create(capture));
297292
}
298293
array->create_data_property_or_throw(i + 1, capture_value);
@@ -305,7 +300,7 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege
305300
auto groups_object = Object::create(global_object, nullptr);
306301

307302
for (auto& entry : result.named_capture_group_matches[0]) {
308-
groups_object->create_data_property_or_throw(entry.key, js_string(vm, entry.value.view.to_string()));
303+
groups_object->create_data_property_or_throw(entry.key, js_string(vm, entry.value.view.u16_view()));
309304
group_names.set(entry.key, Match::create(entry.value));
310305
}
311306

@@ -325,7 +320,7 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege
325320
}
326321

327322
// 22.2.5.2.1 RegExpExec ( R, S ), https://tc39.es/ecma262/#sec-regexpexec
328-
Value regexp_exec(GlobalObject& global_object, Object& regexp_object, String const& string)
323+
Value regexp_exec(GlobalObject& global_object, Object& regexp_object, Utf16View const& string)
329324
{
330325
auto& vm = global_object.vm();
331326

@@ -352,6 +347,15 @@ Value regexp_exec(GlobalObject& global_object, Object& regexp_object, String con
352347
return regexp_builtin_exec(global_object, static_cast<RegExpObject&>(regexp_object), string);
353348
}
354349

350+
// 22.2.5.2.1 RegExpExec ( R, S ), https://tc39.es/ecma262/#sec-regexpexec
351+
Value regexp_exec(GlobalObject& global_object, Object& regexp_object, String const& string)
352+
{
353+
auto utf16_string = AK::utf8_to_utf16(string);
354+
Utf16View utf16_string_view { utf16_string };
355+
356+
return regexp_exec(global_object, regexp_object, utf16_string_view);
357+
}
358+
355359
// 1.1.4.3 get RegExp.prototype.hasIndices, https://tc39.es/proposal-regexp-match-indices/#sec-get-regexp.prototype.hasIndices
356360
// 22.2.5.3 get RegExp.prototype.dotAll, https://tc39.es/ecma262/#sec-get-regexp.prototype.dotAll
357361
// 22.2.5.5 get RegExp.prototype.global, https://tc39.es/ecma262/#sec-get-regexp.prototype.global
@@ -424,11 +428,12 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::exec)
424428
if (!regexp_object)
425429
return {};
426430

427-
auto string = vm.argument(0).to_string(global_object);
431+
auto string = vm.argument(0).to_utf16_string(global_object);
428432
if (vm.exception())
429433
return {};
434+
Utf16View string_view { string };
430435

431-
return regexp_builtin_exec(global_object, *regexp_object, string);
436+
return regexp_builtin_exec(global_object, *regexp_object, string_view);
432437
}
433438

434439
// 22.2.5.15 RegExp.prototype.test ( S ), https://tc39.es/ecma262/#sec-regexp.prototype.test
@@ -438,11 +443,12 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::test)
438443
if (!regexp_object)
439444
return {};
440445

441-
auto str = vm.argument(0).to_string(global_object);
446+
auto string = vm.argument(0).to_utf16_string(global_object);
442447
if (vm.exception())
443448
return {};
449+
Utf16View string_view { string };
444450

445-
auto match = regexp_exec(global_object, *regexp_object, str);
451+
auto match = regexp_exec(global_object, *regexp_object, string_view);
446452
if (vm.exception())
447453
return {};
448454

Userland/Libraries/LibJS/Runtime/RegExpPrototype.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
namespace JS {
1212

1313
Value regexp_exec(GlobalObject& global_object, Object& regexp_object, String const& string);
14+
Value regexp_exec(GlobalObject& global_object, Object& regexp_object, Utf16View const& string);
1415
size_t advance_string_index(String const& string, size_t index, bool unicode);
16+
size_t advance_string_index(Utf16View const& string, size_t index, bool unicode);
1517

1618
class RegExpPrototype final : public Object {
1719
JS_OBJECT(RegExpPrototype, Object);

0 commit comments

Comments
 (0)