Skip to content

Commit e265d81

Browse files
alimpfardawesomekling
authored andcommitted
LibRegex: Correct And/Or and inversion interplay semantics
This commit also fixes an incorrect test case from very early on, our behaviour now matches the ECMA262 spec in this case. Fixes #21786.
1 parent 8931578 commit e265d81

File tree

3 files changed

+29
-8
lines changed

3 files changed

+29
-8
lines changed

Tests/LibRegex/Regex.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -648,7 +648,7 @@ TEST_CASE(ECMA262_match)
648648
{ "^[\\0-\\x1f]$"sv, "\n"sv },
649649
{ .pattern = "\\bhello\\B"sv, .subject = "hello1"sv, .options = ECMAScriptFlags::Global },
650650
{ "\\b.*\\b"sv, "hello1"sv },
651-
{ "[^\\D\\S]{2}"sv, "1 "sv },
651+
{ "[^\\D\\S]{2}"sv, "1 "sv, false },
652652
{ "bar(?=f.)foo"sv, "barfoo"sv },
653653
{ "bar(?=foo)bar"sv, "barbar"sv, false },
654654
{ "bar(?!foo)bar"sv, "barbar"sv, true },
@@ -1174,6 +1174,14 @@ TEST_CASE(inversion_state_in_char_class)
11741174
EXPECT_EQ(result.capture_group_matches.first()[0].view.to_byte_string(), "slideNumbers"sv);
11751175
EXPECT_EQ(result.capture_group_matches.first()[1].view.to_byte_string(), "}"sv);
11761176
}
1177+
{
1178+
// #21786, /[^\S\n]/.exec("\n") should be null, not [ "\n" ].
1179+
// This was a general confusion between the inversion state and the negation state (temp inverse).
1180+
Regex<ECMA262> re("[^\\S\\n]", ECMAScriptFlags::Global | (ECMAScriptFlags)regex::AllFlags::SingleMatch);
1181+
1182+
auto result = re.match("\n"sv);
1183+
EXPECT_EQ(result.success, false);
1184+
}
11771185
}
11781186

11791187
TEST_CASE(mismatching_brackets)

Userland/Libraries/LibRegex/RegexByteCode.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
426426
bool active { false };
427427
bool is_conjunction { false };
428428
bool fail { false };
429+
bool inverse_matched { false };
429430
size_t initial_position;
430431
size_t initial_code_unit_position;
431432
Optional<size_t> last_accepted_position {};
@@ -623,17 +624,19 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
623624
case CharacterCompareType::And:
624625
disjunction_states.append({
625626
.active = true,
626-
.is_conjunction = false,
627-
.fail = false,
627+
.is_conjunction = current_inversion_state(),
628+
.fail = current_inversion_state(),
629+
.inverse_matched = current_inversion_state(),
628630
.initial_position = state.string_position,
629631
.initial_code_unit_position = state.string_position_in_code_units,
630632
});
631633
continue;
632634
case CharacterCompareType::Or:
633635
disjunction_states.append({
634636
.active = true,
635-
.is_conjunction = true,
636-
.fail = true,
637+
.is_conjunction = !current_inversion_state(),
638+
.fail = !current_inversion_state(),
639+
.inverse_matched = !current_inversion_state(),
637640
.initial_position = state.string_position,
638641
.initial_code_unit_position = state.string_position_in_code_units,
639642
});
@@ -644,6 +647,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
644647
state.string_position = disjunction_state.last_accepted_position.value_or(disjunction_state.initial_position);
645648
state.string_position_in_code_units = disjunction_state.last_accepted_code_unit_position.value_or(disjunction_state.initial_code_unit_position);
646649
}
650+
inverse_matched = disjunction_state.inverse_matched || disjunction_state.fail;
647651
break;
648652
}
649653
default:
@@ -664,6 +668,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
664668
if (!failed) {
665669
new_disjunction_state.last_accepted_position = state.string_position;
666670
new_disjunction_state.last_accepted_code_unit_position = state.string_position_in_code_units;
671+
new_disjunction_state.inverse_matched |= inverse_matched;
667672
}
668673

669674
if (new_disjunction_state.is_conjunction)
@@ -673,6 +678,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
673678

674679
state.string_position = new_disjunction_state.initial_position;
675680
state.string_position_in_code_units = new_disjunction_state.initial_code_unit_position;
681+
inverse_matched = false;
676682
}
677683
}
678684

Userland/Libraries/LibRegex/RegexParser.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1777,10 +1777,12 @@ bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_
17771777

17781778
Vector<CompareTypeAndValuePair> compares;
17791779

1780+
auto uses_explicit_or_semantics = false;
17801781
if (match(TokenType::Circumflex)) {
17811782
// Negated charclass
17821783
consume();
17831784
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
1785+
uses_explicit_or_semantics = true;
17841786
}
17851787

17861788
// ClassContents :: [empty]
@@ -1800,6 +1802,11 @@ bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_
18001802
if (flags.unicode_sets && !parse_class_set_expression(compares))
18011803
return false;
18021804

1805+
if (uses_explicit_or_semantics && compares.size() > 2) {
1806+
compares.insert(1, CompareTypeAndValuePair { CharacterCompareType::Or, 0 });
1807+
compares.empend(CompareTypeAndValuePair { CharacterCompareType::EndAndOr, 0 });
1808+
}
1809+
18031810
match_length_minimum += 1;
18041811
stack.insert_bytecode_compare_values(move(compares));
18051812
return true;
@@ -2466,9 +2473,9 @@ DeprecatedFlyString ECMA262Parser::read_capture_group_specifier(bool take_starti
24662473
{
24672474
static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
24682475
static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv);
2469-
static constexpr const u32 REPLACEMENT_CHARACTER = 0xFFFD;
2470-
constexpr const u32 ZERO_WIDTH_NON_JOINER { 0x200C };
2471-
constexpr const u32 ZERO_WIDTH_JOINER { 0x200D };
2476+
static constexpr u32 const REPLACEMENT_CHARACTER = 0xFFFD;
2477+
constexpr u32 const ZERO_WIDTH_NON_JOINER { 0x200C };
2478+
constexpr u32 const ZERO_WIDTH_JOINER { 0x200D };
24722479

24732480
if (take_starting_angle_bracket && !consume("<"))
24742481
return {};

0 commit comments

Comments
 (0)