Skip to content

Commit

Permalink
REGRESSION(257823@main): named-groups/lookbehind.js Test262-test is f…
Browse files Browse the repository at this point in the history
…ailing

https://bugs.webkit.org/show_bug.cgi?id=249330
rdar://103367993

Reviewed by Yusuke Suzuki.

Fixed the case where a nested capturing group within a lookahead / lookbehind doesn't get its capture cleared if
the lookaround fails.  This has been a long standing issue for lookaheads.

The fix is slightly different in the Yarr interpreter than in the Yarr JIT.  For the interpreter, the Parenthetical
Assertion processing will clear the nested captures within the assertion for the case where an inverted assertion
succeeds or a normal assertion fails.  For the JIT, the clearing is done at the end of a failed top level alternative
for all the contained captures for that alternative.

* JSTests/stress/regexp-lookaround-captures.js: Added.
(arrayToString):
(dumpValue):
(compareArray):
(testRegExp):
(testRegExp.c):
* Source/JavaScriptCore/yarr/YarrInterpreter.cpp:
(JSC::Yarr::Interpreter::ParenthesesDisjunctionContext::ParenthesesDisjunctionContext):
(JSC::Yarr::Interpreter::matchBackReference):
(JSC::Yarr::Interpreter::backtrackBackReference):
(JSC::Yarr::Interpreter::recordParenthesesMatch):
(JSC::Yarr::Interpreter::resetMatches):
(JSC::Yarr::Interpreter::matchParenthesesOnceBegin):
(JSC::Yarr::Interpreter::matchParenthesesOnceEnd):
(JSC::Yarr::Interpreter::backtrackParenthesesOnceBegin):
(JSC::Yarr::Interpreter::backtrackParenthesesOnceEnd):
(JSC::Yarr::Interpreter::matchParentheticalAssertionEnd):
(JSC::Yarr::Interpreter::backtrackParentheticalAssertionEnd):
(JSC::Yarr::Interpreter::matchDisjunction):
(JSC::Yarr::ByteCompiler::atomParentheticalAssertionEnd):
(JSC::Yarr::ByteCompiler::atomParenthesesSubpatternEnd):
(JSC::Yarr::ByteCompiler::atomParenthesesOnceEnd):
(JSC::Yarr::ByteCompiler::atomParenthesesTerminalEnd):
(JSC::Yarr::ByteCompiler::emitDisjunction):
(JSC::Yarr::ByteTermDumper::dumpTerm):
* Source/JavaScriptCore/yarr/YarrInterpreter.h:
(JSC::Yarr::ByteTerm::ByteTerm):
(JSC::Yarr::ByteTerm::containsAnyCaptures):
(JSC::Yarr::ByteTerm::subpatternId):
(JSC::Yarr::ByteTerm::lastSubpatternId):
* Source/JavaScriptCore/yarr/YarrJIT.cpp:
* Source/JavaScriptCore/yarr/YarrParser.h:
(JSC::Yarr::Parser::parseTokens):
* Source/JavaScriptCore/yarr/YarrPattern.cpp:
(JSC::Yarr::YarrPatternConstructor::atomParenthesesSubpatternBegin):
(JSC::Yarr::YarrPatternConstructor::atomParentheticalAssertionBegin):
(JSC::Yarr::YarrPatternConstructor::copyDisjunction):
(JSC::Yarr::YarrPatternConstructor::disjunction):
* Source/JavaScriptCore/yarr/YarrPattern.h:
(JSC::Yarr::PatternTerm::containsAnyCaptures):
(JSC::Yarr::PatternAlternative::PatternAlternative):
(JSC::Yarr::PatternAlternative::cleanupCaptures const):
(JSC::Yarr::PatternDisjunction::addNewAlternative):
* Source/JavaScriptCore/yarr/YarrSyntaxChecker.cpp:
(JSC::Yarr::SyntaxChecker::disjunction):
* Source/WebCore/contentextensions/URLFilterParser.cpp:
(WebCore::ContentExtensions::PatternParser::disjunction):

Canonical link: https://commits.webkit.org/258195@main
  • Loading branch information
msaboff committed Dec 21, 2022
1 parent fec9dea commit 6204c47
Show file tree
Hide file tree
Showing 9 changed files with 206 additions and 45 deletions.
107 changes: 107 additions & 0 deletions JSTests/stress/regexp-lookaround-captures.js
@@ -0,0 +1,107 @@
// With verbose set to false, this test is successful if there is no output. Set verbose to true to see expected matches.
let verbose = false;

function arrayToString(arr)
{
let str = '';
arr.forEach(function(v, index) {
if (typeof v == "string")
str += "\"" + v + "\"";
else
str += v;

if (index != (arr.length - 1)) {
str += ',';
};
});
return "[" + str + "]";
}

function dumpValue(v)
{
if (v === null)
return "<null>";

if (v === undefined)
return "<undefined>";

if (typeof v == "string")
return "\"" + v + "\"";

if (v.length)
return arrayToString(v);

return v;
}

function compareArray(a, b)
{
if (a === null && b === null)
return true;

if (a === null) {
print("### a is null, b is not null");
return false;
}

if (b === null) {
print("### a is not null, b is null");
return false;
}

if (a.length !== b.length) {
print("### a.length: " + a.length + ", b.length: " + b.length);
return false;
}

for (var i = 0; i < a.length; i++) {
if (a[i] !== b[i]) {
print("### a[" + i + "]: \"" + a[i] + "\" !== b[" + i + "]: \"" + b[i] + "\"");
return false;
}
}

return true;
}

let testNumber = 0;

function testRegExp(re, str, exp)
{
testNumber++;

let actual = str.match(re);

if (compareArray(exp, actual)) {
if (verbose)
print(dumpValue(str) +".match(" + re.toString() + "), passed ", dumpValue(exp));
} else
print(dumpValue(str) +".match(" + re.toString() + "), FAILED test #" + testNumber + ", Expected ", dumpValue(exp), " got ", dumpValue(actual));
}

// Test 1
testRegExp(/c(?!(\D))|c/u, "abcdef", ["c", undefined]);
testRegExp(/c(?!(\D){3})|c/u, "abcdef", ["c", undefined]);
testRegExp(/c(?=(de)x)|c/u, "abcdef", ["c", undefined]);
testRegExp(/c(?=(def))x|c/u, "abcdef", ["c", undefined]);
testRegExp(/c(?=(def))x|c(?!(def))|c/, "abcdef", ["c", undefined, undefined]);

// Test 6
testRegExp(/(?<!(\D{3}))f|f/u, "abcdef", ["f", undefined]);
testRegExp(/(?<!(\D{3}))f/, "abcdef", null);
testRegExp(/(?<!(\D))f/u, "abcdef", null);
testRegExp(/(?<!(\D){3})f/u, "abcdef", null);
testRegExp(/(?<!(\D){3})f|f/u, "abcdef", ["f", undefined]);

// Test 11
testRegExp(/(?<=(\w){6})f/, "abcdef", null);
testRegExp(/f(?=(\w{6})})/, "abcdef", null);
testRegExp(/((?<!\D{3}))f|f/u, "abcdef", ["f", undefined]);
testRegExp(/(?<!(\D){3})f/, "abcdef", null);
testRegExp(/(?<!(\d){3})f/, "abcdef", ["f", undefined]);

// Test 16
testRegExp(/(?<!(\D){3})f/, "abcdef", null);
testRegExp(/(?<!(\D){3})f|f/, "abcdef", ["f", undefined]);
testRegExp(/((?<!\D{3}))f|f/, "abcdef", ["f", undefined]);
testRegExp(/(?<!(\w{3}))f(?=(\w{3}))|(?<=(\w+?))c(?=(\w{2}))|(?<=(\w{4}))c(?=(\w{3})$)/, "abcdef", ["c",undefined,undefined,"b","de",undefined,undefined]);
57 changes: 34 additions & 23 deletions Source/JavaScriptCore/yarr/YarrInterpreter.cpp
Expand Up @@ -134,7 +134,7 @@ class Interpreter {
{
ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term)
{
unsigned firstSubpatternId = term.atom.subpatternId;
unsigned firstSubpatternId = term.subpatternId();
unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;

for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
Expand Down Expand Up @@ -893,8 +893,8 @@ class Interpreter {
ASSERT(term.type == ByteTerm::Type::BackReference);
BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);

unsigned matchBegin = output[(term.atom.subpatternId << 1)];
unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
unsigned matchBegin = output[(term.subpatternId() << 1)];
unsigned matchEnd = output[(term.subpatternId() << 1) + 1];

// If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
// In this case the result of match is empty string like when it references to a parentheses with zero-width match.
Expand Down Expand Up @@ -945,8 +945,8 @@ class Interpreter {
ASSERT(term.type == ByteTerm::Type::BackReference);
BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);

unsigned matchBegin = output[(term.atom.subpatternId << 1)];
unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
unsigned matchBegin = output[(term.subpatternId() << 1)];
unsigned matchEnd = output[(term.subpatternId() << 1) + 1];

if (matchBegin == offsetNoMatch)
return false;
Expand Down Expand Up @@ -985,15 +985,15 @@ class Interpreter {
void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
{
if (term.capture()) {
unsigned subpatternId = term.atom.subpatternId;
unsigned subpatternId = term.subpatternId();
// For Backward matches, the captured indexes where recorded end then start.
output[(subpatternId << 1) + term.matchDirection()] = context->getDisjunctionContext(term)->matchBegin - term.inputPosition;
output[(subpatternId << 1) + 1 - term.matchDirection()] = context->getDisjunctionContext(term)->matchEnd - term.inputPosition;
}
}
void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
{
unsigned firstSubpatternId = term.atom.subpatternId;
unsigned firstSubpatternId = term.subpatternId();
unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
context->restoreOutput(output, firstSubpatternId, count);
}
Expand Down Expand Up @@ -1040,7 +1040,7 @@ class Interpreter {
}

if (term.capture()) {
unsigned subpatternId = term.atom.subpatternId;
unsigned subpatternId = term.subpatternId();
// For Backward matches, the captured indexes where recorded end then start.
output[(subpatternId << 1) + term.matchDirection()] = input.getPos() - term.inputPosition;
}
Expand All @@ -1054,7 +1054,7 @@ class Interpreter {
ASSERT(term.atom.quantityMaxCount == 1);

if (term.capture()) {
unsigned subpatternId = term.atom.subpatternId;
unsigned subpatternId = term.subpatternId();
// For Backward matches, the captured indexes where recorded end then start.
output[(subpatternId << 1) + 1 - term.matchDirection()] = input.getPos() - term.inputPosition;
}
Expand All @@ -1074,7 +1074,7 @@ class Interpreter {
BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);

if (term.capture()) {
unsigned subpatternId = term.atom.subpatternId;
unsigned subpatternId = term.subpatternId();
output[(subpatternId << 1)] = offsetNoMatch;
output[(subpatternId << 1) + 1] = offsetNoMatch;
}
Expand Down Expand Up @@ -1119,7 +1119,7 @@ class Interpreter {
// the same anyway! (We don't pre-check for greedy or non-greedy matches.)
ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::Type::ParenthesesSubpatternOnceBegin);
ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
unsigned subpatternId = term.atom.subpatternId;
unsigned subpatternId = term.subpatternId();
// For Backward matches, the captured indexes where recorded end then start.
output[(subpatternId << 1) + term.matchDirection()] = input.getPos() - term.inputPosition;
}
Expand Down Expand Up @@ -1204,6 +1204,10 @@ class Interpreter {

// We've reached the end of the parens; if they are inverted, this is failure.
if (term.invert()) {
if (term.containsAnyCaptures()) {
for (unsigned subpattern = term.subpatternId(); subpattern <= term.lastSubpatternId(); subpattern++)
output[(subpattern << 1)] = offsetNoMatch;
}
context->term -= term.atom.parenthesesWidth;
return false;
}
Expand Down Expand Up @@ -1239,6 +1243,11 @@ class Interpreter {

input.setPos(backTrack->begin);

if (term.containsAnyCaptures()) {
for (unsigned subpattern = term.subpatternId(); subpattern <= term.lastSubpatternId(); subpattern++)
output[(subpattern << 1)] = offsetNoMatch;
}

context->term -= term.atom.parenthesesWidth;
return false;
}
Expand Down Expand Up @@ -1581,10 +1590,10 @@ class Interpreter {

switch (currentTerm().type) {
case ByteTerm::Type::SubpatternBegin:
DUMP_EXTRA_IF(currentTerm().capture(), "id:", currentTerm().atom.subpatternId);
DUMP_EXTRA_IF(currentTerm().capture(), "id:", currentTerm().subpatternId());
MATCH_NEXT();
case ByteTerm::Type::SubpatternEnd:
DUMP_EXTRA_IF(currentTerm().capture(), "id:", currentTerm().atom.subpatternId, " - Return Match\n");
DUMP_EXTRA_IF(currentTerm().capture(), "id:", currentTerm().subpatternId(), " - Return Match\n");
context->matchEnd = input.getPos();
return JSRegExpResult::Match;

Expand Down Expand Up @@ -1884,7 +1893,7 @@ class Interpreter {

switch (currentTerm().type) {
case ByteTerm::Type::SubpatternBegin:
DUMP_EXTRA("id:", currentTerm().atom.subpatternId, " - Return NoMatch\n");
DUMP_EXTRA("id:", currentTerm().subpatternId(), " - Return NoMatch\n");
return JSRegExpResult::NoMatch;
case ByteTerm::Type::SubpatternEnd:
RELEASE_ASSERT_NOT_REACHED();
Expand Down Expand Up @@ -2256,7 +2265,7 @@ class ByteCompiler {
m_currentAlternativeIndex = beginTerm + 1;
}

void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned lastSubpatternId, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
{
unsigned beginTerm = popParenthesesStack();
closeAlternative(beginTerm + 1);
Expand All @@ -2266,11 +2275,13 @@ class ByteCompiler {

bool invert = m_bodyDisjunction->terms[beginTerm].invert();
MatchDirection matchDirection = m_bodyDisjunction->terms[beginTerm].matchDirection();
unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].subpatternId();

m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::Type::ParentheticalAssertionEnd, subpatternId, false, invert, matchDirection, inputPosition));
m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
m_bodyDisjunction->terms[beginTerm].atom.ids.lastSubpatternId = lastSubpatternId;
m_bodyDisjunction->terms[endTerm].atom.ids.lastSubpatternId = lastSubpatternId;
m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;

m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount;
Expand Down Expand Up @@ -2356,7 +2367,7 @@ class ByteCompiler {

auto parenthesesMatchDirection = parenthesesBegin.matchDirection();
bool capture = parenthesesBegin.capture();
unsigned subpatternId = parenthesesBegin.atom.subpatternId;
unsigned subpatternId = parenthesesBegin.subpatternId();

unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
auto parenthesesDisjunction = makeUnique<ByteDisjunction>(numSubpatterns, callFrameSize);
Expand Down Expand Up @@ -2390,7 +2401,7 @@ class ByteCompiler {
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::Type::ParenthesesSubpatternOnceBegin);

bool capture = m_bodyDisjunction->terms[beginTerm].capture();
unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].subpatternId();

m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::Type::ParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition));
if (m_bodyDisjunction->terms[beginTerm].matchDirection() == Backward) {
Expand Down Expand Up @@ -2422,7 +2433,7 @@ class ByteCompiler {
if (m_bodyDisjunction->terms[beginTerm].matchDirection() == Backward)
inputPosition = 0;
bool capture = m_bodyDisjunction->terms[beginTerm].capture();
unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].subpatternId();

m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::Type::ParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition));
m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
Expand Down Expand Up @@ -2592,7 +2603,7 @@ class ByteCompiler {
atomParentheticalAssertionBegin(term.parentheses.subpatternId, 0, term.invert(), term.matchDirection(), term.frameLocation, alternativeFrameLocation);
if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount, term.matchDirection()))
return error;
atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityMaxCount, term.quantityType);
atomParentheticalAssertionEnd(0, term.parentheses.lastSubpatternId, term.frameLocation, term.quantityMaxCount, term.quantityType);
if (uncheckAmount) {
checkInput(uncheckAmount);
currentCountAlreadyChecked += uncheckAmount;
Expand Down Expand Up @@ -2620,7 +2631,7 @@ class ByteCompiler {

if (auto error = emitDisjunction(term.parentheses.disjunction, checkedCountForLookbehind, positiveInputOffset + minimumSize, term.matchDirection()))
return error;
atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityMaxCount, term.quantityType);
atomParentheticalAssertionEnd(0, term.parentheses.lastSubpatternId, term.frameLocation, term.quantityMaxCount, term.quantityType);

if (uncheckAmount) {
checkInput(uncheckAmount);
Expand Down Expand Up @@ -2692,7 +2703,7 @@ void ByteTermDumper::dumpTerm(size_t idx, ByteTerm term)

auto dumpCaptured = [&](ByteTerm& term) {
if (term.capture())
out.print(" captured (#", term.atom.subpatternId, ")");
out.print(" captured (#", term.subpatternId(), ")");
};

auto dumpInverted = [&](ByteTerm& term) {
Expand Down Expand Up @@ -2854,7 +2865,7 @@ void ByteTermDumper::dumpTerm(size_t idx, ByteTerm term)
break;
case ByteTerm::Type::BackReference:
outputTermIndexAndNest(idx, m_nesting);
out.print("BackReference #", term.atom.subpatternId);
out.print("BackReference #", term.subpatternId());
dumpInputPosition(term);
dumpQuantity(term);
break;
Expand Down
27 changes: 23 additions & 4 deletions Source/JavaScriptCore/yarr/YarrInterpreter.h
Expand Up @@ -49,7 +49,10 @@ struct ByteTerm {
UChar32 hi;
} casedCharacter;
CharacterClass* characterClass;
unsigned subpatternId;
struct {
unsigned subpatternId;
unsigned lastSubpatternId;
} ids;
};
union {
ByteDisjunction* parenthesesDisjunction;
Expand Down Expand Up @@ -188,7 +191,7 @@ struct ByteTerm {
, m_matchDirection(Forward)
, inputPosition(inputPos)
{
atom.subpatternId = subpatternId;
atom.ids.subpatternId = subpatternId;
atom.parenthesesDisjunction = parenthesesInfo;
atom.quantityType = QuantifierType::FixedCount;
atom.quantityMinCount = 1;
Expand All @@ -213,7 +216,7 @@ struct ByteTerm {
, m_matchDirection(Forward)
, inputPosition(inputPos)
{
atom.subpatternId = subpatternId;
atom.ids.subpatternId = subpatternId;
atom.quantityType = QuantifierType::FixedCount;
atom.quantityMinCount = 1;
atom.quantityMaxCount = 1;
Expand All @@ -226,7 +229,7 @@ struct ByteTerm {
, m_matchDirection(matchDirection)
, inputPosition(inputPos)
{
atom.subpatternId = subpatternId;
atom.ids.subpatternId = subpatternId;
atom.quantityType = QuantifierType::FixedCount;
atom.quantityMinCount = 1;
atom.quantityMaxCount = 1;
Expand Down Expand Up @@ -367,6 +370,22 @@ struct ByteTerm {
return type == Type::CharacterClass;
}

bool containsAnyCaptures()
{
ASSERT(this->type == Type::ParentheticalAssertionBegin
|| this->type == Type::ParentheticalAssertionEnd);
return lastSubpatternId() >= subpatternId();
}

unsigned subpatternId()
{
return atom.ids.subpatternId;
}

unsigned lastSubpatternId()
{
return atom.ids.lastSubpatternId;
}
bool invert()
{
return m_invert;
Expand Down

0 comments on commit 6204c47

Please sign in to comment.