Skip to content

Commit a3c622c

Browse files
committed
regexec.c - incredibly inefficient solution to backref problem
Backrefs to unclosed parens inside of a quantified group were not being properly handled, which revealed we are not unrolling the paren state properly on failure and backtracking. Much of the code assumes that when we execute a "conditional" operation (where more than one thing could match) that we need not concern ourself with the paren state unless the conditional operation itself represents a paren, and that generally opcodes only needed to concern themselves with parens to their right. When you exclude backrefs from the equation this is broadly reasonable (i think), as on failure we typically dont care about the state of the paren buffers. They either get reset as we find a new different accepting pathway, or their state is irrelevant if the overal match is rejected (eg it fails). However backreferences are different. Consider the following pattern from the tests "xa=xaaa" =~ /^(xa|=?\1a){2}\z/ in the first iteration through this the first branch matches, and in fact because the \1 is in the second branch it can't match on the first iteration at all. After this $1 = "xa". We then perform the second iteration. "xa" does not match "=xaaa" so we fall to the second branch. The '=?' matches, but sets up a backtracking action to not match if the rest of the pattern does not match. \1 matches 'xa', and then the 'a' matches, leaving an unmatched 'a' in the string, we exit the quantifier loop with $1 = "=xaa" and match \z against the remaining "a" in the pattern, and fail. Here is where things go wrong in the old code, we unwind to the outer loop, but we do not unwind the paren state. We then unwind further into the 2nds iteration of the loop, to the '=?' where we then try to match the tail with the quantifier matching the empty string. We then match the old $1 (which was not unwound) as "=xaa", and then the "a" matches, and we are the end of the string and we have incorrectly accpeted this string as matching the pattern. What should have happend was when the \1 was resolved the second time it should have returned the same string as it did when the =? matched '=', which then would have resulted in the tail matching again, and etc, eventually unwinding the entire pattern when the second iteration failed entirely. This patch is very crude. It simple pushes the state of the parens and creates and unwind point for every case where we do a transition to a B or _next operation, and we make the corresponding _next_fail do the appropriate unwinding. The objective was to achieve correctness and then work towards making it more efficient. We almost certainly overstore items on the stack. In a future patch we can perhaps keep track of the unclosed parens before the relevant operators and make sure that they are properly pushed and unwound at the correct times.
1 parent a164ff8 commit a3c622c

File tree

3 files changed

+60
-2
lines changed

3 files changed

+60
-2
lines changed

regexec.c

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,13 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
290290
SSPUSHINT(rex->lastcloseparen);
291291
SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
292292

293+
294+
DEBUG_BUFFERS_r({
295+
Perl_re_exec_indentf(aTHX_
296+
"finished regcppush returning %" IVdf " cur: %" IVdf "\n",
297+
depth, retval, PL_savestack_ix);
298+
});
299+
293300
return retval;
294301
}
295302

@@ -421,6 +428,13 @@ S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
421428

422429
PERL_ARGS_ASSERT_REGCPPOP;
423430

431+
432+
DEBUG_BUFFERS_r({
433+
Perl_re_exec_indentf(aTHX_
434+
"starting regcppop at %" IVdf "\n",
435+
depth, PL_savestack_ix);
436+
});
437+
424438
/* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
425439
i = SSPOPUV;
426440
assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
@@ -496,6 +510,11 @@ S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
496510
));
497511
}
498512
#endif
513+
DEBUG_BUFFERS_r({
514+
Perl_re_exec_indentf(aTHX_
515+
"finished regcppop at %" IVdf "\n",
516+
depth, PL_savestack_ix);
517+
});
499518
}
500519

501520
/* restore the parens and associated vars at savestack position ix,
@@ -6865,6 +6884,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
68656884
case TRIE_next_fail: /* we failed - try next alternative */
68666885
{
68676886
U8 *uc;
6887+
REGCP_UNWIND(ST.lastcp);
6888+
regcppop(rex,&maxopenparen);
68686889
if ( ST.jump ) {
68696890
/* undo any captures done in the tail part of a branch,
68706891
* e.g.
@@ -6987,6 +7008,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
69877008
});
69887009

69897010
if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
7011+
(void)regcppush(rex, 0, maxopenparen);
7012+
REGCP_SET(ST.lastcp);
69907013
PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
69917014
script_run_begin);
69927015
NOT_REACHED; /* NOTREACHED */
@@ -9068,6 +9091,8 @@ NULL
90689091
ST.lastcloseparen = rex->lastcloseparen;
90699092
ST.next_branch = next;
90709093
REGCP_SET(ST.cp);
9094+
regcppush(rex, 0, maxopenparen);
9095+
REGCP_SET(ST.lastcp);
90719096

90729097
/* Now go into the branch */
90739098
if (has_cutgroup) {
@@ -9104,6 +9129,8 @@ NULL
91049129
do_cutgroup = 0;
91059130
no_final = 0;
91069131
}
9132+
REGCP_UNWIND(ST.lastcp);
9133+
regcppop(rex,&maxopenparen);
91079134
REGCP_UNWIND(ST.cp);
91089135
UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
91099136
CAPTURE_CLEAR(ST.before_paren+1,ST.after_paren,"BRANCH_next_fail");
@@ -9468,7 +9495,8 @@ NULL
94689495

94699496
case CURLY_B_min_fail:
94709497
/* failed to find B in a non-greedy match. */
9471-
9498+
REGCP_UNWIND(ST.lastcp);
9499+
regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
94729500
REGCP_UNWIND(ST.cp);
94739501
if (ST.paren) {
94749502
UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
@@ -9581,6 +9609,8 @@ NULL
95819609
}
95829610

95839611
curly_try_B_min:
9612+
(void)regcppush(rex, 0, maxopenparen);
9613+
REGCP_SET(ST.lastcp);
95849614
CURLY_SETPAREN(ST.paren, ST.count);
95859615
PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
95869616
script_run_begin);
@@ -9594,16 +9624,22 @@ NULL
95949624
&& locinput + ST.Binfo.min_length <= loceol
95959625
&& S_test_EXACTISH_ST(locinput, ST.Binfo)))
95969626
{
9627+
(void)regcppush(rex, 0, maxopenparen);
9628+
REGCP_SET(ST.lastcp);
95979629
CURLY_SETPAREN(ST.paren, ST.count);
95989630
PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
95999631
script_run_begin);
96009632
NOT_REACHED; /* NOTREACHED */
96019633
}
9602-
/* FALLTHROUGH */
9634+
goto CURLY_B_all_failed;
9635+
NOT_REACHED; /* NOTREACHED */
96039636

96049637
case CURLY_B_max_fail:
96059638
/* failed to find B in a greedy match */
96069639

9640+
REGCP_UNWIND(ST.lastcp);
9641+
regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
9642+
CURLY_B_all_failed:
96079643
REGCP_UNWIND(ST.cp);
96089644
if (ST.paren) {
96099645
UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);

regexp.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -840,6 +840,7 @@ typedef struct regmatch_state {
840840
U32 lastparen;
841841
U32 lastcloseparen;
842842
CHECKPOINT cp;
843+
CHECKPOINT lastcp;
843844
U16 before_paren;
844845
U16 after_paren;
845846

@@ -851,6 +852,7 @@ typedef struct regmatch_state {
851852
U32 lastparen;
852853
U32 lastcloseparen;
853854
CHECKPOINT cp;
855+
CHECKPOINT lastcp;
854856
U16 before_paren;
855857
U16 after_paren;
856858

@@ -863,6 +865,7 @@ typedef struct regmatch_state {
863865
U32 lastparen;
864866
U32 lastcloseparen;
865867
CHECKPOINT cp;
868+
CHECKPOINT lastcp;
866869
U16 before_paren;
867870
U16 after_paren;
868871

@@ -926,6 +929,7 @@ typedef struct regmatch_state {
926929
regnode *me; /* the CURLYX node */
927930
regnode *B; /* the B node in /A*B/ */
928931
CHECKPOINT cp; /* remember current savestack index */
932+
CHECKPOINT lastcp; /* remember current savestack index */
929933
bool minmod;
930934
int parenfloor;/* how far back to strip paren data */
931935

@@ -949,6 +953,7 @@ typedef struct regmatch_state {
949953
/* this first element must match u.yes */
950954
struct regmatch_state *prev_yes_state;
951955
CHECKPOINT cp;
956+
CHECKPOINT lastcp; /* remember current savestack index */
952957
U32 lastparen;
953958
U32 lastcloseparen;
954959
I32 alen; /* length of first-matched A string */
@@ -962,6 +967,7 @@ typedef struct regmatch_state {
962967
struct {
963968
U32 paren;
964969
CHECKPOINT cp;
970+
CHECKPOINT lastcp; /* remember current savestack index */
965971
U32 lastparen;
966972
U32 lastcloseparen;
967973
char *maxpos; /* highest possible point in string to match */

t/re/re_tests

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2132,6 +2132,22 @@ AB\s+\x{100} AB \x{100}X y - -
21322132
/(([ab]+)|([cd]+)|([ef]+))+/ acebd y $1-$2-$3-$4=$& d--d-=acebd
21332133
/(([ab]+)|([cd]+)|([ef]+))+/ acebdf y $1-$2-$3-$4=$& f---f=acebdf
21342134
/((a)(b)(c)|(a)(b)|(a))+/ abcaba y $1+$2-$3-$4+$5-$6+$7=$& a+--+-+a=abcaba
2135+
2136+
/^(xa|=?\1a){2}$/ xa=xaaa n - - # GH 10073 - RT72020
2137+
/^(xa|(?:=|zzzz|)\1a){2}$/ xa=xaaa n - - # GH 10073 - RT72020
2138+
/^(xa|(?:=|zzzz)?\1a){2}$/ xa=xaaa n - - # GH 10073 - RT72020
2139+
/^(xa|[Z=]?\1a){2}$/ xa=xaaa n - - # GH 10073 - RT72020
2140+
/^(xa|([Z=])?\1a){2}$/ xa=xaaa n - - # GH 10073 - RT72020
2141+
/^(xa|([Z=]|zzzz)?\1a){2}$/ xa=xaaa n - - # GH 10073 - RT72020
2142+
/^(xa|(=|)\1a){2}$/ xa=xaaa n - - # GH 10073 - RT72020
2143+
/^(xa|(?:[Z=])?\1a){2}$/ xa=xaaa n - - # GH 10073 - RT72020
2144+
/^(xa|(?:[Z=]|zzzz)?\1a){2}$/ xa=xaaa n - - # GH 10073 - RT72020
2145+
/^(xa|(?:=|)\1a){2}$/ xa=xaaa n - - # GH 10073 - RT72020
2146+
/^(xa|=*\1a){2}$/ xa=xaaa n - - # GH 10073 - RT72020
2147+
/^(xa|[Z=]*\1a){2}$/ xa=xaaa n - - # GH 10073 - RT72020
2148+
/^(xa|(?:[Z=])*\1a){2}$/ xa=xaaa n - - # GH 10073 - RT72020
2149+
/^(xa|(?:[Z=]|zzzz)*\1a){2}$/ xa=xaaa n - - # GH 10073 - RT72020
2150+
21352151
# Keep these lines at the end of the file
21362152
# pat string y/n/etc expr expected-expr skip-reason comment
21372153
# vim: softtabstop=0 noexpandtab

0 commit comments

Comments
 (0)