Skip to content

Commit

Permalink
regexec.c - teach BRANCH and BRANCHJ nodes to reset capture buffers
Browse files Browse the repository at this point in the history
In /((a)(b)|(a))+/ we should not end up with $2 and $4 being set at
the same time. When a branch fails it should reset any capture buffers
that might be touched by its branch.

We change BRANCH and BRANCHJ to store the number of parens before the
branch, and the number of parens after the branch was completed. When
a BRANCH operation fails, we clear the buffers it contains before we
continue on.

It is a bit more complex than it should be because we have BRANCHJ
and BRANCH. (One of these days we should merge them together.)

This is also made somewhat more complex because TRIE nodes are actually
branches, and may need to track capture buffers also, at two levels.
The overall TRIE op, and for jump tries especially where we emulate
the behavior of branches. So we have to do the same clearing logic if
a trie branch fails as well.
  • Loading branch information
demerphq committed Jan 19, 2023
1 parent 2b97a35 commit a5166c5
Show file tree
Hide file tree
Showing 10 changed files with 156 additions and 21 deletions.
4 changes: 2 additions & 2 deletions pod/perldebguts.pod
Expand Up @@ -668,7 +668,7 @@ will be lost.
# pointer of each individual branch points; each branch
# starts with the operand node of a BRANCH node.
#
BRANCH node Match this alternative, or the next...
BRANCH node 1 Match this alternative, or the next...

# Literals

Expand Down Expand Up @@ -796,7 +796,7 @@ will be lost.

# Support for long RE
LONGJMP off 1 1 Jump far away.
BRANCHJ off 1 1 BRANCH with long offset.
BRANCHJ off 2L 1 BRANCH with long offset.

# Special Case Regops
IFMATCH off 1 1 Succeeds if the following matches; non-zero
Expand Down
46 changes: 42 additions & 4 deletions regcomp.c
Expand Up @@ -4093,6 +4093,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
parse_rest:
/* Pick up the branches, linking them together. */
segment_parse_start = RExC_parse;
I32 npar_before_regbranch = RExC_npar - 1;
br = regbranch(pRExC_state, &flags, 1, depth+1);

/* branch_len = (paren != 0); */
Expand All @@ -4104,9 +4105,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
if (*RExC_parse == '|') {
if (RExC_use_BRANCHJ) {
reginsert(pRExC_state, BRANCHJ, br, depth+1);
ARG2La_SET(REGNODE_p(br), npar_before_regbranch);
ARG2Lb_SET(REGNODE_p(br), (U16)RExC_npar - 1);
}
else {
reginsert(pRExC_state, BRANCH, br, depth+1);
ARGa_SET(REGNODE_p(br), (U16)npar_before_regbranch);
ARGb_SET(REGNODE_p(br), (U16)RExC_npar - 1);
}
have_branch = 1;
}
Expand Down Expand Up @@ -4149,6 +4154,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */
REQUIRE_BRANCHJ(flagp, 0);
}
assert(OP(REGNODE_p(br)) == BRANCH || OP(REGNODE_p(br))==BRANCHJ);
assert(OP(REGNODE_p(lastbr)) == BRANCH || OP(REGNODE_p(lastbr))==BRANCHJ);
if (OP(REGNODE_p(br)) == BRANCH) {
if (OP(REGNODE_p(lastbr)) == BRANCH)
ARGb_SET(REGNODE_p(lastbr),ARGa(REGNODE_p(br)));
else
ARG2Lb_SET(REGNODE_p(lastbr),ARGa(REGNODE_p(br)));
}
else
if (OP(REGNODE_p(br)) == BRANCHJ) {
if (OP(REGNODE_p(lastbr)) == BRANCH)
ARGb_SET(REGNODE_p(lastbr),ARG2La(REGNODE_p(br)));
else
ARG2Lb_SET(REGNODE_p(lastbr),ARG2La(REGNODE_p(br)));
}

lastbr = br;
*flagp |= flags & (HASWIDTH | POSTPONED);
}
Expand Down Expand Up @@ -4222,6 +4243,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
(IV)(ender - lastbr)
);
});
if (OP(REGNODE_p(lastbr)) == BRANCH) {
ARGb_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
}
else
if (OP(REGNODE_p(lastbr)) == BRANCHJ) {
ARG2Lb_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
}

if (! REGTAIL(pRExC_state, lastbr, ender)) {
REQUIRE_BRANCHJ(flagp, 0);
}
Expand Down Expand Up @@ -4365,6 +4394,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
regnode_offset ret;
regnode_offset chain = 0;
regnode_offset latest;
regnode *branch_node = NULL;
I32 flags = 0, c = 0;
DECLARE_AND_GET_RE_DEBUG_FLAGS;

Expand All @@ -4375,10 +4405,14 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
if (first)
ret = 0;
else {
if (RExC_use_BRANCHJ)
ret = reganode(pRExC_state, BRANCHJ, 0);
else {
ret = reg_node(pRExC_state, BRANCH);
if (RExC_use_BRANCHJ) {
ret = reg2Lanode(pRExC_state, BRANCHJ, 0, 0);
branch_node = REGNODE_p(ret);
ARG2La_SET(branch_node, (U16)RExC_npar-1);
} else {
ret = reganode(pRExC_state, BRANCH, 0);
branch_node = REGNODE_p(ret);
ARGa_SET(branch_node, (U16)RExC_npar-1);
}
}

Expand Down Expand Up @@ -13360,6 +13394,10 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
PerlMemShared_free(trie->bitmap);
if (trie->jump)
PerlMemShared_free(trie->jump);
if (trie->j_before_paren)
PerlMemShared_free(trie->j_before_paren);
if (trie->j_after_paren)
PerlMemShared_free(trie->j_after_paren);
PerlMemShared_free(trie->wordinfo);
/* do this last!!!! */
PerlMemShared_free(ri->data->data[n]);
Expand Down
36 changes: 33 additions & 3 deletions regcomp.h
Expand Up @@ -183,7 +183,13 @@ struct regnode_1 {
U8 flags;
U8 type;
U16 next_off;
U32 arg1;
union {
U32 arg1;
struct {
U16 arg1a;
U16 arg1b;
};
};
};

/* Node whose argument is 'SV *'. This needs to be used very carefully in
Expand Down Expand Up @@ -214,7 +220,13 @@ struct regnode_2L {
U8 type;
U16 next_off;
U32 arg1;
I32 arg2;
union {
I32 arg2;
struct {
U16 arg2a;
U16 arg2b;
};
};
};

/* 'Two field' -- Two 32 bit signed args.
Expand Down Expand Up @@ -361,18 +373,26 @@ struct regnode_ssc {

#define ARG(p) ARG_VALUE(ARG_LOC(p))
#define ARGp(p) ARGp_VALUE_inline(p)
#define ARGa(p) ARG_VALUE(ARGa_LOC(p))
#define ARGb(p) ARG_VALUE(ARGb_LOC(p))
#define ARG1(p) ARG_VALUE(ARG1_LOC(p))
#define ARG2(p) ARG_VALUE(ARG2_LOC(p))
#define ARG3(p) ARG_VALUE(ARG3_LOC(p))
#define ARG4(p) ARG_VALUE(ARG4_LOC(p))
#define ARG2L(p) ARG_VALUE(ARG2L_LOC(p))
#define ARG2La(p) ARG_VALUE(ARG2La_LOC(p))
#define ARG2Lb(p) ARG_VALUE(ARG2Lb_LOC(p))

#define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val))
#define ARGa_SET(p, val) ARG__SET(ARGa_LOC(p), (val))
#define ARGb_SET(p, val) ARG__SET(ARGb_LOC(p), (val))
#define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val))
#define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val))
#define ARG3_SET(p, val) ARG__SET(ARG3_LOC(p), (val))
#define ARG4_SET(p, val) ARG__SET(ARG4_LOC(p), (val))
#define ARG2L_SET(p, val) ARG__SET(ARG2L_LOC(p), (val))
#define ARG2La_SET(p, val) ARG__SET(ARG2La_LOC(p), (val))
#define ARG2Lb_SET(p, val) ARG__SET(ARG2Lb_LOC(p), (val))
#define ARGp_SET(p, val) ARGp_SET_inline((p),(val))

#undef NEXT_OFF
Expand Down Expand Up @@ -454,13 +474,16 @@ struct regnode_ssc {

#define NODE_ALIGN(node)
#define ARG_LOC(p) (((struct regnode_1 *)p)->arg1)
#define ARGa_LOC(p) (((struct regnode_1 *)p)->arg1a)
#define ARGb_LOC(p) (((struct regnode_1 *)p)->arg1b)
#define ARGp_BYTES_LOC(p) (((struct regnode_p *)p)->arg1_sv_ptr_bytes)
#define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1)
#define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2)
#define ARG3_LOC(p) (((struct regnode_4 *)p)->arg3)
#define ARG4_LOC(p) (((struct regnode_4 *)p)->arg4)
#define ARG2L_LOC(p) (((struct regnode_2L *)p)->arg2)

#define ARG2La_LOC(p) (((struct regnode_2L *)p)->arg2a)
#define ARG2Lb_LOC(p) (((struct regnode_2L *)p)->arg2b)

/* These should no longer be used directly in most cases. Please use
* the REGNODE_AFTER() macros instead. */
Expand Down Expand Up @@ -1148,6 +1171,11 @@ struct _reg_trie_data {
char *bitmap; /* stclass bitmap */
U16 *jump; /* optional 1 indexed array of offsets before tail
for the node following a given word. */
U16 *j_before_paren; /* optional 1 indexed array of parno reset data
for the given jump. */
U16 *j_after_paren; /* optional 1 indexed array of parno reset data
for the given jump. */

reg_trie_wordinfo *wordinfo; /* array of info per word */
U16 uniquecharcount; /* unique chars in trie (width of trans table) */
U32 startstate; /* initial state - used for common prefix optimisation */
Expand All @@ -1157,6 +1185,8 @@ struct _reg_trie_data {
U32 statecount; /* Build only - number of states in the states array
(including the unused zero state) */
U32 wordcount; /* Build only */
U16 before_paren;
U16 after_paren;
#ifdef DEBUGGING
STRLEN charcount; /* Build only */
#endif
Expand Down
4 changes: 2 additions & 2 deletions regcomp.sym
Expand Up @@ -140,7 +140,7 @@ CLUMP CLUMP, no 0 V ; Match any extended grapheme cluster sequence
#* pointer of each individual branch points; each branch
#* starts with the operand node of a BRANCH node.
#*
BRANCH BRANCH, node 0 V ; Match this alternative, or the next...
BRANCH BRANCH, node 1 V ; Match this alternative, or the next...

#*Literals
# NOTE: the relative ordering of these types is important do not change it
Expand Down Expand Up @@ -252,7 +252,7 @@ REFFAN REF, num 1 V ; Match already matched string, using /aai rul

#*Support for long RE
LONGJMP LONGJMP, off 1 . 1 ; Jump far away.
BRANCHJ BRANCHJ, off 1 V 1 ; BRANCH with long offset.
BRANCHJ BRANCHJ, off 2L V 1 ; BRANCH with long offset.

#*Special Case Regops
IFMATCH BRANCHJ, off 1 . 1 ; Succeeds if the following matches; non-zero flags "f", next_off "o" means lookbehind assertion starting "f..(f-o)" characters before current
Expand Down
12 changes: 10 additions & 2 deletions regcomp_debug.c
Expand Up @@ -408,8 +408,13 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
sv_catpv(sv, REGNODE_NAME(op)); /* Take off const! */

k = REGNODE_TYPE(op);

if (k == EXACT) {
if (op == BRANCH) {
Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARGa(o),(IV)ARGb(o));
}
else if (op == BRANCHJ) {
Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG2La(o),(IV)ARG2Lb(o));
}
else if (k == EXACT) {
sv_catpvs(sv, " ");
/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
* is a crude hack but it may be the best for now since
Expand Down Expand Up @@ -462,6 +467,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
);
sv_catpvs(sv, "]");
}
if (trie->before_paren || trie->after_paren)
Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")",
(IV)trie->before_paren,(IV)trie->after_paren);
} else if (k == CURLY) {
U32 lo = ARG1(o), hi = ARG2(o);
if (ARG3(o) || ARG4(o))
Expand Down
28 changes: 27 additions & 1 deletion regcomp_trie.c
Expand Up @@ -469,10 +469,26 @@ is the recommended Unicode-aware way of saying
trie->wordinfo[curword].accept = state; \
\
if ( noper_next < tail ) { \
if (!trie->jump) \
if (!trie->jump) { \
trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
sizeof(U16) ); \
trie->j_before_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \
sizeof(U16) ); \
trie->j_after_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \
sizeof(U16) ); \
} \
trie->jump[curword] = (U16)(noper_next - convert); \
U16 set_before_paren; \
U16 set_after_paren; \
if (OP(cur) == BRANCH) { \
set_before_paren = ARGa(cur); \
set_after_paren = ARGb(cur); \
} else { \
set_before_paren = ARG2La(cur); \
set_after_paren = ARG2Lb(cur); \
} \
trie->j_before_paren[curword] = set_before_paren; \
trie->j_after_paren[curword] = set_after_paren; \
if (!jumper) \
jumper = noper_next; \
if (!nextbranch) \
Expand Down Expand Up @@ -533,6 +549,7 @@ Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
U32 next_alloc = 0;
regnode *jumper = NULL;
regnode *nextbranch = NULL;
regnode *lastbranch = NULL;
regnode *convert = NULL;
U32 *prev_states; /* temp array mapping each state to previous one */
/* we just use folder as a flag in utf8 */
Expand Down Expand Up @@ -569,6 +586,7 @@ Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, REGNODE_NAME(flags) );
}

/* create the trie struct, all zeroed */
trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
trie->refcount = 1;
trie->startstate = 1;
Expand Down Expand Up @@ -639,6 +657,7 @@ Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
STRLEN maxchars = 0;
bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
bitmap?*/
lastbranch = cur;

if (OP(noper) == NOTHING) {
/* skip past a NOTHING at the start of an alternation
Expand Down Expand Up @@ -797,6 +816,13 @@ Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
trie->maxlen = maxchars;
}
} /* end first pass */
trie->before_paren = OP(first) == BRANCH
? ARGa(first)
: ARG2La(first); /* BRANCHJ */

trie->after_paren = OP(lastbranch) == BRANCH
? ARGb(lastbranch)
: ARG2Lb(lastbranch); /* BRANCHJ */
DEBUG_TRIE_COMPILE_r(
Perl_re_indentf( aTHX_
"TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
Expand Down
32 changes: 28 additions & 4 deletions regexec.c
Expand Up @@ -6706,6 +6706,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
/* what trie are we using right now */
reg_trie_data * const trie
= (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
ST.before_paren = trie->before_paren;
ST.after_paren = trie->after_paren;
assert(ST.before_paren<=rex->nparens);
assert(ST.after_paren<=rex->nparens);

HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
U32 state = trie->startstate;

Expand Down Expand Up @@ -6755,6 +6760,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
U32 accepted = 0; /* have we seen any accepting states? */

ST.jump = trie->jump;
ST.j_before_paren = trie->j_before_paren;
ST.j_after_paren= trie->j_after_paren;
ST.me = scan;
ST.firstpos = NULL;
ST.longfold = FALSE; /* char longer if folded => it's harder */
Expand Down Expand Up @@ -6866,6 +6873,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
* rest of the branch */
REGCP_UNWIND(ST.cp);
UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
if (ST.after_paren) {
assert(ST.before_paren<=rex->nparens && ST.after_paren<=rex->nparens);
CAPTURE_CLEAR(ST.before_paren+1, ST.after_paren,"TRIE_next_fail");
}
}
if (!--ST.accepted) {
DEBUG_EXECUTE_r({
Expand Down Expand Up @@ -6955,10 +6966,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
uc += chars;
}
}
if (ST.jump && ST.jump[ST.nextword]) {
scan = ST.me + ST.jump[ST.nextword];
ST.before_paren = ST.j_before_paren[ST.nextword];
assert(ST.before_paren <= rex->nparens);
ST.after_paren = ST.j_after_paren[ST.nextword];
assert(ST.after_paren <= rex->nparens);
} else {
scan = ST.me + NEXT_OFF(ST.me);
}

scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
? ST.jump[ST.nextword]
: NEXT_OFF(ST.me));

DEBUG_EXECUTE_r({
Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n",
Expand Down Expand Up @@ -9036,9 +9053,15 @@ NULL
next = scan + ARG(scan);
if (next == scan)
next = NULL;
/* FALLTHROUGH */
ST.before_paren = ARG2La(scan);
ST.after_paren = ARG2Lb(scan);
goto branch_logic;
NOT_REACHED; /* NOTREACHED */

case BRANCH: /* /(...|A|...)/ */
ST.before_paren = ARGa(scan);
ST.after_paren = ARGb(scan);
branch_logic:
scan = REGNODE_AFTER_opcode(scan,state_num); /* scan now points to inner node */
assert(scan);
ST.lastparen = rex->lastparen;
Expand Down Expand Up @@ -9083,6 +9106,7 @@ NULL
}
REGCP_UNWIND(ST.cp);
UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
CAPTURE_CLEAR(ST.before_paren+1,ST.after_paren,"BRANCH_next_fail");
scan = ST.next_branch;
/* no more branches? */
if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
Expand Down

0 comments on commit a5166c5

Please sign in to comment.