diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod index 42db53944ff6..6db2cd1571d3 100644 --- a/pod/perldebguts.pod +++ b/pod/perldebguts.pod @@ -668,7 +668,7 @@ will be lost. # pointer of each individual branch points; each branch # starts with the operand node of a BRANCH node. # - BRANCH node Match this alternative, or the next... + BRANCH node 1 Match this alternative, or the next... # Literals @@ -796,7 +796,7 @@ will be lost. # Support for long RE LONGJMP off 1 1 Jump far away. - BRANCHJ off 1 1 BRANCH with long offset. + BRANCHJ off 2L 1 BRANCH with long offset. # Special Case Regops IFMATCH off 1 1 Succeeds if the following matches; non-zero diff --git a/regcomp.c b/regcomp.c index 1170cc55c075..7b96c0d7bc69 100644 --- a/regcomp.c +++ b/regcomp.c @@ -4093,6 +4093,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) parse_rest: /* Pick up the branches, linking them together. */ segment_parse_start = RExC_parse; + I32 npar_before_regbranch = RExC_npar - 1; br = regbranch(pRExC_state, &flags, 1, depth+1); /* branch_len = (paren != 0); */ @@ -4104,9 +4105,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) if (*RExC_parse == '|') { if (RExC_use_BRANCHJ) { reginsert(pRExC_state, BRANCHJ, br, depth+1); + ARG2La_SET(REGNODE_p(br), npar_before_regbranch); + ARG2Lb_SET(REGNODE_p(br), (U16)RExC_npar - 1); } else { reginsert(pRExC_state, BRANCH, br, depth+1); + ARGa_SET(REGNODE_p(br), (U16)npar_before_regbranch); + ARGb_SET(REGNODE_p(br), (U16)RExC_npar - 1); } have_branch = 1; } @@ -4149,6 +4154,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */ REQUIRE_BRANCHJ(flagp, 0); } + assert(OP(REGNODE_p(br)) == BRANCH || OP(REGNODE_p(br))==BRANCHJ); + assert(OP(REGNODE_p(lastbr)) == BRANCH || OP(REGNODE_p(lastbr))==BRANCHJ); + if (OP(REGNODE_p(br)) == BRANCH) { + if (OP(REGNODE_p(lastbr)) == BRANCH) + ARGb_SET(REGNODE_p(lastbr),ARGa(REGNODE_p(br))); + else + ARG2Lb_SET(REGNODE_p(lastbr),ARGa(REGNODE_p(br))); + } + else + if (OP(REGNODE_p(br)) == BRANCHJ) { + if (OP(REGNODE_p(lastbr)) == BRANCH) + ARGb_SET(REGNODE_p(lastbr),ARG2La(REGNODE_p(br))); + else + ARG2Lb_SET(REGNODE_p(lastbr),ARG2La(REGNODE_p(br))); + } + lastbr = br; *flagp |= flags & (HASWIDTH | POSTPONED); } @@ -4222,6 +4243,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) (IV)(ender - lastbr) ); }); + if (OP(REGNODE_p(lastbr)) == BRANCH) { + ARGb_SET(REGNODE_p(lastbr),(U16)RExC_npar-1); + } + else + if (OP(REGNODE_p(lastbr)) == BRANCHJ) { + ARG2Lb_SET(REGNODE_p(lastbr),(U16)RExC_npar-1); + } + if (! REGTAIL(pRExC_state, lastbr, ender)) { REQUIRE_BRANCHJ(flagp, 0); } @@ -4365,6 +4394,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) regnode_offset ret; regnode_offset chain = 0; regnode_offset latest; + regnode *branch_node = NULL; I32 flags = 0, c = 0; DECLARE_AND_GET_RE_DEBUG_FLAGS; @@ -4375,10 +4405,14 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) if (first) ret = 0; else { - if (RExC_use_BRANCHJ) - ret = reganode(pRExC_state, BRANCHJ, 0); - else { - ret = reg_node(pRExC_state, BRANCH); + if (RExC_use_BRANCHJ) { + ret = reg2Lanode(pRExC_state, BRANCHJ, 0, 0); + branch_node = REGNODE_p(ret); + ARG2La_SET(branch_node, (U16)RExC_npar-1); + } else { + ret = reganode(pRExC_state, BRANCH, 0); + branch_node = REGNODE_p(ret); + ARGa_SET(branch_node, (U16)RExC_npar-1); } } @@ -13360,6 +13394,10 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) PerlMemShared_free(trie->bitmap); if (trie->jump) PerlMemShared_free(trie->jump); + if (trie->j_before_paren) + PerlMemShared_free(trie->j_before_paren); + if (trie->j_after_paren) + PerlMemShared_free(trie->j_after_paren); PerlMemShared_free(trie->wordinfo); /* do this last!!!! */ PerlMemShared_free(ri->data->data[n]); diff --git a/regcomp.h b/regcomp.h index 9a1b00dece5a..4a4ed52219bf 100644 --- a/regcomp.h +++ b/regcomp.h @@ -183,7 +183,13 @@ struct regnode_1 { U8 flags; U8 type; U16 next_off; - U32 arg1; + union { + U32 arg1; + struct { + U16 arg1a; + U16 arg1b; + }; + }; }; /* Node whose argument is 'SV *'. This needs to be used very carefully in @@ -214,7 +220,13 @@ struct regnode_2L { U8 type; U16 next_off; U32 arg1; - I32 arg2; + union { + I32 arg2; + struct { + U16 arg2a; + U16 arg2b; + }; + }; }; /* 'Two field' -- Two 32 bit signed args. @@ -361,18 +373,26 @@ struct regnode_ssc { #define ARG(p) ARG_VALUE(ARG_LOC(p)) #define ARGp(p) ARGp_VALUE_inline(p) +#define ARGa(p) ARG_VALUE(ARGa_LOC(p)) +#define ARGb(p) ARG_VALUE(ARGb_LOC(p)) #define ARG1(p) ARG_VALUE(ARG1_LOC(p)) #define ARG2(p) ARG_VALUE(ARG2_LOC(p)) #define ARG3(p) ARG_VALUE(ARG3_LOC(p)) #define ARG4(p) ARG_VALUE(ARG4_LOC(p)) #define ARG2L(p) ARG_VALUE(ARG2L_LOC(p)) +#define ARG2La(p) ARG_VALUE(ARG2La_LOC(p)) +#define ARG2Lb(p) ARG_VALUE(ARG2Lb_LOC(p)) #define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val)) +#define ARGa_SET(p, val) ARG__SET(ARGa_LOC(p), (val)) +#define ARGb_SET(p, val) ARG__SET(ARGb_LOC(p), (val)) #define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val)) #define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val)) #define ARG3_SET(p, val) ARG__SET(ARG3_LOC(p), (val)) #define ARG4_SET(p, val) ARG__SET(ARG4_LOC(p), (val)) #define ARG2L_SET(p, val) ARG__SET(ARG2L_LOC(p), (val)) +#define ARG2La_SET(p, val) ARG__SET(ARG2La_LOC(p), (val)) +#define ARG2Lb_SET(p, val) ARG__SET(ARG2Lb_LOC(p), (val)) #define ARGp_SET(p, val) ARGp_SET_inline((p),(val)) #undef NEXT_OFF @@ -454,13 +474,16 @@ struct regnode_ssc { #define NODE_ALIGN(node) #define ARG_LOC(p) (((struct regnode_1 *)p)->arg1) +#define ARGa_LOC(p) (((struct regnode_1 *)p)->arg1a) +#define ARGb_LOC(p) (((struct regnode_1 *)p)->arg1b) #define ARGp_BYTES_LOC(p) (((struct regnode_p *)p)->arg1_sv_ptr_bytes) #define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1) #define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2) #define ARG3_LOC(p) (((struct regnode_4 *)p)->arg3) #define ARG4_LOC(p) (((struct regnode_4 *)p)->arg4) #define ARG2L_LOC(p) (((struct regnode_2L *)p)->arg2) - +#define ARG2La_LOC(p) (((struct regnode_2L *)p)->arg2a) +#define ARG2Lb_LOC(p) (((struct regnode_2L *)p)->arg2b) /* These should no longer be used directly in most cases. Please use * the REGNODE_AFTER() macros instead. */ @@ -1148,6 +1171,11 @@ struct _reg_trie_data { char *bitmap; /* stclass bitmap */ U16 *jump; /* optional 1 indexed array of offsets before tail for the node following a given word. */ + U16 *j_before_paren; /* optional 1 indexed array of parno reset data + for the given jump. */ + U16 *j_after_paren; /* optional 1 indexed array of parno reset data + for the given jump. */ + reg_trie_wordinfo *wordinfo; /* array of info per word */ U16 uniquecharcount; /* unique chars in trie (width of trans table) */ U32 startstate; /* initial state - used for common prefix optimisation */ @@ -1157,6 +1185,8 @@ struct _reg_trie_data { U32 statecount; /* Build only - number of states in the states array (including the unused zero state) */ U32 wordcount; /* Build only */ + U16 before_paren; + U16 after_paren; #ifdef DEBUGGING STRLEN charcount; /* Build only */ #endif diff --git a/regcomp.sym b/regcomp.sym index c0735aada955..e01844f9b075 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -140,7 +140,7 @@ CLUMP CLUMP, no 0 V ; Match any extended grapheme cluster sequence #* pointer of each individual branch points; each branch #* starts with the operand node of a BRANCH node. #* -BRANCH BRANCH, node 0 V ; Match this alternative, or the next... +BRANCH BRANCH, node 1 V ; Match this alternative, or the next... #*Literals # NOTE: the relative ordering of these types is important do not change it @@ -252,7 +252,7 @@ REFFAN REF, num 1 V ; Match already matched string, using /aai rul #*Support for long RE LONGJMP LONGJMP, off 1 . 1 ; Jump far away. -BRANCHJ BRANCHJ, off 1 V 1 ; BRANCH with long offset. +BRANCHJ BRANCHJ, off 2L V 1 ; BRANCH with long offset. #*Special Case Regops IFMATCH BRANCHJ, off 1 . 1 ; Succeeds if the following matches; non-zero flags "f", next_off "o" means lookbehind assertion starting "f..(f-o)" characters before current diff --git a/regcomp_debug.c b/regcomp_debug.c index 6410f5e2da09..bfa5370662e1 100644 --- a/regcomp_debug.c +++ b/regcomp_debug.c @@ -408,8 +408,13 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ sv_catpv(sv, REGNODE_NAME(op)); /* Take off const! */ k = REGNODE_TYPE(op); - - if (k == EXACT) { + if (op == BRANCH) { + Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARGa(o),(IV)ARGb(o)); + } + else if (op == BRANCHJ) { + Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG2La(o),(IV)ARG2Lb(o)); + } + else if (k == EXACT) { sv_catpvs(sv, " "); /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) * is a crude hack but it may be the best for now since @@ -462,6 +467,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ ); sv_catpvs(sv, "]"); } + if (trie->before_paren || trie->after_paren) + Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", + (IV)trie->before_paren,(IV)trie->after_paren); } else if (k == CURLY) { U32 lo = ARG1(o), hi = ARG2(o); if (ARG3(o) || ARG4(o)) diff --git a/regcomp_trie.c b/regcomp_trie.c index cf692d082173..8a6b4b639aa5 100644 --- a/regcomp_trie.c +++ b/regcomp_trie.c @@ -469,10 +469,26 @@ is the recommended Unicode-aware way of saying trie->wordinfo[curword].accept = state; \ \ if ( noper_next < tail ) { \ - if (!trie->jump) \ + if (!trie->jump) { \ trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ sizeof(U16) ); \ + trie->j_before_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \ + sizeof(U16) ); \ + trie->j_after_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \ + sizeof(U16) ); \ + } \ trie->jump[curword] = (U16)(noper_next - convert); \ + U16 set_before_paren; \ + U16 set_after_paren; \ + if (OP(cur) == BRANCH) { \ + set_before_paren = ARGa(cur); \ + set_after_paren = ARGb(cur); \ + } else { \ + set_before_paren = ARG2La(cur); \ + set_after_paren = ARG2Lb(cur); \ + } \ + trie->j_before_paren[curword] = set_before_paren; \ + trie->j_after_paren[curword] = set_after_paren; \ if (!jumper) \ jumper = noper_next; \ if (!nextbranch) \ @@ -533,6 +549,7 @@ Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, U32 next_alloc = 0; regnode *jumper = NULL; regnode *nextbranch = NULL; + regnode *lastbranch = NULL; regnode *convert = NULL; U32 *prev_states; /* temp array mapping each state to previous one */ /* we just use folder as a flag in utf8 */ @@ -569,6 +586,7 @@ Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, REGNODE_NAME(flags) ); } + /* create the trie struct, all zeroed */ trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); trie->refcount = 1; trie->startstate = 1; @@ -639,6 +657,7 @@ Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, STRLEN maxchars = 0; bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ + lastbranch = cur; if (OP(noper) == NOTHING) { /* skip past a NOTHING at the start of an alternation @@ -797,6 +816,13 @@ Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, trie->maxlen = maxchars; } } /* end first pass */ + trie->before_paren = OP(first) == BRANCH + ? ARGa(first) + : ARG2La(first); /* BRANCHJ */ + + trie->after_paren = OP(lastbranch) == BRANCH + ? ARGb(lastbranch) + : ARG2Lb(lastbranch); /* BRANCHJ */ DEBUG_TRIE_COMPILE_r( Perl_re_indentf( aTHX_ "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", diff --git a/regexec.c b/regexec.c index 0a7b53161ea8..8f864092ea2f 100644 --- a/regexec.c +++ b/regexec.c @@ -6706,6 +6706,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* what trie are we using right now */ reg_trie_data * const trie = (reg_trie_data*)rexi->data->data[ ARG( scan ) ]; + ST.before_paren = trie->before_paren; + ST.after_paren = trie->after_paren; + assert(ST.before_paren<=rex->nparens); + assert(ST.after_paren<=rex->nparens); + HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]); U32 state = trie->startstate; @@ -6755,6 +6760,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) U32 accepted = 0; /* have we seen any accepting states? */ ST.jump = trie->jump; + ST.j_before_paren = trie->j_before_paren; + ST.j_after_paren= trie->j_after_paren; ST.me = scan; ST.firstpos = NULL; ST.longfold = FALSE; /* char longer if folded => it's harder */ @@ -6866,6 +6873,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * rest of the branch */ REGCP_UNWIND(ST.cp); UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + if (ST.after_paren) { + assert(ST.before_paren<=rex->nparens && ST.after_paren<=rex->nparens); + CAPTURE_CLEAR(ST.before_paren+1, ST.after_paren,"TRIE_next_fail"); + } } if (!--ST.accepted) { DEBUG_EXECUTE_r({ @@ -6955,10 +6966,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) uc += chars; } } + if (ST.jump && ST.jump[ST.nextword]) { + scan = ST.me + ST.jump[ST.nextword]; + ST.before_paren = ST.j_before_paren[ST.nextword]; + assert(ST.before_paren <= rex->nparens); + ST.after_paren = ST.j_after_paren[ST.nextword]; + assert(ST.after_paren <= rex->nparens); + } else { + scan = ST.me + NEXT_OFF(ST.me); + } - scan = ST.me + ((ST.jump && ST.jump[ST.nextword]) - ? ST.jump[ST.nextword] - : NEXT_OFF(ST.me)); DEBUG_EXECUTE_r({ Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n", @@ -9036,9 +9053,15 @@ NULL next = scan + ARG(scan); if (next == scan) next = NULL; - /* FALLTHROUGH */ + ST.before_paren = ARG2La(scan); + ST.after_paren = ARG2Lb(scan); + goto branch_logic; + NOT_REACHED; /* NOTREACHED */ case BRANCH: /* /(...|A|...)/ */ + ST.before_paren = ARGa(scan); + ST.after_paren = ARGb(scan); + branch_logic: scan = REGNODE_AFTER_opcode(scan,state_num); /* scan now points to inner node */ assert(scan); ST.lastparen = rex->lastparen; @@ -9083,6 +9106,7 @@ NULL } REGCP_UNWIND(ST.cp); UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + CAPTURE_CLEAR(ST.before_paren+1,ST.after_paren,"BRANCH_next_fail"); scan = ST.next_branch; /* no more branches? */ if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) { diff --git a/regexp.h b/regexp.h index 0b5127357259..fadfe74f16a6 100644 --- a/regexp.h +++ b/regexp.h @@ -840,6 +840,8 @@ typedef struct regmatch_state { U32 lastparen; U32 lastcloseparen; CHECKPOINT cp; + U16 before_paren; + U16 after_paren; } branchlike; @@ -849,6 +851,8 @@ typedef struct regmatch_state { U32 lastparen; U32 lastcloseparen; CHECKPOINT cp; + U16 before_paren; + U16 after_paren; regnode *next_branch; /* next branch node */ } branch; @@ -859,10 +863,14 @@ typedef struct regmatch_state { U32 lastparen; U32 lastcloseparen; CHECKPOINT cp; + U16 before_paren; + U16 after_paren; U32 accepted; /* how many accepting states left */ bool longfold;/* saw a fold with a 1->n char mapping */ U16 *jump; /* positive offsets from me */ + U16 *j_before_paren; + U16 *j_after_paren; regnode *me; /* Which node am I - needed for jump tries*/ U8 *firstpos;/* pos in string of first trie match */ U32 firstchars;/* len in chars of firstpos from start */ diff --git a/regnodes.h b/regnodes.h index 04701f214f66..a6559db54d31 100644 --- a/regnodes.h +++ b/regnodes.h @@ -29,8 +29,8 @@ typedef struct regnode tregnode_BOUND; typedef struct regnode tregnode_BOUNDA; typedef struct regnode tregnode_BOUNDL; typedef struct regnode tregnode_BOUNDU; -typedef struct regnode tregnode_BRANCH; -typedef struct regnode_1 tregnode_BRANCHJ; +typedef struct regnode_1 tregnode_BRANCH; +typedef struct regnode_2L tregnode_BRANCHJ; typedef struct regnode_1 tregnode_CLOSE; typedef struct regnode tregnode_CLUMP; typedef struct regnode_1 tregnode_COMMIT; @@ -2026,7 +2026,7 @@ EXTCONST struct regnode_meta PL_regnode_info[] = { { /* #40 op BRANCH */ .type = BRANCH, - .arg_len = 0, + .arg_len = EXTRA_SIZE(tregnode_BRANCH), .arg_len_varies = 0, .off_by_arg = 0 }, diff --git a/t/re/re_tests b/t/re/re_tests index da11f0aa396e..89f9fec8c65c 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -2131,6 +2131,7 @@ AB\s+\x{100} AB \x{100}X y - - /(([ab]+)|([cd]+)|([ef]+))+/ aceb y $1-$2-$3-$4=$& b-b--=aceb /(([ab]+)|([cd]+)|([ef]+))+/ acebd y $1-$2-$3-$4=$& d--d-=acebd /(([ab]+)|([cd]+)|([ef]+))+/ acebdf y $1-$2-$3-$4=$& f---f=acebdf +/((a)(b)(c)|(a)(b)|(a))+/ abcaba y $1+$2-$3-$4+$5-$6+$7=$& a+--+-+a=abcaba # Keep these lines at the end of the file # pat string y/n/etc expr expected-expr skip-reason comment # vim: softtabstop=0 noexpandtab