Skip to content

Commit

Permalink
tries: don't allocate memory at runtime
Browse files Browse the repository at this point in the history
This is an indirect fix for
    [perl #74484] Regex causing exponential runtime+mem usage

The trie runtime code was doing more SAVETMPS than FREETMPS and was thus
growing a large tmps stack on heavy backtracking. Rather than fixing this
directly, I rewrote part of the trie code so that it no longer needs to
allocate memory in S_regmatch (it still does in find_byclass()).

The basic issue is that multiple branches in the trie may trigger an
accept state; for example:

    "abcd" =~ /xyz/abcd.*X|ab.*Y|/

here, words (branches) 2 and 3 are accept states. The original approach
was, at run time, to create a list of accepted word numbers and the
character positions of the end of each of those words. Then run the rest
of the pattern for each word in the list in turn (in word index order).
This requires memory for the list to be allocated and freed.

The new approach involves creating extra info at compile time; in
particular, for each word, a pointer to the previous accepted word (if
any) in the state tree. For example for the above pattern, part of the
state tree may be

      q    b    c    d
    1 -> 2 -> 3 -> 4 -> 5
            (#3)       (#2)

(e.g. at state 1, if the next char is 'a', we transition to state 2).
Here, state 3 is an accept state with word #3, and 5 is an accept state
with word #2. So we build a table indexed by word number, which has
wordinfo[2] = 3, wordinfo[3] = 0, thus building the word chain 2->3->0.

At run time we run the trie to completion, and remember the word
associated with the longest accept state (word #2 above). Then by following
back the chain of .prev fields, we can produce a list of all accepting
words. We then iteratively find the smallest-numbered (ie LH-most) word in
the chain, and run with it. On failure and backtrack, we find the
next-smallest and so on.

Since we are no longer recording the end-position of each word in the
string, we have to recalculate this for each backtrack. We initially
record the end-position of the shortest accepting word, and given that we
know the length of each word, we can calculate the new position each time
as an offset from that first word. Depending on unicode and folding, that
calculation can be cheap or expensive.

This algorithm is optimised for the typical case where there are a small
number (<= 2) accepting states.

This patch creates a new compile-time array, trie->wordinfo[], indexed by
word number, which contains relevant info about each word. This also
supersedes the old trie->newword[] array, whose function of recording
"overspills" of multiple words per accept state, is now handled as part of
the wordinfo[].prev chain.
  • Loading branch information
iabyn committed May 3, 2010
1 parent a23e6e2 commit 2e64971
Show file tree
Hide file tree
Showing 6 changed files with 302 additions and 222 deletions.
18 changes: 10 additions & 8 deletions ext/re/t/regop.t
Expand Up @@ -41,7 +41,9 @@ foreach my $testout ( @tests ) {
s/\s+$//;
ok( $testout=~/\Q$_\E/, "$_: /$pattern/" )
or do {
!$diaged++ and diag("$_: /$pattern/\n$testout");
!$diaged++ and diag("PATTERN: /$pattern/\n\n"
. "EXPECTED:\n$_\n\n"
. "WITHIN GOT:\n$testout");
};
}
}
Expand Down Expand Up @@ -152,16 +154,17 @@ minlen 3
# # 8| W 4 @ 0
# # 9| W 5 @ 0
# # A| W 6 @ 0
# word_info N:(prev,char)= 1:(0,1) 2:(0,1) 3:(0,1) 4:(0,1) 5:(0,1) 6:(0,1)
# Final program:
# 1: EXACT <ABC>(3)
# 3: TRIEC-EXACT<S:4/10 W:6 L:1/1 C:24/7>[A-EGP](20)
# 1: EXACT <ABC> (3)
# 3: TRIEC-EXACT<S:4/10 W:6 L:1/1 C:24/7>[A-EGP] (20)
# <P>
# <G>
# <E>
# <B>
# <A>
# <D>
# 20: END(0)
# 20: END (0)
# anchored "ABC" at 0 (checking anchored) minlen 4
# Offsets: [20]
# 1:4[3] 3:4[15] 19:32[0] 20:34[0]
Expand All @@ -172,18 +175,17 @@ minlen 3
# 0 <> <ABCD> | 1:EXACT <ABC>(3)
# 3 <ABC> <D> | 3:TRIEC-EXACT<S:4/10 W:6 L:1/1 C:24/7>[A-EGP](20)
# 3 <ABC> <D> | State: 4 Accepted: 0 Charid: 7 CP: 44 After State: a
# 4 <ABCD> <> | State: a Accepted: 1 Charid: 6 CP: 0 After State: 0
# 4 <ABCD> <> | State: a Accepted: 1 Charid: 7 CP: 0 After State: 0
# got 1 possible matches
# only one match left: #6 <D>
# 4 <ABCD> <> | 20:END(0)
# TRIE matched word #6, continuing
# 4 <ABCD> <> | 20: END(0)
# Match successful!
# %MATCHED%
# Freeing REx: "(?:ABCP|ABCG|ABCE|ABCB|ABCA|ABCD)"
%MATCHED%
EXACT <ABC>
TRIEC-EXACT
[A-EGP]
only one match left: #6 <D>
S:4/10
W:6
L:1/1
Expand Down
85 changes: 65 additions & 20 deletions regcomp.c
Expand Up @@ -878,6 +878,7 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
U32 state;
SV *sv=sv_newmortal();
int colwidth= widecharmap ? 6 : 4;
U16 word;
GET_RE_DEBUG_FLAGS_DECL;

PERL_ARGS_ASSERT_DUMP_TRIE;
Expand Down Expand Up @@ -947,6 +948,13 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
}
PerlIO_printf( Perl_debug_log, "\n" );
}
PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
for (word=1; word <= trie->wordcount; word++) {
PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
(int)word, (int)(trie->wordinfo[word].prev),
(int)(trie->wordinfo[word].len));
}
PerlIO_printf(Perl_debug_log, "\n" );
}
/*
Dumps a fully constructed but uncompressed trie in list form.
Expand Down Expand Up @@ -1077,6 +1085,7 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,

#endif


/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
startbranch: the first branch in the whole branch sequence
first : start branch of sequence of branch-exact nodes.
Expand Down Expand Up @@ -1257,8 +1266,6 @@ is the recommended Unicode-aware way of saying
U16 dupe= trie->states[ state ].wordnum; \
regnode * const noper_next = regnext( noper ); \
\
if (trie->wordlen) \
trie->wordlen[ curword ] = wordlen; \
DEBUG_r({ \
/* store the word for dumping */ \
SV* tmp; \
Expand All @@ -1270,6 +1277,9 @@ is the recommended Unicode-aware way of saying
}); \
\
curword++; \
trie->wordinfo[curword].prev = 0; \
trie->wordinfo[curword].len = wordlen; \
trie->wordinfo[curword].accept = state; \
\
if ( noper_next < tail ) { \
if (!trie->jump) \
Expand All @@ -1282,16 +1292,11 @@ is the recommended Unicode-aware way of saying
} \
\
if ( dupe ) { \
/* So it's a dupe. This means we need to maintain a */\
/* linked-list from the first to the next. */\
/* we only allocate the nextword buffer when there */\
/* a dupe, so first time we have to do the allocation */\
if (!trie->nextword) \
trie->nextword = (U16 *) \
PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
while ( trie->nextword[dupe] ) \
dupe= trie->nextword[dupe]; \
trie->nextword[dupe]= curword; \
/* It's a dupe. Pre-insert into the wordinfo[].prev */\
/* chain, so that when the bits of chain are later */\
/* linked together, the dups appear in the chain */\
trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
trie->wordinfo[dupe].prev = curword; \
} else { \
/* we haven't inserted this word yet. */ \
trie->states[ state ].wordnum = curword; \
Expand Down Expand Up @@ -1329,6 +1334,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
regnode *jumper = NULL;
regnode *nextbranch = NULL;
regnode *convert = NULL;
U32 *prev_states; /* temp array mapping each state to previous one */
/* we just use folder as a flag in utf8 */
const U8 * const folder = ( flags == EXACTF
? PL_fold
Expand Down Expand Up @@ -1364,6 +1370,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
if (!(UTF && folder))
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
trie->wordcount+1, sizeof(reg_trie_wordinfo));

DEBUG_r({
trie_words = newAV();
});
Expand Down Expand Up @@ -1496,7 +1505,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
(int)trie->minlen, (int)trie->maxlen )
);
trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );

/*
We now know what we are dealing with in terms of unique chars and
Expand All @@ -1520,6 +1528,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
*/


Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
prev_states[1] = 0;

if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
/*
Second Pass -- Array Of Lists Representation
Expand Down Expand Up @@ -1590,6 +1601,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
}
if ( ! newstate ) {
newstate = next_alloc++;
prev_states[newstate] = state;
TRIE_LIST_PUSH( state, charid, newstate );
transcount++;
}
Expand Down Expand Up @@ -1773,6 +1785,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
if ( !trie->trans[ state + charid ].next ) {
trie->trans[ state + charid ].next = next_alloc;
trie->trans[ state ].check++;
prev_states[TRIE_NODENUM(next_alloc)]
= TRIE_NODENUM(state);
next_alloc += trie->uniquecharcount;
}
state = trie->trans[ state + charid ].next;
Expand Down Expand Up @@ -1920,9 +1934,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
PerlMemShared_realloc( trie->trans, trie->lasttrans
* sizeof(reg_trie_trans) );

/* and now dump out the compressed format */
DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));

{ /* Modify the program and insert the new TRIE node*/
U8 nodetype =(U8)(flags & 0xFF);
char *str=NULL;
Expand Down Expand Up @@ -2052,6 +2063,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
break;
}
}
trie->prefixlen = (state-1);
if (str) {
regnode *n = convert+NODE_SZ_STR(convert);
NEXT_OFF(convert) = NODE_SZ_STR(convert);
Expand Down Expand Up @@ -2147,6 +2159,42 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
});
} /* end node insert */

/* Finish populating the prev field of the wordinfo array. Walk back
* from each accept state until we find another accept state, and if
* so, point the first word's .prev field at the second word. If the
* second already has a .prev field set, stop now. This will be the
* case either if we've already processed that word's accept state,
* or that that state had multiple words, and the overspill words
* were already linked up earlier.
*/
{
U16 word;
U32 state;
U16 prev;

for (word=1; word <= trie->wordcount; word++) {
prev = 0;
if (trie->wordinfo[word].prev)
continue;
state = trie->wordinfo[word].accept;
while (state) {
state = prev_states[state];
if (!state)
break;
prev = trie->states[state].wordnum;
if (prev)
break;
}
trie->wordinfo[word].prev = prev;
}
Safefree(prev_states);
}


/* and now dump out the compressed format */
DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));

RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
#ifdef DEBUGGING
RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
Expand Down Expand Up @@ -9571,12 +9619,9 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
PerlMemShared_free(trie->trans);
if (trie->bitmap)
PerlMemShared_free(trie->bitmap);
if (trie->wordlen)
PerlMemShared_free(trie->wordlen);
if (trie->jump)
PerlMemShared_free(trie->jump);
if (trie->nextword)
PerlMemShared_free(trie->nextword);
PerlMemShared_free(trie->wordinfo);
/* do this last!!!! */
PerlMemShared_free(ri->data->data[n]);
}
Expand Down
14 changes: 11 additions & 3 deletions regcomp.h
Expand Up @@ -586,6 +586,15 @@ struct _reg_trie_state {
} trans;
};

/* info per word; indexed by wordnum */
typedef struct {
U16 prev; /* previous word in acceptance chain; eg in
* zzz|abc|ab/ after matching the chars abc, the
* accepted word is #2, and the previous accepted
* word is #3 */
U32 len; /* how many chars long is this word? */
U32 accept; /* accept state for this word */
} reg_trie_wordinfo;


typedef struct _reg_trie_state reg_trie_state;
Expand All @@ -603,15 +612,14 @@ struct _reg_trie_data {
reg_trie_state *states; /* state data */
reg_trie_trans *trans; /* array of transition elements */
char *bitmap; /* stclass bitmap */
U32 *wordlen; /* array of lengths of words */
U16 *jump; /* optional 1 indexed array of offsets before tail
for the node following a given word. */
U16 *nextword; /* optional 1 indexed array to support linked list
of duplicate wordnums */
reg_trie_wordinfo *wordinfo; /* array of info per word */
U16 uniquecharcount; /* unique chars in trie (width of trans table) */
U32 startstate; /* initial state - used for common prefix optimisation */
STRLEN minlen; /* minimum length of words in trie - build/opt only? */
STRLEN maxlen; /* maximum length of words in trie - build/opt only? */
U32 prefixlen; /* #chars in common prefix */
U32 statecount; /* Build only - number of states in the states array
(including the unused zero state) */
U32 wordcount; /* Build only */
Expand Down

0 comments on commit 2e64971

Please sign in to comment.