Skip to content

Commit

Permalink
unify regmatch_info data
Browse files Browse the repository at this point in the history
Previously the regmatch_info struct was allocated as a local var on the C
stack, while some extra state (only needed for regexes having (?{})) was
malloced (as a regmatch_eval_state struct) as needed - and a destructor set
up to clean it up afterwards. This being because the stuff being cleaned
up couldn't be allocated on the C stack as it needed to hang around after
a croak().

Reorganise this so that:

* regmatch_info is on the C stack as before.

* a new struct, regmatch_info_aux is allocated within the first slot of the
  regmatch_state stack, for fields which must always exist but which need
  cleanup afterwards. This is currently unused, but will be shortly.

* a new struct, regmatch_info_aux_eval (which is just a renamed
  regmatch_eval_state struct), is optionally allocated in the second
  slot of regmatch_state. This is logically part of regmatch_info_aux,
  except that splitting it in two stops it being too large to fit in a
  regmatch_state slot (we can fit it in two instead).

(The second and third structs aren't allocated when we're intuit()
rather than regexec()).

Doing it like this simplifies allocation and cleanup: there's no need for
a malloc(), and we are already going to allocate a slab's worth of
regmatch_state slots, so using an extra one of two of them is effectively
free; and the cleanup just requires calling a single overall destructor.

In the next few commits, more of the regexec() state setup and tear-down
will be integrated into this new regime. And in particular, the new
regmatch_info_aux struct will give us somewhere to hang things like
PL_reg_poscache once it stops being global (it being local state that
needs cleanup).
  • Loading branch information
iabyn committed Jun 2, 2013
1 parent d3aa529 commit bf2039a
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 68 deletions.
123 changes: 65 additions & 58 deletions regexec.c
Expand Up @@ -250,8 +250,9 @@ static const char* const non_utf8_target_but_utf8_required
#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])

static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
static void S_restore_eval_state(pTHX_ void *arg);
static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
static void S_clear_backtrack_stack(pTHX_ void *p);
static regmatch_state * S_push_slab(pTHX);

#define REGCP_PAREN_ELEMS 3
#define REGCP_OTHER_ELEMS 3
Expand Down Expand Up @@ -652,7 +653,7 @@ Perl_re_intuit_start(pTHX_
goto fail;
}

reginfo->eval_state = NULL;
reginfo->info_aux = NULL;
reginfo->strbeg = strbeg;
reginfo->strend = strend;
reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
Expand Down Expand Up @@ -2097,7 +2098,6 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,

multiline = prog->extflags & RXf_PMf_MULTILINE;

reginfo->eval_state = NULL;
reginfo->prog = rx; /* Yes, sorry that this is confusing. */
reginfo->intuit = 0;
reginfo->is_utf8_target = cBOOL(utf8_target);
Expand All @@ -2113,10 +2113,6 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
/* note current PL_regmatch_state position; at end of match we'll
* pop back to there and free any higher slabs */
oldsave = PL_savestack_ix;
SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
SAVEVPTR(PL_regmatch_slab);
SAVEVPTR(PL_regmatch_state);


DEBUG_EXECUTE_r(
debug_start_match(rx, utf8_target, startpos, strend,
Expand Down Expand Up @@ -2151,8 +2147,35 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
/* see how far we have to get to not match where we matched before */
reginfo->till = startpos+minend;

if ((prog->extflags & RXf_EVAL_SEEN))
SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
SAVEVPTR(PL_regmatch_slab);
SAVEVPTR(PL_regmatch_state);

/* grab next slot in regmatch_state stack to store regmatch_info_aux
* struct */

reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
PL_regmatch_state = S_push_slab(aTHX);

SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);


if ((prog->extflags & RXf_EVAL_SEEN)) {
/* grab next slot in regmatch_state stack to store
* regmatch_info_aux_eval struct */

reginfo->info_aux_eval =
reginfo->info_aux->info_aux_eval =
&(PL_regmatch_state->u.info_aux_eval);

if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
PL_regmatch_state = S_push_slab(aTHX);

S_setup_eval_state(aTHX_ reginfo);
}
else
reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;


/* If there is a "must appear" string, look for it. */
Expand Down Expand Up @@ -2628,12 +2651,10 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
);
Safefree(swap);

if (reginfo->eval_state) {
reginfo->eval_state->direct = TRUE;
S_restore_eval_state(aTHX_ reginfo->eval_state);
}
/* clean up; this will trigger destructors that will free all slabs
* above the current one, and cleanup the regmatch_info_aux
* and regmatch_info_aux_eval sructs */

/* clean up; in particular, free all slabs above current one */
LEAVE_SCOPE(oldsave);

if (RXp_PAREN_NAMES(prog))
Expand Down Expand Up @@ -2766,12 +2787,10 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
PL_colors[4], PL_colors[5]));

if (reginfo->eval_state) {
reginfo->eval_state->direct = TRUE;
S_restore_eval_state(aTHX_ reginfo->eval_state);
}
/* clean up; this will trigger destructors that will free all slabs
* above the current one, and cleanup the regmatch_info_aux
* and regmatch_info_aux_eval sructs */

/* clean up; in particular, free all slabs above current one */
LEAVE_SCOPE(oldsave);

if (swap) {
Expand All @@ -2792,7 +2811,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
/* Set which rex is pointed to by PL_reg_state, handling ref counting.
* Do inc before dec, in case old and new rex are the same */
#define SET_reg_curpm(Re2) \
if (reginfo->eval_state) { \
if (reginfo->info_aux_eval) { \
(void)ReREFCNT_inc(Re2); \
ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
PM_SETRE((PL_reg_curpm), (Re2)); \
Expand Down Expand Up @@ -4939,8 +4958,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
" re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );

rex->offs[0].end = locinput - reginfo->strbeg;
if (reginfo->eval_state->pos_magic)
reginfo->eval_state->pos_magic->mg_len
if (reginfo->info_aux_eval->pos_magic)
reginfo->info_aux_eval->pos_magic->mg_len
= locinput - reginfo->strbeg;

if (sv_yes_mark) {
Expand Down Expand Up @@ -6524,7 +6543,7 @@ NULL
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
PL_colors[4], PL_colors[5]));

if (reginfo->eval_state) {
if (reginfo->info_aux_eval) {
/* each successfully executed (?{...}) block does the equivalent of
* local $^R = do {...}
* When popping the save stack, all these locals would be undone;
Expand Down Expand Up @@ -7492,24 +7511,15 @@ S_reghopmaybe3(U8* s, I32 off, const U8* lim)
* save the old values of subbeg etc of the current regex, and set then
to the current string (again, this is normally only done at the end
of execution)
It also sets up a destructor so that all this will be cleared up if
we die.
*/

static void
S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
{
MAGIC *mg;
regexp *const rex = ReANY(reginfo->prog);
regmatch_eval_state *eval_state;

Newx(eval_state, 1, regmatch_eval_state);
assert(!reginfo->eval_state);
reginfo->eval_state = eval_state;
regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;

eval_state->restored = FALSE;
eval_state->direct = FALSE;
eval_state->rex = rex;

if (reginfo->sv) {
Expand Down Expand Up @@ -7571,41 +7581,38 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
rex->suboffset = 0;
rex->subcoffset = 0;
rex->sublen = reginfo->strend - reginfo->strbeg;
SAVEDESTRUCTOR_X(S_restore_eval_state, eval_state);
}

/* undo the effects of S_setup_eval_state() - can either be called
* directly, or via a destructor. If we get called directly, we'll still
* get called again later from the destructor */

/* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */

static void
S_restore_eval_state(pTHX_ void *arg)
S_cleanup_regmatch_info_aux(pTHX_ void *arg)
{
dVAR;
regmatch_eval_state * const eval_state = (regmatch_eval_state *)arg;
regexp * const rex = eval_state->rex;

if (!eval_state->restored) {
if (eval_state->subbeg) {
rex->subbeg = eval_state->subbeg;
rex->sublen = eval_state->sublen;
rex->suboffset = eval_state->suboffset;
rex->subcoffset = eval_state->subcoffset;
regmatch_info_aux *aux = (regmatch_info_aux *) arg;
regmatch_info_aux_eval *eval_state = aux->info_aux_eval;

if (!eval_state)
return;

/* undo the effects of S_setup_eval_state() */

if (eval_state->subbeg) {
regexp * const rex = eval_state->rex;
rex->subbeg = eval_state->subbeg;
rex->sublen = eval_state->sublen;
rex->suboffset = eval_state->suboffset;
rex->subcoffset = eval_state->subcoffset;
#ifdef PERL_ANY_COW
rex->saved_copy = eval_state->saved_copy;
rex->saved_copy = eval_state->saved_copy;
#endif
RXp_MATCH_COPIED_on(rex);
}
if (eval_state->pos_magic)
eval_state->pos_magic->mg_len = eval_state->pos;
PL_curpm = eval_state->curpm;
eval_state->restored = TRUE;
RXp_MATCH_COPIED_on(rex);
}
if (eval_state->direct)
eval_state->direct = FALSE;
else
/* we're being called from a destructor rather than directly */
Safefree(eval_state);
if (eval_state->pos_magic)
eval_state->pos_magic->mg_len = eval_state->pos;

PL_curpm = eval_state->curpm;
}


Expand Down
54 changes: 44 additions & 10 deletions regexp.h
Expand Up @@ -577,11 +577,9 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp)
#define FBMrf_MULTILINE 1


/* saved state when executing a regex that contains code blocks.
* These need restoring at the end of the match, or on croak().
* These fields can't be stored in regmatch_info since the latter is
* allocated directly on the stack in regexec_flags(), and they need to
* exist during a croak() after the stack has been unwound */
/* like regmatch_info_aux, but contains extra fields only needed if the
* pattern contains (?{}). If used, is snuck into the second slot in the
* regmatch_state stack at the start of execution */

typedef struct {
regexp *rex;
Expand All @@ -595,12 +593,24 @@ typedef struct {
STRLEN subcoffset; /* saved subcoffset field from rex */
MAGIC *pos_magic; /* pos() magic attached to $_ */
I32 pos; /* the original value of pos() in pos_magic */
bool restored; /* we have already undone the save */
bool direct; /* we are calling the destructor directly */
} regmatch_eval_state;
} regmatch_info_aux_eval;


/* fields that logically live in regmatch_info, but which need cleaning
* up on croak(), and so are instead are snuck into the first slot in
* the regmatch_state stack at the start of execution */

typedef struct {
regmatch_info_aux_eval *info_aux_eval;
} regmatch_info_aux;


/* some basic information about the current match that is created by
* Perl_regexec_flags and then passed to regtry(), regmatch() etc */
* Perl_regexec_flags and then passed to regtry(), regmatch() etc.
* It is allocated as a local var on the stack, so nothing should be
* stored in it that needs preserving or clearing up on croak().
* For that, see the aux_info and aux_info_eval members of the
* regmatch_state union. */

typedef struct {
REGEXP *prog;
Expand All @@ -610,7 +620,8 @@ typedef struct {
SV *sv;
char *ganch;
char *cutpoint;
regmatch_eval_state *eval_state; /* extra saved state for (?{}) */
regmatch_info_aux *info_aux; /* extra fields that need cleanup */
regmatch_info_aux_eval *info_aux_eval; /* extra saved state for (?{}) */
I32 poscache_maxiter; /* how many whilems todo before S-L cache kicks in */
I32 poscache_iter; /* current countdown from _maxiter to zero */
bool intuit; /* re_intuit_start() is the top-level caller */
Expand All @@ -634,6 +645,29 @@ typedef struct regmatch_state {

union {

/* the 'info_aux' and 'info_aux_eval' union members are cuckoos in
* the nest. They aren't saved backtrack state; rather they
* represent one or two extra chunks of data that need allocating
* at the start of a match. These fields would logically live in
* the regmatch_info struct, except that is allocated on the
* C stack, and these fields are all things that require cleanup
* after a croak(), when the stack is lost.
* As a convenience, we just use the first 1 or 2 regmatch_state
* slots to store this info, as we will be allocating a slab of
* these anyway. Otherwise we'd have to malloc and then free them,
* or allocate them on the save stack (where they will get
* realloced if the save stack grows).
* info_aux contains the extra fields that are always needed;
* info_aux_eval contains extra fields that only needed if
* the pattern contains code blocks
* We split them into two separate structs to avoid increasing
* the size of the union.
*/

regmatch_info_aux info_aux;

regmatch_info_aux_eval info_aux_eval;

/* this is a fake union member that matches the first element
* of each member that needs to store positive backtrack
* information */
Expand Down

0 comments on commit bf2039a

Please sign in to comment.