pat codeblocks: add size to struct reg_code_blocks

Split the 'count' field of reg_code_blocks structures into separate 'count' and 'size' fields to make the code less fragile; and as an intended side-effect, fix GH #16627. Background: When a pattern includes embedded perl code, such as /(?{ CODE })/, then at compile-time the op trees associated with each of those code blocks are stored within the compiled regex, in a reg_code_blocks structure. This structure contains some basic info, plus a pointer to an array of reg_code_block structures, each of which contains a pointer to the optree for that code block, plus string offsets to where the (?{..}) or similar expression starts and ends within the pattern string. For a runtime pattern, perl tries to reuse any original compiled code blocks rather than recompiling them, to maintain correct closure behaviour. So for example, in the following: my $x = 1; { my $x = 2; $r = qr/(??{$x})/ } my $y = 3; my $s = '(??{$y})'; my $pat = qr/A (??{$x}) B $r C $s/x; at perl compile time, the two '$x' code blocks are compiled, and their optrees stored. At runtime, when the $pat pattern is compiled, the third code block, '$y', is compiled, and the two earlier optrees are retrieved. A new three-element 'struct reg_code_blocks' array is malloc()ed, and the pointers to the two old, and one new, optrees are stored in it. Overall, $pat has the same effect as qr/A1B2C3/. The assembly of this reg_code_blocks array is mostly performed by S_concat_pat() and S_compile_runtime_code(). It is done incrementally, since the total number of code blocks isn't known in advance. Prior to this commit, the array was often realloced() and grown one element at at a time, as each new run-time code block was discovered, with a corresponding pRExC_state->code_blocks->count++. This count field served twin purposes: it indicated both how many code blocks had been found and stored so far, and the malloc()ed size of the array. But some parts of the regex compiler allocate more than one slot at a time, and so the two meanings of the 'count' field temporarily diverge. This became noticeable when S_concat_pat() recursed to interpolate the contents of an array, such as qr/$a$b@c/, where interpolating $a, $b was done iteratively at the top level, then it recursed to process each element of @c. S_concat_pat() had a local var, 'int n', which counted how many code blocks had been found so far, and this value sometimes represented the difference between the two meanings of the 'count' field. However when it recursed, n started from zero again and things got out of whack, which led to GH #16627. The bug in that ticket can be reduced to: my @x = ( qr/(?{A})/ ); qr/(?{B})@x/; Here the B code block is stored in pRExC_state->code_blocks->cb[0], but then S_concat_pat recurses, n is reset to 0, and the A code block is also stored into slot 0. Then things would start to crash. The quick and dirty fix would be to share n between recursive calls to S_concat_pat(), by passing a pointer to it. Instead, this commit takes the approach of adding a 'size' field to pRExC_state->code_blocks, so that ->count now only indicates the current number of code blocks stored (replacing the local var n) while ->size indicates the current number of slots malloc()ed. This makes the code more conventional and simpler to understand, and allows the realloc() to pre-allocate rather than incrementing the array size by 1 each time. By removing the fragile double meaning of the 'count' field, it should make any future bugs easier to diagnose, at the cost of this initial commit being more complex.
Perl · May 8, 2024 · b74d82c · b74d82c
1 parent 1e43922
commit b74d82c
Show file tree

Hide file tree

Showing 3 changed files with 177 additions and 41 deletions.
diff --git a/regcomp.c b/regcomp.c
@@ -502,19 +502,42 @@ S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
     Safefree(cbs);
 }
 
+/* Ensure that there are at least 'required' spare code block slots
+ * available, using a simple doubling */
+
+static void
+S_grow_code_blocks(pTHX_ struct reg_code_blocks *cbs, int required)
+{
+    required += cbs->count;
+    if (required < 1)
+        return;
+
+    if (required < cbs->size)
+        return;
+
+    int new_size  = cbs->size;
+    if (new_size < 1)
+        new_size = 1;
+
+    while (new_size < required)
+        new_size *= 2;
+
+    Renew(cbs->cb, new_size, struct reg_code_block);
+    cbs->size = new_size;
+}
+
 
 static struct reg_code_blocks *
 S_alloc_code_blocks(pTHX_  int ncode)
 {
      struct reg_code_blocks *cbs;
     Newx(cbs, 1, struct reg_code_blocks);
-    cbs->count = ncode;
+    cbs->size = 0;
+    cbs->count = 0;
+    cbs->cb = NULL;
     cbs->refcnt = 1;
     SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
-    if (ncode)
-        Newx(cbs->cb, ncode, struct reg_code_block);
-    else
-        cbs->cb = NULL;
+    S_grow_code_blocks(aTHX_ cbs, ncode);
     return cbs;
 }
 
@@ -528,7 +551,7 @@ S_alloc_code_blocks(pTHX_  int ncode)
 
 static void
 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
-                    char **pat_p, STRLEN *plen_p, int num_code_blocks)
+                    char **pat_p, STRLEN *plen_p)
 {
     U8 *const src = (U8*)*pat_p;
     U8 *dst, *d;
@@ -540,14 +563,19 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
     DEBUG_PARSE_r(Perl_re_printf( aTHX_
         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
 
+    int nblocks = 0;
+    if (pRExC_state->code_blocks)
+        nblocks = pRExC_state->code_blocks->count;
+
+
     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
     Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
     d = dst;
 
     while (s < *plen_p) {
         append_utf8_from_native_byte(src[s], &d);
 
-        if (n < num_code_blocks) {
+        if (n < nblocks) {
             assert(pRExC_state->code_blocks);
             if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
                 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
@@ -591,7 +619,6 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
                 OP *oplist, bool *recompile_p, SV *delim)
 {
     SV **svp;
-    int n = 0;
     bool use_delim = FALSE;
     bool alloced = FALSE;
 
@@ -680,11 +707,15 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
             if (oplist->op_type == OP_NULL
                 && (oplist->op_flags & OPf_SPECIAL))
             {
-                assert(n < pRExC_state->code_blocks->count);
-                pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
-                pRExC_state->code_blocks->cb[n].block = oplist;
-                pRExC_state->code_blocks->cb[n].src_regex = NULL;
-                n++;
+                /* process next literal code block */
+                struct reg_code_blocks *cbs =  pRExC_state->code_blocks;
+                S_grow_code_blocks(aTHX_ cbs, 1);
+                int n = cbs->count;
+
+                cbs->cb[n].start = pat ? SvCUR(pat) : 0;
+                cbs->cb[n].block = oplist;
+                cbs->cb[n].src_regex = NULL;
+                cbs->count++;
                 code = 1;
                 oplist = OpSIBLING(oplist); /* skip CONST */
                 assert(oplist);
@@ -711,11 +742,24 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
         {
             sv_setsv(pat, sv);
-            /* overloading involved: all bets are off over literal
-             * code. Pretend we haven't seen it */
-            if (n)
-                pRExC_state->code_blocks->count -= n;
-            n = 0;
+            /* pat now represents the return value of overloaded
+             * concatenation of of two values:
+             *   1) all the components previously concatenated;
+             *   2) the current pattern element.
+             * Since the return value can be anything, any previously
+             * found code-blocks (even literal ones) should be discarded.
+             * For example, in:
+             *      qr/(?{A})$obj/
+             * the overloaded concatenation of '(?{A})' and $obj
+             * could return anything, and not necessarily the literal
+             * code block. So throw away any previously found code blocks,
+             * and so any code-block bits in the returned string will be
+             * treated as run-time.
+             */
+            struct reg_code_blocks *cbs =  pRExC_state->code_blocks;
+            if (cbs) {
+                cbs->count = 0;
+            }
         }
         else {
             /* ... or failing that, try "" overload */
@@ -741,7 +785,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
                 char *dst = SvPV_force_nomg(pat, dlen);
                 orig_patlen = dlen;
                 if (SvUTF8(msv) && !SvUTF8(pat)) {
-                    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
+                    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen);
                     sv_setpvn(pat, dst, dlen);
                     SvUTF8_on(pat);
                 }
@@ -763,8 +807,11 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
                 }
             }
 
-            if (code)
-                pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
+            /* was this pattern element a literal code block? */
+            if (code) {
+                struct reg_code_blocks *cbs =  pRExC_state->code_blocks;
+                cbs->cb[cbs->count - 1].end = SvCUR(pat) - 1;
+            }
         }
 
         /* extract any code blocks within any embedded qr//'s */
@@ -780,13 +827,10 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
                  * qr// may not have changed, but it may be a
                  * different closure than last time */
                 *recompile_p = 1;
-                if (pRExC_state->code_blocks) {
-                    int new_count = pRExC_state->code_blocks->count
-                            + ri->code_blocks->count;
-                    Renew(pRExC_state->code_blocks->cb,
-                            new_count, struct reg_code_block);
-                    pRExC_state->code_blocks->count = new_count;
-                }
+
+                if (pRExC_state->code_blocks)
+                    S_grow_code_blocks(aTHX_ pRExC_state->code_blocks,
+                                        ri->code_blocks->count);
                 else
                     pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
                                                     ri->code_blocks->count);
@@ -795,21 +839,22 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
                     struct reg_code_block *src, *dst;
                     STRLEN offset =  orig_patlen
                         + ReANY((REGEXP *)rx)->pre_prefix;
-                    assert(n < pRExC_state->code_blocks->count);
                     src = &ri->code_blocks->cb[i];
-                    dst = &pRExC_state->code_blocks->cb[n];
+                    dst = &pRExC_state->code_blocks->cb[
+                                pRExC_state->code_blocks->count++];
                     dst->start	    = src->start + offset;
                     dst->end	    = src->end   + offset;
                     dst->block	    = src->block;
                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
                                             src->src_regex
                                                 ? src->src_regex
                                                 : (REGEXP*)rx);
-                    n++;
                 }
             }
         }
-    }
+
+    } /* for (patternp) */
+
     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
     if (alloced)
         SvSETMAGIC(pat);
@@ -1059,7 +1104,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
                                     : src->src_regex;
             dst++;
         }
-        r1->code_blocks->count += r2c;
+        r1->code_blocks->count  = r1->code_blocks->size = r1c + r2c;
         Safefree(r1->code_blocks->cb);
         r1->code_blocks->cb = new_block;
     }
@@ -1478,7 +1523,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
 
     /* set expr to the first arg op */
 
-    if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
+    /* ->size > 0 if we alloced above with ncode > 0 */
+    if (pRExC_state->code_blocks && pRExC_state->code_blocks->size
          && expr->op_type != OP_CONST)
     {
             expr = cLISTOPx(expr)->op_first;
@@ -1602,8 +1648,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
             /* whoops, we have a non-utf8 pattern, whilst run-time code
              * got compiled as utf8. Try again with a utf8 pattern */
-            S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
-                pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
+            S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen);
             goto redo_parse;
         }
     }
@@ -1750,8 +1795,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
                             variant_under_utf8_count((U8 *) exp, (U8 *) exp
                                                 + RExC_latest_warn_offset);
             }
-            S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
-            pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
+            S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen);
             DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
         }
         else {
@@ -13669,14 +13713,17 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
     if (ri->code_blocks) {
         int n;
         Newx(reti->code_blocks, 1, struct reg_code_blocks);
+
         Newx(reti->code_blocks->cb, ri->code_blocks->count,
                     struct reg_code_block);
+        reti->code_blocks->size = ri->code_blocks->count;
+
         Copy(ri->code_blocks->cb, reti->code_blocks->cb,
              ri->code_blocks->count, struct reg_code_block);
         for (n = 0; n < ri->code_blocks->count; n++)
              reti->code_blocks->cb[n].src_regex = (REGEXP*)
                     sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
-        reti->code_blocks->count = ri->code_blocks->count;
+        reti->code_blocks->count  = ri->code_blocks->count;
         reti->code_blocks->refcnt = 1;
     }
     else

diff --git a/regexp.h b/regexp.h
@@ -113,7 +113,8 @@ struct reg_code_block {
 
 struct reg_code_blocks {
     int refcnt; /* we may be pointed to from a regex and from the savestack */
-    int  count;    /* how many code blocks */
+    int  count; /* how many code block slots currently in use */
+    int  size;  /* how many slots allocated in code_block[]  */
     struct reg_code_block *cb; /* array of reg_code_block's */
 };
 

diff --git a/t/re/pat.t b/t/re/pat.t
@@ -28,7 +28,7 @@ skip_all_without_unicode_tables();
 my $has_locales = locales_enabled('LC_CTYPE');
 my $utf8_locale = find_utf8_ctype_locale();
 
-plan tests => 1265;  # Update this when adding/deleting tests.
+plan tests => 1281;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -2512,6 +2512,94 @@ SKIP:
         ok($str =~ s/$copy/PQR/, 'replaced $copy with PQR');
         is($str, "PQR", 'final string should be PQR');
     }
+
+
+    # Various tests for regexes with code blocks interpolated from an
+    # array, related to fixing GH #16627.
+    #
+    # Prior to the fix, some of these tests would wrongly need 'use re
+    # "eval"', or would assert fail, or crash, or produce unpredictable
+    # results.
+
+    {
+        local $" = '-'; # separator when interpolating arrays
+
+        my $A = 'A';
+        my $B = 'B';
+        my $C = 'C';
+        my $D = 'D';
+        my $E = 'E';
+
+        my $a = 'aa';
+        my $b = 'bb';
+        my $c = 'cc';
+        my $d = 'dd';
+        my $e = 'ee';
+
+        my @r = (qr/(??{$B})/);
+
+        # array with single element, usually following a literal code block
+
+        like "B",   qr/^@r$/,                   "code in array 1";
+        like "AB" , qr/^(??{$A})@r$/,           "code in array 2";
+        like "XAB", qr/^X(??{$A})@r$/,          "code in array 3";
+        like "XABC", qr/^X(??{$A})@r(??{$C})$/, "code in array 4";
+
+        {
+            my $B = 'Q';
+            push  @r, qr/(??{$B})/;
+        }
+
+        # array with two elements, usually following a literal code block
+        #
+        like "B-Q",    qr/^@r$/,                  "code in array 5";
+        like "AB-Q",   qr/^(??{$A})@r$/,          "code in array 6";
+        like "XAB-Q",  qr/^X(??{$A})@r$/,         "code in array 7";
+        like "XAB-QC", qr/^X(??{$A})@r(??{$C})$/, "code in array 8";
+
+        # Simple overload package which returns a lower-cased version
+        # of a concatenated string, with a '=' used to join
+
+        package LcConcat {
+            use overload
+                '""' => sub { ${$_[0]} },
+                '.' =>  sub {
+                                my ($x, $y) = @_[ $_[2] ? (1,0) : (0,1) ];
+                                my ($xx, $yy) = ("$x", "$y");
+                                lc("$xx=$yy");
+                            }
+                ;
+        }
+
+        my $r = qr/(??{$E})/;
+        bless $r, 'LcConcat';
+
+        # Overloading concatenation converts literal compile-time code
+        # blocks into run-time recompiled affairs, so need to enable eval
+        use re 'eval';
+
+        # First, use an overloaded scalar to establish baseline behaviour.
+        # Note that the overloaded concatenation converts everything in
+        # the pattern to its left to lowercase, so (??{$B}) becomes
+        # (??{$b}) etc.
+
+        like "=ee",     qr/^$r$/,                  "code in array 9";
+        like "aa=ee",   qr/^(??{$A})$r$/,          "code in array 10";
+        like "xaa=ee",  qr/^X(??{$A})$r$/,         "code in array 11";
+        like "xaa=eeC", qr/^X(??{$A})$r(??{$C})$/, "code in array 12";
+
+        # Then add an overloaded scalar to an array to see if it's
+        # still handled ok by the array interpolation code
+
+        push @r, $r;
+
+        like "bb-bb-=ee",     qr/^@r$/,                  "code in array 13";
+        like "aabb-bb-=ee",   qr/^(??{$A})@r$/,          "code in array 14";
+        like "xaabb-bb-=ee",  qr/^X(??{$A})@r$/,         "code in array 15";
+        like "xaabb-bb-=eeC", qr/^X(??{$A})@r(??{$C})$/, "code in array 16";
+
+    }
+
 } # End of sub run_tests
 
 1;