Skip to content

Commit

Permalink
regex engine - replace many attribute arrays with one
Browse files Browse the repository at this point in the history
This replaces PL_regnode_arg_len, PL_regnode_arg_len_varies,
PL_regnode_off_by_arg and PL_regnode_kind with a single PL_regnode_info
array, which is an array of struct regnode_meta, which contains the same
data but as a struct. Since PL_regnode_name is only used in debugging
builds of the regex engine we keep it separate. If we add more debug
properties it might be good to create a PL_regnode_debug_info[] to hold
that data instead.

This means when we add new properties we do not need to modify any
secondary sources to add new properites, just the struct definition
and regen/regcomp.pl
  • Loading branch information
demerphq committed Aug 6, 2022
1 parent cbf5c5b commit 12d173c
Show file tree
Hide file tree
Showing 5 changed files with 1,125 additions and 634 deletions.
5 changes: 1 addition & 4 deletions globvar.sym
Expand Up @@ -57,11 +57,8 @@ PL_phase_names
PL_ppaddr
PL_reg_extflags_name
PL_reg_intflags_name
PL_regnode_arg_len
PL_regnode_arg_len_varies
PL_regnode_kind
PL_regnode_info
PL_regnode_name
PL_regnode_off_by_arg
PL_revision
PL_runops_dbg
PL_runops_std
Expand Down
11 changes: 6 additions & 5 deletions regcomp.h
Expand Up @@ -10,6 +10,7 @@

#if ! defined(PERL_REGCOMP_H_) && ( defined(PERL_CORE) \
|| defined(PERL_EXT_RE_BUILD))

#define PERL_REGCOMP_H_

#include "regcharclass.h"
Expand Down Expand Up @@ -1409,11 +1410,11 @@ typedef enum {
# define GET_REGCLASS_AUX_DATA(a,b,c,d,e,f) get_re_gclass_aux_data(a,b,c,d,e,f)
#endif

#define REGNODE_TYPE(arg) PL_regnode_kind[(arg)]
#define REGNODE_OFF_BY_ARG(node) PL_regnode_of_by_arg[(node)]
#define REGNODE_ARG_LEN(node) PL_regnode_arg_len[(node)]
#define REGNODE_ARG_LEN_VARIES(node) PL_regnode_arg_len_varies[(node)]
#define REGNODE_NAME(node) PL_regnode_name[(node)]
#define REGNODE_TYPE(node) (PL_regnode_info[(node)].type)
#define REGNODE_OFF_BY_ARG(node) (PL_regnode_info[(node)].off_by_arg)
#define REGNODE_ARG_LEN(node) (PL_regnode_info[(node)].arg_len)
#define REGNODE_ARG_LEN_VARIES(node) (PL_regnode_info[(node)].arg_len_varies)
#define REGNODE_NAME(node) (PL_regnode_name[(node)])

#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C)
#include "reginline.h"
Expand Down
132 changes: 37 additions & 95 deletions regen/regcomp.pl
Expand Up @@ -51,8 +51,8 @@ BEGIN
# id Both integer value for this opcode/state
# optype Both Either 'op' or 'state'
# line_num Both line_num number of the input file for this item.
# type Op Type of node (aka regkind)
# code Op Apparently not used
# type Op Type of node (aka regnode_kind)
# code Op Meta about the node, used to detect variable length nodes
# suffix Op which regnode struct this uses, so if this is '1', it
# uses 'struct regnode_1'
# flags Op S for simple; V for varies
Expand Down Expand Up @@ -272,10 +272,10 @@ sub print_process_EXACTish {
print $out <<EOP,
/* Is 'op', known to be of type EXACT, folding? */
#define isEXACTFish(op) (__ASSERT_(PL_regnode_kind[op] == EXACT) (PL_EXACTFish_bitmask & (1U << (op - EXACT))))
#define isEXACTFish(op) (__ASSERT_(REGNODE_TYPE(op) == EXACT) (PL_EXACTFish_bitmask & (1U << (op - EXACT))))
/* Do only UTF-8 target strings match 'op', known to be of type EXACT? */
#define isEXACT_REQ8(op) (__ASSERT_(PL_regnode_kind[op] == EXACT) (PL_EXACT_REQ8_bitmask & (1U << (op - EXACT))))
#define isEXACT_REQ8(op) (__ASSERT_(REGNODE_TYPE(op) == EXACT) (PL_EXACT_REQ8_bitmask & (1U << (op - EXACT))))
#ifndef DOINIT
EXTCONST U32 PL_EXACTFish_bitmask;
Expand Down Expand Up @@ -456,32 +456,6 @@ sub print_state_def_line
print $fh "\n"; # Blank line separates groups for clarity
}

sub print_regkind {
my ($out)= @_;
print $out <<EOP;
/* PL_regnode_kind[] What type of regop or state is this. */
#ifndef DOINIT
EXTCONST U8 PL_regnode_kind[];
#else
EXTCONST U8 PL_regnode_kind[] = {
EOP
use Data::Dumper;
foreach my $node (@all) {
print Dumper($node) if !defined $node->{type} or !defined( $node->{name} );
printf $out "\t%*s\t/* %*s */\n",
-1 - $twidth, "$node->{type},", -$base_name_width, $node->{name};
print $out "\t/* ------------ States ------------- */\n"
if $node->{id} == $#ops and $node->{id} != $#all;
}

print $out <<EOP;
};
#endif
EOP
}

sub print_typedefs {
my ($out)= @_;
print $out <<EOP;
Expand Down Expand Up @@ -515,51 +489,44 @@ sub print_typedefs {

}

sub print_regarglen {
my ($out)= @_;
print $out <<EOP;
/* PL_regnode_arg_len[] - How large is the argument part of the node (in regnodes) */
#ifndef DOINIT
EXTCONST U8 PL_regnode_arg_len[];
#else
EXTCONST U8 PL_regnode_arg_len[] = {
EOP

foreach my $node (@ops) {
my $size= 0;
$size= "EXTRA_SIZE($node->{typedef})" if $node->{suffix};

printf $out "\t%*s\t/* %*s */\n", -37, "$size,", -$rwidth, $node->{name};
}

print $out <<EOP;
};
#endif /* DOINIT */

EOP
}

sub print_regargvaries {
sub print_regnode_info {
my ($out)= @_;
print $out <<EOP;
/* PL_regnode_arg_len_varies[] - Is the size of the node determined by STR_SZ() macros?
Currently this is a boolean, but in the future it might turn into something
that uses more bits of the value to indicate that a different macro would be
used. */
/* PL_regnode_info[] - Opcode/state names in string form, for debugging */
#ifndef DOINIT
EXTCONST U8 PL_regnode_arg_len_varies[];
EXTCONST struct regnode_meta PL_regnode_info[];
#else
EXTCONST U8 PL_regnode_arg_len_varies[] = {
EXTCONST struct regnode_meta const PL_regnode_info[] = {
EOP
foreach my $node (@ops) {
my $varies= 0;
$varies= 1 if $node->{code}=~"str";
my @fields= qw(type arg_len arg_len_varies off_by_arg);
foreach my $node_idx (0..$#all) {
my $node= $all[$node_idx];
{
my $size= 0;
$size= "EXTRA_SIZE($node->{typedef})" if $node->{suffix};
$node->{arg_len}= $size;

printf $out "\t%*s\t/* %*s */\n", -37, "$varies,", -$rwidth, $node->{name};
}
{
my $varies= 0;
$varies= 1 if $node->{code} and $node->{code}=~"str";
$node->{arg_len_varies}= $varies;
}
$node->{off_by_arg}= $node->{longj} || 0;
print $out " {\n";
print $out " /* #$node_idx $node->{optype} $node->{name} */\n";
foreach my $f_idx (0..$#fields) {
my $field= $fields[$f_idx];
printf $out " .%s = %s", $field, $node->{$field} // 0;
printf $out $f_idx == $#fields ? "\n" : ",\n";
}
print $out " }";
print $out $node_idx==$#all ? "\n" : ",\n";
}

print $out <<EOP;
Expand All @@ -569,31 +536,8 @@ sub print_regargvaries {
EOP
}

sub print_reg_off_by_arg {
my ($out)= @_;
print $out <<EOP;
/* PL_regnode_off_by_arg[] - Which argument holds the offset to the next node */
#ifndef DOINIT
EXTCONST U8 PL_regnode_off_by_arg[];
#else
EXTCONST U8 PL_regnode_off_by_arg[] = {
EOP

foreach my $node (@ops) {
my $size= $node->{longj} || 0;

printf $out "\t%d,\t/* %*s */\n", $size, -$rwidth, $node->{name};
}

print $out <<EOP;
};
#endif
EOP
}

sub print_reg_name {
sub print_regnode_name {
my ($out)= @_;
print $out <<EOP;
Expand All @@ -608,8 +552,6 @@ sub print_reg_name {
my $ofs= 0;
my $sym= "";
foreach my $node (@all) {
my $size= $node->{longj} || 0;

printf $out "\t%*s\t/* $sym%#04x */\n",
-3 - $base_name_width, qq("$node->{name}",), $node->{id} - $ofs;
if ( $node->{id} == $#ops and @ops != @all ) {
Expand Down Expand Up @@ -867,11 +809,11 @@ sub do_perldebguts {
print $out "#if $confine_to_core\n\n";
print_typedefs($out);
print_state_defs($out);
print_regkind($out);
print_regarglen($out);
print_regargvaries($out);
print_reg_off_by_arg($out);
print_reg_name($out);

print_regnode_name($out);
print_regnode_info($out);


print_reg_extflags_name($out);
print_reg_intflags_name($out);
print_process_flags($out);
Expand Down
7 changes: 7 additions & 0 deletions regexp.h
Expand Up @@ -22,6 +22,13 @@

typedef SSize_t regnode_offset;

struct regnode_meta {
U8 type;
U8 arg_len;
U8 arg_len_varies;
U8 off_by_arg;
};

struct regnode {
U8 flags;
U8 type;
Expand Down

0 comments on commit 12d173c

Please sign in to comment.