Skip to content

Commit

Permalink
Re: [PATCH] Better version of the Aho-Corasick patch and lots of benc…
Browse files Browse the repository at this point in the history
…hmarks.

Message-ID: <9b18b3110606081556t779de698r82f361d82a05fbc8@mail.gmail.com>

(with tweaks)

p4raw-id: //depot/perl@28373
  • Loading branch information
demerphq authored and rgs committed Jun 9, 2006
1 parent b23a565 commit 07be1b8
Show file tree
Hide file tree
Showing 10 changed files with 1,017 additions and 395 deletions.
8 changes: 7 additions & 1 deletion embed.fnc
Original file line number Diff line number Diff line change
Expand Up @@ -1295,7 +1295,7 @@ Es |regnode*|reg_node |NN struct RExC_state_t *state|U8 op
Es |regnode*|regpiece |NN struct RExC_state_t *state|NN I32 *flagp|U32 depth
Es |void |reginsert |NN struct RExC_state_t *state|U8 op|NN regnode *opnd
Es |void |regtail |NN struct RExC_state_t *state|NN regnode *p|NN const regnode *val|U32 depth
Es |U8 |regtail_study |NN struct RExC_state_t *state|NN regnode *p|NN const regnode *val|U32 depth
Es |U32 |join_exact |NN struct RExC_state_t *state|NN regnode *scan|NN I32 *min|U32 flags|NULLOK regnode *val|U32 depth
EsRn |char* |regwhite |NN char *p|NN const char *e
Es |char* |nextchar |NN struct RExC_state_t *state
Es |void |scan_commit |NN const struct RExC_state_t* state|NN struct scan_data_t *data
Expand All @@ -1317,6 +1317,8 @@ Es |void |checkposixcc |NN struct RExC_state_t* state
Es |I32 |make_trie |NN struct RExC_state_t* state|NN regnode *startbranch \
|NN regnode *first|NN regnode *last|NN regnode *tail \
|U32 flags|U32 depth
Es |void |make_trie_failtable |NN struct RExC_state_t* state \
|NN regnode *source|NN regnode *node|U32 depth
# ifdef DEBUGGING
Es |const regnode*|dumpuntil|NN const regexp *r|NN const regnode *start \
|NN const regnode *node \
Expand All @@ -1325,6 +1327,7 @@ Es |void |put_byte |NN SV* sv|int c
Es |void |dump_trie |NN const struct _reg_trie_data *trie|U32 depth
Es |void |dump_trie_interim_list|NN const struct _reg_trie_data *trie|U32 next_alloc|U32 depth
Es |void |dump_trie_interim_table|NN const struct _reg_trie_data *trie|U32 next_alloc|U32 depth
Es |U8 |regtail_study |NN struct RExC_state_t *state|NN regnode *p|NN const regnode *val|U32 depth
# endif
#endif

Expand All @@ -1341,6 +1344,9 @@ ERsn |U8* |reghopmaybe3 |NN U8 *pos|I32 off|NN const U8 *lim
ERs |char* |find_byclass |NN regexp * prog|NN const regnode *c|NN char *s|NN const char *strend|NULLOK const regmatch_info *reginfo
Es |void |to_utf8_substr |NN regexp * prog
Es |void |to_byte_substr |NN regexp * prog
# ifdef DEBUGGING
Es |void |dump_exec_pos |NN const char *locinput|NN const regnode *scan|const bool do_utf8
# endif
#endif

#if defined(PERL_IN_DUMP_C) || defined(PERL_DECL_PROT)
Expand Down
18 changes: 16 additions & 2 deletions embed.h
Original file line number Diff line number Diff line change
Expand Up @@ -1304,7 +1304,7 @@
#define regpiece S_regpiece
#define reginsert S_reginsert
#define regtail S_regtail
#define regtail_study S_regtail_study
#define join_exact S_join_exact
#define regwhite S_regwhite
#define nextchar S_nextchar
#define scan_commit S_scan_commit
Expand All @@ -1324,6 +1324,7 @@
#define regpposixcc S_regpposixcc
#define checkposixcc S_checkposixcc
#define make_trie S_make_trie
#define make_trie_failtable S_make_trie_failtable
#endif
# ifdef DEBUGGING
#if defined(PERL_CORE) || defined(PERL_EXT)
Expand All @@ -1332,6 +1333,7 @@
#define dump_trie S_dump_trie
#define dump_trie_interim_list S_dump_trie_interim_list
#define dump_trie_interim_table S_dump_trie_interim_table
#define regtail_study S_regtail_study
#endif
# endif
#endif
Expand All @@ -1349,6 +1351,11 @@
#define to_utf8_substr S_to_utf8_substr
#define to_byte_substr S_to_byte_substr
#endif
# ifdef DEBUGGING
#if defined(PERL_CORE) || defined(PERL_EXT)
#define dump_exec_pos S_dump_exec_pos
#endif
# endif
#endif
#if defined(PERL_IN_DUMP_C) || defined(PERL_DECL_PROT)
#ifdef PERL_CORE
Expand Down Expand Up @@ -3472,7 +3479,7 @@
#define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c)
#define reginsert(a,b,c) S_reginsert(aTHX_ a,b,c)
#define regtail(a,b,c,d) S_regtail(aTHX_ a,b,c,d)
#define regtail_study(a,b,c,d) S_regtail_study(aTHX_ a,b,c,d)
#define join_exact(a,b,c,d,e,f) S_join_exact(aTHX_ a,b,c,d,e,f)
#define regwhite S_regwhite
#define nextchar(a) S_nextchar(aTHX_ a)
#define scan_commit(a,b) S_scan_commit(aTHX_ a,b)
Expand All @@ -3491,6 +3498,7 @@
#define regpposixcc(a,b) S_regpposixcc(aTHX_ a,b)
#define checkposixcc(a) S_checkposixcc(aTHX_ a)
#define make_trie(a,b,c,d,e,f,g) S_make_trie(aTHX_ a,b,c,d,e,f,g)
#define make_trie_failtable(a,b,c,d) S_make_trie_failtable(aTHX_ a,b,c,d)
#endif
# ifdef DEBUGGING
#if defined(PERL_CORE) || defined(PERL_EXT)
Expand All @@ -3499,6 +3507,7 @@
#define dump_trie(a,b) S_dump_trie(aTHX_ a,b)
#define dump_trie_interim_list(a,b,c) S_dump_trie_interim_list(aTHX_ a,b,c)
#define dump_trie_interim_table(a,b,c) S_dump_trie_interim_table(aTHX_ a,b,c)
#define regtail_study(a,b,c,d) S_regtail_study(aTHX_ a,b,c,d)
#endif
# endif
#endif
Expand All @@ -3516,6 +3525,11 @@
#define to_utf8_substr(a) S_to_utf8_substr(aTHX_ a)
#define to_byte_substr(a) S_to_byte_substr(aTHX_ a)
#endif
# ifdef DEBUGGING
#if defined(PERL_CORE) || defined(PERL_EXT)
#define dump_exec_pos(a,b,c) S_dump_exec_pos(aTHX_ a,b,c)
#endif
# endif
#endif
#if defined(PERL_IN_DUMP_C) || defined(PERL_DECL_PROT)
#ifdef PERL_CORE
Expand Down
9 changes: 8 additions & 1 deletion ext/re/t/re.t
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ BEGIN {

use strict;

use Test::More tests => 13;
use Test::More tests => 14;
require_ok( 're' );

# setcolor
Expand Down Expand Up @@ -58,6 +58,13 @@ re->unimport('taint');
ok( !( $^H & 0x00100000 ), 'unimport should clear bits in $^H when requested' );
re->unimport('eval');
ok( !( $^H & 0x00200000 ), '... and again' );
my $reg=qr/(foo|bar|baz|blah)/;
close STDERR;
eval"use re Debug=>'ALL'";
my $ok='foo'=~/$reg/;
eval"no re Debug=>'ALL'";
ok( $ok, 'No segv!' );


package Term::Cap;

Expand Down
8 changes: 6 additions & 2 deletions ext/re/t/regop.pl
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
use re Debug=>qw(COMPILE EXECUTE);
use re Debug=>qw(COMPILE EXECUTE OFFSETS);
my @tests=(
XY => 'X(A|[B]Q||C|D)Y' ,
foobar => '[f][o][o][b][a][r]',
x => '.[XY].',
'ABCD' => '(?:ABCP|ABCG|ABCE|ABCB|ABCA|ABCD)',
'D:\\dev/perl/ver/28321_/perl.exe'=>
'/(\\.COM|\\.EXE|\\.BAT|\\.CMD|\\.VBS|\\.VBE|\\.JS|\\.JSE|\\.WSF|\\.WSH|\\.pyo|\\.pyc|\\.pyw|\\.py)$/i',
'q'=>'[q]',
);
while (@tests) {
my ($str,$pat)=splice @tests,0,2;
warn "\n";
$pat="/$pat/" if substr($pat,0,1) ne '/';
# string eval to get the free regex message in the right place.
eval qq[
warn "$str"=~/$pat/ ? "%MATCHED%" : "%FAILED%","\n";
warn "$str"=~$pat ? "%MATCHED%" : "%FAILED%","\n";
];
die $@ if $@;
}
113 changes: 97 additions & 16 deletions ext/re/t/regop.t
Original file line number Diff line number Diff line change
Expand Up @@ -12,29 +12,45 @@ BEGIN {

use strict;
require "./test.pl";
our $NUM_SECTS;
chomp(my @strs= grep { !/^\s*\#/ } <DATA>);
my $out = runperl(progfile => "../ext/re/t/regop.pl", stderr => 1 );
my @tests = grep { /\S/ } split /(?=Compiling REx)/, $out;
# on debug builds we get an EXECUTING... message in there at the top
shift @tests
if $tests[0] =~ /EXECUTING.../;

chomp(my @strs=grep { !/^\s*\#/ } <DATA>);
my $out = runperl(progfile => "../ext/re/t/regop.pl", stderr => 1);
my @tests = grep { /\S/ && !/EXECUTING/ } split /(?=Compiling REx)/,$out;
plan( @tests + 2 + ( @strs - grep { !$_ or /^---/ } @strs ));

plan(2 + (@strs - grep { !$_ or /^---/ } @strs) + @tests);
is( scalar @tests, $NUM_SECTS,
"Expecting output for $NUM_SECTS patterns" );
ok( defined $out, 'regop.pl returned something defined' );

my $numtests=4;
is(scalar @tests, $numtests, "Expecting output for $numtests patterns");
ok(defined $out,'regop.pl');
$out||="";
my $test=1;
foreach my $testout (@tests) {
my ($pattern)=$testout=~/Compiling REx "([^"]+)"/;
ok($pattern, "Pattern found for test ".($test++));
$out ||= "";
my $test= 1;
foreach my $testout ( @tests ) {
my ( $pattern )= $testout=~/Compiling REx "([^"]+)"/;
ok( $pattern, "Pattern for test " . ($test++) );
while (@strs) {
my $str=shift @strs;
last if !$str or $str=~/^---/;
next if $str=~/^\s*#/;
ok($testout=~/\Q$str\E/,"$str: /$pattern/");
local $_= shift @strs;
last if !$_
or /^---/;
next if /^\s*#/;
s/^\s+//;
s/\s+$//;
ok( $testout=~/\Q$_\E/, "$_: /$pattern/" );
}
}

# The format below is simple. Each line is an exact
# string that must be found in the output.
# Lines starting the # are comments.
# Lines starting with --- are seperators indicating
# that the tests for this result set are finished.
# If you add a test make sure you update $NUM_SECTS
# the commented output is just for legacy/debugging purposes
BEGIN{ $NUM_SECTS= 6 }

__END__
#Compiling REx "X(A|[B]Q||C|D)Y"
#size 34
Expand Down Expand Up @@ -146,3 +162,68 @@ Start-Class:A-EGP
only one match : #6 <ABCD>
Start:4
minlen 4
---
#Compiling REx "(\.COM|\.EXE|\.BAT|\.CMD|\.VBS|\.VBE|\.JS|\.JSE|\.WSF|\.WSH|\.pyo|\.pyc|\.pyw|\.py)$"
#size 48 nodes first at 3
#first at 3
#rarest char
# at 0
# 1: OPEN1(3)
# 3: EXACTF <.>(5)
# 5: TRIE-EXACTF(45)
# [Start:2 Words:14 Chars:54 Unique:18 States:29 Minlen:2 Maxlen:3 Start-Class:BCEJPVWbcejpvw]
# <.COM>
# ... yada yada ... (dmq)
# <.py>
# 45: CLOSE1(47)
# 47: EOL(48)
# 48: END(0)
#floating ""$ at 3..4 (checking floating) stclass "EXACTF <.>" minlen 3
#Offsets: [48]
# 1:1[1] 3:2[1] 5:2[81] 45:83[1] 47:84[1] 48:85[0]
#Guessing start of match, REx "(\.COM|\.EXE|\.BAT|\.CMD|\.VBS|\.VBE|\.JS|\.JSE|\.WSF|\.WSH|..." against "D:dev/perl/ver/28321_/perl.exe"...
#Found floating substr ""$ at offset 30...
#Starting position does not contradict /^/m...
#Does not contradict STCLASS...
#Guessed: match at offset 26
#Matching REx "(\.COM|\.EXE|\.BAT|\.CMD|\.VBS|\.VBE|\.JS|\.JSE|\.WSF|\.WSH|\.pyo|\.pyc|\.pyw|\.py)$..." against ".exe"
#Matching stclass "EXACTF <.>" against ".exe"
# Setting an EVAL scope, savestack=140
# 26 <21_/perl> <.exe> | 1: OPEN1
# 26 <21_/perl> <.exe> | 3: EXACTF <.>
# 27 <21_/perl.> <exe> | 5: TRIE-EXACTF
# only one match : #2 <.EXE>
# 30 <21_/perl.exe> <> | 45: CLOSE1
# 30 <21_/perl.exe> <> | 47: EOL
# 30 <21_/perl.exe> <> | 48: END
#Match successful!
#POP STATE(1)
#%MATCHED%
#Freeing REx: "(\\.COM|\\.EXE|\\.BAT|\\.CMD|\\.VBS|\\.VBE|\\.JS|\\.JSE|\\."......
%MATCHED%
floating ""$ at 3..4 (checking floating)
1:1[1] 3:2[1] 5:2[81] 45:83[1] 47:84[1] 48:85[0]
stclass "EXACTF <.>" minlen 3
Found floating substr ""$ at offset 30...
Does not contradict STCLASS...
Guessed: match at offset 26
Matching stclass "EXACTF <.>" against ".exe"
---
#Compiling REx "[q]"
#size 12 nodes Got 100 bytes for offset annotations.
#first at 1
#Final program:
# 1: EXACT <q>(3)
# 3: END(0)
#anchored "q" at 0 (checking anchored isall) minlen 1
#Offsets: [12]
# 1:1[3] 3:4[0]
#Guessing start of match, REx "[q]" against "q"...
#Found anchored substr "q" at offset 0...
#Guessed: match at offset 0
#%MATCHED%
#Freeing REx: "[q]"
Got 100 bytes for offset annotations.
Offsets: [12]
1:1[3] 3:4[0]
%MATCHED%
18 changes: 17 additions & 1 deletion proto.h
Original file line number Diff line number Diff line change
Expand Up @@ -3549,7 +3549,7 @@ STATIC void S_regtail(pTHX_ struct RExC_state_t *state, regnode *p, const regnod
__attribute__nonnull__(pTHX_2)
__attribute__nonnull__(pTHX_3);

STATIC U8 S_regtail_study(pTHX_ struct RExC_state_t *state, regnode *p, const regnode *val, U32 depth)
STATIC U32 S_join_exact(pTHX_ struct RExC_state_t *state, regnode *scan, I32 *min, U32 flags, regnode *val, U32 depth)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2)
__attribute__nonnull__(pTHX_3);
Expand Down Expand Up @@ -3620,6 +3620,11 @@ STATIC I32 S_make_trie(pTHX_ struct RExC_state_t* state, regnode *startbranch, r
__attribute__nonnull__(pTHX_4)
__attribute__nonnull__(pTHX_5);

STATIC void S_make_trie_failtable(pTHX_ struct RExC_state_t* state, regnode *source, regnode *node, U32 depth)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2)
__attribute__nonnull__(pTHX_3);

# ifdef DEBUGGING
STATIC const regnode* S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, const regnode *last, SV* sv, I32 l)
__attribute__nonnull__(pTHX_1)
Expand All @@ -3639,6 +3644,11 @@ STATIC void S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U3
STATIC void S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
__attribute__nonnull__(pTHX_1);

STATIC U8 S_regtail_study(pTHX_ struct RExC_state_t *state, regnode *p, const regnode *val, U32 depth)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2)
__attribute__nonnull__(pTHX_3);

# endif
#endif

Expand Down Expand Up @@ -3690,6 +3700,12 @@ STATIC void S_to_utf8_substr(pTHX_ regexp * prog)
STATIC void S_to_byte_substr(pTHX_ regexp * prog)
__attribute__nonnull__(pTHX_1);

# ifdef DEBUGGING
STATIC void S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2);

# endif
#endif

#if defined(PERL_IN_DUMP_C) || defined(PERL_DECL_PROT)
Expand Down
Loading

0 comments on commit 07be1b8

Please sign in to comment.