From e9061b90b1c798fcda9b19e709d2836e210da898 Mon Sep 17 00:00:00 2001 From: Nicholas Clark Date: Wed, 2 Jun 2021 19:09:24 +0000 Subject: [PATCH 01/11] Tests for regexs that are could be used to implement rtrim(). We might want to optimise the implementation of these - if so, test that nothing changes. Brute force enumeration of a lot of variant regular expressions, flags, matches and substitutions. --- MANIFEST | 1 + t/re/rtrim.t | 411 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 412 insertions(+) create mode 100644 t/re/rtrim.t diff --git a/MANIFEST b/MANIFEST index e15db8ea8a32..a55ae8a2683c 100644 --- a/MANIFEST +++ b/MANIFEST @@ -6098,6 +6098,7 @@ t/re/regexp_trielist.t See if regular expressions work with trie optimisation t/re/regexp_unicode_prop.t See if unicode properties work in regular expressions as expected t/re/regexp_unicode_prop_thr.t See if unicode properties work in regular expressions as expected under threads t/re/rt122747.t Test rt122747 assert faile (requires DEBUGGING) +t/re/rtrim.t Tests for potential rtrim optimisations t/re/rxcode.t See if /(?{ code })/ works t/re/script_run.t See if script runs works t/re/speed.t See if optimisations are keeping things fast diff --git a/t/re/rtrim.t b/t/re/rtrim.t new file mode 100644 index 000000000000..7c676283a885 --- /dev/null +++ b/t/re/rtrim.t @@ -0,0 +1,411 @@ +#!./perl + +use strict; +use warnings; + +# This tests that regexs used for trimming whitespace from end of string +# continue to work consistently when we optimise the regex engine. + +BEGIN { + chdir 't' if -d 't'; + require './test.pl'; + set_up_inc('../lib'); +} + +my $nbs_u = "\xA0\x{100}"; +chop $nbs_u; +my $nl_u = "\n\x{100}"; +chop $nl_u; +my $empty_u = "\x{100}"; +chop $empty_u; + +my @tests = ( + ['Hello world'], + [' Hello world'], + [' Hello world ', ' Hello world'], + ['Hello world ', 'Hello world'], + + ["Hello world\n", 'Hello world'], + [" Hello world\n", ' Hello world'], + ["Hello world \n", 'Hello world'], + [" Hello world \n", ' Hello world'], + + ["Yarrrr\r", 'Yarrrr'], + ["NBS8\xA0", 'NBS8'], + ["NBSU$nbs_u", 'NBSU'], + ["\n", ""], + ["\r\n\t\f ", ""], + ["!\t", "!"], + + ["EN\x{2002}Space\x{2002}", "EN\x{2002}Space"], + ["\x{2002}\x{2003}Spaces\x{2004}\x{2005}", "\x{2002}\x{2003}Spaces"], + ["\x{1680}", ""], + [$nl_u, ""], + [$empty_u], +); + +plan(80 * @tests); + +# Yes this is recursive copy-paste-edit, but I'm not confident that trying to +# generate the code then string eval it is much better. Particularly given the +# corner cases. (+ sometimes doesn't match, * always will, and without //u) +for (@tests) { + my ($input, $want) = @$_; + my $pretty = $input; + my $pretty_want = $want; + for ($pretty, $pretty_want) { + next + unless defined $_; + s/\n/\\n/g; + s/\f/\\f/g; + s/\r/\\r/g; + s/\t/\\t/g; + # Normally such complexity would have no place *in* a test for the regex + # engine, but as this test is testing optimisations it seems acceptable. + s/([^[:ascii:]])/sprintf "\\x{%X}", ord $1/ge; + } + + # m// s/// s///r + # \s or [[:space:]] + # + or * + # \z or $ + # // or //u + + if (defined $want) { + { + ok($input =~ /\s+\z/u, "qq<$pretty> =~ /\\s+\\z/u"); + my $copy1 = $input; + is($copy1 =~ s/\s+\z//u, 1, "qq<$pretty> =~ s/\\s+\\z//u"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s+\z//ur, $want, "qq<$pretty> =~ s/\\s+\\z//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/\\s+\\z//ur unchanged"); + } + + { + ok($input =~ /[[:space:]]+\z/u, "qq<$pretty> =~ /[[:space:]]+\\z/u"); + my $copy1 = $input; + is($copy1 =~ s/\s+\z//u, 1, "qq<$pretty> =~ s/[[:space:]]+\\z//u"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s+\z//ur, $want, "qq<$pretty> =~ s/[[:space:]]+\\z//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]+\\z//ur unchanged"); + } + + { + ok($input =~ /\s*\z/u, "qq<$pretty> =~ /\\s*\\z/u"); + my $copy1 = $input; + is($copy1 =~ s/\s*\z//u, 1, "qq<$pretty> =~ s/\\s*\\z//u"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s*\z//ur, $want, "qq<$pretty> =~ s/\\s*\\z//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/\\s*\\z//ur unchanged"); + } + + { + ok($input =~ /[[:space:]]*\z/u, "qq<$pretty> =~ /[[:space:]]*\\z/u"); + my $copy1 = $input; + is($copy1 =~ s/\s*\z//u, 1, "qq<$pretty> =~ s/[[:space:]]*\\z//u"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s*\z//ur, $want, "qq<$pretty> =~ s/[[:space:]]*\\z//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]*\\z//ur unchanged"); + } + + { + ok($input =~ /\s+$/u, "qq<$pretty> =~ /\\s+\$/u"); + my $copy1 = $input; + is($copy1 =~ s/\s+$//u, 1, "qq<$pretty> =~ s/\\s+\$//u"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s+$//ur, $want, "qq<$pretty> =~ s/\\s+\$//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/\\s+\$//ur unchanged"); + } + + { + ok($input =~ /[[:space:]]+$/u, "qq<$pretty> =~ /[[:space:]]+\$/u"); + my $copy1 = $input; + is($copy1 =~ s/\s+$//u, 1, "qq<$pretty> =~ s/[[:space:]]+\$//u"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s+$//ur, $want, "qq<$pretty> =~ s/[[:space:]]+\$//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]+\$//ur unchanged"); + } + + { + ok($input =~ /\s*$/u, "qq<$pretty> =~ /\\s*\$/u"); + my $copy1 = $input; + is($copy1 =~ s/\s*$//u, 1, "qq<$pretty> =~ s/\\s*\$//u"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s*$//ur, $want, "qq<$pretty> =~ s/\\s*\$//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/\\s*\$//ur unchanged"); + } + + { + ok($input =~ /[[:space:]]*$/u, "qq<$pretty> =~ /[[:space:]]*\$/u"); + my $copy1 = $input; + is($copy1 =~ s/\s*$//u, 1, "qq<$pretty> =~ s/[[:space:]]*\$//u"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s*$//ur, $want, "qq<$pretty> =~ s/[[:space:]]*\$//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]*\$//ur unchanged"); + } + } + else { + { + ok($input !~ /\s+\z/u, "qq<$pretty> !~ /\\s+\\z/u"); + my $copy1 = $input; + is($copy1 =~ s/\s+\z//u, "", "qq<$pretty> =~ s/\\s+\\z//u"); + is($copy1, $input, "qq<$pretty> unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s+\z//ur, $input, "qq<$pretty> =~ s/[[:space:]]+\\z//ur"); + is($copy2, $input, "qq<$pretty> unchanged"); + } + + { + ok($input !~ /[[:space:]]+\z/u, "qq<$pretty> !~ /[[:space:]]+\\z/u"); + my $copy1 = $input; + is($copy1 =~ s/[[:space:]]+\z//u, "", "qq<$pretty> =~ s/[[:space:]]+\\z//u retval"); + is($copy1, $input, "qq<$pretty> =~ s/[[:space:]]+\\z//u unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s+\z//ur, $input, "qq<$pretty> =~ s/[[:space:]]+\\z//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]+\\z//ur unchanged"); + } + + { + # Unlike +, * matches, but doesn't change anything + ok($input =~ /\s*\z/u, "qq<$pretty> =~ /\\s*\\z/u"); + my $copy1 = $input; + is($copy1 =~ s/\s*\z//u, 1, "qq<$pretty> =~ s/\\s*\\z//u"); + is($copy1, $input, "qq<$pretty> unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s*\z//ur, $input, "qq<$pretty> =~ s/[[:space:]]*\\z//ur"); + is($copy2, $input, "qq<$pretty> unchanged"); + } + + { + # Unlike +, * matches, but doesn't change anything + ok($input =~ /[[:space:]]*\z/u, "qq<$pretty> =~ /[[:space:]]*\\z/u"); + my $copy1 = $input; + is($copy1 =~ s/[[:space:]]*\z//u, 1, "qq<$pretty> =~ s/[[:space:]]*\\z//u retval"); + is($copy1, $input, "qq<$pretty> =~ s/[[:space:]]*\\z//u unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s*\z//ur, $input, "qq<$pretty> =~ s/[[:space:]]*\\z//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]*\\z//ur unchanged"); + } + + { + ok($input !~ /\s+$/u, "qq<$pretty> !~ /\\s+\$/u"); + my $copy1 = $input; + is($copy1 =~ s/\s+$//u, "", "qq<$pretty> =~ s/\\s+\$//u"); + is($copy1, $input, "qq<$pretty> unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s+$//ur, $input, "qq<$pretty> =~ s/[[:space:]]+\$//ur"); + is($copy2, $input, "qq<$pretty> unchanged"); + } + + { + ok($input !~ /[[:space:]]+$/u, "qq<$pretty> !~ /[[:space:]]+\$/u"); + my $copy1 = $input; + is($copy1 =~ s/[[:space:]]+$//u, "", "qq<$pretty> =~ s/[[:space:]]+\$//u retval"); + is($copy1, $input, "qq<$pretty> =~ s/[[:space:]]+\$//u unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s+$//ur, $input, "qq<$pretty> =~ s/[[:space:]]+\$//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]+\$//ur unchanged"); + } + + { + # Unlike +, * matches, but doesn't change anything + ok($input =~ /\s*$/u, "qq<$pretty> =~ /\\s*\$/u"); + my $copy1 = $input; + is($copy1 =~ s/\s*$//u, 1, "qq<$pretty> =~ s/\\s*\$//u"); + is($copy1, $input, "qq<$pretty> unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s*$//ur, $input, "qq<$pretty> =~ s/[[:space:]]*\$//ur"); + is($copy2, $input, "qq<$pretty> unchanged"); + } + + { + # Unlike +, * matches, but doesn't change anything + ok($input =~ /[[:space:]]*$/u, "qq<$pretty> =~ /[[:space:]]*\$/u"); + my $copy1 = $input; + is($copy1 =~ s/[[:space:]]*$//u, 1, "qq<$pretty> =~ s/[[:space:]]*\$//u retval"); + is($copy1, $input, "qq<$pretty> =~ s/[[:space:]]*\$//u unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s*$//ur, $input, "qq<$pretty> =~ s/[[:space:]]*\$//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]*\$//ur unchanged"); + } + } + + # And without //u + undef $want + if $input =~ /^NBS8/; + + if (defined $want) { + { + ok($input =~ /\s+\z/, "qq<$pretty> =~ /\\s+\\z/"); + my $copy1 = $input; + is($copy1 =~ s/\s+\z//, 1, "qq<$pretty> =~ s/\\s+\\z//"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s+\z//r, $want, "qq<$pretty> =~ s/\\s+\\z//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/\\s+\\z//r unchanged"); + } + + { + ok($input =~ /[[:space:]]+\z/, "qq<$pretty> =~ /[[:space:]]+\\z/"); + my $copy1 = $input; + is($copy1 =~ s/\s+\z//, 1, "qq<$pretty> =~ s/[[:space:]]+\\z//"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s+\z//r, $want, "qq<$pretty> =~ s/[[:space:]]+\\z//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]+\\z//r unchanged"); + } + + { + ok($input =~ /\s*\z/, "qq<$pretty> =~ /\\s*\\z/"); + my $copy1 = $input; + is($copy1 =~ s/\s*\z//, 1, "qq<$pretty> =~ s/\\s*\\z//"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s*\z//r, $want, "qq<$pretty> =~ s/\\s*\\z//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/\\s*\\z//r unchanged"); + } + + { + ok($input =~ /[[:space:]]*\z/, "qq<$pretty> =~ /[[:space:]]*\\z/"); + my $copy1 = $input; + is($copy1 =~ s/\s*\z//, 1, "qq<$pretty> =~ s/[[:space:]]*\\z//"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s*\z//r, $want, "qq<$pretty> =~ s/[[:space:]]*\\z//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]*\\z//r unchanged"); + } + + { + ok($input =~ /\s+$/, "qq<$pretty> =~ /\\s+\$/"); + my $copy1 = $input; + is($copy1 =~ s/\s+$//, 1, "qq<$pretty> =~ s/\\s+\$//"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s+$//r, $want, "qq<$pretty> =~ s/\\s+\$//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/\\s+\$//r unchanged"); + } + + { + ok($input =~ /[[:space:]]+$/, "qq<$pretty> =~ /[[:space:]]+\$/"); + my $copy1 = $input; + is($copy1 =~ s/\s+$//, 1, "qq<$pretty> =~ s/[[:space:]]+\$//"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s+$//r, $want, "qq<$pretty> =~ s/[[:space:]]+\$//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]+\$//r unchanged"); + } + + { + ok($input =~ /\s*$/, "qq<$pretty> =~ /\\s*\$/"); + my $copy1 = $input; + is($copy1 =~ s/\s*$//, 1, "qq<$pretty> =~ s/\\s*\$//"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s*$//r, $want, "qq<$pretty> =~ s/\\s*\$//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/\\s*\$//r unchanged"); + } + + { + ok($input =~ /[[:space:]]*$/, "qq<$pretty> =~ /[[:space:]]*\$/"); + my $copy1 = $input; + is($copy1 =~ s/\s*$//, 1, "qq<$pretty> =~ s/[[:space:]]*\$//"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s*$//r, $want, "qq<$pretty> =~ s/[[:space:]]*\$//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]*\$//r unchanged"); + } + } + else { + { + ok($input !~ /\s+\z/, "qq<$pretty> !~ /\\s+\\z/"); + my $copy1 = $input; + is($copy1 =~ s/\s+\z//, "", "qq<$pretty> =~ s/\\s+\\z//"); + is($copy1, $input, "qq<$pretty> unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s+\z//r, $input, "qq<$pretty> =~ s/[[:space:]]+\\z//r"); + is($copy2, $input, "qq<$pretty> unchanged"); + } + + { + ok($input !~ /[[:space:]]+\z/, "qq<$pretty> !~ /[[:space:]]+\\z/"); + my $copy1 = $input; + is($copy1 =~ s/[[:space:]]+\z//, "", "qq<$pretty> =~ s/[[:space:]]+\\z// retval"); + is($copy1, $input, "qq<$pretty> =~ s/[[:space:]]+\\z// unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s+\z//r, $input, "qq<$pretty> =~ s/[[:space:]]+\\z//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]+\\z//r unchanged"); + } + + { + # Unlike +, * matches, but doesn't change anything + ok($input =~ /\s*\z/, "qq<$pretty> =~ /\\s*\\z/"); + my $copy1 = $input; + is($copy1 =~ s/\s*\z//, 1, "qq<$pretty> =~ s/\\s*\\z//"); + is($copy1, $input, "qq<$pretty> unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s*\z//r, $input, "qq<$pretty> =~ s/[[:space:]]*\\z//r"); + is($copy2, $input, "qq<$pretty> unchanged"); + } + + { + # Unlike +, * matches, but doesn't change anything + ok($input =~ /[[:space:]]*\z/, "qq<$pretty> =~ /[[:space:]]*\\z/"); + my $copy1 = $input; + is($copy1 =~ s/[[:space:]]*\z//, 1, "qq<$pretty> =~ s/[[:space:]]*\\z// retval"); + is($copy1, $input, "qq<$pretty> =~ s/[[:space:]]*\\z// unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s*\z//r, $input, "qq<$pretty> =~ s/[[:space:]]*\\z//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]*\\z//r unchanged"); + } + + { + ok($input !~ /\s+$/, "qq<$pretty> !~ /\\s+\$/"); + my $copy1 = $input; + is($copy1 =~ s/\s+$//, "", "qq<$pretty> =~ s/\\s+\$//"); + is($copy1, $input, "qq<$pretty> unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s+$//r, $input, "qq<$pretty> =~ s/[[:space:]]+\$//r"); + is($copy2, $input, "qq<$pretty> unchanged"); + } + + { + ok($input !~ /[[:space:]]+$/, "qq<$pretty> !~ /[[:space:]]+\$/"); + my $copy1 = $input; + is($copy1 =~ s/[[:space:]]+$//, "", "qq<$pretty> =~ s/[[:space:]]+\$// retval"); + is($copy1, $input, "qq<$pretty> =~ s/[[:space:]]+\$// unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s+$//r, $input, "qq<$pretty> =~ s/[[:space:]]+\$//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]+\$//r unchanged"); + } + + { + # Unlike +, * matches, but doesn't change anything + ok($input =~ /\s*$/, "qq<$pretty> =~ /\\s*\$/"); + my $copy1 = $input; + is($copy1 =~ s/\s*$//, 1, "qq<$pretty> =~ s/\\s*\$//"); + is($copy1, $input, "qq<$pretty> unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s*$//r, $input, "qq<$pretty> =~ s/[[:space:]]*\$//r"); + is($copy2, $input, "qq<$pretty> unchanged"); + } + + { + # Unlike +, * matches, but doesn't change anything + ok($input =~ /[[:space:]]*$/, "qq<$pretty> =~ /[[:space:]]*\$/"); + my $copy1 = $input; + is($copy1 =~ s/[[:space:]]*$//, 1, "qq<$pretty> =~ s/[[:space:]]*\$// retval"); + is($copy1, $input, "qq<$pretty> =~ s/[[:space:]]*\$// unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s*$//r, $input, "qq<$pretty> =~ s/[[:space:]]*\$//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]*\$//r unchanged"); + } + } +} From f6f1a031f14fdfc6fbac0ebfc094d9448f0ee360 Mon Sep 17 00:00:00 2001 From: Nicholas Clark Date: Wed, 2 Jun 2021 07:36:44 +0000 Subject: [PATCH 02/11] Reserve another bit in the regex extflags. This breaks binary compatibility, but that's fine in blead. --- ext/Devel-Peek/t/Peek.t | 15 +++++++++++---- regexp.h | 4 ++-- regnodes.h | 36 ++++++++++++++++++------------------ 3 files changed, 31 insertions(+), 24 deletions(-) diff --git a/ext/Devel-Peek/t/Peek.t b/ext/Devel-Peek/t/Peek.t index a8c68c80e60d..30b654ff9408 100644 --- a/ext/Devel-Peek/t/Peek.t +++ b/ext/Devel-Peek/t/Peek.t @@ -355,9 +355,10 @@ do_test('reference to named subroutine without prototype', if ($] >= 5.011) { # note the conditionals on ENGINE and INTFLAGS were introduced in 5.19.9 -do_test('reference to regexp', - qr(tic), -'SV = $RV\\($ADDR\\) at $ADDR + # We are taking great care to curate this test as if the module is dual life + # (or we actively want to cherry-pick entire chunks of it back to maint) + # Is this a good idea? + my $raw = 'SV = $RV\\($ADDR\\) at $ADDR REFCNT = 1 FLAGS = \\(ROK\\) RV = $ADDR @@ -423,7 +424,13 @@ do_test('reference to regexp', OFFS = $ADDR QR_ANONCV = 0x0(?: SAVED_COPY = 0x0)?' -)); +); + + $raw =~ s/ EXTFLAGS = 0x680000 / EXTFLAGS = 0x340000 /g + if $] >= 5.035; + do_test('reference to regexp', + qr(tic), + $raw); } else { do_test('reference to regexp', qr(tic), diff --git a/regexp.h b/regexp.h index e40420613353..b14351a71515 100644 --- a/regexp.h +++ b/regexp.h @@ -414,7 +414,7 @@ and check for NULL. * For the regexp bits, PL_reg_extflags_name[] in regnodes.h has a comment * giving which bits are used/unused */ -# define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT + 2) +# define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT + 1) /* What we have seen */ # define RXf_NO_INPLACE_SUBST (1U<<(RXf_BASE_SHIFT+2)) @@ -455,7 +455,7 @@ and check for NULL. * highest bit position should be used, so that if RXf_BASE_SHIFT gets * increased, the #error below will be triggered so that you will be reminded * to adjust things at the other end to keep the bit positions unchanged */ -# if RXf_BASE_SHIFT+17 > 31 +# if RXf_BASE_SHIFT+18 > 31 # error Too many RXf_PMf bits used. See comments at beginning of these for what to do # endif diff --git a/regnodes.h b/regnodes.h index 94b71102eb3b..205342f77312 100644 --- a/regnodes.h +++ b/regnodes.h @@ -1654,7 +1654,7 @@ EXTCONST char * const PL_reg_name[] = { EXTCONST char * PL_reg_extflags_name[]; #else EXTCONST char * const PL_reg_extflags_name[] = { - /* Bits in extflags defined: 11111111111111110000111111111111 */ + /* Bits in extflags defined: 01111111111111111000111111111111 */ "MULTILINE", /* 0x00000001 */ "SINGLELINE", /* 0x00000002 */ "FOLD", /* 0x00000004 */ @@ -1670,23 +1670,23 @@ EXTCONST char * const PL_reg_extflags_name[] = { "UNUSED_BIT_12", /* 0x00001000 */ "UNUSED_BIT_13", /* 0x00002000 */ "UNUSED_BIT_14", /* 0x00004000 */ - "UNUSED_BIT_15", /* 0x00008000 */ - "NO_INPLACE_SUBST", /* 0x00010000 */ - "EVAL_SEEN", /* 0x00020000 */ - "UNBOUNDED_QUANTIFIER_SEEN",/* 0x00040000 */ - "CHECK_ALL", /* 0x00080000 */ - "MATCH_UTF8", /* 0x00100000 */ - "USE_INTUIT_NOML", /* 0x00200000 */ - "USE_INTUIT_ML", /* 0x00400000 */ - "INTUIT_TAIL", /* 0x00800000 */ - "IS_ANCHORED", /* 0x01000000 */ - "COPY_DONE", /* 0x02000000 */ - "TAINTED_SEEN", /* 0x04000000 */ - "TAINTED", /* 0x08000000 */ - "START_ONLY", /* 0x10000000 */ - "SKIPWHITE", /* 0x20000000 */ - "WHITE", /* 0x40000000 */ - "NULL", /* 0x80000000 */ + "NO_INPLACE_SUBST", /* 0x00008000 */ + "EVAL_SEEN", /* 0x00010000 */ + "UNBOUNDED_QUANTIFIER_SEEN",/* 0x00020000 */ + "CHECK_ALL", /* 0x00040000 */ + "MATCH_UTF8", /* 0x00080000 */ + "USE_INTUIT_NOML", /* 0x00100000 */ + "USE_INTUIT_ML", /* 0x00200000 */ + "INTUIT_TAIL", /* 0x00400000 */ + "IS_ANCHORED", /* 0x00800000 */ + "COPY_DONE", /* 0x01000000 */ + "TAINTED_SEEN", /* 0x02000000 */ + "TAINTED", /* 0x04000000 */ + "START_ONLY", /* 0x08000000 */ + "SKIPWHITE", /* 0x10000000 */ + "WHITE", /* 0x20000000 */ + "NULL", /* 0x40000000 */ + "UNUSED_BIT_31", /* 0x80000000 */ }; #endif /* DOINIT */ From 5f9070efd809443004aed73626881254cd99832d Mon Sep 17 00:00:00 2001 From: Nicholas Clark Date: Wed, 2 Jun 2021 08:11:30 +0000 Subject: [PATCH 03/11] Add a new regex flag, RXf_RTRIM. --- dump.c | 3 +++ regexp.h | 1 + regnodes.h | 4 ++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/dump.c b/dump.c index a72accddb3bf..d3f85ed2de75 100644 --- a/dump.c +++ b/dump.c @@ -918,6 +918,8 @@ S_pm_description(pTHX_ const PMOP *pm) sv_catpvs(desc, ",WHITE"); if (RX_EXTFLAGS(regex) & RXf_NULL) sv_catpvs(desc, ",NULL"); + if (RX_EXTFLAGS(regex) & RXf_RTRIM) + sv_catpvs(desc, ",RTRIM"); } append_flags(desc, pmflags, pmflags_flags_names); @@ -1708,6 +1710,7 @@ const struct flag_to_name regexp_extflags_names[] = { {RXf_SKIPWHITE, "SKIPWHITE,"}, {RXf_WHITE, "WHITE,"}, {RXf_NULL, "NULL,"}, + {RXf_RTRIM, "RTRIM,"}, }; /* NOTE: this structure is mostly duplicative of one generated by diff --git a/regexp.h b/regexp.h index b14351a71515..b783f9906f39 100644 --- a/regexp.h +++ b/regexp.h @@ -450,6 +450,7 @@ and check for NULL. /* split " " */ # define RXf_WHITE (1U<<(RXf_BASE_SHIFT+16)) /* Pattern is /\s+/ */ # define RXf_NULL (1U<<(RXf_BASE_SHIFT+17)) /* Pattern is // */ +# define RXf_RTRIM (1U<<(RXf_BASE_SHIFT+18)) /* Pattern is /[:space:]\z/u */ /* See comments at the beginning of these defines about adding bits. The * highest bit position should be used, so that if RXf_BASE_SHIFT gets diff --git a/regnodes.h b/regnodes.h index 205342f77312..106cc10b9b55 100644 --- a/regnodes.h +++ b/regnodes.h @@ -1654,7 +1654,7 @@ EXTCONST char * const PL_reg_name[] = { EXTCONST char * PL_reg_extflags_name[]; #else EXTCONST char * const PL_reg_extflags_name[] = { - /* Bits in extflags defined: 01111111111111111000111111111111 */ + /* Bits in extflags defined: 11111111111111111000111111111111 */ "MULTILINE", /* 0x00000001 */ "SINGLELINE", /* 0x00000002 */ "FOLD", /* 0x00000004 */ @@ -1686,7 +1686,7 @@ EXTCONST char * const PL_reg_extflags_name[] = { "SKIPWHITE", /* 0x10000000 */ "WHITE", /* 0x20000000 */ "NULL", /* 0x40000000 */ - "UNUSED_BIT_31", /* 0x80000000 */ + "RTRIM", /* 0x80000000 */ }; #endif /* DOINIT */ From fcaa368f98859d8fdf3f04ff5134086a0bfdb85f Mon Sep 17 00:00:00 2001 From: Nicholas Clark Date: Wed, 2 Jun 2021 09:15:01 +0000 Subject: [PATCH 04/11] Set RXf_RTRIM if the pattern *is* /[[:space:]]+\z/u We can be more flexible in our selection of patterns soon, but for now, start with a minimal testcase. We don't actually *do* anything yet, so we must "fail" in re_intuit_start. --- regcomp.c | 9 +++++++++ regexec.c | 6 ++++++ 2 files changed, 15 insertions(+) diff --git a/regcomp.c b/regcomp.c index 512e6a165b3a..5d0c8176b623 100644 --- a/regcomp.c +++ b/regcomp.c @@ -8488,6 +8488,15 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, && *(STRING(first)) == ' ' && OP(regnext(first)) == END ) RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + else if (fop == PLUS + && nop == POSIXU && FLAGS(next) == _CC_SPACE) { + regnode *second = regnext(first); + regnode *third = OP(second) == EOS ? regnext(second) : NULL; + if (third && OP(third) == END) { + /* /[[:space:]]+\z/u */ + RExC_rx->extflags |= RXf_RTRIM; + } + } } diff --git a/regexec.c b/regexec.c index 0e9e5c432ed7..591f2c905d40 100644 --- a/regexec.c +++ b/regexec.c @@ -921,6 +921,12 @@ Perl_re_intuit_start(pTHX_ /* not actually used within intuit, but zero for safety anyway */ reginfo->poscache_maxiter = 0; + if(prog->extflags & RXf_RTRIM) { + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ + " rtrim intuit not yet implemented...\n")); + return rx_origin; + } + if (utf8_target) { if ((!prog->anchored_utf8 && prog->anchored_substr) || (!prog->float_utf8 && prog->float_substr)) From ddce7ad76dae277841d4bc12724b4fef53c10ca5 Mon Sep 17 00:00:00 2001 From: Nicholas Clark Date: Wed, 2 Jun 2021 20:38:45 +0000 Subject: [PATCH 05/11] re_intuit_start now handles RXf_RTRIM. If the compiler set this bit, then the pattern is /[[:space:]]\z/u, and hence we know how to walk backwards from the end of the string to find the match start. --- regexec.c | 48 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/regexec.c b/regexec.c index 591f2c905d40..2b702beca8ee 100644 --- a/regexec.c +++ b/regexec.c @@ -922,9 +922,51 @@ Perl_re_intuit_start(pTHX_ reginfo->poscache_maxiter = 0; if(prog->extflags & RXf_RTRIM) { - DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ - " rtrim intuit not yet implemented...\n")); - return rx_origin; + const char *s = strend; + if (strpos >= strend) { + /* This should be unreachable: + * String shorter than min possible regex match (0 < 1) + * but in the future we might want to also handle *, ? and {0,...} + */ + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ + " rtrim intuit on empty string ...\n")); + goto fail; + } + if (utf8_target) { + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ + " rtrim intuit UTF-8 ...\n")); + while (1) { + const char *was_s = s; + if (s == strpos) + break; + s = (const char *) Perl_utf8_hop_back((const U8 *)s, -1, (const U8 *)strpos); + if (s < strpos || !isSPACE_utf8_safe(s, strend)) { + s = was_s; + break; + } + } + } + else { + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ + " rtrim intuit Latin1 ...\n")); + while (1) { + const char *was_s = s; + if (s == strpos) + break; + --s; + if (s < strpos || !isSPACE_L1(*s)) { + s = was_s; + break; + } + } + } + if (s < strend) { + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ + "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", + PL_colors[4], PL_colors[5], (long)(s - strbeg)) ); + return s; + } + return strpos; } if (utf8_target) { From 6e855fb36b33f9a79e33e3d54839c182c3c21393 Mon Sep 17 00:00:00 2001 From: Nicholas Clark Date: Wed, 2 Jun 2021 21:14:29 +0000 Subject: [PATCH 06/11] Set RXf_CHECK_ALL with RXf_RTRIM - re_intuit_start can do the entire match! --- regcomp.c | 2 +- regexec.c | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/regcomp.c b/regcomp.c index 5d0c8176b623..853cf27d49c5 100644 --- a/regcomp.c +++ b/regcomp.c @@ -8494,7 +8494,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, regnode *third = OP(second) == EOS ? regnext(second) : NULL; if (third && OP(third) == END) { /* /[[:space:]]+\z/u */ - RExC_rx->extflags |= RXf_RTRIM; + RExC_rx->extflags |= RXf_RTRIM | RXf_CHECK_ALL; } } diff --git a/regexec.c b/regexec.c index 2b702beca8ee..4a785786c351 100644 --- a/regexec.c +++ b/regexec.c @@ -966,7 +966,7 @@ Perl_re_intuit_start(pTHX_ PL_colors[4], PL_colors[5], (long)(s - strbeg)) ); return s; } - return strpos; + goto fail; } if (utf8_target) { @@ -3699,9 +3699,19 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, prog->lastparen = prog->lastcloseparen = 0; RXp_MATCH_UTF8_set(prog, utf8_target); prog->offs[0].start = s - strbeg; - prog->offs[0].end = utf8_target - ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg - : s - strbeg + prog->minlenret; + if (prog->extflags & RXf_RTRIM) { + /* Oh my, seems that until RTRIM, match via INTUIT was always + * a fixed length, given by minlenret. + * RTRIM breaks that assumption. + * For now, we just hack our known (other) match length - the + * entire string: */ + prog->offs[0].end = strend - strbeg; + } + else { + prog->offs[0].end = utf8_target + ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg + : s - strbeg + prog->minlenret; + } if ( !(flags & REXEC_NOT_FIRST) ) S_reg_set_capture_string(aTHX_ rx, strbeg, strend, From f47501b0ac5721519d9fca44073744b92eacd4e0 Mon Sep 17 00:00:00 2001 From: Nicholas Clark Date: Thu, 3 Jun 2021 08:45:18 +0000 Subject: [PATCH 07/11] Also set RXf_RTRIM if the pattern is /[[:space:]]+$/u The current code can handle $ just as easily as \z because the optional newline can also be matched by [[:space:]]. This wouldn't be guaranteed by a more general "match backwards from the end" approach. --- regcomp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/regcomp.c b/regcomp.c index 853cf27d49c5..1f1198b91102 100644 --- a/regcomp.c +++ b/regcomp.c @@ -8491,9 +8491,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, else if (fop == PLUS && nop == POSIXU && FLAGS(next) == _CC_SPACE) { regnode *second = regnext(first); - regnode *third = OP(second) == EOS ? regnext(second) : NULL; + regnode *third = (OP(second) == EOS || OP(second) == SEOL) + ? regnext(second) : NULL; if (third && OP(third) == END) { - /* /[[:space:]]+\z/u */ + /* /[[:space:]]+\z/u + * /[[:space:]]+$/u */ RExC_rx->extflags |= RXf_RTRIM | RXf_CHECK_ALL; } } From 9da0bd4ded2deb8199fe79dc27e389539225b782 Mon Sep 17 00:00:00 2001 From: Nicholas Clark Date: Thu, 3 Jun 2021 12:52:25 +0000 Subject: [PATCH 08/11] Also set RXf_RTRIM if the pattern is /\s*$/u It takes a bit more care to handle 0-or-more alternations, but this covers more regex variantions used in the wild. --- regcomp.c | 8 +++++--- regexec.c | 11 +++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/regcomp.c b/regcomp.c index 1f1198b91102..bea9a6959b9c 100644 --- a/regcomp.c +++ b/regcomp.c @@ -8466,7 +8466,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* It's safe to read through *next only if OP(first) is a regop of * the right type (not EXACT, for example). */ - U8 nop = (fop == NOTHING || fop == MBOL || fop == SBOL || fop == PLUS) + U8 nop = (fop == NOTHING || fop == MBOL || fop == SBOL || fop == PLUS || fop == STAR) ? OP(next) : 0; if (PL_regkind[fop] == NOTHING && nop == END) @@ -8488,14 +8488,16 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, && *(STRING(first)) == ' ' && OP(regnext(first)) == END ) RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE); - else if (fop == PLUS + else if ((fop == PLUS || fop == STAR) && nop == POSIXU && FLAGS(next) == _CC_SPACE) { regnode *second = regnext(first); regnode *third = (OP(second) == EOS || OP(second) == SEOL) ? regnext(second) : NULL; if (third && OP(third) == END) { /* /[[:space:]]+\z/u - * /[[:space:]]+$/u */ + * /[[:space:]]+$/u + * /[[:space:]]*$/u + * etc */ RExC_rx->extflags |= RXf_RTRIM | RXf_CHECK_ALL; } } diff --git a/regexec.c b/regexec.c index 4a785786c351..0bc815265a4b 100644 --- a/regexec.c +++ b/regexec.c @@ -923,10 +923,17 @@ Perl_re_intuit_start(pTHX_ if(prog->extflags & RXf_RTRIM) { const char *s = strend; + if (strpos == strend && prog->minlen == 0) { + /* \s* and we are asked to match an empty string */ + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ + "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", + PL_colors[4], PL_colors[5], (long)(s - strbeg)) ); + return s; + } if (strpos >= strend) { /* This should be unreachable: * String shorter than min possible regex match (0 < 1) - * but in the future we might want to also handle *, ? and {0,...} + * but in the future we might want to also handle ? and {0,...} */ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " rtrim intuit on empty string ...\n")); @@ -960,7 +967,7 @@ Perl_re_intuit_start(pTHX_ } } } - if (s < strend) { + if (s < strend || s == strend && prog->minlen == 0) { DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", PL_colors[4], PL_colors[5], (long)(s - strbeg)) ); From 173727279a210eb24d388fe6faf146b999a171bc Mon Sep 17 00:00:00 2001 From: Nicholas Clark Date: Thu, 3 Jun 2021 14:35:11 +0000 Subject: [PATCH 09/11] Also set RXf_RTRIM if the pattern is /\s*$/ (without the //u flag). Without the //u flag, \s and [[:space:]] are compiled to POSIXD ops. The POSIXD op behaves differently depending on whether the target string has SVf_UTF8 set. One could in theory continue and handle the case of the //a flag (POSIXA ops), but this doesn't seem worth it for an optimisation, as it is unlikely to be common on regexs that are intended to remove "generic" whitespace from the end of a string. --- regcomp.c | 4 +++- regexec.c | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/regcomp.c b/regcomp.c index bea9a6959b9c..a9604a34a853 100644 --- a/regcomp.c +++ b/regcomp.c @@ -8489,7 +8489,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, && OP(regnext(first)) == END ) RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE); else if ((fop == PLUS || fop == STAR) - && nop == POSIXU && FLAGS(next) == _CC_SPACE) { + && (nop == POSIXU || nop == POSIXD) + && FLAGS(next) == _CC_SPACE) { regnode *second = regnext(first); regnode *third = (OP(second) == EOS || OP(second) == SEOL) ? regnext(second) : NULL; @@ -8497,6 +8498,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* /[[:space:]]+\z/u * /[[:space:]]+$/u * /[[:space:]]*$/u + * /\s*$/ * etc */ RExC_rx->extflags |= RXf_RTRIM | RXf_CHECK_ALL; } diff --git a/regexec.c b/regexec.c index 0bc815265a4b..2197279f19fe 100644 --- a/regexec.c +++ b/regexec.c @@ -953,7 +953,23 @@ Perl_re_intuit_start(pTHX_ } } } + else if (OP(NEXTOPER(progi->program + 1)) == POSIXD) { + /* Without //u \x{A0} mustn't match \s when stored as octets. */ + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ + " rtrim intuit Legacy ...\n")); + while (1) { + const char *was_s = s; + if (s == strpos) + break; + --s; + if (s < strpos || !isSPACE(*s)) { + s = was_s; + break; + } + } + } else { + /* flag //u present - the op will be POSIXU */ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " rtrim intuit Latin1 ...\n")); while (1) { From 9f0d3ea3984fa60f0a655c38294e4f49d94aa1d5 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 6 Jun 2021 13:45:05 -0600 Subject: [PATCH 10/11] regcharclass.pl: Backwards UTF-8 isSPACE --- regcharclass.h | 89 ++++++++++++++++++++++++++++++++++++++++++- regen/regcharclass.pl | 83 ++++++++++++++++++++++++++++++---------- 2 files changed, 150 insertions(+), 22 deletions(-) diff --git a/regcharclass.h b/regcharclass.h index 08b1358dcd94..06e9d16c65f9 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -155,6 +155,35 @@ ( 0x202F == cp || ( 0x202F < cp && \ ( 0x205F == cp || 0x3000 == cp ) ) ) ) ) ) ) ) ) +/* + XPERLSPACE: \p{XPerlSpace} + + \p{XPerlSpace} +*/ +/*** GENERATED CODE ***/ +#define is_XPERLSPACE_utf8_safe_backwards(s,e) \ +( ((s) - (e) > 2) ? \ + ( ( inRANGE_helper_(U8, *((const U8*)s - 1), '\t', '\r') || ' ' == *((const U8*)s - 1) ) ? 1\ + : ( 0x80 == *((const U8*)s - 1) ) ? \ + ( ( 0x80 == *((const U8*)s - 2) ) ? \ + ( ( inRANGE_helper_(U8, *((const U8*)s - 3), 0xE2, 0xE3) ) ? 3 : 0 )\ + : ( ( 0x9A == *((const U8*)s - 2) ) && ( 0xE1 == *((const U8*)s - 3) ) ) ? 3 : 0 )\ + : ( inRANGE_helper_(U8, *((const U8*)s - 1), 0x81, 0x84) || inRANGE_helper_(U8, *((const U8*)s - 1), 0x86, 0x8A) || inRANGE_helper_(U8, *((const U8*)s - 1), 0xA8, 0xA9) || 0xAF == *((const U8*)s - 1) ) ?\ + ( ( ( 0x80 == *((const U8*)s - 2) ) && ( 0xE2 == *((const U8*)s - 3) ) ) ? 3 : 0 )\ + : ( 0x85 == *((const U8*)s - 1) ) ? \ + ( ( 0x80 == *((const U8*)s - 2) ) ? \ + ( ( 0xE2 == *((const U8*)s - 3) ) ? 3 : 0 ) \ + : ( 0xC2 == *((const U8*)s - 2) ) ? 2 : 0 ) \ + : ( 0x9F == *((const U8*)s - 1) ) ? \ + ( ( ( 0x81 == *((const U8*)s - 2) ) && ( 0xE2 == *((const U8*)s - 3) ) ) ? 3 : 0 )\ + : ( ( 0xA0 == *((const U8*)s - 1) ) && ( 0xC2 == *((const U8*)s - 2) ) ) ? 2 : 0 )\ +: ((s) - (e) > 1) ? \ + ( ( inRANGE_helper_(U8, *((const U8*)s - 1), '\t', '\r') || ' ' == *((const U8*)s - 1) ) ? 1\ + : ( ( 0x85 == *((const U8*)s - 1) || 0xA0 == *((const U8*)s - 1) ) && ( 0xC2 == *((const U8*)s - 2) ) ) ? 2 : 0 )\ +: ((s) - (e) > 0) ? \ + ( inRANGE_helper_(U8, *((const U8*)s - 1), '\t', '\r') || ' ' == *((const U8*)s - 1) )\ +: 0 ) + /* NONCHAR: Non character code points @@ -1338,6 +1367,35 @@ ( 0x202F == cp || ( 0x202F < cp && \ ( 0x205F == cp || 0x3000 == cp ) ) ) ) ) ) ) ) ) +/* + XPERLSPACE: \p{XPerlSpace} + + \p{XPerlSpace} +*/ +/*** GENERATED CODE ***/ +#define is_XPERLSPACE_utf8_safe_backwards(s,e) \ +( ((s) - (e) > 2) ? \ + ( ( '\t' == *((const U8*)s - 1) || inRANGE_helper_(U8, *((const U8*)s - 1), '\v', '\r') || '\n' == *((const U8*)s - 1) || 0x25 == *((const U8*)s - 1) || ' ' == *((const U8*)s - 1) ) ? 1\ + : ( 0x41 == *((const U8*)s - 1) ) ? \ + ( ( 0x41 == *((const U8*)s - 2) ) ? \ + ( ( ( *((const U8*)s - 3) & 0xFB ) == 0xCA ) ? 3 : 0 ) \ + : ( 0x63 == *((const U8*)s - 2) ) ? \ + ( ( 0xBC == *((const U8*)s - 3) ) ? 3 : 0 ) \ + : ( 0x80 == *((const U8*)s - 2) ) ? 2 : 0 ) \ + : ( inRANGE_helper_(U8, *((const U8*)s - 1), 0x42, 0x48) || 0x51 == *((const U8*)s - 1) ) ?\ + ( ( ( 0x41 == *((const U8*)s - 2) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\ + : ( inRANGE_helper_(U8, *((const U8*)s - 1), 0x49, 0x4A) ) ? \ + ( ( ( inRANGE_helper_(U8, *((const U8*)s - 2), 0x41, 0x42) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\ + : ( 0x56 == *((const U8*)s - 1) ) ? \ + ( ( ( 0x42 == *((const U8*)s - 2) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\ + : ( ( ( 0x73 == *((const U8*)s - 1) ) && ( 0x43 == *((const U8*)s - 2) ) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\ +: ((s) - (e) > 1) ? \ + ( ( '\t' == *((const U8*)s - 1) || inRANGE_helper_(U8, *((const U8*)s - 1), '\v', '\r') || '\n' == *((const U8*)s - 1) || 0x25 == *((const U8*)s - 1) || ' ' == *((const U8*)s - 1) ) ? 1\ + : ( ( 0x41 == *((const U8*)s - 1) ) && ( 0x80 == *((const U8*)s - 2) ) ) ? 2 : 0 )\ +: ((s) - (e) > 0) ? \ + ( '\t' == *((const U8*)s - 1) || inRANGE_helper_(U8, *((const U8*)s - 1), '\v', '\r') || '\n' == *((const U8*)s - 1) || 0x25 == *((const U8*)s - 1) || ' ' == *((const U8*)s - 1) )\ +: 0 ) + /* NONCHAR: Non character code points @@ -2516,6 +2574,35 @@ ( 0x202F == cp || ( 0x202F < cp && \ ( 0x205F == cp || 0x3000 == cp ) ) ) ) ) ) ) ) ) +/* + XPERLSPACE: \p{XPerlSpace} + + \p{XPerlSpace} +*/ +/*** GENERATED CODE ***/ +#define is_XPERLSPACE_utf8_safe_backwards(s,e) \ +( ((s) - (e) > 2) ? \ + ( ( '\t' == *((const U8*)s - 1) || inRANGE_helper_(U8, *((const U8*)s - 1), '\v', '\r') || 0x15 == *((const U8*)s - 1) || '\n' == *((const U8*)s - 1) || ' ' == *((const U8*)s - 1) ) ? 1\ + : ( 0x41 == *((const U8*)s - 1) ) ? \ + ( ( 0x41 == *((const U8*)s - 2) ) ? \ + ( ( ( *((const U8*)s - 3) & 0xFB ) == 0xCA ) ? 3 : 0 ) \ + : ( 0x62 == *((const U8*)s - 2) ) ? \ + ( ( 0xBD == *((const U8*)s - 3) ) ? 3 : 0 ) \ + : ( 0x78 == *((const U8*)s - 2) ) ? 2 : 0 ) \ + : ( inRANGE_helper_(U8, *((const U8*)s - 1), 0x42, 0x48) || 0x51 == *((const U8*)s - 1) ) ?\ + ( ( ( 0x41 == *((const U8*)s - 2) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\ + : ( inRANGE_helper_(U8, *((const U8*)s - 1), 0x49, 0x4A) ) ? \ + ( ( ( inRANGE_helper_(U8, *((const U8*)s - 2), 0x41, 0x42) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\ + : ( 0x56 == *((const U8*)s - 1) ) ? \ + ( ( ( 0x42 == *((const U8*)s - 2) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\ + : ( ( ( 0x72 == *((const U8*)s - 1) ) && ( 0x43 == *((const U8*)s - 2) ) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\ +: ((s) - (e) > 1) ? \ + ( ( '\t' == *((const U8*)s - 1) || inRANGE_helper_(U8, *((const U8*)s - 1), '\v', '\r') || 0x15 == *((const U8*)s - 1) || '\n' == *((const U8*)s - 1) || ' ' == *((const U8*)s - 1) ) ? 1\ + : ( ( 0x41 == *((const U8*)s - 1) ) && ( 0x78 == *((const U8*)s - 2) ) ) ? 2 : 0 )\ +: ((s) - (e) > 0) ? \ + ( '\t' == *((const U8*)s - 1) || inRANGE_helper_(U8, *((const U8*)s - 1), '\v', '\r') || 0x15 == *((const U8*)s - 1) || '\n' == *((const U8*)s - 1) || ' ' == *((const U8*)s - 1) )\ +: 0 ) + /* NONCHAR: Non character code points @@ -3617,6 +3704,6 @@ * 696e706fddd3ce8cd48c7ea91caf4c9edf5c296432d320aa7b78631f69aa9eac lib/unicore/mktables * 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version * 24120d5e0c9685c442c93bc1dbea9b85ef973bf8e9474baf0e55b160c288226b regen/charset_translations.pl - * 3635c6e564558e965018947bdab45f37d9a4fa82eb05b2694eae1a04bf7e65a3 regen/regcharclass.pl + * 29d7ced5065b4b2476607aefb87083c37a7dc5f9705430a7c0811d4232efca13 regen/regcharclass.pl * b2f896452d2b30da3e04800f478c60c1fd0b03d6b668689b020f1e3cf1f1cdd9 regen/regcharclass_multi_char_folds.pl * ex: set ro: */ diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index 76b1532047d6..658506a48a83 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -354,8 +354,9 @@ sub val_fmt # # Each string is then stored in the 'strs' subhash as a hash record # made up of the results of __uni_latin1, using the keynames -# 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and -# 'UTF8' which hold a merge of 'low' and their lowercase equivalents. +# 'low', 'latin1', 'utf8', as well as the synthesized 'LATIN1', 'high', +# 'UTF8', and 'backwards_UTF8' which hold a merge of 'low' and their lowercase +# equivalents. # # Size data is tracked per type in the 'size' subhash. # @@ -489,7 +490,7 @@ sub new { # sub make_trie { - my ( $self, $type, $maxlen )= @_; + my ( $self, $type, $maxlen, $backwards )= @_; my $strs= $self->{strs}; my %trie; @@ -500,7 +501,8 @@ sub make_trie { next unless $dat; next if $maxlen && @$dat > $maxlen; my $node= \%trie; - foreach my $elem ( @$dat ) { + my @ordered_dat = ($backwards) ? reverse @$dat : @$dat; + foreach my $elem ( @ordered_dat ) { $node->{$elem} ||= {}; $node= $node->{$elem}; } @@ -533,7 +535,7 @@ ($) # sub _optree { - my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_; + my ( $self, $trie, $test_type, $ret_type, $else, $depth, $backwards )= @_; return unless defined $trie; $ret_type ||= 'len'; $else= 0 unless defined $else; @@ -567,7 +569,16 @@ sub _optree { # can return the "else" value. return $else if !@conds; - my $test = $test_type =~ /^cp/ ? "cp" : "((const U8*)s)[$depth]"; + my $test; + if ($test_type =~ /^cp/) { + $test = "cp"; + } + elsif ($backwards) { + $test = "*((const U8*)s - " . ($depth + 1) . ")"; + } + else { + $test = "((const U8*)s)[$depth]"; + } # First we loop over the possible keys/conditions and find out what they # look like; we group conditions with the same optree together. @@ -578,7 +589,7 @@ sub _optree { # get the optree for this child/condition my $res= $self->_optree( $trie->{$cond}, $test_type, $ret_type, - $else, $depth + 1 ); + $else, $depth + 1, $backwards ); # convert it to a string with Dumper my $res_code= Dumper( $res ); @@ -618,10 +629,11 @@ sub _optree { sub optree { my $self= shift; my %opt= @_; - my $trie= $self->make_trie( $opt{type}, $opt{max_depth} ); + my $trie= $self->make_trie( $opt{type}, $opt{max_depth}, $opt{backwards} ); $opt{ret_type} ||= 'len'; my $test_type= $opt{type} =~ /^cp/ ? 'cp' : 'depth'; - return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 ); + return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0, + $opt{backwards} ); } # my $optree= generic_optree(%opts); @@ -638,10 +650,10 @@ sub generic_optree { my $test_type= 'depth'; my $else= $opt{else} || 0; - my $latin1= $self->make_trie( 'latin1', $opt{max_depth} ); - my $utf8= $self->make_trie( 'utf8', $opt{max_depth} ); + my $latin1= $self->make_trie( 'latin1', $opt{max_depth}, $opt{backwards} ); + my $utf8= $self->make_trie( 'utf8', $opt{max_depth}, $opt{backwards} ); - $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 ) + $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0, $opt{backwards} ) for $latin1, $utf8; if ( $utf8 ) { @@ -650,9 +662,10 @@ sub generic_optree { $else= __cond_join( "!( is_utf8 )", $latin1, $else ); } if ($opt{type} eq 'generic') { - my $low= $self->make_trie( 'low', $opt{max_depth} ); + my $low= $self->make_trie( 'low', $opt{max_depth}, $opt{backwards} ); if ( $low ) { - $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 ); + $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0, + $opt{backwards} ); } } @@ -710,6 +723,14 @@ sub length_optree { $else= __cond_join( $cond, $optree, $else ); } } + elsif ($opt{backwards}) { + my @size= sort { $a <=> $b } keys %{ $self->{size}{$type} }; + for my $size ( @size ) { + my $optree= $self->$method(%opt, type => $type, max_depth => $size); + my $cond= "((s) - (e) > " . ( $size - 1 ).")"; + $else= __cond_join( $cond, $optree, $else ); + } + } else { my $utf8; @@ -725,11 +746,12 @@ sub length_optree { # If we do want more than the 0-255 range, find those, and if they # exist... if ( $opt{type} !~ /latin1/i - && ($utf8 = $self->make_trie($trie_type, 0))) + && ($utf8 = $self->make_trie($trie_type, 0, $opt{backwards}))) { # ... get them into an optree, and set them up as the 'else' clause - $utf8 = $self->_optree( $utf8, 'depth', $opt{ret_type}, 0, 0 ); + $utf8 = $self->_optree( $utf8, 'depth', $opt{ret_type}, 0, 0, + $opt{backwards} ); # We could make this # UTF8_IS_START(*s) && ((e) - (s)) >= UTF8SKIP(s))"; @@ -747,16 +769,18 @@ sub length_optree { # the case where the input isn't UTF-8. my $latin1; if ($method eq 'generic_optree') { - $latin1 = $self->make_trie( 'latin1', 1); - $latin1= $self->_optree($latin1, 'depth', $opt{ret_type}, 0, 0); + $latin1 = $self->make_trie( 'latin1', 1, $opt{backwards}); + $latin1= $self->_optree($latin1, 'depth', $opt{ret_type}, 0, 0, + $opt{backwards}); } # If we want the UTF-8 invariants, get those. my $low; if ($opt{type} !~ /non_low|high/ - && ($low= $self->make_trie( 'low', 1))) + && ($low= $self->make_trie( 'low', 1, 0))) { - $low= $self->_optree( $low, 'depth', $opt{ret_type}, 0, 0 ); + $low= $self->_optree( $low, 'depth', $opt{ret_type}, 0, 0, + $opt{backwards} ); # Expand out the UTF-8 invariants as a string so that we # can use them as the conditional @@ -1303,7 +1327,8 @@ sub render { # make a macro of a given type. # calls into make_trie and (generic_|length_)optree as needed # Opts are: -# type : 'cp','cp_high', 'generic','high','low','latin1','utf8','LATIN1','UTF8' +# type : 'cp', 'cp_high', 'generic', 'high', 'low', 'latin1', +# 'utf8', 'LATIN1', 'UTF8' 'backwards_UTF8' # ret_type : 'cp' or 'len' # safe : don't assume is well-formed UTF-8, so don't skip any range # checks, and add length guards to macro @@ -1357,6 +1382,7 @@ sub make_macro { $ext .= '_non_low' if $type eq 'generic_non_low'; $ext .= "_safe" if $opts{safe}; $ext .= "_no_length_checks" if $opts{no_length_checks}; + $ext .= "_backwards" if $opts{backwards}; my $argstr= join ",", @args; my $def_fmt="$pfx$self->{op}$ext%s($argstr)"; my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type ); @@ -1418,6 +1444,13 @@ sub make_macro { foreach my $type_spec ( @types ) { my ( $type, $ret )= split /-/, $type_spec; $ret ||= 'len'; + + my $backwards = 0; + if ($type eq 'backwards_UTF8') { + $type = 'UTF8'; + $backwards = 1; + } + foreach my $mod ( @mods ) { # 'safe' is irrelevant with code point macros, so skip if @@ -1435,6 +1468,7 @@ sub make_macro { charset => $charset, no_length_checks => $mod eq 'no_length_checks' && $type !~ /^cp/, + backwards => $backwards, ); print $out_fh $macro, "\n"; } @@ -1562,6 +1596,9 @@ sub make_macro { # class that can include any code point, adding the 'low' ones # to what 'utf8' works on. It is designed to take only an input # UTF-8 parameter. +# backwards_UTF8 like 'UTF8', but designed to match backwards, so that the +# second parameter to the function is earlier in the string than +# the first. # generic generate a macro whose name is 'is_BASE". It has a 2nd, # boolean, parameter which indicates if the first one points to # a UTF-8 string or not. Thus it works in all circumstances. @@ -1648,6 +1685,10 @@ sub make_macro { => high cp_high : fast \p{XPerlSpace} +XPERLSPACE: \p{XPerlSpace} +=> backwards_UTF8 : safe +\p{XPerlSpace} + NONCHAR: Non character code points => UTF8 :safe \p{_Perl_Nchar} From b062b74fe2022c7ca9d9c68e7f126359df600a47 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 6 Jun 2021 14:30:14 -0600 Subject: [PATCH 11/11] regexec.c: Use backwards isSPACE --- regexec.c | 54 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/regexec.c b/regexec.c index 2197279f19fe..fbe956826b49 100644 --- a/regexec.c +++ b/regexec.c @@ -928,7 +928,7 @@ Perl_re_intuit_start(pTHX_ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", PL_colors[4], PL_colors[5], (long)(s - strbeg)) ); - return s; + return (char *) s; } if (strpos >= strend) { /* This should be unreachable: @@ -942,52 +942,54 @@ Perl_re_intuit_start(pTHX_ if (utf8_target) { DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " rtrim intuit UTF-8 ...\n")); - while (1) { - const char *was_s = s; - if (s == strpos) - break; - s = (const char *) Perl_utf8_hop_back((const U8 *)s, -1, (const U8 *)strpos); - if (s < strpos || !isSPACE_utf8_safe(s, strend)) { - s = was_s; - break; + if (s > strpos) { + while (1) { + Size_t space_len = is_XPERLSPACE_utf8_safe_backwards(s, strpos); + if (space_len == 0) { + break; + } + + s -= space_len; } } +#if 0 /* Or, replacing the whole thing above, because the macro handles the + initial condition: */ + Size_t space_len; + while ((space_len = is_XPERLSPACE_utf8_safe_backwards(s, strpos))) + { + s -= space_len; + } +#endif } else if (OP(NEXTOPER(progi->program + 1)) == POSIXD) { /* Without //u \x{A0} mustn't match \s when stored as octets. */ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " rtrim intuit Legacy ...\n")); - while (1) { - const char *was_s = s; - if (s == strpos) - break; - --s; - if (s < strpos || !isSPACE(*s)) { - s = was_s; + while (s > strpos) { + s--; + if (! isSPACE(*s)) { + s++; break; } } } else { - /* flag //u present - the op will be POSIXU */ + /* flag /u present - the op will be POSIXU */ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " rtrim intuit Latin1 ...\n")); - while (1) { - const char *was_s = s; - if (s == strpos) - break; - --s; - if (s < strpos || !isSPACE_L1(*s)) { - s = was_s; + while (s > strpos) { + s--; + if (! isSPACE_L1(*s)) { + s++; break; } } } - if (s < strend || s == strend && prog->minlen == 0) { + if (s < strend || (s == strend && prog->minlen == 0)) { DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", PL_colors[4], PL_colors[5], (long)(s - strbeg)) ); - return s; + return (char *) s; } goto fail; }