diff --git a/MANIFEST b/MANIFEST index bb895e908506..99f26c7561ac 100644 --- a/MANIFEST +++ b/MANIFEST @@ -6098,6 +6098,7 @@ t/re/regexp_trielist.t See if regular expressions work with trie optimisation t/re/regexp_unicode_prop.t See if unicode properties work in regular expressions as expected t/re/regexp_unicode_prop_thr.t See if unicode properties work in regular expressions as expected under threads t/re/rt122747.t Test rt122747 assert faile (requires DEBUGGING) +t/re/rtrim.t Tests for potential rtrim optimisations t/re/rxcode.t See if /(?{ code })/ works t/re/script_run.t See if script runs works t/re/speed.t See if optimisations are keeping things fast diff --git a/dump.c b/dump.c index a72accddb3bf..d3f85ed2de75 100644 --- a/dump.c +++ b/dump.c @@ -918,6 +918,8 @@ S_pm_description(pTHX_ const PMOP *pm) sv_catpvs(desc, ",WHITE"); if (RX_EXTFLAGS(regex) & RXf_NULL) sv_catpvs(desc, ",NULL"); + if (RX_EXTFLAGS(regex) & RXf_RTRIM) + sv_catpvs(desc, ",RTRIM"); } append_flags(desc, pmflags, pmflags_flags_names); @@ -1708,6 +1710,7 @@ const struct flag_to_name regexp_extflags_names[] = { {RXf_SKIPWHITE, "SKIPWHITE,"}, {RXf_WHITE, "WHITE,"}, {RXf_NULL, "NULL,"}, + {RXf_RTRIM, "RTRIM,"}, }; /* NOTE: this structure is mostly duplicative of one generated by diff --git a/ext/Devel-Peek/t/Peek.t b/ext/Devel-Peek/t/Peek.t index a8c68c80e60d..30b654ff9408 100644 --- a/ext/Devel-Peek/t/Peek.t +++ b/ext/Devel-Peek/t/Peek.t @@ -355,9 +355,10 @@ do_test('reference to named subroutine without prototype', if ($] >= 5.011) { # note the conditionals on ENGINE and INTFLAGS were introduced in 5.19.9 -do_test('reference to regexp', - qr(tic), -'SV = $RV\\($ADDR\\) at $ADDR + # We are taking great care to curate this test as if the module is dual life + # (or we actively want to cherry-pick entire chunks of it back to maint) + # Is this a good idea? + my $raw = 'SV = $RV\\($ADDR\\) at $ADDR REFCNT = 1 FLAGS = \\(ROK\\) RV = $ADDR @@ -423,7 +424,13 @@ do_test('reference to regexp', OFFS = $ADDR QR_ANONCV = 0x0(?: SAVED_COPY = 0x0)?' -)); +); + + $raw =~ s/ EXTFLAGS = 0x680000 / EXTFLAGS = 0x340000 /g + if $] >= 5.035; + do_test('reference to regexp', + qr(tic), + $raw); } else { do_test('reference to regexp', qr(tic), diff --git a/regcharclass.h b/regcharclass.h index 52a46aab3395..1057a1229f84 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -155,6 +155,35 @@ ( 0x202F == cp || ( 0x202F < cp && \ ( 0x205F == cp || 0x3000 == cp ) ) ) ) ) ) ) ) ) +/* + XPERLSPACE: \p{XPerlSpace} + + \p{XPerlSpace} +*/ +/*** GENERATED CODE ***/ +#define is_XPERLSPACE_utf8_safe_backwards(s,e) \ +( ((s) - (e) > 2) ? \ + ( ( inRANGE_helper_(U8, *((const U8*)s - 1), '\t', '\r') || ' ' == *((const U8*)s - 1) ) ? 1\ + : ( 0x80 == *((const U8*)s - 1) ) ? \ + ( ( 0x80 == *((const U8*)s - 2) ) ? \ + ( ( inRANGE_helper_(U8, *((const U8*)s - 3), 0xE2, 0xE3) ) ? 3 : 0 )\ + : ( ( 0x9A == *((const U8*)s - 2) ) && ( 0xE1 == *((const U8*)s - 3) ) ) ? 3 : 0 )\ + : ( inRANGE_helper_(U8, *((const U8*)s - 1), 0x81, 0x84) || inRANGE_helper_(U8, *((const U8*)s - 1), 0x86, 0x8A) || inRANGE_helper_(U8, *((const U8*)s - 1), 0xA8, 0xA9) || 0xAF == *((const U8*)s - 1) ) ?\ + ( ( ( 0x80 == *((const U8*)s - 2) ) && ( 0xE2 == *((const U8*)s - 3) ) ) ? 3 : 0 )\ + : ( 0x85 == *((const U8*)s - 1) ) ? \ + ( ( 0x80 == *((const U8*)s - 2) ) ? \ + ( ( 0xE2 == *((const U8*)s - 3) ) ? 3 : 0 ) \ + : ( 0xC2 == *((const U8*)s - 2) ) ? 2 : 0 ) \ + : ( 0x9F == *((const U8*)s - 1) ) ? \ + ( ( ( 0x81 == *((const U8*)s - 2) ) && ( 0xE2 == *((const U8*)s - 3) ) ) ? 3 : 0 )\ + : ( ( 0xA0 == *((const U8*)s - 1) ) && ( 0xC2 == *((const U8*)s - 2) ) ) ? 2 : 0 )\ +: ((s) - (e) > 1) ? \ + ( ( inRANGE_helper_(U8, *((const U8*)s - 1), '\t', '\r') || ' ' == *((const U8*)s - 1) ) ? 1\ + : ( ( 0x85 == *((const U8*)s - 1) || 0xA0 == *((const U8*)s - 1) ) && ( 0xC2 == *((const U8*)s - 2) ) ) ? 2 : 0 )\ +: ((s) - (e) > 0) ? \ + ( inRANGE_helper_(U8, *((const U8*)s - 1), '\t', '\r') || ' ' == *((const U8*)s - 1) )\ +: 0 ) + /* NONCHAR: Non character code points @@ -1338,6 +1367,35 @@ ( 0x202F == cp || ( 0x202F < cp && \ ( 0x205F == cp || 0x3000 == cp ) ) ) ) ) ) ) ) ) +/* + XPERLSPACE: \p{XPerlSpace} + + \p{XPerlSpace} +*/ +/*** GENERATED CODE ***/ +#define is_XPERLSPACE_utf8_safe_backwards(s,e) \ +( ((s) - (e) > 2) ? \ + ( ( '\t' == *((const U8*)s - 1) || inRANGE_helper_(U8, *((const U8*)s - 1), '\v', '\r') || '\n' == *((const U8*)s - 1) || 0x25 == *((const U8*)s - 1) || ' ' == *((const U8*)s - 1) ) ? 1\ + : ( 0x41 == *((const U8*)s - 1) ) ? \ + ( ( 0x41 == *((const U8*)s - 2) ) ? \ + ( ( ( *((const U8*)s - 3) & 0xFB ) == 0xCA ) ? 3 : 0 ) \ + : ( 0x63 == *((const U8*)s - 2) ) ? \ + ( ( 0xBC == *((const U8*)s - 3) ) ? 3 : 0 ) \ + : ( 0x80 == *((const U8*)s - 2) ) ? 2 : 0 ) \ + : ( inRANGE_helper_(U8, *((const U8*)s - 1), 0x42, 0x48) || 0x51 == *((const U8*)s - 1) ) ?\ + ( ( ( 0x41 == *((const U8*)s - 2) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\ + : ( inRANGE_helper_(U8, *((const U8*)s - 1), 0x49, 0x4A) ) ? \ + ( ( ( inRANGE_helper_(U8, *((const U8*)s - 2), 0x41, 0x42) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\ + : ( 0x56 == *((const U8*)s - 1) ) ? \ + ( ( ( 0x42 == *((const U8*)s - 2) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\ + : ( ( ( 0x73 == *((const U8*)s - 1) ) && ( 0x43 == *((const U8*)s - 2) ) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\ +: ((s) - (e) > 1) ? \ + ( ( '\t' == *((const U8*)s - 1) || inRANGE_helper_(U8, *((const U8*)s - 1), '\v', '\r') || '\n' == *((const U8*)s - 1) || 0x25 == *((const U8*)s - 1) || ' ' == *((const U8*)s - 1) ) ? 1\ + : ( ( 0x41 == *((const U8*)s - 1) ) && ( 0x80 == *((const U8*)s - 2) ) ) ? 2 : 0 )\ +: ((s) - (e) > 0) ? \ + ( '\t' == *((const U8*)s - 1) || inRANGE_helper_(U8, *((const U8*)s - 1), '\v', '\r') || '\n' == *((const U8*)s - 1) || 0x25 == *((const U8*)s - 1) || ' ' == *((const U8*)s - 1) )\ +: 0 ) + /* NONCHAR: Non character code points @@ -2516,6 +2574,35 @@ ( 0x202F == cp || ( 0x202F < cp && \ ( 0x205F == cp || 0x3000 == cp ) ) ) ) ) ) ) ) ) +/* + XPERLSPACE: \p{XPerlSpace} + + \p{XPerlSpace} +*/ +/*** GENERATED CODE ***/ +#define is_XPERLSPACE_utf8_safe_backwards(s,e) \ +( ((s) - (e) > 2) ? \ + ( ( '\t' == *((const U8*)s - 1) || inRANGE_helper_(U8, *((const U8*)s - 1), '\v', '\r') || 0x15 == *((const U8*)s - 1) || '\n' == *((const U8*)s - 1) || ' ' == *((const U8*)s - 1) ) ? 1\ + : ( 0x41 == *((const U8*)s - 1) ) ? \ + ( ( 0x41 == *((const U8*)s - 2) ) ? \ + ( ( ( *((const U8*)s - 3) & 0xFB ) == 0xCA ) ? 3 : 0 ) \ + : ( 0x62 == *((const U8*)s - 2) ) ? \ + ( ( 0xBD == *((const U8*)s - 3) ) ? 3 : 0 ) \ + : ( 0x78 == *((const U8*)s - 2) ) ? 2 : 0 ) \ + : ( inRANGE_helper_(U8, *((const U8*)s - 1), 0x42, 0x48) || 0x51 == *((const U8*)s - 1) ) ?\ + ( ( ( 0x41 == *((const U8*)s - 2) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\ + : ( inRANGE_helper_(U8, *((const U8*)s - 1), 0x49, 0x4A) ) ? \ + ( ( ( inRANGE_helper_(U8, *((const U8*)s - 2), 0x41, 0x42) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\ + : ( 0x56 == *((const U8*)s - 1) ) ? \ + ( ( ( 0x42 == *((const U8*)s - 2) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\ + : ( ( ( 0x72 == *((const U8*)s - 1) ) && ( 0x43 == *((const U8*)s - 2) ) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\ +: ((s) - (e) > 1) ? \ + ( ( '\t' == *((const U8*)s - 1) || inRANGE_helper_(U8, *((const U8*)s - 1), '\v', '\r') || 0x15 == *((const U8*)s - 1) || '\n' == *((const U8*)s - 1) || ' ' == *((const U8*)s - 1) ) ? 1\ + : ( ( 0x41 == *((const U8*)s - 1) ) && ( 0x78 == *((const U8*)s - 2) ) ) ? 2 : 0 )\ +: ((s) - (e) > 0) ? \ + ( '\t' == *((const U8*)s - 1) || inRANGE_helper_(U8, *((const U8*)s - 1), '\v', '\r') || 0x15 == *((const U8*)s - 1) || '\n' == *((const U8*)s - 1) || ' ' == *((const U8*)s - 1) )\ +: 0 ) + /* NONCHAR: Non character code points @@ -3617,6 +3704,6 @@ * 696e706fddd3ce8cd48c7ea91caf4c9edf5c296432d320aa7b78631f69aa9eac lib/unicore/mktables * 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version * 24120d5e0c9685c442c93bc1dbea9b85ef973bf8e9474baf0e55b160c288226b regen/charset_translations.pl - * 3635c6e564558e965018947bdab45f37d9a4fa82eb05b2694eae1a04bf7e65a3 regen/regcharclass.pl + * 29d7ced5065b4b2476607aefb87083c37a7dc5f9705430a7c0811d4232efca13 regen/regcharclass.pl * b2f896452d2b30da3e04800f478c60c1fd0b03d6b668689b020f1e3cf1f1cdd9 regen/regcharclass_multi_char_folds.pl * ex: set ro: */ diff --git a/regcomp.c b/regcomp.c index c5e54cc69a29..d1660d2d0445 100644 --- a/regcomp.c +++ b/regcomp.c @@ -8466,7 +8466,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* It's safe to read through *next only if OP(first) is a regop of * the right type (not EXACT, for example). */ - U8 nop = (fop == NOTHING || fop == MBOL || fop == SBOL || fop == PLUS) + U8 nop = (fop == NOTHING || fop == MBOL || fop == SBOL || fop == PLUS || fop == STAR) ? OP(next) : 0; if (PL_regkind[fop] == NOTHING && nop == END) @@ -8488,6 +8488,21 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, && *(STRING(first)) == ' ' && OP(regnext(first)) == END ) RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + else if ((fop == PLUS || fop == STAR) + && (nop == POSIXU || nop == POSIXD) + && FLAGS(next) == _CC_SPACE) { + regnode *second = regnext(first); + regnode *third = (OP(second) == EOS || OP(second) == SEOL) + ? regnext(second) : NULL; + if (third && OP(third) == END) { + /* /[[:space:]]+\z/u + * /[[:space:]]+$/u + * /[[:space:]]*$/u + * /\s*$/ + * etc */ + RExC_rx->extflags |= RXf_RTRIM | RXf_CHECK_ALL; + } + } } diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index 76b1532047d6..658506a48a83 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -354,8 +354,9 @@ sub val_fmt # # Each string is then stored in the 'strs' subhash as a hash record # made up of the results of __uni_latin1, using the keynames -# 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and -# 'UTF8' which hold a merge of 'low' and their lowercase equivalents. +# 'low', 'latin1', 'utf8', as well as the synthesized 'LATIN1', 'high', +# 'UTF8', and 'backwards_UTF8' which hold a merge of 'low' and their lowercase +# equivalents. # # Size data is tracked per type in the 'size' subhash. # @@ -489,7 +490,7 @@ sub new { # sub make_trie { - my ( $self, $type, $maxlen )= @_; + my ( $self, $type, $maxlen, $backwards )= @_; my $strs= $self->{strs}; my %trie; @@ -500,7 +501,8 @@ sub make_trie { next unless $dat; next if $maxlen && @$dat > $maxlen; my $node= \%trie; - foreach my $elem ( @$dat ) { + my @ordered_dat = ($backwards) ? reverse @$dat : @$dat; + foreach my $elem ( @ordered_dat ) { $node->{$elem} ||= {}; $node= $node->{$elem}; } @@ -533,7 +535,7 @@ ($) # sub _optree { - my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_; + my ( $self, $trie, $test_type, $ret_type, $else, $depth, $backwards )= @_; return unless defined $trie; $ret_type ||= 'len'; $else= 0 unless defined $else; @@ -567,7 +569,16 @@ sub _optree { # can return the "else" value. return $else if !@conds; - my $test = $test_type =~ /^cp/ ? "cp" : "((const U8*)s)[$depth]"; + my $test; + if ($test_type =~ /^cp/) { + $test = "cp"; + } + elsif ($backwards) { + $test = "*((const U8*)s - " . ($depth + 1) . ")"; + } + else { + $test = "((const U8*)s)[$depth]"; + } # First we loop over the possible keys/conditions and find out what they # look like; we group conditions with the same optree together. @@ -578,7 +589,7 @@ sub _optree { # get the optree for this child/condition my $res= $self->_optree( $trie->{$cond}, $test_type, $ret_type, - $else, $depth + 1 ); + $else, $depth + 1, $backwards ); # convert it to a string with Dumper my $res_code= Dumper( $res ); @@ -618,10 +629,11 @@ sub _optree { sub optree { my $self= shift; my %opt= @_; - my $trie= $self->make_trie( $opt{type}, $opt{max_depth} ); + my $trie= $self->make_trie( $opt{type}, $opt{max_depth}, $opt{backwards} ); $opt{ret_type} ||= 'len'; my $test_type= $opt{type} =~ /^cp/ ? 'cp' : 'depth'; - return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 ); + return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0, + $opt{backwards} ); } # my $optree= generic_optree(%opts); @@ -638,10 +650,10 @@ sub generic_optree { my $test_type= 'depth'; my $else= $opt{else} || 0; - my $latin1= $self->make_trie( 'latin1', $opt{max_depth} ); - my $utf8= $self->make_trie( 'utf8', $opt{max_depth} ); + my $latin1= $self->make_trie( 'latin1', $opt{max_depth}, $opt{backwards} ); + my $utf8= $self->make_trie( 'utf8', $opt{max_depth}, $opt{backwards} ); - $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 ) + $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0, $opt{backwards} ) for $latin1, $utf8; if ( $utf8 ) { @@ -650,9 +662,10 @@ sub generic_optree { $else= __cond_join( "!( is_utf8 )", $latin1, $else ); } if ($opt{type} eq 'generic') { - my $low= $self->make_trie( 'low', $opt{max_depth} ); + my $low= $self->make_trie( 'low', $opt{max_depth}, $opt{backwards} ); if ( $low ) { - $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 ); + $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0, + $opt{backwards} ); } } @@ -710,6 +723,14 @@ sub length_optree { $else= __cond_join( $cond, $optree, $else ); } } + elsif ($opt{backwards}) { + my @size= sort { $a <=> $b } keys %{ $self->{size}{$type} }; + for my $size ( @size ) { + my $optree= $self->$method(%opt, type => $type, max_depth => $size); + my $cond= "((s) - (e) > " . ( $size - 1 ).")"; + $else= __cond_join( $cond, $optree, $else ); + } + } else { my $utf8; @@ -725,11 +746,12 @@ sub length_optree { # If we do want more than the 0-255 range, find those, and if they # exist... if ( $opt{type} !~ /latin1/i - && ($utf8 = $self->make_trie($trie_type, 0))) + && ($utf8 = $self->make_trie($trie_type, 0, $opt{backwards}))) { # ... get them into an optree, and set them up as the 'else' clause - $utf8 = $self->_optree( $utf8, 'depth', $opt{ret_type}, 0, 0 ); + $utf8 = $self->_optree( $utf8, 'depth', $opt{ret_type}, 0, 0, + $opt{backwards} ); # We could make this # UTF8_IS_START(*s) && ((e) - (s)) >= UTF8SKIP(s))"; @@ -747,16 +769,18 @@ sub length_optree { # the case where the input isn't UTF-8. my $latin1; if ($method eq 'generic_optree') { - $latin1 = $self->make_trie( 'latin1', 1); - $latin1= $self->_optree($latin1, 'depth', $opt{ret_type}, 0, 0); + $latin1 = $self->make_trie( 'latin1', 1, $opt{backwards}); + $latin1= $self->_optree($latin1, 'depth', $opt{ret_type}, 0, 0, + $opt{backwards}); } # If we want the UTF-8 invariants, get those. my $low; if ($opt{type} !~ /non_low|high/ - && ($low= $self->make_trie( 'low', 1))) + && ($low= $self->make_trie( 'low', 1, 0))) { - $low= $self->_optree( $low, 'depth', $opt{ret_type}, 0, 0 ); + $low= $self->_optree( $low, 'depth', $opt{ret_type}, 0, 0, + $opt{backwards} ); # Expand out the UTF-8 invariants as a string so that we # can use them as the conditional @@ -1303,7 +1327,8 @@ sub render { # make a macro of a given type. # calls into make_trie and (generic_|length_)optree as needed # Opts are: -# type : 'cp','cp_high', 'generic','high','low','latin1','utf8','LATIN1','UTF8' +# type : 'cp', 'cp_high', 'generic', 'high', 'low', 'latin1', +# 'utf8', 'LATIN1', 'UTF8' 'backwards_UTF8' # ret_type : 'cp' or 'len' # safe : don't assume is well-formed UTF-8, so don't skip any range # checks, and add length guards to macro @@ -1357,6 +1382,7 @@ sub make_macro { $ext .= '_non_low' if $type eq 'generic_non_low'; $ext .= "_safe" if $opts{safe}; $ext .= "_no_length_checks" if $opts{no_length_checks}; + $ext .= "_backwards" if $opts{backwards}; my $argstr= join ",", @args; my $def_fmt="$pfx$self->{op}$ext%s($argstr)"; my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type ); @@ -1418,6 +1444,13 @@ sub make_macro { foreach my $type_spec ( @types ) { my ( $type, $ret )= split /-/, $type_spec; $ret ||= 'len'; + + my $backwards = 0; + if ($type eq 'backwards_UTF8') { + $type = 'UTF8'; + $backwards = 1; + } + foreach my $mod ( @mods ) { # 'safe' is irrelevant with code point macros, so skip if @@ -1435,6 +1468,7 @@ sub make_macro { charset => $charset, no_length_checks => $mod eq 'no_length_checks' && $type !~ /^cp/, + backwards => $backwards, ); print $out_fh $macro, "\n"; } @@ -1562,6 +1596,9 @@ sub make_macro { # class that can include any code point, adding the 'low' ones # to what 'utf8' works on. It is designed to take only an input # UTF-8 parameter. +# backwards_UTF8 like 'UTF8', but designed to match backwards, so that the +# second parameter to the function is earlier in the string than +# the first. # generic generate a macro whose name is 'is_BASE". It has a 2nd, # boolean, parameter which indicates if the first one points to # a UTF-8 string or not. Thus it works in all circumstances. @@ -1648,6 +1685,10 @@ sub make_macro { => high cp_high : fast \p{XPerlSpace} +XPERLSPACE: \p{XPerlSpace} +=> backwards_UTF8 : safe +\p{XPerlSpace} + NONCHAR: Non character code points => UTF8 :safe \p{_Perl_Nchar} diff --git a/regexec.c b/regexec.c index b06b6b0ea37a..ef30bd188598 100644 --- a/regexec.c +++ b/regexec.c @@ -921,6 +921,79 @@ Perl_re_intuit_start(pTHX_ /* not actually used within intuit, but zero for safety anyway */ reginfo->poscache_maxiter = 0; + if(prog->extflags & RXf_RTRIM) { + const char *s = strend; + if (strpos == strend && prog->minlen == 0) { + /* \s* and we are asked to match an empty string */ + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ + "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", + PL_colors[4], PL_colors[5], (long)(s - strbeg)) ); + return (char *) s; + } + if (strpos >= strend) { + /* This should be unreachable: + * String shorter than min possible regex match (0 < 1) + * but in the future we might want to also handle ? and {0,...} + */ + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ + " rtrim intuit on empty string ...\n")); + goto fail; + } + if (utf8_target) { + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ + " rtrim intuit UTF-8 ...\n")); + if (s > strpos) { + while (1) { + Size_t space_len = is_XPERLSPACE_utf8_safe_backwards(s, strpos); + if (space_len == 0) { + break; + } + + s -= space_len; + } + } +#if 0 /* Or, replacing the whole thing above, because the macro handles the + initial condition: */ + Size_t space_len; + while ((space_len = is_XPERLSPACE_utf8_safe_backwards(s, strpos))) + { + s -= space_len; + } +#endif + } + else if (OP(NEXTOPER(progi->program + 1)) == POSIXD) { + /* Without //u \x{A0} mustn't match \s when stored as octets. */ + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ + " rtrim intuit Legacy ...\n")); + while (s > strpos) { + s--; + if (! isSPACE(*s)) { + s++; + break; + } + } + } + else { + /* flag /u present - the op will be POSIXU */ + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ + " rtrim intuit Latin1 ...\n")); + while (s > strpos) { + s--; + if (! isSPACE_L1(*s)) { + s++; + break; + } + } + } + if (s < strend || (s == strend && prog->minlen == 0)) { + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ + "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", + PL_colors[4], PL_colors[5], (long)(s - strbeg)) ); + return (char *) s; + } + goto fail; + } + if (utf8_target) { if ((!prog->anchored_utf8 && prog->anchored_substr) || (!prog->float_utf8 && prog->float_substr)) @@ -3651,9 +3724,19 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, prog->lastparen = prog->lastcloseparen = 0; RXp_MATCH_UTF8_set(prog, utf8_target); prog->offs[0].start = s - strbeg; - prog->offs[0].end = utf8_target - ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg - : s - strbeg + prog->minlenret; + if (prog->extflags & RXf_RTRIM) { + /* Oh my, seems that until RTRIM, match via INTUIT was always + * a fixed length, given by minlenret. + * RTRIM breaks that assumption. + * For now, we just hack our known (other) match length - the + * entire string: */ + prog->offs[0].end = strend - strbeg; + } + else { + prog->offs[0].end = utf8_target + ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg + : s - strbeg + prog->minlenret; + } if ( !(flags & REXEC_NOT_FIRST) ) S_reg_set_capture_string(aTHX_ rx, strbeg, strend, diff --git a/regexp.h b/regexp.h index e40420613353..b783f9906f39 100644 --- a/regexp.h +++ b/regexp.h @@ -414,7 +414,7 @@ and check for NULL. * For the regexp bits, PL_reg_extflags_name[] in regnodes.h has a comment * giving which bits are used/unused */ -# define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT + 2) +# define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT + 1) /* What we have seen */ # define RXf_NO_INPLACE_SUBST (1U<<(RXf_BASE_SHIFT+2)) @@ -450,12 +450,13 @@ and check for NULL. /* split " " */ # define RXf_WHITE (1U<<(RXf_BASE_SHIFT+16)) /* Pattern is /\s+/ */ # define RXf_NULL (1U<<(RXf_BASE_SHIFT+17)) /* Pattern is // */ +# define RXf_RTRIM (1U<<(RXf_BASE_SHIFT+18)) /* Pattern is /[:space:]\z/u */ /* See comments at the beginning of these defines about adding bits. The * highest bit position should be used, so that if RXf_BASE_SHIFT gets * increased, the #error below will be triggered so that you will be reminded * to adjust things at the other end to keep the bit positions unchanged */ -# if RXf_BASE_SHIFT+17 > 31 +# if RXf_BASE_SHIFT+18 > 31 # error Too many RXf_PMf bits used. See comments at beginning of these for what to do # endif diff --git a/regnodes.h b/regnodes.h index 94b71102eb3b..106cc10b9b55 100644 --- a/regnodes.h +++ b/regnodes.h @@ -1654,7 +1654,7 @@ EXTCONST char * const PL_reg_name[] = { EXTCONST char * PL_reg_extflags_name[]; #else EXTCONST char * const PL_reg_extflags_name[] = { - /* Bits in extflags defined: 11111111111111110000111111111111 */ + /* Bits in extflags defined: 11111111111111111000111111111111 */ "MULTILINE", /* 0x00000001 */ "SINGLELINE", /* 0x00000002 */ "FOLD", /* 0x00000004 */ @@ -1670,23 +1670,23 @@ EXTCONST char * const PL_reg_extflags_name[] = { "UNUSED_BIT_12", /* 0x00001000 */ "UNUSED_BIT_13", /* 0x00002000 */ "UNUSED_BIT_14", /* 0x00004000 */ - "UNUSED_BIT_15", /* 0x00008000 */ - "NO_INPLACE_SUBST", /* 0x00010000 */ - "EVAL_SEEN", /* 0x00020000 */ - "UNBOUNDED_QUANTIFIER_SEEN",/* 0x00040000 */ - "CHECK_ALL", /* 0x00080000 */ - "MATCH_UTF8", /* 0x00100000 */ - "USE_INTUIT_NOML", /* 0x00200000 */ - "USE_INTUIT_ML", /* 0x00400000 */ - "INTUIT_TAIL", /* 0x00800000 */ - "IS_ANCHORED", /* 0x01000000 */ - "COPY_DONE", /* 0x02000000 */ - "TAINTED_SEEN", /* 0x04000000 */ - "TAINTED", /* 0x08000000 */ - "START_ONLY", /* 0x10000000 */ - "SKIPWHITE", /* 0x20000000 */ - "WHITE", /* 0x40000000 */ - "NULL", /* 0x80000000 */ + "NO_INPLACE_SUBST", /* 0x00008000 */ + "EVAL_SEEN", /* 0x00010000 */ + "UNBOUNDED_QUANTIFIER_SEEN",/* 0x00020000 */ + "CHECK_ALL", /* 0x00040000 */ + "MATCH_UTF8", /* 0x00080000 */ + "USE_INTUIT_NOML", /* 0x00100000 */ + "USE_INTUIT_ML", /* 0x00200000 */ + "INTUIT_TAIL", /* 0x00400000 */ + "IS_ANCHORED", /* 0x00800000 */ + "COPY_DONE", /* 0x01000000 */ + "TAINTED_SEEN", /* 0x02000000 */ + "TAINTED", /* 0x04000000 */ + "START_ONLY", /* 0x08000000 */ + "SKIPWHITE", /* 0x10000000 */ + "WHITE", /* 0x20000000 */ + "NULL", /* 0x40000000 */ + "RTRIM", /* 0x80000000 */ }; #endif /* DOINIT */ diff --git a/t/re/rtrim.t b/t/re/rtrim.t new file mode 100644 index 000000000000..7c676283a885 --- /dev/null +++ b/t/re/rtrim.t @@ -0,0 +1,411 @@ +#!./perl + +use strict; +use warnings; + +# This tests that regexs used for trimming whitespace from end of string +# continue to work consistently when we optimise the regex engine. + +BEGIN { + chdir 't' if -d 't'; + require './test.pl'; + set_up_inc('../lib'); +} + +my $nbs_u = "\xA0\x{100}"; +chop $nbs_u; +my $nl_u = "\n\x{100}"; +chop $nl_u; +my $empty_u = "\x{100}"; +chop $empty_u; + +my @tests = ( + ['Hello world'], + [' Hello world'], + [' Hello world ', ' Hello world'], + ['Hello world ', 'Hello world'], + + ["Hello world\n", 'Hello world'], + [" Hello world\n", ' Hello world'], + ["Hello world \n", 'Hello world'], + [" Hello world \n", ' Hello world'], + + ["Yarrrr\r", 'Yarrrr'], + ["NBS8\xA0", 'NBS8'], + ["NBSU$nbs_u", 'NBSU'], + ["\n", ""], + ["\r\n\t\f ", ""], + ["!\t", "!"], + + ["EN\x{2002}Space\x{2002}", "EN\x{2002}Space"], + ["\x{2002}\x{2003}Spaces\x{2004}\x{2005}", "\x{2002}\x{2003}Spaces"], + ["\x{1680}", ""], + [$nl_u, ""], + [$empty_u], +); + +plan(80 * @tests); + +# Yes this is recursive copy-paste-edit, but I'm not confident that trying to +# generate the code then string eval it is much better. Particularly given the +# corner cases. (+ sometimes doesn't match, * always will, and without //u) +for (@tests) { + my ($input, $want) = @$_; + my $pretty = $input; + my $pretty_want = $want; + for ($pretty, $pretty_want) { + next + unless defined $_; + s/\n/\\n/g; + s/\f/\\f/g; + s/\r/\\r/g; + s/\t/\\t/g; + # Normally such complexity would have no place *in* a test for the regex + # engine, but as this test is testing optimisations it seems acceptable. + s/([^[:ascii:]])/sprintf "\\x{%X}", ord $1/ge; + } + + # m// s/// s///r + # \s or [[:space:]] + # + or * + # \z or $ + # // or //u + + if (defined $want) { + { + ok($input =~ /\s+\z/u, "qq<$pretty> =~ /\\s+\\z/u"); + my $copy1 = $input; + is($copy1 =~ s/\s+\z//u, 1, "qq<$pretty> =~ s/\\s+\\z//u"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s+\z//ur, $want, "qq<$pretty> =~ s/\\s+\\z//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/\\s+\\z//ur unchanged"); + } + + { + ok($input =~ /[[:space:]]+\z/u, "qq<$pretty> =~ /[[:space:]]+\\z/u"); + my $copy1 = $input; + is($copy1 =~ s/\s+\z//u, 1, "qq<$pretty> =~ s/[[:space:]]+\\z//u"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s+\z//ur, $want, "qq<$pretty> =~ s/[[:space:]]+\\z//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]+\\z//ur unchanged"); + } + + { + ok($input =~ /\s*\z/u, "qq<$pretty> =~ /\\s*\\z/u"); + my $copy1 = $input; + is($copy1 =~ s/\s*\z//u, 1, "qq<$pretty> =~ s/\\s*\\z//u"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s*\z//ur, $want, "qq<$pretty> =~ s/\\s*\\z//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/\\s*\\z//ur unchanged"); + } + + { + ok($input =~ /[[:space:]]*\z/u, "qq<$pretty> =~ /[[:space:]]*\\z/u"); + my $copy1 = $input; + is($copy1 =~ s/\s*\z//u, 1, "qq<$pretty> =~ s/[[:space:]]*\\z//u"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s*\z//ur, $want, "qq<$pretty> =~ s/[[:space:]]*\\z//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]*\\z//ur unchanged"); + } + + { + ok($input =~ /\s+$/u, "qq<$pretty> =~ /\\s+\$/u"); + my $copy1 = $input; + is($copy1 =~ s/\s+$//u, 1, "qq<$pretty> =~ s/\\s+\$//u"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s+$//ur, $want, "qq<$pretty> =~ s/\\s+\$//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/\\s+\$//ur unchanged"); + } + + { + ok($input =~ /[[:space:]]+$/u, "qq<$pretty> =~ /[[:space:]]+\$/u"); + my $copy1 = $input; + is($copy1 =~ s/\s+$//u, 1, "qq<$pretty> =~ s/[[:space:]]+\$//u"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s+$//ur, $want, "qq<$pretty> =~ s/[[:space:]]+\$//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]+\$//ur unchanged"); + } + + { + ok($input =~ /\s*$/u, "qq<$pretty> =~ /\\s*\$/u"); + my $copy1 = $input; + is($copy1 =~ s/\s*$//u, 1, "qq<$pretty> =~ s/\\s*\$//u"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s*$//ur, $want, "qq<$pretty> =~ s/\\s*\$//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/\\s*\$//ur unchanged"); + } + + { + ok($input =~ /[[:space:]]*$/u, "qq<$pretty> =~ /[[:space:]]*\$/u"); + my $copy1 = $input; + is($copy1 =~ s/\s*$//u, 1, "qq<$pretty> =~ s/[[:space:]]*\$//u"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s*$//ur, $want, "qq<$pretty> =~ s/[[:space:]]*\$//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]*\$//ur unchanged"); + } + } + else { + { + ok($input !~ /\s+\z/u, "qq<$pretty> !~ /\\s+\\z/u"); + my $copy1 = $input; + is($copy1 =~ s/\s+\z//u, "", "qq<$pretty> =~ s/\\s+\\z//u"); + is($copy1, $input, "qq<$pretty> unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s+\z//ur, $input, "qq<$pretty> =~ s/[[:space:]]+\\z//ur"); + is($copy2, $input, "qq<$pretty> unchanged"); + } + + { + ok($input !~ /[[:space:]]+\z/u, "qq<$pretty> !~ /[[:space:]]+\\z/u"); + my $copy1 = $input; + is($copy1 =~ s/[[:space:]]+\z//u, "", "qq<$pretty> =~ s/[[:space:]]+\\z//u retval"); + is($copy1, $input, "qq<$pretty> =~ s/[[:space:]]+\\z//u unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s+\z//ur, $input, "qq<$pretty> =~ s/[[:space:]]+\\z//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]+\\z//ur unchanged"); + } + + { + # Unlike +, * matches, but doesn't change anything + ok($input =~ /\s*\z/u, "qq<$pretty> =~ /\\s*\\z/u"); + my $copy1 = $input; + is($copy1 =~ s/\s*\z//u, 1, "qq<$pretty> =~ s/\\s*\\z//u"); + is($copy1, $input, "qq<$pretty> unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s*\z//ur, $input, "qq<$pretty> =~ s/[[:space:]]*\\z//ur"); + is($copy2, $input, "qq<$pretty> unchanged"); + } + + { + # Unlike +, * matches, but doesn't change anything + ok($input =~ /[[:space:]]*\z/u, "qq<$pretty> =~ /[[:space:]]*\\z/u"); + my $copy1 = $input; + is($copy1 =~ s/[[:space:]]*\z//u, 1, "qq<$pretty> =~ s/[[:space:]]*\\z//u retval"); + is($copy1, $input, "qq<$pretty> =~ s/[[:space:]]*\\z//u unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s*\z//ur, $input, "qq<$pretty> =~ s/[[:space:]]*\\z//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]*\\z//ur unchanged"); + } + + { + ok($input !~ /\s+$/u, "qq<$pretty> !~ /\\s+\$/u"); + my $copy1 = $input; + is($copy1 =~ s/\s+$//u, "", "qq<$pretty> =~ s/\\s+\$//u"); + is($copy1, $input, "qq<$pretty> unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s+$//ur, $input, "qq<$pretty> =~ s/[[:space:]]+\$//ur"); + is($copy2, $input, "qq<$pretty> unchanged"); + } + + { + ok($input !~ /[[:space:]]+$/u, "qq<$pretty> !~ /[[:space:]]+\$/u"); + my $copy1 = $input; + is($copy1 =~ s/[[:space:]]+$//u, "", "qq<$pretty> =~ s/[[:space:]]+\$//u retval"); + is($copy1, $input, "qq<$pretty> =~ s/[[:space:]]+\$//u unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s+$//ur, $input, "qq<$pretty> =~ s/[[:space:]]+\$//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]+\$//ur unchanged"); + } + + { + # Unlike +, * matches, but doesn't change anything + ok($input =~ /\s*$/u, "qq<$pretty> =~ /\\s*\$/u"); + my $copy1 = $input; + is($copy1 =~ s/\s*$//u, 1, "qq<$pretty> =~ s/\\s*\$//u"); + is($copy1, $input, "qq<$pretty> unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s*$//ur, $input, "qq<$pretty> =~ s/[[:space:]]*\$//ur"); + is($copy2, $input, "qq<$pretty> unchanged"); + } + + { + # Unlike +, * matches, but doesn't change anything + ok($input =~ /[[:space:]]*$/u, "qq<$pretty> =~ /[[:space:]]*\$/u"); + my $copy1 = $input; + is($copy1 =~ s/[[:space:]]*$//u, 1, "qq<$pretty> =~ s/[[:space:]]*\$//u retval"); + is($copy1, $input, "qq<$pretty> =~ s/[[:space:]]*\$//u unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s*$//ur, $input, "qq<$pretty> =~ s/[[:space:]]*\$//ur retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]*\$//ur unchanged"); + } + } + + # And without //u + undef $want + if $input =~ /^NBS8/; + + if (defined $want) { + { + ok($input =~ /\s+\z/, "qq<$pretty> =~ /\\s+\\z/"); + my $copy1 = $input; + is($copy1 =~ s/\s+\z//, 1, "qq<$pretty> =~ s/\\s+\\z//"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s+\z//r, $want, "qq<$pretty> =~ s/\\s+\\z//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/\\s+\\z//r unchanged"); + } + + { + ok($input =~ /[[:space:]]+\z/, "qq<$pretty> =~ /[[:space:]]+\\z/"); + my $copy1 = $input; + is($copy1 =~ s/\s+\z//, 1, "qq<$pretty> =~ s/[[:space:]]+\\z//"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s+\z//r, $want, "qq<$pretty> =~ s/[[:space:]]+\\z//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]+\\z//r unchanged"); + } + + { + ok($input =~ /\s*\z/, "qq<$pretty> =~ /\\s*\\z/"); + my $copy1 = $input; + is($copy1 =~ s/\s*\z//, 1, "qq<$pretty> =~ s/\\s*\\z//"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s*\z//r, $want, "qq<$pretty> =~ s/\\s*\\z//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/\\s*\\z//r unchanged"); + } + + { + ok($input =~ /[[:space:]]*\z/, "qq<$pretty> =~ /[[:space:]]*\\z/"); + my $copy1 = $input; + is($copy1 =~ s/\s*\z//, 1, "qq<$pretty> =~ s/[[:space:]]*\\z//"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s*\z//r, $want, "qq<$pretty> =~ s/[[:space:]]*\\z//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]*\\z//r unchanged"); + } + + { + ok($input =~ /\s+$/, "qq<$pretty> =~ /\\s+\$/"); + my $copy1 = $input; + is($copy1 =~ s/\s+$//, 1, "qq<$pretty> =~ s/\\s+\$//"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s+$//r, $want, "qq<$pretty> =~ s/\\s+\$//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/\\s+\$//r unchanged"); + } + + { + ok($input =~ /[[:space:]]+$/, "qq<$pretty> =~ /[[:space:]]+\$/"); + my $copy1 = $input; + is($copy1 =~ s/\s+$//, 1, "qq<$pretty> =~ s/[[:space:]]+\$//"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s+$//r, $want, "qq<$pretty> =~ s/[[:space:]]+\$//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]+\$//r unchanged"); + } + + { + ok($input =~ /\s*$/, "qq<$pretty> =~ /\\s*\$/"); + my $copy1 = $input; + is($copy1 =~ s/\s*$//, 1, "qq<$pretty> =~ s/\\s*\$//"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s*$//r, $want, "qq<$pretty> =~ s/\\s*\$//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/\\s*\$//r unchanged"); + } + + { + ok($input =~ /[[:space:]]*$/, "qq<$pretty> =~ /[[:space:]]*\$/"); + my $copy1 = $input; + is($copy1 =~ s/\s*$//, 1, "qq<$pretty> =~ s/[[:space:]]*\$//"); + is($copy1, $want, "qq<$pretty> => qq<$pretty_want>"); + my $copy2 = $input; + is($copy2 =~ s/\s*$//r, $want, "qq<$pretty> =~ s/[[:space:]]*\$//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]*\$//r unchanged"); + } + } + else { + { + ok($input !~ /\s+\z/, "qq<$pretty> !~ /\\s+\\z/"); + my $copy1 = $input; + is($copy1 =~ s/\s+\z//, "", "qq<$pretty> =~ s/\\s+\\z//"); + is($copy1, $input, "qq<$pretty> unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s+\z//r, $input, "qq<$pretty> =~ s/[[:space:]]+\\z//r"); + is($copy2, $input, "qq<$pretty> unchanged"); + } + + { + ok($input !~ /[[:space:]]+\z/, "qq<$pretty> !~ /[[:space:]]+\\z/"); + my $copy1 = $input; + is($copy1 =~ s/[[:space:]]+\z//, "", "qq<$pretty> =~ s/[[:space:]]+\\z// retval"); + is($copy1, $input, "qq<$pretty> =~ s/[[:space:]]+\\z// unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s+\z//r, $input, "qq<$pretty> =~ s/[[:space:]]+\\z//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]+\\z//r unchanged"); + } + + { + # Unlike +, * matches, but doesn't change anything + ok($input =~ /\s*\z/, "qq<$pretty> =~ /\\s*\\z/"); + my $copy1 = $input; + is($copy1 =~ s/\s*\z//, 1, "qq<$pretty> =~ s/\\s*\\z//"); + is($copy1, $input, "qq<$pretty> unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s*\z//r, $input, "qq<$pretty> =~ s/[[:space:]]*\\z//r"); + is($copy2, $input, "qq<$pretty> unchanged"); + } + + { + # Unlike +, * matches, but doesn't change anything + ok($input =~ /[[:space:]]*\z/, "qq<$pretty> =~ /[[:space:]]*\\z/"); + my $copy1 = $input; + is($copy1 =~ s/[[:space:]]*\z//, 1, "qq<$pretty> =~ s/[[:space:]]*\\z// retval"); + is($copy1, $input, "qq<$pretty> =~ s/[[:space:]]*\\z// unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s*\z//r, $input, "qq<$pretty> =~ s/[[:space:]]*\\z//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]*\\z//r unchanged"); + } + + { + ok($input !~ /\s+$/, "qq<$pretty> !~ /\\s+\$/"); + my $copy1 = $input; + is($copy1 =~ s/\s+$//, "", "qq<$pretty> =~ s/\\s+\$//"); + is($copy1, $input, "qq<$pretty> unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s+$//r, $input, "qq<$pretty> =~ s/[[:space:]]+\$//r"); + is($copy2, $input, "qq<$pretty> unchanged"); + } + + { + ok($input !~ /[[:space:]]+$/, "qq<$pretty> !~ /[[:space:]]+\$/"); + my $copy1 = $input; + is($copy1 =~ s/[[:space:]]+$//, "", "qq<$pretty> =~ s/[[:space:]]+\$// retval"); + is($copy1, $input, "qq<$pretty> =~ s/[[:space:]]+\$// unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s+$//r, $input, "qq<$pretty> =~ s/[[:space:]]+\$//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]+\$//r unchanged"); + } + + { + # Unlike +, * matches, but doesn't change anything + ok($input =~ /\s*$/, "qq<$pretty> =~ /\\s*\$/"); + my $copy1 = $input; + is($copy1 =~ s/\s*$//, 1, "qq<$pretty> =~ s/\\s*\$//"); + is($copy1, $input, "qq<$pretty> unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s*$//r, $input, "qq<$pretty> =~ s/[[:space:]]*\$//r"); + is($copy2, $input, "qq<$pretty> unchanged"); + } + + { + # Unlike +, * matches, but doesn't change anything + ok($input =~ /[[:space:]]*$/, "qq<$pretty> =~ /[[:space:]]*\$/"); + my $copy1 = $input; + is($copy1 =~ s/[[:space:]]*$//, 1, "qq<$pretty> =~ s/[[:space:]]*\$// retval"); + is($copy1, $input, "qq<$pretty> =~ s/[[:space:]]*\$// unchanged"); + my $copy2 = $input; + is($copy2 =~ s/\s*$//r, $input, "qq<$pretty> =~ s/[[:space:]]*\$//r retval"); + is($copy2, $input, "qq<$pretty> =~ s/[[:space:]]*\$//r unchanged"); + } + } +}