Skip to content

Commit

Permalink
regcharclass.pl: Backwards UTF-8 isSPACE
Browse files Browse the repository at this point in the history
  • Loading branch information
khwilliamson committed Jun 6, 2021
1 parent 1737272 commit 9f0d3ea
Show file tree
Hide file tree
Showing 2 changed files with 150 additions and 22 deletions.
89 changes: 88 additions & 1 deletion regcharclass.h
Expand Up @@ -155,6 +155,35 @@
( 0x202F == cp || ( 0x202F < cp && \
( 0x205F == cp || 0x3000 == cp ) ) ) ) ) ) ) ) )

/*
XPERLSPACE: \p{XPerlSpace}
\p{XPerlSpace}
*/
/*** GENERATED CODE ***/
#define is_XPERLSPACE_utf8_safe_backwards(s,e) \
( ((s) - (e) > 2) ? \
( ( inRANGE_helper_(U8, *((const U8*)s - 1), '\t', '\r') || ' ' == *((const U8*)s - 1) ) ? 1\
: ( 0x80 == *((const U8*)s - 1) ) ? \
( ( 0x80 == *((const U8*)s - 2) ) ? \
( ( inRANGE_helper_(U8, *((const U8*)s - 3), 0xE2, 0xE3) ) ? 3 : 0 )\
: ( ( 0x9A == *((const U8*)s - 2) ) && ( 0xE1 == *((const U8*)s - 3) ) ) ? 3 : 0 )\
: ( inRANGE_helper_(U8, *((const U8*)s - 1), 0x81, 0x84) || inRANGE_helper_(U8, *((const U8*)s - 1), 0x86, 0x8A) || inRANGE_helper_(U8, *((const U8*)s - 1), 0xA8, 0xA9) || 0xAF == *((const U8*)s - 1) ) ?\
( ( ( 0x80 == *((const U8*)s - 2) ) && ( 0xE2 == *((const U8*)s - 3) ) ) ? 3 : 0 )\
: ( 0x85 == *((const U8*)s - 1) ) ? \
( ( 0x80 == *((const U8*)s - 2) ) ? \
( ( 0xE2 == *((const U8*)s - 3) ) ? 3 : 0 ) \
: ( 0xC2 == *((const U8*)s - 2) ) ? 2 : 0 ) \
: ( 0x9F == *((const U8*)s - 1) ) ? \
( ( ( 0x81 == *((const U8*)s - 2) ) && ( 0xE2 == *((const U8*)s - 3) ) ) ? 3 : 0 )\
: ( ( 0xA0 == *((const U8*)s - 1) ) && ( 0xC2 == *((const U8*)s - 2) ) ) ? 2 : 0 )\
: ((s) - (e) > 1) ? \
( ( inRANGE_helper_(U8, *((const U8*)s - 1), '\t', '\r') || ' ' == *((const U8*)s - 1) ) ? 1\
: ( ( 0x85 == *((const U8*)s - 1) || 0xA0 == *((const U8*)s - 1) ) && ( 0xC2 == *((const U8*)s - 2) ) ) ? 2 : 0 )\
: ((s) - (e) > 0) ? \
( inRANGE_helper_(U8, *((const U8*)s - 1), '\t', '\r') || ' ' == *((const U8*)s - 1) )\
: 0 )

/*
NONCHAR: Non character code points
Expand Down Expand Up @@ -1338,6 +1367,35 @@
( 0x202F == cp || ( 0x202F < cp && \
( 0x205F == cp || 0x3000 == cp ) ) ) ) ) ) ) ) )

/*
XPERLSPACE: \p{XPerlSpace}
\p{XPerlSpace}
*/
/*** GENERATED CODE ***/
#define is_XPERLSPACE_utf8_safe_backwards(s,e) \
( ((s) - (e) > 2) ? \
( ( '\t' == *((const U8*)s - 1) || inRANGE_helper_(U8, *((const U8*)s - 1), '\v', '\r') || '\n' == *((const U8*)s - 1) || 0x25 == *((const U8*)s - 1) || ' ' == *((const U8*)s - 1) ) ? 1\
: ( 0x41 == *((const U8*)s - 1) ) ? \
( ( 0x41 == *((const U8*)s - 2) ) ? \
( ( ( *((const U8*)s - 3) & 0xFB ) == 0xCA ) ? 3 : 0 ) \
: ( 0x63 == *((const U8*)s - 2) ) ? \
( ( 0xBC == *((const U8*)s - 3) ) ? 3 : 0 ) \
: ( 0x80 == *((const U8*)s - 2) ) ? 2 : 0 ) \
: ( inRANGE_helper_(U8, *((const U8*)s - 1), 0x42, 0x48) || 0x51 == *((const U8*)s - 1) ) ?\
( ( ( 0x41 == *((const U8*)s - 2) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\
: ( inRANGE_helper_(U8, *((const U8*)s - 1), 0x49, 0x4A) ) ? \
( ( ( inRANGE_helper_(U8, *((const U8*)s - 2), 0x41, 0x42) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\
: ( 0x56 == *((const U8*)s - 1) ) ? \
( ( ( 0x42 == *((const U8*)s - 2) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\
: ( ( ( 0x73 == *((const U8*)s - 1) ) && ( 0x43 == *((const U8*)s - 2) ) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\
: ((s) - (e) > 1) ? \
( ( '\t' == *((const U8*)s - 1) || inRANGE_helper_(U8, *((const U8*)s - 1), '\v', '\r') || '\n' == *((const U8*)s - 1) || 0x25 == *((const U8*)s - 1) || ' ' == *((const U8*)s - 1) ) ? 1\
: ( ( 0x41 == *((const U8*)s - 1) ) && ( 0x80 == *((const U8*)s - 2) ) ) ? 2 : 0 )\
: ((s) - (e) > 0) ? \
( '\t' == *((const U8*)s - 1) || inRANGE_helper_(U8, *((const U8*)s - 1), '\v', '\r') || '\n' == *((const U8*)s - 1) || 0x25 == *((const U8*)s - 1) || ' ' == *((const U8*)s - 1) )\
: 0 )

/*
NONCHAR: Non character code points
Expand Down Expand Up @@ -2516,6 +2574,35 @@
( 0x202F == cp || ( 0x202F < cp && \
( 0x205F == cp || 0x3000 == cp ) ) ) ) ) ) ) ) )

/*
XPERLSPACE: \p{XPerlSpace}
\p{XPerlSpace}
*/
/*** GENERATED CODE ***/
#define is_XPERLSPACE_utf8_safe_backwards(s,e) \
( ((s) - (e) > 2) ? \
( ( '\t' == *((const U8*)s - 1) || inRANGE_helper_(U8, *((const U8*)s - 1), '\v', '\r') || 0x15 == *((const U8*)s - 1) || '\n' == *((const U8*)s - 1) || ' ' == *((const U8*)s - 1) ) ? 1\
: ( 0x41 == *((const U8*)s - 1) ) ? \
( ( 0x41 == *((const U8*)s - 2) ) ? \
( ( ( *((const U8*)s - 3) & 0xFB ) == 0xCA ) ? 3 : 0 ) \
: ( 0x62 == *((const U8*)s - 2) ) ? \
( ( 0xBD == *((const U8*)s - 3) ) ? 3 : 0 ) \
: ( 0x78 == *((const U8*)s - 2) ) ? 2 : 0 ) \
: ( inRANGE_helper_(U8, *((const U8*)s - 1), 0x42, 0x48) || 0x51 == *((const U8*)s - 1) ) ?\
( ( ( 0x41 == *((const U8*)s - 2) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\
: ( inRANGE_helper_(U8, *((const U8*)s - 1), 0x49, 0x4A) ) ? \
( ( ( inRANGE_helper_(U8, *((const U8*)s - 2), 0x41, 0x42) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\
: ( 0x56 == *((const U8*)s - 1) ) ? \
( ( ( 0x42 == *((const U8*)s - 2) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\
: ( ( ( 0x72 == *((const U8*)s - 1) ) && ( 0x43 == *((const U8*)s - 2) ) ) && ( 0xCA == *((const U8*)s - 3) ) ) ? 3 : 0 )\
: ((s) - (e) > 1) ? \
( ( '\t' == *((const U8*)s - 1) || inRANGE_helper_(U8, *((const U8*)s - 1), '\v', '\r') || 0x15 == *((const U8*)s - 1) || '\n' == *((const U8*)s - 1) || ' ' == *((const U8*)s - 1) ) ? 1\
: ( ( 0x41 == *((const U8*)s - 1) ) && ( 0x78 == *((const U8*)s - 2) ) ) ? 2 : 0 )\
: ((s) - (e) > 0) ? \
( '\t' == *((const U8*)s - 1) || inRANGE_helper_(U8, *((const U8*)s - 1), '\v', '\r') || 0x15 == *((const U8*)s - 1) || '\n' == *((const U8*)s - 1) || ' ' == *((const U8*)s - 1) )\
: 0 )

/*
NONCHAR: Non character code points
Expand Down Expand Up @@ -3617,6 +3704,6 @@
* 696e706fddd3ce8cd48c7ea91caf4c9edf5c296432d320aa7b78631f69aa9eac lib/unicore/mktables
* 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
* 24120d5e0c9685c442c93bc1dbea9b85ef973bf8e9474baf0e55b160c288226b regen/charset_translations.pl
* 3635c6e564558e965018947bdab45f37d9a4fa82eb05b2694eae1a04bf7e65a3 regen/regcharclass.pl
* 29d7ced5065b4b2476607aefb87083c37a7dc5f9705430a7c0811d4232efca13 regen/regcharclass.pl
* b2f896452d2b30da3e04800f478c60c1fd0b03d6b668689b020f1e3cf1f1cdd9 regen/regcharclass_multi_char_folds.pl
* ex: set ro: */
83 changes: 62 additions & 21 deletions regen/regcharclass.pl
Expand Up @@ -354,8 +354,9 @@ sub val_fmt
#
# Each string is then stored in the 'strs' subhash as a hash record
# made up of the results of __uni_latin1, using the keynames
# 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and
# 'UTF8' which hold a merge of 'low' and their lowercase equivalents.
# 'low', 'latin1', 'utf8', as well as the synthesized 'LATIN1', 'high',
# 'UTF8', and 'backwards_UTF8' which hold a merge of 'low' and their lowercase
# equivalents.
#
# Size data is tracked per type in the 'size' subhash.
#
Expand Down Expand Up @@ -489,7 +490,7 @@ sub new {
#

sub make_trie {
my ( $self, $type, $maxlen )= @_;
my ( $self, $type, $maxlen, $backwards )= @_;

my $strs= $self->{strs};
my %trie;
Expand All @@ -500,7 +501,8 @@ sub make_trie {
next unless $dat;
next if $maxlen && @$dat > $maxlen;
my $node= \%trie;
foreach my $elem ( @$dat ) {
my @ordered_dat = ($backwards) ? reverse @$dat : @$dat;
foreach my $elem ( @ordered_dat ) {
$node->{$elem} ||= {};
$node= $node->{$elem};
}
Expand Down Expand Up @@ -533,7 +535,7 @@ ($)
#

sub _optree {
my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
my ( $self, $trie, $test_type, $ret_type, $else, $depth, $backwards )= @_;
return unless defined $trie;
$ret_type ||= 'len';
$else= 0 unless defined $else;
Expand Down Expand Up @@ -567,7 +569,16 @@ sub _optree {
# can return the "else" value.
return $else if !@conds;

my $test = $test_type =~ /^cp/ ? "cp" : "((const U8*)s)[$depth]";
my $test;
if ($test_type =~ /^cp/) {
$test = "cp";
}
elsif ($backwards) {
$test = "*((const U8*)s - " . ($depth + 1) . ")";
}
else {
$test = "((const U8*)s)[$depth]";
}

# First we loop over the possible keys/conditions and find out what they
# look like; we group conditions with the same optree together.
Expand All @@ -578,7 +589,7 @@ sub _optree {

# get the optree for this child/condition
my $res= $self->_optree( $trie->{$cond}, $test_type, $ret_type,
$else, $depth + 1 );
$else, $depth + 1, $backwards );
# convert it to a string with Dumper
my $res_code= Dumper( $res );

Expand Down Expand Up @@ -618,10 +629,11 @@ sub _optree {
sub optree {
my $self= shift;
my %opt= @_;
my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
my $trie= $self->make_trie( $opt{type}, $opt{max_depth}, $opt{backwards} );
$opt{ret_type} ||= 'len';
my $test_type= $opt{type} =~ /^cp/ ? 'cp' : 'depth';
return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0,
$opt{backwards} );
}

# my $optree= generic_optree(%opts);
Expand All @@ -638,10 +650,10 @@ sub generic_optree {
my $test_type= 'depth';
my $else= $opt{else} || 0;

my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
my $utf8= $self->make_trie( 'utf8', $opt{max_depth} );
my $latin1= $self->make_trie( 'latin1', $opt{max_depth}, $opt{backwards} );
my $utf8= $self->make_trie( 'utf8', $opt{max_depth}, $opt{backwards} );

$_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
$_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0, $opt{backwards} )
for $latin1, $utf8;

if ( $utf8 ) {
Expand All @@ -650,9 +662,10 @@ sub generic_optree {
$else= __cond_join( "!( is_utf8 )", $latin1, $else );
}
if ($opt{type} eq 'generic') {
my $low= $self->make_trie( 'low', $opt{max_depth} );
my $low= $self->make_trie( 'low', $opt{max_depth}, $opt{backwards} );
if ( $low ) {
$else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
$else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0,
$opt{backwards} );
}
}

Expand Down Expand Up @@ -710,6 +723,14 @@ sub length_optree {
$else= __cond_join( $cond, $optree, $else );
}
}
elsif ($opt{backwards}) {
my @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
for my $size ( @size ) {
my $optree= $self->$method(%opt, type => $type, max_depth => $size);
my $cond= "((s) - (e) > " . ( $size - 1 ).")";
$else= __cond_join( $cond, $optree, $else );
}
}
else {
my $utf8;

Expand All @@ -725,11 +746,12 @@ sub length_optree {
# If we do want more than the 0-255 range, find those, and if they
# exist...
if ( $opt{type} !~ /latin1/i
&& ($utf8 = $self->make_trie($trie_type, 0)))
&& ($utf8 = $self->make_trie($trie_type, 0, $opt{backwards})))
{

# ... get them into an optree, and set them up as the 'else' clause
$utf8 = $self->_optree( $utf8, 'depth', $opt{ret_type}, 0, 0 );
$utf8 = $self->_optree( $utf8, 'depth', $opt{ret_type}, 0, 0,
$opt{backwards} );

# We could make this
# UTF8_IS_START(*s) && ((e) - (s)) >= UTF8SKIP(s))";
Expand All @@ -747,16 +769,18 @@ sub length_optree {
# the case where the input isn't UTF-8.
my $latin1;
if ($method eq 'generic_optree') {
$latin1 = $self->make_trie( 'latin1', 1);
$latin1= $self->_optree($latin1, 'depth', $opt{ret_type}, 0, 0);
$latin1 = $self->make_trie( 'latin1', 1, $opt{backwards});
$latin1= $self->_optree($latin1, 'depth', $opt{ret_type}, 0, 0,
$opt{backwards});
}

# If we want the UTF-8 invariants, get those.
my $low;
if ($opt{type} !~ /non_low|high/
&& ($low= $self->make_trie( 'low', 1)))
&& ($low= $self->make_trie( 'low', 1, 0)))
{
$low= $self->_optree( $low, 'depth', $opt{ret_type}, 0, 0 );
$low= $self->_optree( $low, 'depth', $opt{ret_type}, 0, 0,
$opt{backwards} );

# Expand out the UTF-8 invariants as a string so that we
# can use them as the conditional
Expand Down Expand Up @@ -1303,7 +1327,8 @@ sub render {
# make a macro of a given type.
# calls into make_trie and (generic_|length_)optree as needed
# Opts are:
# type : 'cp','cp_high', 'generic','high','low','latin1','utf8','LATIN1','UTF8'
# type : 'cp', 'cp_high', 'generic', 'high', 'low', 'latin1',
# 'utf8', 'LATIN1', 'UTF8' 'backwards_UTF8'
# ret_type : 'cp' or 'len'
# safe : don't assume is well-formed UTF-8, so don't skip any range
# checks, and add length guards to macro
Expand Down Expand Up @@ -1357,6 +1382,7 @@ sub make_macro {
$ext .= '_non_low' if $type eq 'generic_non_low';
$ext .= "_safe" if $opts{safe};
$ext .= "_no_length_checks" if $opts{no_length_checks};
$ext .= "_backwards" if $opts{backwards};
my $argstr= join ",", @args;
my $def_fmt="$pfx$self->{op}$ext%s($argstr)";
my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
Expand Down Expand Up @@ -1418,6 +1444,13 @@ sub make_macro {
foreach my $type_spec ( @types ) {
my ( $type, $ret )= split /-/, $type_spec;
$ret ||= 'len';

my $backwards = 0;
if ($type eq 'backwards_UTF8') {
$type = 'UTF8';
$backwards = 1;
}

foreach my $mod ( @mods ) {

# 'safe' is irrelevant with code point macros, so skip if
Expand All @@ -1435,6 +1468,7 @@ sub make_macro {
charset => $charset,
no_length_checks => $mod eq 'no_length_checks'
&& $type !~ /^cp/,
backwards => $backwards,
);
print $out_fh $macro, "\n";
}
Expand Down Expand Up @@ -1562,6 +1596,9 @@ sub make_macro {
# class that can include any code point, adding the 'low' ones
# to what 'utf8' works on. It is designed to take only an input
# UTF-8 parameter.
# backwards_UTF8 like 'UTF8', but designed to match backwards, so that the
# second parameter to the function is earlier in the string than
# the first.
# generic generate a macro whose name is 'is_BASE". It has a 2nd,
# boolean, parameter which indicates if the first one points to
# a UTF-8 string or not. Thus it works in all circumstances.
Expand Down Expand Up @@ -1648,6 +1685,10 @@ sub make_macro {
=> high cp_high : fast
\p{XPerlSpace}
XPERLSPACE: \p{XPerlSpace}
=> backwards_UTF8 : safe
\p{XPerlSpace}
NONCHAR: Non character code points
=> UTF8 :safe
\p{_Perl_Nchar}
Expand Down

0 comments on commit 9f0d3ea

Please sign in to comment.