Skip to content

Commit

Permalink
regcharclass.pl: Move some code earlier
Browse files Browse the repository at this point in the history
We can short circuit some work by moving the test earlier.  This does
not change the generated file.
  • Loading branch information
khwilliamson committed Aug 7, 2021
1 parent 741c1a9 commit bc5a92d
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 40 deletions.
2 changes: 1 addition & 1 deletion regcharclass.h
Expand Up @@ -3765,6 +3765,6 @@
* 696e706fddd3ce8cd48c7ea91caf4c9edf5c296432d320aa7b78631f69aa9eac lib/unicore/mktables
* 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
* 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
* 1738139306d9ade7fcff636a38c9bf6c1889b98c63c8d7f8928e275ee5944afc regen/regcharclass.pl
* 69854621b341c8eab85aad890763e28eaa6d4df83cecb0566e81f7f6619efff0 regen/regcharclass.pl
* b2f896452d2b30da3e04800f478c60c1fd0b03d6b668689b020f1e3cf1f1cdd9 regen/regcharclass_multi_char_folds.pl
* ex: set ro: */
80 changes: 41 additions & 39 deletions regen/regcharclass.pl
Expand Up @@ -1096,6 +1096,46 @@ sub _cond_as_str {

return 1 if @cond == 256; # If all bytes match, is trivially true

# If this is a single UTF-8 range which includes all possible
# continuation bytes, and we aren't checking for well-formedness, this
# is trivially true.
if ( @ranges == 1
&& ! $opts_ref->{safe}
&& ! $opts_ref->{no_length_checks}
&& $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi
&& $ranges[0]->[1] == 0xBF
&& $ranges[0]->[0] == 0x80)
{
return 1;
}

my $loop_start = 0;
if (ref $ranges[0] && $ranges[0]->[0] == 0) {

# If the first range matches all 256 possible bytes, it is
# trivially true.
return 1 if $ranges[0]->[1] == 0xFF;
# this case
# Here, the first range starts at 0, but doesn't match everything.
# But the condition doesn't have to worry about being < 0
$ranges[0] = "( $test <= "
. $self->val_fmt($ranges[0]->[1]) . " )";
$loop_start++;
}

my $loop_end = @ranges;
if ( @ranges
&& ref $ranges[-1]
&& $ranges[-1]->[1] == 0xFF
&& $ranges[-1]->[0] != 0xFF)
{
# If the final range consists of more than one byte ending with
# the highest possible one, the condition doesn't have to worry
# about being > FF
$ranges[-1] = "( $test >= " . $self->val_fmt($ranges[-1]->[0]) . " )";
$loop_end--;
}

my @masks;
if (@ranges > 1) {

Expand Down Expand Up @@ -1134,53 +1174,15 @@ sub _cond_as_str {
# Here, there was no entire-class optimization that was clearly better
# than doing things by ranges. Look at each range.
my $range_count_extra = 0;
for (my $i = 0; $i < @ranges; $i++) {
for (my $i = $loop_start; $i < $loop_end; $i++) {
if (! ref $ranges[$i]) { # Trivial case: no range
$ranges[$i] = $self->val_fmt($ranges[$i]) . " == $test";
}
elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) {
$ranges[$i] = # Trivial case: single element range
$self->val_fmt($ranges[$i]->[0]) . " == $test";
}
elsif ($ranges[$i]->[0] == 0) {
# If the range matches all 256 possible bytes, it is trivially
# true.
return 1 if $ranges[0]->[1] == 0xFF; # @ranges must be 1 in
# this case
$ranges[$i] = "( $test <= "
. $self->val_fmt($ranges[$i]->[1]) . " )";
}
elsif ($ranges[$i]->[1] == 255) {

# Similarly the max possible is 255, so can omit an upper bound
# test if the calculated max is the max possible one.
$ranges[$i] = "( $test >= " . $self->val_fmt($ranges[0]->[0]) . " )";
}
else {
# Well-formed UTF-8 continuation bytes on ascii platforms must be
# in the range 0x80 .. 0xBF. If we know that the input is
# well-formed (indicated by not trying to be 'safe'), we can omit
# tests that verify that the input is within either of these
# bounds. (No legal UTF-8 character can begin with anything in
# this range, so we don't have to worry about this being a
# continuation byte or not.)
if ($opts_ref->{charset} =~ /ascii/i
&& (! $opts_ref->{safe} && ! $opts_ref->{no_length_checks})
&& $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi)
{
# If the range is the entire legal range, it matches any legal
# byte, so we can omit both tests. (This should happen only
# if the number of ranges is 1.)
if ($ranges[$i]->[0] == 0x80 && $ranges[$i]->[1] == 0xBF) {
return 1;
}
}

# Here, it isn't the full range of legal continuation bytes. We
# could just assume that there's nothing outside of the legal
# bounds. But inRANGE() allows us to have a single conditional,
# so the only cost of making sure it's a legal UTF-8 continuation
# byte is an extra subtraction instruction, a trivial expense.
$ranges[$i] = "inRANGE_helper_(U8, $test, "
. $self->val_fmt($ranges[$i]->[0]) .", "
. $self->val_fmt($ranges[$i]->[1]) . ")";
Expand Down

0 comments on commit bc5a92d

Please sign in to comment.