regcharclass.pl: Move some code earlier

We can short circuit some work by moving the test earlier. This does not change the generated file.
Perl · Aug 7, 2021 · bc5a92d · bc5a92d
1 parent 741c1a9
commit bc5a92d
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 40 deletions.
diff --git a/regcharclass.h b/regcharclass.h
@@ -3765,6 +3765,6 @@
  * 696e706fddd3ce8cd48c7ea91caf4c9edf5c296432d320aa7b78631f69aa9eac lib/unicore/mktables
  * 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
  * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
- * 1738139306d9ade7fcff636a38c9bf6c1889b98c63c8d7f8928e275ee5944afc regen/regcharclass.pl
+ * 69854621b341c8eab85aad890763e28eaa6d4df83cecb0566e81f7f6619efff0 regen/regcharclass.pl
  * b2f896452d2b30da3e04800f478c60c1fd0b03d6b668689b020f1e3cf1f1cdd9 regen/regcharclass_multi_char_folds.pl
  * ex: set ro: */
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
@@ -1096,6 +1096,46 @@ sub _cond_as_str {
 
     return 1 if @cond == 256;  # If all bytes match, is trivially true
 
+        # If this is a single UTF-8 range which includes all possible
+        # continuation bytes, and we aren't checking for well-formedness, this
+        # is trivially true.
+        if (     @ranges == 1
+            && ! $opts_ref->{safe}
+            && ! $opts_ref->{no_length_checks}
+            &&   $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi
+            &&   $ranges[0]->[1] == 0xBF
+            &&   $ranges[0]->[0] == 0x80)
+        {
+            return 1;
+        }
+
+        my $loop_start = 0;
+        if (ref $ranges[0] && $ranges[0]->[0] == 0) {
+
+            # If the first range matches all 256 possible bytes, it is
+            # trivially true.
+            return 1 if $ranges[0]->[1] == 0xFF;
+                                                    # this case
+            # Here, the first range starts at 0, but doesn't match everything.
+            # But the condition doesn't have to worry about being < 0
+            $ranges[0] = "( $test <= "
+                        . $self->val_fmt($ranges[0]->[1]) . " )";
+            $loop_start++;
+        }
+
+        my $loop_end = @ranges;
+        if (   @ranges
+            && ref $ranges[-1]
+            && $ranges[-1]->[1] == 0xFF
+            && $ranges[-1]->[0] != 0xFF)
+        {
+            # If the final range consists of more than one byte ending with
+            # the highest possible one, the condition doesn't have to worry
+            # about being > FF
+            $ranges[-1] = "( $test >= " . $self->val_fmt($ranges[-1]->[0]) . " )";
+            $loop_end--;
+        }
+
     my @masks;
     if (@ranges > 1) {
 
@@ -1134,53 +1174,15 @@ sub _cond_as_str {
     # Here, there was no entire-class optimization that was clearly better
     # than doing things by ranges.  Look at each range.
     my $range_count_extra = 0;
-    for (my $i = 0; $i < @ranges; $i++) {
+    for (my $i = $loop_start; $i < $loop_end; $i++) {
         if (! ref $ranges[$i]) {    # Trivial case: no range
             $ranges[$i] = $self->val_fmt($ranges[$i]) . " == $test";
         }
         elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) {
             $ranges[$i] =           # Trivial case: single element range
                     $self->val_fmt($ranges[$i]->[0]) . " == $test";
         }
-        elsif ($ranges[$i]->[0] == 0) {
-            # If the range matches all 256 possible bytes, it is trivially
-            # true.
-            return 1 if $ranges[0]->[1] == 0xFF;    # @ranges must be 1 in
-                                                    # this case
-            $ranges[$i] = "( $test <= "
-                        . $self->val_fmt($ranges[$i]->[1]) . " )";
-        }
-        elsif ($ranges[$i]->[1] == 255) {
-
-            # Similarly the max possible is 255, so can omit an upper bound
-            # test if the calculated max is the max possible one.
-            $ranges[$i] = "( $test >= " . $self->val_fmt($ranges[0]->[0]) . " )";
-        }
         else {
-            # Well-formed UTF-8 continuation bytes on ascii platforms must be
-            # in the range 0x80 .. 0xBF.  If we know that the input is
-            # well-formed (indicated by not trying to be 'safe'), we can omit
-            # tests that verify that the input is within either of these
-            # bounds.  (No legal UTF-8 character can begin with anything in
-            # this range, so we don't have to worry about this being a
-            # continuation byte or not.)
-            if ($opts_ref->{charset} =~ /ascii/i
-                && (! $opts_ref->{safe} && ! $opts_ref->{no_length_checks})
-                && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi)
-            {
-                # If the range is the entire legal range, it matches any legal
-                # byte, so we can omit both tests.  (This should happen only
-                # if the number of ranges is 1.)
-                if ($ranges[$i]->[0] == 0x80 && $ranges[$i]->[1] == 0xBF) {
-                    return 1;
-                }
-            }
-
-            # Here, it isn't the full range of legal continuation bytes.  We
-            # could just assume that there's nothing outside of the legal
-            # bounds.  But inRANGE() allows us to have a single conditional,
-            # so the only cost of making sure it's a legal UTF-8 continuation
-            # byte is an extra subtraction instruction, a trivial expense.
             $ranges[$i] = "inRANGE_helper_(U8, $test, "
                         . $self->val_fmt($ranges[$i]->[0]) .", "
                         . $self->val_fmt($ranges[$i]->[1]) . ")";