PCRE2Project · PhilipHazel · Oct 17, 2024 · Oct 17, 2024 · Oct 16, 2024
diff --git a/doc/pcre2test.1 b/doc/pcre2test.1
@@ -1,4 +1,4 @@
-.TH PCRE2TEST 1 "04 October 2024" "PCRE 10.45"
+.TH PCRE2TEST 1 "16 October 2024" "PCRE 10.45"
 .SH NAME
 pcre2test - a program for testing Perl-compatible regular expressions.
 .SH SYNOPSIS
@@ -511,7 +511,9 @@ means of encoding non-printing characters in a visible way:
 .sp
 Invoking \eN{U+hh...} or \ex{hh...} doesn't require the use of the \fButf\fP
 modifier on the pattern. It is always recognized. There may be any number of
-hexadecimal digits inside the braces; invalid values provoke error messages.
+hexadecimal digits inside the braces; invalid values provoke error messages
+but when using \eN{U+hh...} with some invalid unicode characters they will
+be accepted with a warning instead.
 .P
 Note that even in UTF-8 mode, \exhh (and depending of how large, \eddd)
 describe one byte rather than one character; this makes it possible to
@@ -526,7 +528,7 @@ When testing te 16-bit library, not in UTF-16 mode, all 4-digit \ex{hhhh}
 values are accepted. This makes it possible to construct invalid UTF-16
 sequences for testing purposes.
 .P
-When testing the 32-bit library, not In UTF-32 mode, all 4 to 8-digit \ex{...}
+When testing the 32-bit library, not in UTF-32 mode, all 4 to 8-digit \ex{...}
 values are accepted. This makes it possible to construct invalid UTF-32
 sequences for testing purposes.
 .P
@@ -2243,6 +2245,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 04 October 2024
+Last updated: 16 October 2024
 Copyright (c) 1997-2024 University of Cambridge.
 .fi
diff --git a/perltest.sh b/perltest.sh
@@ -85,6 +85,7 @@ fi
 #   aftertext          interpreted as "print $' afterwards"
 #   afteralltext       ignored
 #   dupnames           ignored (Perl always allows)
+#   hex                preprocess pattern with embedded octets
 #   jitstack           ignored
 #   mark               show mark information
 #   no_auto_possess    ignored
@@ -244,9 +245,9 @@ for (;;)
 
   # Split the pattern from the modifiers and adjust them as necessary.
 
-  $pattern =~ /^\s*((.).*\2)(.*)$/s;
-  $pat = $1;
-  $del = $2;
+  $pattern =~ /^\s*(.)(.*)\1(.*)$/s;
+  $del = $1;
+  $pat = $2;
   $mod = "$3,$extra_modifiers";
   $mod =~ s/^,\s*//;
 
@@ -286,6 +287,34 @@ for (;;)
 
   $mod =~ s/no_auto_possess,?//;
 
+  # The "hex" modifier instructs us to preprocess the pattern
+
+  if ($mod =~ s/hex,?//)
+    {
+    my $t = "";
+
+    # find either 2 digit hex octets, optionally surrounded by spaces, to
+    # add as code points or quoted strings that will be copied verbatim
+
+    while ($pat =~ /\s*(?:(\p{ahex}{2})|(['"])([^\2]+?)\2)\s*/g)
+      {
+      if (defined $1)
+        {
+        no utf8;
+        $t .= chr(hex($1));
+        use if $utf8, "utf8";
+        }
+      else
+        {
+        $t .= $3;
+        }
+      }
+    no utf8;
+    utf8::decode($t) if $utf8;
+    use if $utf8, "utf8";
+    $pat = $t;
+    }
+
   # Use no_start_optimize (disable PCRE2 start-up optimization) to disable Perl
   # optimization by inserting (??{""}) at the start of the pattern. We may
   # also encounter -no_start_optimize from a #pattern setting.
@@ -297,7 +326,8 @@ for (;;)
   # Add back retained modifiers and check that the pattern is valid.
 
   $mod =~ s/,//g;
-  $pattern = "$pat$mod";
+
+  $pattern = "$del$pat$del$mod";
 
   eval "\$_ =~ ${pattern}";
   if ($@)

diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
@@ -1940,7 +1940,7 @@ else
       cc = *ptr++;
       if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
 #if PCRE2_CODE_UNIT_WIDTH == 32
-      if (c >= 0x20000000l) { overflow = TRUE; break; }
+      if (c >= 0x20000000u) { overflow = TRUE; break; }
 #endif
       c = (c << 3) + (cc - CHAR_0);
 #if PCRE2_CODE_UNIT_WIDTH == 8

diff --git a/src/pcre2test.c b/src/pcre2test.c
@@ -713,7 +713,7 @@ static modstruct modlist[] = {
   { "global",                      MOD_PNDP, MOD_CTL, CTL_GLOBAL,                 PO(control) },
   { "heap_limit",                  MOD_CTM,  MOD_INT, 0,                          MO(heap_limit) },
   { "heapframes_size",             MOD_PND,  MOD_CTL, CTL2_HEAPFRAMES_SIZE,       PO(control2) },
-  { "hex",                         MOD_PAT,  MOD_CTL, CTL_HEXPAT,                 PO(control) },
+  { "hex",                         MOD_PATP, MOD_CTL, CTL_HEXPAT,                 PO(control) },
   { "info",                        MOD_PAT,  MOD_CTL, CTL_INFO,                   PO(control) },
   { "jit",                         MOD_PAT,  MOD_IND, 7,                          PO(jit) },
   { "jitfast",                     MOD_PAT,  MOD_CTL, CTL_JITFAST,                PO(control) },
@@ -7264,7 +7264,7 @@ while ((c = *p++) != 0)
       c = 0;
       for (pt++; isdigit(*pt) && *pt < '8'; ++i, pt++)
         {
-        if (c >= 0x20000000l)
+        if (c >= 0x20000000u)
           {
           fprintf(outfile, "** \\o{ escape too large\n");
           return PR_OK;
@@ -7397,20 +7397,38 @@ while ((c = *p++) != 0)
                          "and therefore cannot be encoded as UTF-8\n", c);
         return PR_OK;
         }
+      else if (encoding == FORCE_UTF && c > MAX_UTF_CODE_POINT)
+        fprintf(outfile, "** Warning: character \\N{U+%x} is greater than "
+                         "0x%x and should not be encoded as UTF-8\n",
+                         c, MAX_UTF_CODE_POINT);
       q8 += ord2utf8(c, q8);
       }
     }
 #endif
 #ifdef SUPPORT_PCRE2_16
   if (test_mode == PCRE16_MODE)
     {
-    if (encoding == FORCE_UTF || utf)
+    /* Unlike the 8-bit code, there are no forced raw suggestions for the
+    16-bit mode, so assume raw unless utf is preferred */
+
+    if (!(encoding == FORCE_UTF || utf))
       {
-      if (c > 0x10ffffu)
+      if (c > 0xffffu)
+        {
+        fprintf(outfile, "** Character \\x{%x} is greater than 0xffff "
+          "and UTF-16 mode is not enabled.\n", c);
+        fprintf(outfile, "** Truncation will probably give the wrong "
+          "result.\n");
+        }
+      *q16++ = (uint16_t)c;
+      }
+    else
+      {
+      if (c > MAX_UTF_CODE_POINT)
         {
         fprintf(outfile, "** Failed: character \\N{U+%x} is greater than "
-                         "0x10ffff and therefore cannot be encoded as "
-                         "UTF-16\n", c);
+                         "0x%x and therefore cannot be encoded as UTF-16\n",
+                c, MAX_UTF_CODE_POINT);
         return PR_OK;
         }
       else if (c >= 0x10000u)
@@ -7419,24 +7437,25 @@ while ((c = *p++) != 0)
         *q16++ = 0xD800 | (c >> 10);
         *q16++ = 0xDC00 | (c & 0x3ff);
         }
-      else *q16++ = c;
-      }
-    else
-      {
-      if (c > 0xffffu)
+      else
         {
-        fprintf(outfile, "** Character \\x{%x} is greater than 0xffff "
-          "and UTF-16 mode is not enabled.\n", c);
-        fprintf(outfile, "** Truncation will probably give the wrong "
-          "result.\n");
+        if (encoding == FORCE_UTF && 0xe000u > c && c >= 0xd800u)
+          fprintf(outfile, "** Warning: character \\N{U+%x} is a surrogate "
+                           "and should not be encoded as UTF-16\n", c);
+        *q16++ = c;
         }
-
-      *q16++ = (uint16_t)c;
       }
     }
 #endif
 #ifdef SUPPORT_PCRE2_32
-  if (test_mode == PCRE32_MODE) *q32++ = c;
+  if (test_mode == PCRE32_MODE)
+    {
+    if (encoding == FORCE_UTF && c > MAX_UTF_CODE_POINT)
+      fprintf(outfile, "** Warning: character \\N{U+%x} is greater than "
+                       "0x%x and should not be encoded as UTF-32\n",
+                       c, MAX_UTF_CODE_POINT);
+    *q32++ = c;
+    }
 #endif
   }
 

diff --git a/testdata/testinput1 b/testdata/testinput1
@@ -6709,4 +6709,7 @@ $/x
 \= Expect no match
     .a.b.c.
 
+/65 00 64/hex
+    e\0d
+
 # End of testinput1 
diff --git a/testdata/testinput11 b/testdata/testinput11
@@ -356,12 +356,18 @@
 # We can use pcre2test's utf8_input modifier to create wide pattern characters,
 # even though this test is run when UTF is not supported.
 
+/a\x{d800}b/utf8_input
+    aí b
+    a\x{d800}b
+    a\o{154000}b
+\= Expect warning unless 32bit
+    a\N{U+d800}b
+
 /a\x{ffff}b/utf8_input
     aï¿¿b
     a\x{ffff}b
     a\o{177777}b
-\= Expect no match
-    a\N{U+ffff}z
+    a\N{U+ffff}b
 
 /abý¿¿¿¿¿z/utf8_input
     abý¿¿¿¿¿z

diff --git a/testdata/testinput4 b/testdata/testinput4
@@ -2908,4 +2908,9 @@
 /\p{  ^ L u }/
     AbCd
 
+# hex
+
+/c3 b1/hex,utf
+    \N{U+00F1}
+
 # End of testinput4
diff --git a/testdata/testinput9 b/testdata/testinput9
@@ -12,6 +12,7 @@
     a\443b
 
 /fd bf bf bf bf bf/I,hex
+\= Expect warning
     \N{U+7fffffff}
 \= Expect no match # error message (too big char)
     \x{7fffffff}

diff --git a/testdata/testoutput1 b/testdata/testoutput1
@@ -10580,4 +10580,8 @@ No match
     .a.b.c.
 No match
 
+/65 00 64/hex
+    e\0d
+ 0: e\x00d
+
 # End of testinput1 
diff --git a/testdata/testoutput11-16 b/testdata/testoutput11-16
@@ -646,16 +646,27 @@ Subject length lower bound = 1
 # We can use pcre2test's utf8_input modifier to create wide pattern characters,
 # even though this test is run when UTF is not supported.
 
+/a\x{d800}b/utf8_input
+    aí b
+ 0: a\x{d800}b
+    a\x{d800}b
+ 0: a\x{d800}b
+    a\o{154000}b
+ 0: a\x{d800}b
+\= Expect warning unless 32bit
+    a\N{U+d800}b
+** Warning: character \N{U+d800} is a surrogate and should not be encoded as UTF-16
+ 0: a\x{d800}b
+
 /a\x{ffff}b/utf8_input
     aï¿¿b
  0: a\x{ffff}b
     a\x{ffff}b
  0: a\x{ffff}b
     a\o{177777}b
  0: a\x{ffff}b
-\= Expect no match
-    a\N{U+ffff}z
-No match
+    a\N{U+ffff}b
+ 0: a\x{ffff}b
 
 /abý¿¿¿¿¿z/utf8_input
 ** Failed: character value greater than 0xffff cannot be converted to 16-bit in non-UTF mode

diff --git a/testdata/testoutput11-32 b/testdata/testoutput11-32
@@ -649,16 +649,26 @@ Subject length lower bound = 1
 # We can use pcre2test's utf8_input modifier to create wide pattern characters,
 # even though this test is run when UTF is not supported.
 
+/a\x{d800}b/utf8_input
+    aí b
+ 0: a\x{d800}b
+    a\x{d800}b
+ 0: a\x{d800}b
+    a\o{154000}b
+ 0: a\x{d800}b
+\= Expect warning unless 32bit
+    a\N{U+d800}b
+ 0: a\x{d800}b
+
 /a\x{ffff}b/utf8_input
     aï¿¿b
  0: a\x{ffff}b
     a\x{ffff}b
  0: a\x{ffff}b
     a\o{177777}b
  0: a\x{ffff}b
-\= Expect no match
-    a\N{U+ffff}z
-No match
+    a\N{U+ffff}b
+ 0: a\x{ffff}b
 
 /abý¿¿¿¿¿z/utf8_input
     abý¿¿¿¿¿z
@@ -668,6 +678,7 @@ No match
     ab\o{17777777777}z
  0: ab\x{7fffffff}z
     ab\N{U+7fffffff}z
+** Warning: character \N{U+7fffffff} is greater than 0x10ffff and should not be encoded as UTF-32
  0: ab\x{7fffffff}z
 
 /abÿý¿¿¿¿¿z/utf8_input

diff --git a/testdata/testoutput4 b/testdata/testoutput4
@@ -4656,4 +4656,10 @@ No match
     AbCd
  0: b
 
+# hex
+
+/c3 b1/hex,utf
+    \N{U+00F1}
+ 0: \x{f1}
+
 # End of testinput4
diff --git a/testdata/testoutput9 b/testdata/testoutput9
@@ -26,7 +26,9 @@ Capture group count = 0
 First code unit = \xfd
 Last code unit = \xbf
 Subject length lower bound = 6
+\= Expect warning
     \N{U+7fffffff}
+** Warning: character \N{U+7fffffff} is greater than 0x10ffff and should not be encoded as UTF-8
  0: \xfd\xbf\xbf\xbf\xbf\xbf
 \= Expect no match # error message (too big char)
     \x{7fffffff}