PCRE2Project · PhilipHazel · Oct 16, 2024 · Oct 15, 2024
diff --git a/doc/pcre2test.1 b/doc/pcre2test.1
@@ -76,8 +76,8 @@ possible to include binary zeros.
 .sp
 When testing the 16-bit or 32-bit libraries, there is a need to be able to
 generate character code points greater than 255 in the strings that are passed
-to the library. For subject lines, backslash escapes can be used. In addition,
-when the \fButf\fP modifier (see
+to the library. For subject lines and some patterns, backslash escapes can be
+used. In addition, when the \fButf\fP modifier (see
 .\" HTML <a href="#optionmodifiers">
 .\" </a>
 "Setting compilation options"
@@ -97,9 +97,8 @@ UTF-8 (in its original definition) is not capable of encoding values greater
 than 0x7fffffff, but such values can be handled by the 32-bit library. When
 testing this library in non-UTF mode with \fButf8_input\fP set, if any
 character is preceded by the byte 0xff (which is an invalid byte in UTF-8)
-0x80000000 is added to the character's value. This is the only way of passing
-such code points in a pattern string. For subject strings, using an escape
-sequence is preferable.
+0x80000000 is added to the character's value. For subject strings, using an
+escape sequence is preferable.
 .
 .
 .SH "COMMAND LINE OPTIONS"
@@ -493,36 +492,43 @@ space is removed, and the line is scanned for backslash escapes, unless the
 \fBsubject_literal\fP modifier was set for the pattern. The following provide a
 means of encoding non-printing characters in a visible way:
 .sp
-  \ea         alarm (BEL, \ex07)
-  \eb         backspace (\ex08)
-  \ee         escape (\ex27)
-  \ef         form feed (\ex0c)
-  \en         newline (\ex0a)
-  \er         carriage return (\ex0d)
-  \et         tab (\ex09)
-  \ev         vertical tab (\ex0b)
-  \ennn       octal character (up to 3 octal digits); always
-               a byte unless > 255 in UTF-8 or 16-bit or 32-bit mode
-  \eo{dd...}  octal character (any number of octal digits}
-  \exhh       hexadecimal byte (up to 2 hex digits)
-  \ex{hh...}  hexadecimal character (any number of hex digits)
-.sp
-The use of \ex{hh...} is not dependent on the use of the \fButf\fP modifier on
-the pattern. It is recognized always. There may be any number of hexadecimal
-digits inside the braces; invalid values provoke error messages.
-.P
-Note that \exhh specifies one byte rather than one character in UTF-8 mode;
-this makes it possible to construct invalid UTF-8 sequences for testing
-purposes. On the other hand, \ex{hh} is interpreted as a UTF-8 character in
-UTF-8 mode, generating more than one byte if the value is greater than 127.
-When testing the 8-bit library not in UTF-8 mode, \ex{hh} generates one byte
-for values that could fit on it, and causes an error for greater values.
-.P
-In UTF-16 mode, all 4-digit \ex{hhhh} values are accepted. This makes it
-possible to construct invalid UTF-16 sequences for testing purposes.
-.P
-In UTF-32 mode, all 4- to 8-digit \ex{...} values are accepted. This makes it
-possible to construct invalid UTF-32 sequences for testing purposes.
+  \ea          alarm (BEL, \ex07)
+  \eb          backspace (\ex08)
+  \ee          escape (\ex27)
+  \ef          form feed (\ex0c)
+  \en          newline (\ex0a)
+  \eN{U+hh...} unicode character (any number of hex digits)
+  \er          carriage return (\ex0d)
+  \et          tab (\ex09)
+  \ev          vertical tab (\ex0b)
+  \eddd        octal number (up to 3 octal digits); represent a single
+                code point unless larger than 255 with the 8-bit library
+  \eo{dd...}   octal number (any number of octal digits} representing a
+                character in UTF mode or a code point
+  \exhh        hexadecimal byte (up to 2 hex digits)
+  \ex{hh...}   hexadecimal number (up to 8 hex digits) representing a
+                character in UTF mode or a code point
+.sp
+Invoking \eN{U+hh...} or \ex{hh...} doesn't require the use of the \fButf\fP
+modifier on the pattern. It is always recognized. There may be any number of
+hexadecimal digits inside the braces; invalid values provoke error messages.
+.P
+Note that even in UTF-8 mode, \exhh (and depending of how large, \eddd)
+describe one byte rather than one character; this makes it possible to
+construct invalid UTF-8 sequences for testing purposes. On the other hand,
+\ex{hh...} is interpreted as a UTF-8 character in UTF-8 mode, only generating
+more than one byte if the value is greater than 127. To avoid the ambiguity
+it is preferred to use \eN{U+hh...} when describing characters. When testing
+the 8-bit library not in UTF-8 mode, \ex{hh} generates one byte for values
+that could fit on it, and causes an error for greater values.
+.P
+When testing te 16-bit library, not in UTF-16 mode, all 4-digit \ex{hhhh}
+values are accepted. This makes it possible to construct invalid UTF-16
+sequences for testing purposes.
+.P
+When testing the 32-bit library, not In UTF-32 mode, all 4 to 8-digit \ex{...}
+values are accepted. This makes it possible to construct invalid UTF-32
+sequences for testing purposes.
 .P
 There is a special backslash sequence that specifies replication of one or more
 characters:

diff --git a/perltest.sh b/perltest.sh
@@ -32,8 +32,9 @@
 # Handle the shell script arguments.
 
 perl=perl
-perlarg=''
+perlarg=""
 prefix=''
+spc=""
 
 if [ $# -gt 0 -a "$1" = "-perl" ] ; then
   if [ $# -lt 2 ] ; then
@@ -47,11 +48,14 @@ fi
 
 if [ $# -gt 0 -a "$1" = "-w" ] ; then
   perlarg="-w"
+  spc=" "
   shift
 fi
 
 if [ $# -gt 0 -a "$1" = "-utf8" ] ; then
   prefix="use utf8; require Encode;"
+  perlarg="$perlarg$spc-CSD"
+
   shift
 fi
 

diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
@@ -1523,17 +1523,15 @@ else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
 
       if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
         {
-#ifdef EBCDIC
-        *errorcodeptr = ERR93;
-#else
+#ifndef EBCDIC
         if (utf)
           {
           ptr = p + 2;
           escape = 0;   /* Not a fancy escape after all */
           goto COME_FROM_NU;
           }
-        else *errorcodeptr = ERR93;
 #endif
+        *errorcodeptr = ERR93;
         }
 
       /* Give an error in contexts where quantifiers are not allowed

diff --git a/src/pcre2test.c b/src/pcre2test.c
@@ -963,6 +963,13 @@ static coptstruct coptlist[] = {
 #undef SUPPORT_32
 #undef SUPPORT_EBCDIC
 
+/* Types for the parser, to be used in process_data() */
+
+enum force_encoding {
+  FORCE_NONE,         /* No preference, follow utf modifier */
+  FORCE_RAW,          /* Encode as a code point or error if too wide */
+  FORCE_UTF           /* Encode as a character or error if too wide */
+};
 
 /* ----------------------- Static variables ------------------------ */
 
@@ -7134,8 +7141,9 @@ in 16- and 32-bit modes, it can be forced to UTF-8 by the utf8_input modifier.
 
 while ((c = *p++) != 0)
   {
-  int32_t i = 0;
+  int i = 0;
   size_t replen;
+  enum force_encoding encoding = FORCE_NONE;
 
   /* ] may mark the end of a replicated sequence */
 
@@ -7157,6 +7165,7 @@ while ((c = *p++) != 0)
       fprintf(outfile, "** Repeat count too large\n");
       return PR_OK;
       }
+    i = (int)li;
 
     p = (uint8_t *)endptr;
     if (*p++ != '}')
@@ -7165,7 +7174,6 @@ while ((c = *p++) != 0)
       return PR_OK;
       }
 
-    i = (int32_t)li;
     if (i-- <= 0)
       {
       fprintf(outfile, "** Zero or negative repeat not allowed\n");
@@ -7243,24 +7251,32 @@ while ((c = *p++) != 0)
     case '0': case '1': case '2': case '3':
     case '4': case '5': case '6': case '7':
     c -= '0';
-    while (i++ < 2 && isdigit(*p) && *p != '8' && *p != '9')
+    while (i++ < 2 && isdigit(*p) && *p < '8')
       c = c * 8 + (*p++ - '0');
+
+    encoding = (utf && c > 255)? FORCE_UTF : FORCE_RAW;
     break;
 
     case 'o':
     if (*p == '{')
       {
       uint8_t *pt = p;
       c = 0;
-      for (pt++; isdigit(*pt) && *pt != '8' && *pt != '9'; pt++)
+      for (pt++; isdigit(*pt) && *pt < '8'; ++i, pt++)
         {
-        if (++i == 12)
-          fprintf(outfile, "** Too many octal digits in \\o{...} item; "
-                           "using only the first twelve.\n");
+        if (c >= 0x20000000l)
+          {
+          fprintf(outfile, "** \\o{ escape too large\n");
+          return PR_OK;
+          }
         else c = c * 8 + (*pt - '0');
         }
-      if (*pt == '}') p = pt + 1;
-        else fprintf(outfile, "** Missing } after \\o{ (assumed)\n");
+      if (i == 0 || *pt != '}')
+        {
+        fprintf(outfile, "** Malformed \\o{ escape\n");
+        return PR_OK;
+        }
+      else p = pt + 1;
       }
     break;
 
@@ -7306,15 +7322,31 @@ while ((c = *p++) != 0)
         p++;
         }
 #if defined SUPPORT_PCRE2_8
-      if (utf && (test_mode == PCRE8_MODE))
-        {
-        *q8++ = c;
-        continue;
-        }
+      if (utf && (test_mode == PCRE8_MODE)) encoding = FORCE_RAW;
 #endif
       }
     break;
 
+    case 'N':
+    if (memcmp(p, "{U+", 3) == 0 && isxdigit(p[3]))
+      {
+      char *endptr;
+      unsigned long uli;
+
+      p += 3;
+      errno = 0;
+      uli = strtoul((const char *)p, &endptr, 16);
+      if (errno == 0 && *endptr == '}' && uli <= UINT32_MAX)
+        {
+        c = (uint32_t)uli;
+        p = (uint8_t *)endptr + 1;
+        encoding = FORCE_UTF;
+        break;
+        }
+      }
+    fprintf(outfile, "** Malformed \\N{U+ escape\n");
+    return PR_OK;
+
     case 0:     /* \ followed by EOF allows for an empty line */
     p--;
     continue;
@@ -7340,24 +7372,13 @@ while ((c = *p++) != 0)
     }
 
   /* We now have a character value in c that may be greater than 255.
-  In 8-bit mode we convert to UTF-8 if we are in UTF mode. Values greater
-  than 127 in UTF mode must have come from \x{...} or octal constructs
-  because values from \x.. get this far only in non-UTF mode. */
+  Depending of how we got it, the encoding enum could be set to tell
+  us how to encode it, otherwise follow the utf modifier. */
 
 #ifdef SUPPORT_PCRE2_8
   if (test_mode == PCRE8_MODE)
     {
-    if (utf)
-      {
-      if (c > 0x7fffffff)
-        {
-        fprintf(outfile, "** Character \\x{%x} is greater than 0x7fffffff "
-          "and so cannot be converted to UTF-8\n", c);
-        return PR_OK;
-        }
-      q8 += ord2utf8(c, q8);
-      }
-    else
+    if (encoding == FORCE_RAW || !(utf || encoding == FORCE_UTF))
       {
       if (c > 0xffu)
         {
@@ -7368,27 +7389,37 @@ while ((c = *p++) != 0)
         }
       *q8++ = (uint8_t)c;
       }
+    else
+      {
+      if (c > 0x7fffffff)
+        {
+        fprintf(outfile, "** Character \\N{U+%x} is greater than 0x7fffffff "
+                         "and therefore cannot be encoded as UTF-8\n", c);
+        return PR_OK;
+        }
+      q8 += ord2utf8(c, q8);
+      }
     }
 #endif
 #ifdef SUPPORT_PCRE2_16
   if (test_mode == PCRE16_MODE)
     {
-    if (utf)
+    if (encoding == FORCE_UTF || utf)
       {
       if (c > 0x10ffffu)
         {
-        fprintf(outfile, "** Failed: character \\x{%x} is greater than "
-          "0x10ffff and so cannot be converted to UTF-16\n", c);
+        fprintf(outfile, "** Failed: character \\N{U+%x} is greater than "
+                         "0x10ffff and therefore cannot be encoded as "
+                         "UTF-16\n", c);
         return PR_OK;
         }
       else if (c >= 0x10000u)
         {
-        c-= 0x10000u;
+        c -= 0x10000u;
         *q16++ = 0xD800 | (c >> 10);
         *q16++ = 0xDC00 | (c & 0x3ff);
         }
-      else
-        *q16++ = c;
+      else *q16++ = c;
       }
     else
       {

diff --git a/testdata/testinput11 b/testdata/testinput11
@@ -356,9 +356,18 @@
 # We can use pcre2test's utf8_input modifier to create wide pattern characters,
 # even though this test is run when UTF is not supported.
 
+/a\x{ffff}b/utf8_input
+    aï¿¿b
+    a\x{ffff}b
+    a\o{177777}b
+\= Expect no match
+    a\N{U+ffff}z
+
 /abý¿¿¿¿¿z/utf8_input
     abý¿¿¿¿¿z
     ab\x{7fffffff}z
+    ab\o{17777777777}z
+    ab\N{U+7fffffff}z
 
 /abÿý¿¿¿¿¿z/utf8_input
     abÿý¿¿¿¿¿z
@@ -367,6 +376,15 @@
 /abÿAz/utf8_input
     abÿAz
     ab\x{80000041}z 
+\= Expect no match
+    abAz
+    aAz
+    ab\377Az
+    ab\xff\N{U+0041}z
+    ab\N{U+ff}\N{U+41}z
+
+/ab\x{80000041}z/
+    ab\x{80000041}z
 
 /(?i:A{1,}\6666666666)/
     A\x{1b6}6666666

diff --git a/testdata/testinput4 b/testdata/testinput4
@@ -2335,6 +2335,9 @@
 /[\N{U+1234}]/utf
     \x{1234}
 
+/(\x{1234}) \1/utf
+    \N{U+1234} \o{11064}
+
 # Test the full list of Unicode "Pattern White Space" characters that are to
 # be ignored by /x. The pattern lines below may show up oddly in text editors
 # or when listed to the screen. Note that characters such as U+2002, which are