Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions doc/pcre2test.1
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.TH PCRE2TEST 1 "04 October 2024" "PCRE 10.45"
.TH PCRE2TEST 1 "16 October 2024" "PCRE 10.45"
.SH NAME
pcre2test - a program for testing Perl-compatible regular expressions.
.SH SYNOPSIS
Expand Down Expand Up @@ -511,7 +511,9 @@ means of encoding non-printing characters in a visible way:
.sp
Invoking \eN{U+hh...} or \ex{hh...} doesn't require the use of the \fButf\fP
modifier on the pattern. It is always recognized. There may be any number of
hexadecimal digits inside the braces; invalid values provoke error messages.
hexadecimal digits inside the braces; invalid values provoke error messages
but when using \eN{U+hh...} with some invalid unicode characters they will
be accepted with a warning instead.
.P
Note that even in UTF-8 mode, \exhh (and depending of how large, \eddd)
describe one byte rather than one character; this makes it possible to
Expand All @@ -526,7 +528,7 @@ When testing te 16-bit library, not in UTF-16 mode, all 4-digit \ex{hhhh}
values are accepted. This makes it possible to construct invalid UTF-16
sequences for testing purposes.
.P
When testing the 32-bit library, not In UTF-32 mode, all 4 to 8-digit \ex{...}
When testing the 32-bit library, not in UTF-32 mode, all 4 to 8-digit \ex{...}
values are accepted. This makes it possible to construct invalid UTF-32
sequences for testing purposes.
.P
Expand Down Expand Up @@ -2243,6 +2245,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 04 October 2024
Last updated: 16 October 2024
Copyright (c) 1997-2024 University of Cambridge.
.fi
38 changes: 34 additions & 4 deletions perltest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ fi
# aftertext interpreted as "print $' afterwards"
# afteralltext ignored
# dupnames ignored (Perl always allows)
# hex preprocess pattern with embedded octets
# jitstack ignored
# mark show mark information
# no_auto_possess ignored
Expand Down Expand Up @@ -244,9 +245,9 @@ for (;;)

# Split the pattern from the modifiers and adjust them as necessary.

$pattern =~ /^\s*((.).*\2)(.*)$/s;
$pat = $1;
$del = $2;
$pattern =~ /^\s*(.)(.*)\1(.*)$/s;
$del = $1;
$pat = $2;
$mod = "$3,$extra_modifiers";
$mod =~ s/^,\s*//;

Expand Down Expand Up @@ -286,6 +287,34 @@ for (;;)

$mod =~ s/no_auto_possess,?//;

# The "hex" modifier instructs us to preprocess the pattern

if ($mod =~ s/hex,?//)
{
my $t = "";

# find either 2 digit hex octets, optionally surrounded by spaces, to
# add as code points or quoted strings that will be copied verbatim

while ($pat =~ /\s*(?:(\p{ahex}{2})|(['"])([^\2]+?)\2)\s*/g)
{
if (defined $1)
{
no utf8;
$t .= chr(hex($1));
use if $utf8, "utf8";
}
else
{
$t .= $3;
}
}
no utf8;
utf8::decode($t) if $utf8;
use if $utf8, "utf8";
$pat = $t;
}

# Use no_start_optimize (disable PCRE2 start-up optimization) to disable Perl
# optimization by inserting (??{""}) at the start of the pattern. We may
# also encounter -no_start_optimize from a #pattern setting.
Expand All @@ -297,7 +326,8 @@ for (;;)
# Add back retained modifiers and check that the pattern is valid.

$mod =~ s/,//g;
$pattern = "$pat$mod";

$pattern = "$del$pat$del$mod";

eval "\$_ =~ ${pattern}";
if ($@)
Expand Down
2 changes: 1 addition & 1 deletion src/pcre2_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -1940,7 +1940,7 @@ else
cc = *ptr++;
if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
#if PCRE2_CODE_UNIT_WIDTH == 32
if (c >= 0x20000000l) { overflow = TRUE; break; }
if (c >= 0x20000000u) { overflow = TRUE; break; }
#endif
c = (c << 3) + (cc - CHAR_0);
#if PCRE2_CODE_UNIT_WIDTH == 8
Expand Down
55 changes: 37 additions & 18 deletions src/pcre2test.c
Original file line number Diff line number Diff line change
Expand Up @@ -713,7 +713,7 @@ static modstruct modlist[] = {
{ "global", MOD_PNDP, MOD_CTL, CTL_GLOBAL, PO(control) },
{ "heap_limit", MOD_CTM, MOD_INT, 0, MO(heap_limit) },
{ "heapframes_size", MOD_PND, MOD_CTL, CTL2_HEAPFRAMES_SIZE, PO(control2) },
{ "hex", MOD_PAT, MOD_CTL, CTL_HEXPAT, PO(control) },
{ "hex", MOD_PATP, MOD_CTL, CTL_HEXPAT, PO(control) },
{ "info", MOD_PAT, MOD_CTL, CTL_INFO, PO(control) },
{ "jit", MOD_PAT, MOD_IND, 7, PO(jit) },
{ "jitfast", MOD_PAT, MOD_CTL, CTL_JITFAST, PO(control) },
Expand Down Expand Up @@ -7264,7 +7264,7 @@ while ((c = *p++) != 0)
c = 0;
for (pt++; isdigit(*pt) && *pt < '8'; ++i, pt++)
{
if (c >= 0x20000000l)
if (c >= 0x20000000u)
{
fprintf(outfile, "** \\o{ escape too large\n");
return PR_OK;
Expand Down Expand Up @@ -7397,20 +7397,38 @@ while ((c = *p++) != 0)
"and therefore cannot be encoded as UTF-8\n", c);
return PR_OK;
}
else if (encoding == FORCE_UTF && c > MAX_UTF_CODE_POINT)
fprintf(outfile, "** Warning: character \\N{U+%x} is greater than "
"0x%x and should not be encoded as UTF-8\n",
c, MAX_UTF_CODE_POINT);
q8 += ord2utf8(c, q8);
}
}
#endif
#ifdef SUPPORT_PCRE2_16
if (test_mode == PCRE16_MODE)
{
if (encoding == FORCE_UTF || utf)
/* Unlike the 8-bit code, there are no forced raw suggestions for the
16-bit mode, so assume raw unless utf is preferred */

if (!(encoding == FORCE_UTF || utf))
{
if (c > 0x10ffffu)
if (c > 0xffffu)
{
fprintf(outfile, "** Character \\x{%x} is greater than 0xffff "
"and UTF-16 mode is not enabled.\n", c);
fprintf(outfile, "** Truncation will probably give the wrong "
"result.\n");
}
*q16++ = (uint16_t)c;
}
else
{
if (c > MAX_UTF_CODE_POINT)
{
fprintf(outfile, "** Failed: character \\N{U+%x} is greater than "
"0x10ffff and therefore cannot be encoded as "
"UTF-16\n", c);
"0x%x and therefore cannot be encoded as UTF-16\n",
c, MAX_UTF_CODE_POINT);
return PR_OK;
}
else if (c >= 0x10000u)
Expand All @@ -7419,24 +7437,25 @@ while ((c = *p++) != 0)
*q16++ = 0xD800 | (c >> 10);
*q16++ = 0xDC00 | (c & 0x3ff);
}
else *q16++ = c;
}
else
{
if (c > 0xffffu)
else
{
fprintf(outfile, "** Character \\x{%x} is greater than 0xffff "
"and UTF-16 mode is not enabled.\n", c);
fprintf(outfile, "** Truncation will probably give the wrong "
"result.\n");
if (encoding == FORCE_UTF && 0xe000u > c && c >= 0xd800u)
fprintf(outfile, "** Warning: character \\N{U+%x} is a surrogate "
"and should not be encoded as UTF-16\n", c);
*q16++ = c;
}

*q16++ = (uint16_t)c;
}
}
#endif
#ifdef SUPPORT_PCRE2_32
if (test_mode == PCRE32_MODE) *q32++ = c;
if (test_mode == PCRE32_MODE)
{
if (encoding == FORCE_UTF && c > MAX_UTF_CODE_POINT)
fprintf(outfile, "** Warning: character \\N{U+%x} is greater than "
"0x%x and should not be encoded as UTF-32\n",
c, MAX_UTF_CODE_POINT);
*q32++ = c;
}
#endif
}

Expand Down
3 changes: 3 additions & 0 deletions testdata/testinput1
Original file line number Diff line number Diff line change
Expand Up @@ -6709,4 +6709,7 @@ $/x
\= Expect no match
.a.b.c.

/65 00 64/hex
e\0d

# End of testinput1
10 changes: 8 additions & 2 deletions testdata/testinput11
Original file line number Diff line number Diff line change
Expand Up @@ -356,12 +356,18 @@
# We can use pcre2test's utf8_input modifier to create wide pattern characters,
# even though this test is run when UTF is not supported.

/a\x{d800}b/utf8_input
aí €b
a\x{d800}b
a\o{154000}b
\= Expect warning unless 32bit
a\N{U+d800}b

/a\x{ffff}b/utf8_input
aï¿¿b
a\x{ffff}b
a\o{177777}b
\= Expect no match
a\N{U+ffff}z
a\N{U+ffff}b

/abý¿¿¿¿¿z/utf8_input
abý¿¿¿¿¿z
Expand Down
5 changes: 5 additions & 0 deletions testdata/testinput4
Original file line number Diff line number Diff line change
Expand Up @@ -2908,4 +2908,9 @@
/\p{ ^ L u }/
AbCd

# hex

/c3 b1/hex,utf
\N{U+00F1}

# End of testinput4
1 change: 1 addition & 0 deletions testdata/testinput9
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
a\443b

/fd bf bf bf bf bf/I,hex
\= Expect warning
\N{U+7fffffff}
\= Expect no match # error message (too big char)
\x{7fffffff}
Expand Down
4 changes: 4 additions & 0 deletions testdata/testoutput1
Original file line number Diff line number Diff line change
Expand Up @@ -10580,4 +10580,8 @@ No match
.a.b.c.
No match

/65 00 64/hex
e\0d
0: e\x00d

# End of testinput1
17 changes: 14 additions & 3 deletions testdata/testoutput11-16
Original file line number Diff line number Diff line change
Expand Up @@ -646,16 +646,27 @@ Subject length lower bound = 1
# We can use pcre2test's utf8_input modifier to create wide pattern characters,
# even though this test is run when UTF is not supported.

/a\x{d800}b/utf8_input
aí €b
0: a\x{d800}b
a\x{d800}b
0: a\x{d800}b
a\o{154000}b
0: a\x{d800}b
\= Expect warning unless 32bit
a\N{U+d800}b
** Warning: character \N{U+d800} is a surrogate and should not be encoded as UTF-16
0: a\x{d800}b

/a\x{ffff}b/utf8_input
aï¿¿b
0: a\x{ffff}b
a\x{ffff}b
0: a\x{ffff}b
a\o{177777}b
0: a\x{ffff}b
\= Expect no match
a\N{U+ffff}z
No match
a\N{U+ffff}b
0: a\x{ffff}b

/abý¿¿¿¿¿z/utf8_input
** Failed: character value greater than 0xffff cannot be converted to 16-bit in non-UTF mode
Expand Down
17 changes: 14 additions & 3 deletions testdata/testoutput11-32
Original file line number Diff line number Diff line change
Expand Up @@ -649,16 +649,26 @@ Subject length lower bound = 1
# We can use pcre2test's utf8_input modifier to create wide pattern characters,
# even though this test is run when UTF is not supported.

/a\x{d800}b/utf8_input
aí €b
0: a\x{d800}b
a\x{d800}b
0: a\x{d800}b
a\o{154000}b
0: a\x{d800}b
\= Expect warning unless 32bit
a\N{U+d800}b
0: a\x{d800}b

/a\x{ffff}b/utf8_input
aï¿¿b
0: a\x{ffff}b
a\x{ffff}b
0: a\x{ffff}b
a\o{177777}b
0: a\x{ffff}b
\= Expect no match
a\N{U+ffff}z
No match
a\N{U+ffff}b
0: a\x{ffff}b

/abý¿¿¿¿¿z/utf8_input
abý¿¿¿¿¿z
Expand All @@ -668,6 +678,7 @@ No match
ab\o{17777777777}z
0: ab\x{7fffffff}z
ab\N{U+7fffffff}z
** Warning: character \N{U+7fffffff} is greater than 0x10ffff and should not be encoded as UTF-32
0: ab\x{7fffffff}z

/abÿý¿¿¿¿¿z/utf8_input
Expand Down
6 changes: 6 additions & 0 deletions testdata/testoutput4
Original file line number Diff line number Diff line change
Expand Up @@ -4656,4 +4656,10 @@ No match
AbCd
0: b

# hex

/c3 b1/hex,utf
\N{U+00F1}
0: \x{f1}

# End of testinput4
2 changes: 2 additions & 0 deletions testdata/testoutput9
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ Capture group count = 0
First code unit = \xfd
Last code unit = \xbf
Subject length lower bound = 6
\= Expect warning
\N{U+7fffffff}
** Warning: character \N{U+7fffffff} is greater than 0x10ffff and should not be encoded as UTF-8
0: \xfd\xbf\xbf\xbf\xbf\xbf
\= Expect no match # error message (too big char)
\x{7fffffff}
Expand Down
Loading