Skip to content

Commit

Permalink
Merge e0d5a14 into 1f6b8a9
Browse files Browse the repository at this point in the history
  • Loading branch information
nwc10 committed May 15, 2021
2 parents 1f6b8a9 + e0d5a14 commit 4dd5879
Show file tree
Hide file tree
Showing 5 changed files with 180 additions and 28 deletions.
32 changes: 30 additions & 2 deletions dist/Data-Dumper/Changes
Expand Up @@ -6,11 +6,39 @@ Changes - public release history for Data::Dumper

=over 8

=item 2.173
=item 2.179_50 (May 14 2021)

Data::Dumper handles Unicode regex corner cases (GH #18614, GH #18764)

=item 2.179 (May 13 2021)

Revert the changes of 2.177 for the v5.34.0 release to avoid a regression.

=item 2.178 (Apr 7 2021)

Correct documentation of indent Style 2.

=item 2.177 (Mar 3 2021)

Make Data::Dumper mark regex output as UTF-8 if needed. (GH #18614)

=item 2.176 (Sep 30 2020)

Make Data::Dumper strict and warnings compliant.

=item 2.175 (Aug 13 2020)

Avoid some leaks if we call get magic and that throws an exception.

=item 2.174 (Apr 3 2019)

Avoid leaking if we croak due to excessive recursion.

=item 2.173 (Nov 10 2018)

perl #133624: Reinstate support for 5.8.8 and older.

=item 2.172
=item 2.172 (Sep 19 2018)

Prepare recent changes for CPAN release

Expand Down
4 changes: 2 additions & 2 deletions dist/Data-Dumper/Dumper.pm
Expand Up @@ -29,7 +29,7 @@ our ( $Indent, $Trailingcomma, $Purity, $Pad, $Varname, $Useqq, $Terse, $Freezer
our ( @ISA, @EXPORT, @EXPORT_OK, $VERSION );

BEGIN {
$VERSION = '2.178'; # Don't forget to set version and release
$VERSION = '2.179_50'; # Don't forget to set version and release
# date in POD below!

@ISA = qw(Exporter);
Expand Down Expand Up @@ -1476,7 +1476,7 @@ modify it under the same terms as Perl itself.
=head1 VERSION
Version 2.178
Version 2.179_50
=head1 SEE ALSO
Expand Down
87 changes: 69 additions & 18 deletions dist/Data-Dumper/Dumper.xs
Expand Up @@ -2,9 +2,14 @@
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
/* FIXME - we should go through the code and validate what we can remove.
Looks like we could elimiate much of our custom utf8_to_uvchr_buf games in
favour of ppport.h, and likewise if we replace my_sprintf with my_snprintf
some more complexity dies. */
#ifdef USE_PPPORT_H
# define NEED_my_snprintf
# define NEED_sv_2pv_flags
# define NEED_utf8_to_uvchr_buf
# include "ppport.h"
#endif

Expand Down Expand Up @@ -617,9 +622,10 @@ dump_regexp(pTHX_ SV *retval, SV *val)
SV *sv_pattern = NULL;
SV *sv_flags = NULL;
const char *rval;
const char *rend;
const char *slash;
const U8 *rend;
U8 *p;
CV *re_pattern_cv = get_cv("re::regexp_pattern", 0);
int do_utf8;

if (!re_pattern_cv) {
sv_pattern = val;
Expand Down Expand Up @@ -651,29 +657,74 @@ dump_regexp(pTHX_ SV *retval, SV *val)

assert(sv_pattern);

if (SvUTF8(sv_pattern)) {
sv_utf8_upgrade(retval);
}
sv_catpvs(retval, "qr/");

/* The strategy here is from commit 7894fbab1e479c2c (in June 1999) with a
* bug fix in Feb 2012 (commit de5ef703c7d8db65).
* We need to ensure that / is escaped as \/
* To be efficient, we want to avoid copying byte-for-byte, so we scan the
* string looking for "things we need to escape", and each time we find
* something, we copy over the verbatim section, before writing out the
* escaped part. At the end, if there's some verbatim section left, we copy
* that over to finish.
* The complication (perl #58608) is that we must not convert \/ to \\/
* (as that would be a syntax error), so we need to walk the string looking
* for either
* \ and the character immediately after (together)
* a character
* and only for the latter, do we need to escape /
*
* Of course, to add to the fun, we also need to escape Unicode characters
* to \x{...} notation (whether they are "escaped" by \ or stand alone).
* We can do all this in one pass if we are careful...
*/

rval = SvPV(sv_pattern, rlen);
rend = rval+rlen;
slash = rval;
sv_catpvs(retval, "qr/");
p = (U8 *)rval;
rend = p + rlen;
do_utf8 = DO_UTF8(sv_pattern);

while (p < rend) {
UV k = *p;
int saw_backslash = k == '\\';

if (saw_backslash) {
if (++p == rend) {
/* Oh my, \ at the end. Is this possible? */
break;
}
/* Otherwise we look at the next octet */
k = *p;
}

for ( ; slash < rend; slash++) {
if (*slash == '\\') {
++slash;
continue;
if ((k == '/' && !saw_backslash) || (do_utf8 && ! isASCII(k) && k > ' ')) {
STRLEN to_copy = p - (U8 *) rval;
if (to_copy) {
/* If saw_backslash is true, this will copy the \ for us too. */
sv_catpvn(retval, rval, to_copy);
}
if (k == '/') {
sv_catpvs(retval, "\\/");
++p;
}
else {
/* If there was a \, we have copied it already, so all that is
* left to do here is the \x{...} escaping. */
k = utf8_to_uvchr_buf(p, rend, NULL);
sv_catpvf(retval, "\\x{%" UVxf "}", k);
p += UTF8SKIP(p);
}
rval = (const char *) p;
}
if (*slash == '/') {
sv_catpvn(retval, rval, slash-rval);
sv_catpvs(retval, "\\/");
rlen -= slash-rval+1;
rval = slash+1;
else {
++p;
}
}

sv_catpvn(retval, rval, rlen);
rlen = rend - (U8 *) rval;
if (rlen) {
sv_catpvn(retval, rval, rlen);
}
sv_catpvs(retval, "/");

if (sv_flags)
Expand Down
81 changes: 76 additions & 5 deletions dist/Data-Dumper/t/dumper.t
Expand Up @@ -139,7 +139,7 @@ sub SKIP_TEST {
++$TNUM; print "ok $TNUM # skip $reason\n";
}

$TMAX = 474;
$TMAX = 486;

# Force Data::Dumper::Dump to use perl. We test Dumpxs explicitly by calling
# it direct. Out here it lets us knobble the next if to test that the perl
Expand Down Expand Up @@ -1709,6 +1709,7 @@ EOW
#############
{
# [github #18614 - handling of Unicode characters in regexes]
# [github #18764 - ... without breaking subsequent Latin-1]
if ($] lt '5.010') {
SKIP_TEST "Incomplete support for UTF-8 in old perls";
SKIP_TEST "Incomplete support for UTF-8 in old perls";
Expand All @@ -1718,18 +1719,88 @@ $WANT = <<"EOW";
#\$VAR1 = [
# "\\x{41f}",
# qr/\x{8b80}/,
# qr/\x{41f}/
# qr/\x{41f}/,
# qr/\x{e4}/,
# '\xE4'
#];
EOW
$WANT =~ s{/(,?)$}{/u$1}mg if $] gt '5.014';
TEST qq(Data::Dumper->Dump([ [qq/\x{41f}/, qr/\x{8b80}/, qr/\x{41f}/] ])),
if ($] lt '5.010001') {
$WANT =~ s!qr/!qr/(?-xism:!g;
$WANT =~ s!/,!)/,!g;
}
elsif ($] gt '5.014') {
$WANT =~ s{/(,?)$}{/u$1}mg;
}
TEST qq(Data::Dumper->Dump([ [qq/\x{41f}/, qr/\x{8b80}/, qr/\x{41f}/, qr/\x{e4}/, "\xE4"] ])),
"string with Unicode + regexp with Unicode";

TEST qq(Data::Dumper->Dumpxs([ [qq/\x{41f}/, qr/\x{8b80}/, qr/\x{41f}/] ])),
$WANT =~ s/'\xE4'/"\\x{e4}"/;
$WANT =~ s<([^\0-\177])> <sprintf '\\x{%x}', ord $1>ge;
TEST qq(Data::Dumper->Dumpxs([ [qq/\x{41f}/, qr/\x{8b80}/, qr/\x{41f}/, qr/\x{e4}/, "\xE4"] ])),
"string with Unicode + regexp with Unicode, XS"
if $XS;
}
#############
{
# [more perl #58608 tests]
my $bs = "\\\\";
$WANT = <<"EOW";
#\$VAR1 = [
# qr/ \\/ /,
# qr/ \\?\\/ /,
# qr/ $bs\\/ /,
# qr/ $bs:\\/ /,
# qr/ \\?$bs:\\/ /,
# qr/ $bs$bs\\/ /,
# qr/ $bs$bs:\\/ /,
# qr/ $bs$bs$bs\\/ /
#];
EOW
if ($] lt '5.010001') {
$WANT =~ s!qr/!qr/(?-xism:!g;
$WANT =~ s! /! )/!g;
}
TEST qq(Data::Dumper->Dump([ [qr! / !, qr! \\?/ !, qr! $bs/ !, qr! $bs:/ !, qr! \\?$bs:/ !, qr! $bs$bs/ !, qr! $bs$bs:/ !, qr! $bs$bs$bs/ !, ] ])),
"more perl #58608";
TEST qq(Data::Dumper->Dump([ [qr! / !, qr! \\?/ !, qr! $bs/ !, qr! $bs:/ !, qr! \\?$bs:/ !, qr! $bs$bs/ !, qr! $bs$bs:/ !, qr! $bs$bs$bs/ !, ] ])),
"more perl #58608 XS"
if $XS;
}
#############
{
# [github #18614, github #18764, perl #58608 corner cases]
if ($] lt '5.010') {
SKIP_TEST "Incomplete support for UTF-8 in old perls";
SKIP_TEST "Incomplete support for UTF-8 in old perls";
last;
}
my $bs = "\\\\";
$WANT = <<"EOW";
#\$VAR1 = [
# "\\x{2e18}",
# qr/ \x{203d}\\/ /,
# qr/ \\\x{203d}\\/ /,
# qr/ \\\x{203d}$bs:\\/ /,
# '\xA3'
#];
EOW
if ($] lt '5.010001') {
$WANT =~ s!qr/!qr/(?-xism:!g;
$WANT =~ s!/,!)/,!g;
}
elsif ($] gt '5.014') {
$WANT =~ s{/(,?)$}{/u$1}mg;
}
TEST qq(Data::Dumper->Dump([ [ '\x{2e18}', qr! \x{203d}/ !, qr! \\\x{203d}/ !, qr! \\\x{203d}$bs:/ !, "\xa3"] ])),
"github #18614, github #18764, perl #58608 corner cases";

$WANT =~ s/'\x{A3}'/"\\x{a3}"/;
$WANT =~ s/\x{203D}/\\x{203d}/g;
TEST qq(Data::Dumper->Dumpxs([ [ '\x{2e18}', qr! \x{203d}/ !, qr! \\\x{203d}/ !, qr! \\\x{203d}$bs:/ !, "\xa3"] ])),
"github #18614, github #18764, perl #58608 corner cases XS"
if $XS;
}
#############
{
# [perl #82948]
# re::regexp_pattern was moved to universal.c in v5.10.0-252-g192c1e2
Expand Down
4 changes: 3 additions & 1 deletion dist/Devel-PPPort/parts/inc/utf8
Expand Up @@ -359,7 +359,9 @@ utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
* disabled, so this 'if' will be true, and so later on, we know that
* 's' is dereferencible */
if (do_warnings) {
*retlen = (STRLEN) -1;
if (retlen) {
*retlen = (STRLEN) -1;
}
}
else {
ret = D_PPP_utf8_to_uvchr_buf_callee(
Expand Down

0 comments on commit 4dd5879

Please sign in to comment.