From 57b63bab181c98513044b117ee409929dda5c8ed Mon Sep 17 00:00:00 2001 From: Tony Cook Date: Thu, 21 Sep 2023 10:01:10 +1000 Subject: [PATCH 1/6] join: tests from GH #21458 --- t/op/join.t | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/t/op/join.t b/t/op/join.t index 7f9a1968980b..d643023ec023 100644 --- a/t/op/join.t +++ b/t/op/join.t @@ -6,7 +6,7 @@ BEGIN { set_up_inc('../lib'); } -plan tests => 29; +plan tests => 41; @x = (1, 2, 3); is( join(':',@x), '1:2:3', 'join an array with character'); @@ -128,3 +128,54 @@ package o { use overload q|""| => sub { ${$_[0]}++ } } for(1,2) { push @_, \join "x", 1 } isnt $_[1], $_[0], 'join(const, const) still returns a new scalar each time'; + +# tests from GH #21458 +# simple tied variable +{ + package S; + our $fetched; + sub TIESCALAR { my $x = '-'; $fetched = 0; bless \$x } + sub FETCH { my $y = shift; $fetched++; $$y } + + package main; + my $t; + + tie $t, 'S'; + is( join( $t, a .. c ), 'a-b-c', 'tied separator' ); + is( $S::fetched, 1, 'FETCH called once' ); + + tie $t, 'S'; + is( join( $t, 'a' ), 'a', 'tied separator on single item join' ); + is( $S::fetched, 0, 'FETCH not called' ); + + tie $t, 'S'; + is( join( $t, 'a', $t, 'b', $t, 'c' ), + 'a---b---c', 'tied separator also in the join arguments' ); + is( $S::fetched, 3, 'FETCH called 1 + 2 times' ); +} +# self-modifying tied variable +{ + + package SM; + our $fetched; + sub TIESCALAR { my $x = "1"; $fetched = 0; bless \$x } + sub FETCH { my $y = shift; $fetched++; $$y += 3 } + + package main; + my $t; + + tie $t, "SM"; + is( join( $t, a .. c ), 'a4b4c', 'tied separator' ); + is( $SM::fetched, 1, 'FETCH called once' ); + + tie $t, "SM"; + is( join( $t, 'a' ), 'a', 'tied separator on single item join' ); + is( $SM::fetched, 0, 'FETCH not called' ); + + tie $t, "SM"; + { local $TODO = "separator keeps being FETCHed"; + is( join( $t, "a", $t, "b", $t, "c" ), + 'a474b4104c', 'tied separator also in the join arguments' ); + } + is( $SM::fetched, 3, 'FETCH called 1 + 2 times' ); +} From d20bb9925daaa01250b9d02da38fd52d9820d938 Mon Sep 17 00:00:00 2001 From: Tony Cook Date: Thu, 21 Sep 2023 10:13:28 +1000 Subject: [PATCH 2/6] join: add tests from my comments on #21484 --- t/op/join.t | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/t/op/join.t b/t/op/join.t index d643023ec023..3ecaab2247e2 100644 --- a/t/op/join.t +++ b/t/op/join.t @@ -6,7 +6,7 @@ BEGIN { set_up_inc('../lib'); } -plan tests => 41; +plan tests => 43; @x = (1, 2, 3); is( join(':',@x), '1:2:3', 'join an array with character'); @@ -179,3 +179,37 @@ isnt $_[1], $_[0], } is( $SM::fetched, 3, 'FETCH called 1 + 2 times' ); } +{ + # see GH #21484 + local $TODO = "changes to delim have an effect"; + my $expect = "a\x{100}x\x{100}b\n"; + utf8::encode($expect); + fresh_perl_is(<<'CODE', $expect, {}, "modifications delim from magic should be ignored"); +my $n = 1; +my $sep = "\x{100}" x $n; +package MyOver { + use overload '""' => sub { $sep = "\xFF" x $n; "x" }; +} + +my $x = bless {}, "MyOver"; +binmode STDOUT, ":utf8"; +print join($sep, "a", $x, "b"), "\n"; +CODE +} +{ + # see GH #21484 + my $expect = "x\x{100}a\n"; + local $TODO = "modifications to delim PVX caused UB"; + utf8::encode($expect); # fresh_perl() does bytes + fresh_perl_is(<<'CODE', $expect, {}, "modifications to delim PVX shouldn't crash"); +my $n = 1; +my $sep = "\x{100}" x $n; +package MyOver { + use overload '""' => sub { $sep = "\xFF" x ($n+20); "x" }; +} + +my $x = bless {}, "MyOver"; +binmode STDOUT, ":utf8"; +print join($sep, $x, "a"), "\n"; +CODE +} From d1cec229d1b81b9e372732ada63cd0b884465289 Mon Sep 17 00:00:00 2001 From: Tony Cook Date: Thu, 21 Sep 2023 14:58:26 +1000 Subject: [PATCH 3/6] join: save the delimiter string before anything magical happens to it This code had a few problems: - changes to the content of delim from set or overload magic could result in the separator between elements changing during the process of the join. - changes to the content of delim which allocated a new PVX resulted in access to freed memory - changes to the flags of delim, the UTF-8 flag in particular, could result in an invalid joined string, either mojibake or an invalidly encoded upgraded string To avoid that, we copy the separator, either into a local buffer if it's large enough, or an allocated buffer, and save the flag we use, to prevent changes to the delim SV from changing or invalidating the delimpv value. Fixes #21458 and some similar problems. --- doop.c | 30 +++++++++++++++++++++++++++--- t/op/join.t | 4 ---- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/doop.c b/doop.c index 89a5e38c5506..4b4c824916d3 100644 --- a/doop.c +++ b/doop.c @@ -633,6 +633,13 @@ Perl_do_trans(pTHX_ SV *sv) } } +#ifdef DEBUGGING +/* make it small to exercise the logic */ +# define JOIN_DELIM_BUFSIZE 2 +#else +# define JOIN_DELIM_BUFSIZE 40 +#endif + /* =for apidoc_section $string =for apidoc do_join @@ -662,10 +669,27 @@ Perl_do_join(pTHX_ SV *sv, SV *delim, SV **mark, SV **sp) I32 items = sp - mark; STRLEN len; STRLEN delimlen; - const char * const delims = SvPV_const(delim, delimlen); + const char * delimpv = SvPV_const(delim, delimlen); + char delim_buf[JOIN_DELIM_BUFSIZE]; + bool delim_do_utf8 = DO_UTF8(delim); PERL_ARGS_ASSERT_DO_JOIN; + if (items >= 2) { + /* Make a copy of the delim, since G or A magic may modify the delim SV. + Use a local buffer if possible to avoid the cost of allocation and + clean up. + */ + if (delimlen <= JOIN_DELIM_BUFSIZE) { + Copy(delimpv, delim_buf, delimlen, char); + delimpv = delim_buf; + } + else { + delimpv = savepvn(delimpv, delimlen); + SAVEFREEPV(delimpv); + } + } + mark++; len = (items > 0 ? (delimlen * (items - 1) ) : 0); SvUPGRADE(sv, SVt_PV); @@ -699,11 +723,11 @@ Perl_do_join(pTHX_ SV *sv, SV *delim, SV **mark, SV **sp) } if (delimlen) { - const U32 delimflag = DO_UTF8(delim) ? SV_CATUTF8 : SV_CATBYTES; + const U32 delimflag = delim_do_utf8 ? SV_CATUTF8 : SV_CATBYTES; for (; items > 0; items--,mark++) { STRLEN len; const char *s; - sv_catpvn_flags(sv,delims,delimlen,delimflag); + sv_catpvn_flags(sv, delimpv, delimlen, delimflag); s = SvPV_const(*mark,len); sv_catpvn_flags(sv,s,len, DO_UTF8(*mark) ? SV_CATUTF8 : SV_CATBYTES); diff --git a/t/op/join.t b/t/op/join.t index 3ecaab2247e2..b27f6014b659 100644 --- a/t/op/join.t +++ b/t/op/join.t @@ -173,15 +173,12 @@ isnt $_[1], $_[0], is( $SM::fetched, 0, 'FETCH not called' ); tie $t, "SM"; - { local $TODO = "separator keeps being FETCHed"; is( join( $t, "a", $t, "b", $t, "c" ), 'a474b4104c', 'tied separator also in the join arguments' ); - } is( $SM::fetched, 3, 'FETCH called 1 + 2 times' ); } { # see GH #21484 - local $TODO = "changes to delim have an effect"; my $expect = "a\x{100}x\x{100}b\n"; utf8::encode($expect); fresh_perl_is(<<'CODE', $expect, {}, "modifications delim from magic should be ignored"); @@ -199,7 +196,6 @@ CODE { # see GH #21484 my $expect = "x\x{100}a\n"; - local $TODO = "modifications to delim PVX caused UB"; utf8::encode($expect); # fresh_perl() does bytes fresh_perl_is(<<'CODE', $expect, {}, "modifications to delim PVX shouldn't crash"); my $n = 1; From 265f70e72f95fc455627ab31a04b33da6145b0eb Mon Sep 17 00:00:00 2001 From: Tony Cook Date: Thu, 21 Sep 2023 15:15:42 +1000 Subject: [PATCH 4/6] do_join: handle more than 2G arguments No tests since I don't have the memory. --- doop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doop.c b/doop.c index 4b4c824916d3..030782b0428c 100644 --- a/doop.c +++ b/doop.c @@ -666,7 +666,7 @@ void Perl_do_join(pTHX_ SV *sv, SV *delim, SV **mark, SV **sp) { SV ** const oldmark = mark; - I32 items = sp - mark; + SSize_t items = sp - mark; STRLEN len; STRLEN delimlen; const char * delimpv = SvPV_const(delim, delimlen); From a7ce2f09f26b153159450cdbd4cfd9eb2a73a8ca Mon Sep 17 00:00:00 2001 From: Tony Cook Date: Thu, 21 Sep 2023 15:34:06 +1000 Subject: [PATCH 5/6] do_join: validate arguments before we start using them --- doop.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doop.c b/doop.c index 030782b0428c..0e2cfe19e6df 100644 --- a/doop.c +++ b/doop.c @@ -665,6 +665,8 @@ Magic and tainting are handled. void Perl_do_join(pTHX_ SV *sv, SV *delim, SV **mark, SV **sp) { + PERL_ARGS_ASSERT_DO_JOIN; + SV ** const oldmark = mark; SSize_t items = sp - mark; STRLEN len; @@ -673,8 +675,6 @@ Perl_do_join(pTHX_ SV *sv, SV *delim, SV **mark, SV **sp) char delim_buf[JOIN_DELIM_BUFSIZE]; bool delim_do_utf8 = DO_UTF8(delim); - PERL_ARGS_ASSERT_DO_JOIN; - if (items >= 2) { /* Make a copy of the delim, since G or A magic may modify the delim SV. Use a local buffer if possible to avoid the cost of allocation and From 723c8f5e0145d8dcb0f123a5f3434eaba41dc5c0 Mon Sep 17 00:00:00 2001 From: Tony Cook Date: Tue, 26 Sep 2023 15:21:17 +1000 Subject: [PATCH 6/6] join tests: comments clarifying why we don't do simple assignment I changed $n to 2 in each case as a defence against a reasonable possible optimisation of $x x 1 sharing the PV with $x. --- t/op/join.t | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/t/op/join.t b/t/op/join.t index b27f6014b659..79d6c8a9cba0 100644 --- a/t/op/join.t +++ b/t/op/join.t @@ -179,10 +179,12 @@ isnt $_[1], $_[0], } { # see GH #21484 - my $expect = "a\x{100}x\x{100}b\n"; + my $expect = "a\x{100}\x{100}x\x{100}\x{100}b\n"; utf8::encode($expect); fresh_perl_is(<<'CODE', $expect, {}, "modifications delim from magic should be ignored"); -my $n = 1; +# The x $n here is to ensure the PV of $sep isn't a COW of some other SV +# so the PV of $sep is unlikely to change when the overload assigns to $sep. +my $n = 2; my $sep = "\x{100}" x $n; package MyOver { use overload '""' => sub { $sep = "\xFF" x $n; "x" }; @@ -195,10 +197,14 @@ CODE } { # see GH #21484 - my $expect = "x\x{100}a\n"; + my $expect = "x\x{100}\x{100}a\n"; utf8::encode($expect); # fresh_perl() does bytes fresh_perl_is(<<'CODE', $expect, {}, "modifications to delim PVX shouldn't crash"); -my $n = 1; +# the x $n here is to ensure $sep has it's own PV rather than sharing it +# in a COW sense, This means that when the expanded version ($n+20) is assigned +# the origin PV has been released and valgrind or ASAN can pick up the use +# of the freed buffer. +my $n = 2; my $sep = "\x{100}" x $n; package MyOver { use overload '""' => sub { $sep = "\xFF" x ($n+20); "x" };