Skip to content

Commit

Permalink
regexec.c: PATCH: [perl #114808]
Browse files Browse the repository at this point in the history
Commit c72077c fixed a place where
to_byte_substr() fails, but the code continued as if it had succeeded.

There is yet another place where the return is not checked.  This commit
adds a check there.

However, it turns out that there is another underlying problem to
[perl #114808].  The function to_byte_substr() tries to downgrade the
substr fields in the regex program it is passed.  If it fails (because
something in it is expressible only in UTF-8), it permanently changes
that field to point to PL_sv_undef, thus losing the original
information.  This is fine as long as the program will be used once and
discarded.  However, there are places where the program is re-used, as
in the test case introduced by this commit, and the original value has
been lost.

To solve this, this commit also changes to_byte_substr() from returning
void to instead returning bool, indicating success or failure.   On
failure, the original substrs are left intact.

The calls to this function are correspondingly changed.  One of them had
a trace statement when the failure happens, I reworded it to be more
general and accurate (it was slightly misleading), and added the trace
to every such place, not just the one.

In addition, I found the use of the same ternary operation in 3 or 4
consecutive lines very hard to understand; and is inefficient unless
compiled under C optimization which avoids recalculating things.  So I
expanded the several nearly identical places in the code that do that so
that I could quickly see what is going on.
  • Loading branch information
Karl Williamson committed Oct 6, 2012
1 parent c72077c commit 7e0d5ad
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 41 deletions.
2 changes: 1 addition & 1 deletion embed.fnc
Expand Up @@ -2024,7 +2024,7 @@ ERsn |U8* |reghop4 |NN U8 *s|I32 off|NN const U8 *llim \
ERsn |U8* |reghopmaybe3 |NN U8 *s|I32 off|NN const U8 *lim
ERs |char* |find_byclass |NN regexp * prog|NN const regnode *c|NN char *s|NN const char *strend|NULLOK regmatch_info *reginfo
Es |void |to_utf8_substr |NN regexp * prog
Es |void |to_byte_substr |NN regexp * prog
Es |bool |to_byte_substr |NN regexp * prog
ERs |I32 |reg_check_named_buff_matched |NN const regexp *rex \
|NN const regnode *scan
# ifdef DEBUGGING
Expand Down
2 changes: 1 addition & 1 deletion proto.h
Expand Up @@ -6822,7 +6822,7 @@ STATIC I32 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
#define PERL_ARGS_ASSERT_REGTRY \
assert(reginfo); assert(startposp)

STATIC void S_to_byte_substr(pTHX_ regexp * prog)
STATIC bool S_to_byte_substr(pTHX_ regexp * prog)
__attribute__nonnull__(pTHX_1);
#define PERL_ARGS_ASSERT_TO_BYTE_SUBSTR \
assert(prog)
Expand Down
123 changes: 85 additions & 38 deletions regexec.c
Expand Up @@ -37,6 +37,11 @@
#include "re_top.h"
#endif

/* At least one required character in the target string is expressible only in
* UTF-8. */
const char* const non_utf8_target_but_utf8_required
= "Can't match, because target string needs to be in UTF-8\n";

/*
* pregcomp and pregexec -- regsub and regerror are not used in perl
*
Expand Down Expand Up @@ -630,15 +635,15 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
to_utf8_substr(prog);
check = prog->check_utf8;
} else {
if (!prog->check_substr && prog->check_utf8)
to_byte_substr(prog);
if (!prog->check_substr && prog->check_utf8) {
if (! to_byte_substr(prog)) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
non_utf8_target_but_utf8_required));
goto fail;
}
}
check = prog->check_substr;
}
if (check == &PL_sv_undef) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
"Non-utf8 string cannot match utf8 check string\n"));
goto fail;
}
if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
|| ( (prog->extflags & RXf_ANCH_BOL)
Expand Down Expand Up @@ -2317,11 +2322,11 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre
#ifdef DEBUGGING
int did_match = 0;
#endif
if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];

if (utf8_target) {
if (! prog->anchored_utf8) {
to_utf8_substr(prog);
}
ch = SvPVX_const(prog->anchored_utf8)[0];
REXEC_FBC_SCAN(
if (*s == ch) {
DEBUG_EXECUTE_r( did_match = 1 );
Expand All @@ -2331,8 +2336,17 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre
s += UTF8SKIP(s);
}
);

}
else {
if (! prog->anchored_substr) {
if (! to_byte_substr(prog)) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
non_utf8_target_but_utf8_required));
goto phooey;
}
}
ch = SvPVX_const(prog->anchored_substr)[0];
REXEC_FBC_SCAN(
if (*s == ch) {
DEBUG_EXECUTE_r( did_match = 1 );
Expand Down Expand Up @@ -2361,23 +2375,44 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre
int did_match = 0;
#endif
if (prog->anchored_substr || prog->anchored_utf8) {
if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
if (utf8_target) {
if (! prog->anchored_utf8) {
to_utf8_substr(prog);
}
must = prog->anchored_utf8;
}
else {
if (! prog->anchored_substr) {
if (! to_byte_substr(prog)) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
non_utf8_target_but_utf8_required));
goto phooey;
}
}
must = prog->anchored_substr;
}
back_max = back_min = prog->anchored_offset;
} else {
if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
must = utf8_target ? prog->float_utf8 : prog->float_substr;
if (utf8_target) {
if (! prog->float_utf8) {
to_utf8_substr(prog);
}
must = prog->float_utf8;
}
else {
if (! prog->float_substr) {
if (! to_byte_substr(prog)) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
non_utf8_target_but_utf8_required));
goto phooey;
}
}
must = prog->float_substr;
}
back_max = prog->float_max_offset;
back_min = prog->float_min_offset;
}


if (must == &PL_sv_undef)
/* could not downgrade utf8 check substring, so must fail */
goto phooey;

if (back_min<0) {
last = strend;
} else {
Expand Down Expand Up @@ -2471,16 +2506,22 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre
STRLEN len;
const char *little;

if (utf8_target && !prog->float_utf8)
to_utf8_substr(prog);
else if (!utf8_target && !prog->float_substr) {
to_byte_substr(prog);
if (prog->float_substr == &PL_sv_undef)
/* downgrading failed, but target is not utf8, so
* matching must fail */
goto phooey;
}
float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
if (utf8_target) {
if (! prog->float_utf8) {
to_utf8_substr(prog);
}
float_real = prog->float_utf8;
}
else {
if (! prog->float_substr) {
if (! to_byte_substr(prog)) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
non_utf8_target_but_utf8_required));
goto phooey;
}
}
float_real = prog->float_substr;
}

little = SvPV_const(float_real, len);
if (SvTAIL(float_real)) {
Expand Down Expand Up @@ -7357,6 +7398,9 @@ restore_pos(pTHX_ void *arg)
STATIC void
S_to_utf8_substr(pTHX_ register regexp *prog)
{
/* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
* on the converted value */

int i = 1;

PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
Expand Down Expand Up @@ -7385,9 +7429,12 @@ S_to_utf8_substr(pTHX_ register regexp *prog)
} while (i--);
}

STATIC void
STATIC bool
S_to_byte_substr(pTHX_ register regexp *prog)
{
/* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
* on the converted value; returns FALSE if can't be converted. */

dVAR;
int i = 1;

Expand All @@ -7397,7 +7444,9 @@ S_to_byte_substr(pTHX_ register regexp *prog)
if (prog->substrs->data[i].utf8_substr
&& !prog->substrs->data[i].substr) {
SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
if (sv_utf8_downgrade(sv, TRUE)) {
if (! sv_utf8_downgrade(sv, TRUE)) {
return FALSE;
}
if (SvVALID(prog->substrs->data[i].utf8_substr)) {
if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
/* Trim the trailing \n that fbm_compile added last
Expand All @@ -7407,15 +7456,13 @@ S_to_byte_substr(pTHX_ register regexp *prog)
} else
fbm_compile(sv, 0);
}
} else {
SvREFCNT_dec(sv);
sv = &PL_sv_undef;
}
prog->substrs->data[i].substr = sv;
if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
prog->check_substr = sv;
}
} while (i--);

return TRUE;
}

/* These constants are for finding GCB=LV and GCB=LVT. These are for the
Expand Down
16 changes: 15 additions & 1 deletion t/op/split_unicode.t
Expand Up @@ -3,7 +3,7 @@
BEGIN {
require './test.pl';
skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec (used by charnames)");
plan(tests => 150);
plan(tests => 151);
}

{
Expand Down Expand Up @@ -61,4 +61,18 @@ BEGIN {
ok(@r3 == 3 && join('-', @r3) eq "-:A:-:B", "$msg - /\\s+/ No.2");
is($c3, scalar(@r3), "$msg - /\\s+/ No.2 (count)");
}

{ # RT #114808
warning_is(
sub {
$p=chr(0x100);
for (".","ab\x{101}def") {
@q = split /$p/
}
},
undef,
'no warnings when part of split cant match non-utf8'
);
}

}

0 comments on commit 7e0d5ad

Please sign in to comment.