Skip to content

Commit

Permalink
PATCH: [perl #134034] Assert fail in pattern match
Browse files Browse the repository at this point in the history
This turned out to be a general problem in regexec.c.  The function
regtry() can change the position we are looking at in the target string.
In particular it can point to just past the end of the string.  Previous
to this commit, we were assuming that the returned string contained
valid data in the returned position, and in many places we assumed that
it was before the end.  This commit fixes that in a bunch of places.
  • Loading branch information
khwilliamson committed Apr 15, 2019
1 parent 11b48e7 commit 8c9c272
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 25 deletions.
63 changes: 38 additions & 25 deletions regexec.c
Expand Up @@ -1787,7 +1787,9 @@ STMT_START {
STMT_START { \
while (s < strend) { \
CODE \
s += ((UTF8) ? UTF8SKIP(s) : 1); \
s += ((UTF8) \
? UTF8_SAFE_SKIP(s, reginfo->strend) \
: 1); \
} \
} STMT_END

Expand All @@ -1801,7 +1803,7 @@ STMT_START {
#define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \
if (COND) { \
FBC_CHECK_AND_TRY \
s += ((UTF8) ? UTF8SKIP(s) : 1); \
s += ((UTF8) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1);\
previous_occurrence_end = s; \
} \
else { \
Expand All @@ -1820,12 +1822,13 @@ STMT_START {
* of the one we're looking for. Knowing that, we can see right away if the
* next occurrence is adjacent to the previous. When 'doevery' is FALSE, we
* don't accept the 2nd and succeeding adjacent occurrences */
#define FBC_CHECK_AND_TRY \
if ( ( doevery \
|| s != previous_occurrence_end) \
&& (reginfo->intuit || regtry(reginfo, &s))) \
{ \
goto got_it; \
#define FBC_CHECK_AND_TRY \
if ( ( doevery \
|| s != previous_occurrence_end) \
&& ( reginfo->intuit \
|| (s <= reginfo->strend && regtry(reginfo, &s)))) \
{ \
goto got_it; \
}


Expand Down Expand Up @@ -1858,7 +1861,7 @@ STMT_START {
\
if (COND) { \
FBC_CHECK_AND_TRY \
s += UTF8SKIP(s); \
s += UTF8_SAFE_SKIP(s, reginfo->strend); \
previous_occurrence_end = s; \
} \
else { \
Expand Down Expand Up @@ -1977,7 +1980,7 @@ STMT_START {
* string (which should be zero length without having to look at the string
* contents) */
#define REXEC_FBC_TRYIT \
if ((reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s)))) \
if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \
goto got_it

/* The only difference between the BOUND and NBOUND cases is that
Expand Down Expand Up @@ -2398,7 +2401,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
{
goto got_it;
}
s += (utf8_target) ? UTF8SKIP(s) : 1;
s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
}
break;
}
Expand Down Expand Up @@ -2482,7 +2485,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
}

/* Didn't match. Try at the next position (if there is one) */
s += (utf8_target) ? UTF8SKIP(s) : 1;
s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
if (UNLIKELY(s >= reginfo->strend)) {
break;
}
Expand All @@ -2506,7 +2509,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
goto got_it;
}
before = after;
s += UTF8SKIP(s);
s += UTF8_SAFE_SKIP(s, reginfo->strend);
}
}
else { /* Not utf8. Everything is a GCB except between CR and
Expand All @@ -2524,7 +2527,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,

/* And, since this is a bound, it can match after the final
* character in the string */
if ((reginfo->intuit || regtry(reginfo, &s))) {
if ( reginfo->intuit
|| (s <= reginfo->strend && regtry(reginfo, &s)))
{
goto got_it;
}
break;
Expand All @@ -2534,7 +2539,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
if (reginfo->intuit || regtry(reginfo, &s)) {
goto got_it;
}
s += (utf8_target) ? UTF8SKIP(s) : 1;
s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
if (UNLIKELY(s >= reginfo->strend)) {
break;
}
Expand All @@ -2558,7 +2563,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
goto got_it;
}
before = after;
s += UTF8SKIP(s);
s += UTF8_SAFE_SKIP(s, reginfo->strend);
}
}
else { /* Not utf8. */
Expand All @@ -2580,7 +2585,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
}
}

if (reginfo->intuit || regtry(reginfo, &s)) {
if ( reginfo->intuit
|| (s <= reginfo->strend && regtry(reginfo, &s)))
{
goto got_it;
}

Expand All @@ -2591,7 +2598,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
if (reginfo->intuit || regtry(reginfo, &s)) {
goto got_it;
}
s += (utf8_target) ? UTF8SKIP(s) : 1;
s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
if (UNLIKELY(s >= reginfo->strend)) {
break;
}
Expand All @@ -2616,7 +2623,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
goto got_it;
}
before = after;
s += UTF8SKIP(s);
s += UTF8_SAFE_SKIP(s, reginfo->strend);
}
}
else { /* Not utf8. */
Expand All @@ -2641,7 +2648,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
/* Here are at the final position in the target string. The SB
* value is always true here, so matches, depending on other
* constraints */
if (reginfo->intuit || regtry(reginfo, &s)) {
if ( reginfo->intuit
|| (s <= reginfo->strend && regtry(reginfo, &s)))
{
goto got_it;
}

Expand All @@ -2652,7 +2661,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
if (reginfo->intuit || regtry(reginfo, &s)) {
goto got_it;
}
s += (utf8_target) ? UTF8SKIP(s) : 1;
s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
if (UNLIKELY(s >= reginfo->strend)) {
break;
}
Expand Down Expand Up @@ -2686,7 +2695,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
}
previous = before;
before = after;
s += UTF8SKIP(s);
s += UTF8_SAFE_SKIP(s, reginfo->strend);
}
}
else { /* Not utf8. */
Expand All @@ -2711,7 +2720,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
}
}

if (reginfo->intuit || regtry(reginfo, &s)) {
if ( reginfo->intuit
|| (s <= reginfo->strend && regtry(reginfo, &s)))
{
goto got_it;
}
}
Expand Down Expand Up @@ -3028,7 +3039,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
LEAVE;
goto got_it;
}
s = HOPc(s,1);
if (s < reginfo->strend) {
s = HOPc(s,1);
}
DEBUG_TRIE_EXECUTE_r({
Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
});
Expand Down Expand Up @@ -3547,7 +3560,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
if (*s == ch) {
DEBUG_EXECUTE_r( did_match = 1 );
if (regtry(reginfo, &s)) goto got_it;
s += UTF8SKIP(s);
s += UTF8_SAFE_SKIP(s, strend);
while (s < strend && *s == ch)
s += UTF8SKIP(s);
}
Expand Down
5 changes: 5 additions & 0 deletions t/re/pat_advanced.t
Expand Up @@ -2519,6 +2519,11 @@ EOF
"Assertion failure with single character wildcard");
}

{ # [perl #134034] Previously assertion failure
fresh_perl_is('use utf8; q!Ȧिम한글💣΢ყაოსაა!=~/(?li)\b{wb}\B(*COMMIT)0/;',
"", {}, "*COMMIT caused positioning beyond EOS");
}


# !!! NOTE that tests that aren't at all likely to crash perl should go
# a ways above, above these last ones. There's a comment there that, like
Expand Down

0 comments on commit 8c9c272

Please sign in to comment.