diff --git a/src/pcre2.h.generic b/src/pcre2.h.generic index 1ea598bbc..6b5fdd404 100644 --- a/src/pcre2.h.generic +++ b/src/pcre2.h.generic @@ -440,6 +440,7 @@ released, the numbers must not be changed. */ #define PCRE2_ERROR_DIFFSUBSSUBJECT (-72) #define PCRE2_ERROR_DIFFSUBSOFFSET (-73) #define PCRE2_ERROR_DIFFSUBSOPTIONS (-74) +#define PCRE2_ERROR_BAD_BACKSLASH_K (-75) /* Request types for pcre2_pattern_info() */ diff --git a/src/pcre2.h.in b/src/pcre2.h.in index 8b1c40ebd..6fb3ceefa 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -440,6 +440,7 @@ released, the numbers must not be changed. */ #define PCRE2_ERROR_DIFFSUBSSUBJECT (-72) #define PCRE2_ERROR_DIFFSUBSOFFSET (-73) #define PCRE2_ERROR_DIFFSUBSOPTIONS (-74) +#define PCRE2_ERROR_BAD_BACKSLASH_K (-75) /* Request types for pcre2_pattern_info() */ diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index ca1f0dcd1..f126e41b7 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -8370,6 +8370,10 @@ for (;; pptr++) case ESC_A: if (cb->max_lookbehind == 0) cb->max_lookbehind = 1; break; + + case ESC_K: + cb->external_flags |= PCRE2_HASBSK; /* Record */ + break; } *code++ = meta_arg; diff --git a/src/pcre2_error.c b/src/pcre2_error.c index 3fa429f7c..df26add2b 100644 --- a/src/pcre2_error.c +++ b/src/pcre2_error.c @@ -305,6 +305,7 @@ static const unsigned char match_error_texts[] = "substitute subject differs from prior match call\0" "substitute start offset differs from prior match call\0" "substitute options differ from prior match call\0" + "disallowed use of \\K in lookaround\0" ; diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index cad67bb49..4e9c05942 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -538,6 +538,7 @@ bytes in a code unit in that mode. */ #define PCRE2_DUPCAPUSED 0x00200000u /* contains (?| */ #define PCRE2_HASBKC 0x00400000u /* contains \C */ #define PCRE2_HASACCEPT 0x00800000u /* contains (*ACCEPT) */ +#define PCRE2_HASBSK 0x01000000u /* contains \K */ #define PCRE2_MODE_MASK (PCRE2_MODE8 | PCRE2_MODE16 | PCRE2_MODE32) diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h index f7d88cddb..6cefb61d3 100644 --- a/src/pcre2_intmodedep.h +++ b/src/pcre2_intmodedep.h @@ -969,7 +969,9 @@ typedef struct match_block { uint32_t match_call_count; /* Number of times a new frame is created */ BOOL hitend; /* Hit the end of the subject at some point */ BOOL hasthen; /* Pattern contains (*THEN) */ + BOOL hasbsk; /* Pattern contains \K */ BOOL allowemptypartial; /* Allow empty hard partial */ + BOOL allowlookaroundbsk; /* Allow \K within lookarounds */ const uint8_t *lcc; /* Points to lower casing table */ const uint8_t *fcc; /* Points to case-flipping table */ const uint8_t *ctypes; /* Points to table of type maps */ diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c index 42810dd1e..bc55056b8 100644 --- a/src/pcre2_jit_compile.c +++ b/src/pcre2_jit_compile.c @@ -13772,6 +13772,31 @@ common->accept_label = LABEL(); if (common->accept != NULL) set_jumps(common->accept, common->accept_label); +/* Fail if we detect that the start position was moved to be either after +the end position (\K in lookahead) or before the start offset (\K in +lookbehind). */ + +if (common->has_set_som && + (common->re->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0) + { + if (HAS_VIRTUAL_REGISTERS) + { + OP1(SLJIT_MOV, TMP2, 0, ARGUMENTS, 0); + OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, str)); + } + else + { + OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, str)); + } + OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(0)); + + OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, PCRE2_ERROR_BAD_BACKSLASH_K); + /* (ovector[0] < jit_arguments->str)? */ + add_jump(compiler, &common->abort, CMP(SLJIT_LESS, TMP3, 0, TMP2, 0)); + /* (ovector[0] > STR_PTR)? NB. ovector[1] hasn't yet been set to STR_PTR. */ + add_jump(compiler, &common->abort, CMP(SLJIT_GREATER, TMP3, 0, STR_PTR, 0)); + } + /* This means we have a match. Update the ovector. */ copy_ovector(common, re->top_bracket + 1); common->quit_label = common->abort_label = LABEL(); diff --git a/src/pcre2_match.c b/src/pcre2_match.c index 02faa0dcf..d5574d721 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -1010,11 +1010,28 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode, } #ifdef DEBUG_SHOW_OPS - fprintf(stderr, "++ Failed ACCEPT not at end (endanchnored set)\n"); + fprintf(stderr, "++ Failed ACCEPT not at end (endanchored set)\n"); #endif return MATCH_NOMATCH; /* (*ACCEPT) */ } + /* Fail if we detect that the start position was moved to be either after + the end position (\K in lookahead) or before the start offset (\K in + lookbehind). If this occurs, the pattern must have used \K in a somewhat + sneaky way (e.g. by pattern recursion), because if the \K is actually + syntactically inside the lookaround, it's blocked at compile-time. */ + + if (Fstart_match < mb->start_subject + mb->start_offset || + Fstart_match > Feptr) + { + /* The \K expression is fairly rare. We assert it was used so that we + catch any unexpected invalid data in start_match. */ + PCRE2_ASSERT(mb->hasbsk); + + if (!mb->allowlookaroundbsk) + return PCRE2_ERROR_BAD_BACKSLASH_K; + } + /* We have a successful match of the whole pattern. Record the result and then do a direct return from the function. If there is space in the offset vector, set any pairs that follow the highest-numbered captured string but @@ -7393,8 +7410,11 @@ mb->start_offset = start_offset; mb->end_subject = end_subject; mb->true_end_subject = true_end_subject; mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0; +mb->hasbsk = (re->flags & PCRE2_HASBSK) != 0; mb->allowemptypartial = (re->max_lookbehind > 0) || (re->flags & PCRE2_MATCH_EMPTY) != 0; +mb->allowlookaroundbsk = + (re->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) != 0; mb->poptions = re->overall_options; /* Pattern options */ mb->ignore_skip_arg = 0; mb->mark = mb->nomatch_mark = NULL; /* In case never set */ diff --git a/testdata/testinput2 b/testdata/testinput2 index bd8ef098e..1a95fe228 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -6378,7 +6378,42 @@ a)"xI /^abc(?b\K))a(?=(?&sneaky))/g,allow_lookaround_bsk + ab + +/(?(DEFINE)(?b\K))a(?=(?&sneaky))/g + ab + zz + +/a|(?(DEFINE)(?\Ka))(?<=(?&sneaky))b/g,allow_lookaround_bsk + ab + +/a|(?(DEFINE)(?\Ka))(?<=(?&sneaky))b/g + ab + zz + +/a|(?(DEFINE)(?\K\Ga))(?<=(?&sneaky))b/g + ab + zz + +/(?=.{10}(?1))x(\K){0}/ + x1234567890 + +/(?=.{10}(.))(*scs:(1)(?2))x(\K){0}/ + x1234567890 + +/(?=.{5}(?1))\d*(\K){0}/ +\= Totally fine - pattern does nothing bad even though \K is reachable + 1234567890 +\= Not fine - the subject now causes the \K to misbehave + abcdefgh + # --------- # Tests for zero-length NULL to be treated as an empty string. diff --git a/testdata/testoutput2 b/testdata/testoutput2 index cff0c9307..31a508d55 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -19062,7 +19062,57 @@ Failed: error 199 at offset 14: \K is not allowed in lookarounds (but see PCRE2_ /^abc(?b\K))a(?=(?&sneaky))/g,allow_lookaround_bsk + ab +Start of matched string is beyond its end - displaying from end to start. + 0: b + +/(?(DEFINE)(?b\K))a(?=(?&sneaky))/g + ab +Failed: error -75: disallowed use of \K in lookaround + zz +No match + +/a|(?(DEFINE)(?\Ka))(?<=(?&sneaky))b/g,allow_lookaround_bsk + ab + 0: a + 0: ab + +/a|(?(DEFINE)(?\Ka))(?<=(?&sneaky))b/g + ab + 0: a +Failed: error -75: disallowed use of \K in lookaround + zz +No match + +/a|(?(DEFINE)(?\K\Ga))(?<=(?&sneaky))b/g + ab + 0: a + zz +No match + +/(?=.{10}(?1))x(\K){0}/ + x1234567890 +Failed: error -75: disallowed use of \K in lookaround + +/(?=.{10}(.))(*scs:(1)(?2))x(\K){0}/ + x1234567890 +Failed: error -75: disallowed use of \K in lookaround + +/(?=.{5}(?1))\d*(\K){0}/ +\= Totally fine - pattern does nothing bad even though \K is reachable + 1234567890 + 0: 67890 +\= Not fine - the subject now causes the \K to misbehave + abcdefgh +Failed: error -75: disallowed use of \K in lookaround + # --------- # Tests for zero-length NULL to be treated as an empty string.