Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/pcre2.h.generic
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,7 @@ released, the numbers must not be changed. */
#define PCRE2_ERROR_DIFFSUBSSUBJECT (-72)
#define PCRE2_ERROR_DIFFSUBSOFFSET (-73)
#define PCRE2_ERROR_DIFFSUBSOPTIONS (-74)
#define PCRE2_ERROR_BAD_BACKSLASH_K (-75)


/* Request types for pcre2_pattern_info() */
Expand Down
1 change: 1 addition & 0 deletions src/pcre2.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,7 @@ released, the numbers must not be changed. */
#define PCRE2_ERROR_DIFFSUBSSUBJECT (-72)
#define PCRE2_ERROR_DIFFSUBSOFFSET (-73)
#define PCRE2_ERROR_DIFFSUBSOPTIONS (-74)
#define PCRE2_ERROR_BAD_BACKSLASH_K (-75)


/* Request types for pcre2_pattern_info() */
Expand Down
4 changes: 4 additions & 0 deletions src/pcre2_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -8370,6 +8370,10 @@ for (;; pptr++)
case ESC_A:
if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
break;

case ESC_K:
cb->external_flags |= PCRE2_HASBSK; /* Record */
break;
}

*code++ = meta_arg;
Expand Down
1 change: 1 addition & 0 deletions src/pcre2_error.c
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@ static const unsigned char match_error_texts[] =
"substitute subject differs from prior match call\0"
"substitute start offset differs from prior match call\0"
"substitute options differ from prior match call\0"
"disallowed use of \\K in lookaround\0"
;


Expand Down
1 change: 1 addition & 0 deletions src/pcre2_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,7 @@ bytes in a code unit in that mode. */
#define PCRE2_DUPCAPUSED 0x00200000u /* contains (?| */
#define PCRE2_HASBKC 0x00400000u /* contains \C */
#define PCRE2_HASACCEPT 0x00800000u /* contains (*ACCEPT) */
#define PCRE2_HASBSK 0x01000000u /* contains \K */

#define PCRE2_MODE_MASK (PCRE2_MODE8 | PCRE2_MODE16 | PCRE2_MODE32)

Expand Down
2 changes: 2 additions & 0 deletions src/pcre2_intmodedep.h
Original file line number Diff line number Diff line change
Expand Up @@ -969,7 +969,9 @@ typedef struct match_block {
uint32_t match_call_count; /* Number of times a new frame is created */
BOOL hitend; /* Hit the end of the subject at some point */
BOOL hasthen; /* Pattern contains (*THEN) */
BOOL hasbsk; /* Pattern contains \K */
BOOL allowemptypartial; /* Allow empty hard partial */
BOOL allowlookaroundbsk; /* Allow \K within lookarounds */
const uint8_t *lcc; /* Points to lower casing table */
const uint8_t *fcc; /* Points to case-flipping table */
const uint8_t *ctypes; /* Points to table of type maps */
Expand Down
25 changes: 25 additions & 0 deletions src/pcre2_jit_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -13772,6 +13772,31 @@ common->accept_label = LABEL();
if (common->accept != NULL)
set_jumps(common->accept, common->accept_label);

/* Fail if we detect that the start position was moved to be either after
the end position (\K in lookahead) or before the start offset (\K in
lookbehind). */

if (common->has_set_som &&
(common->re->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
{
if (HAS_VIRTUAL_REGISTERS)
{
OP1(SLJIT_MOV, TMP2, 0, ARGUMENTS, 0);
OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, str));
}
else
{
OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, str));
}
OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(0));

OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, PCRE2_ERROR_BAD_BACKSLASH_K);
/* (ovector[0] < jit_arguments->str)? */
add_jump(compiler, &common->abort, CMP(SLJIT_LESS, TMP3, 0, TMP2, 0));
/* (ovector[0] > STR_PTR)? NB. ovector[1] hasn't yet been set to STR_PTR. */
add_jump(compiler, &common->abort, CMP(SLJIT_GREATER, TMP3, 0, STR_PTR, 0));
Copy link
Collaborator

@zherczeg zherczeg Oct 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is better. However, TMP3 is emulated on x86-32, and better not use it whenever is possible:

  OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(0));
  if (HAS_VIRTUAL_REGISTERS)
    {
    OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0);
    OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, str));
    }
  else
    {
    OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(ARGUMENTS), SLJIT_OFFSETOF(jit_arguments, str));
    }
  OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP2, 0, TMP1, 0);
  OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, PCRE2_ERROR_BAD_BACKSLASH_K);
  add_jump(compiler, &common->abort, JUMP(SLJIT_LESS));
  add_jump(compiler, &common->abort, CMP(SLJIT_GREATER, TMP2, 0, STR_PTR, 0));

(Or something similar)
The trick is that the flag register is set, then TMP1 is overwritten (but the flags are not), than jump based on the flag.

}

/* This means we have a match. Update the ovector. */
copy_ovector(common, re->top_bracket + 1);
common->quit_label = common->abort_label = LABEL();
Expand Down
22 changes: 21 additions & 1 deletion src/pcre2_match.c
Original file line number Diff line number Diff line change
Expand Up @@ -1010,11 +1010,28 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
}

#ifdef DEBUG_SHOW_OPS
fprintf(stderr, "++ Failed ACCEPT not at end (endanchnored set)\n");
fprintf(stderr, "++ Failed ACCEPT not at end (endanchored set)\n");
#endif
return MATCH_NOMATCH; /* (*ACCEPT) */
}

/* Fail if we detect that the start position was moved to be either after
the end position (\K in lookahead) or before the start offset (\K in
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure I understand the operation.

Why start offset is a problem? I think the only issue is when ovector(0) > ovector(1), which confuses some simple implementations. Nobody complained about startoffset before.

Do this happens when \K is executed, or as a post check?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • Start offset is a problem, because when you are doing "search all matches" you don't want matches to overlap or go "backwards". The canonical list of matches should be ordered, non-overlapping, and without duplicates. I believe that many clients will expect this. For example, pcre2_substitute itself fails if the list of matches is overlapping.
  • The checks are done as a post check, after a match is accepted (much, much later than when \K is encountered). These checks do not cause backtracking: it simply turns an accepted match into an error.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This rule is strange to me. As long as the end of the matches are <= ordered, the beginning should not matter when you do a global match. This is true now regardless of \K. Substitute (string insert) should not care where the match starts as long as it is <= than the end.

Copy link
Member Author

@NWilson NWilson Oct 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does matter for substitute.

Here is the check that Philip added, long before my involvement:

if (ovector[1] < ovector[0] || ovector[0] < start_offset)

Here's one way to think about it: a match that ends before it starts is weird. But also, if the unmatched text (text in between matches) that is backwards is equally weird.

Any application that does something with the text in between matches, rather than just throwing it away, wants the matches to be non overlapping.

Example:
Subject "abcde", and matches "a", "cd", "de".
If you wanted to split the string on matches (very common), you'd get:

Empty unmatched text at start
a, match
b, unmatched
cd, match
!!! Unmatched d backwards
de, match
Empty unmatched text at end

Find and replace is based around the concept of string split. If you wanted to replace all those three matches with "zzz", what would the result even be? After replacing "cd" with "zzz", how do you even begin to then replace "de"?

The invariant must be that for both matches and the unmatched text in between, the substring start is before the end (or "not after").

Finally, even if applications only want a single match, it is surprising (and possibly broken) to get a match starting at offset 3, if you ask for searching to start from offset 4.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using max(start_offset, ovector[0]) is ok for replacing. Throwing errors is not necessary. Especially that the pattern cannot detect this case.

lookbehind). If this occurs, the pattern must have used \K in a somewhat
sneaky way (e.g. by pattern recursion), because if the \K is actually
syntactically inside the lookaround, it's blocked at compile-time. */

if (Fstart_match < mb->start_subject + mb->start_offset ||
Fstart_match > Feptr)
{
/* The \K expression is fairly rare. We assert it was used so that we
catch any unexpected invalid data in start_match. */
PCRE2_ASSERT(mb->hasbsk);

if (!mb->allowlookaroundbsk)
return PCRE2_ERROR_BAD_BACKSLASH_K;
}

/* We have a successful match of the whole pattern. Record the result and
then do a direct return from the function. If there is space in the offset
vector, set any pairs that follow the highest-numbered captured string but
Expand Down Expand Up @@ -7393,8 +7410,11 @@ mb->start_offset = start_offset;
mb->end_subject = end_subject;
mb->true_end_subject = true_end_subject;
mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
mb->hasbsk = (re->flags & PCRE2_HASBSK) != 0;
mb->allowemptypartial = (re->max_lookbehind > 0) ||
(re->flags & PCRE2_MATCH_EMPTY) != 0;
mb->allowlookaroundbsk =
(re->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) != 0;
mb->poptions = re->overall_options; /* Pattern options */
mb->ignore_skip_arg = 0;
mb->mark = mb->nomatch_mark = NULL; /* In case never set */
Expand Down
37 changes: 36 additions & 1 deletion testdata/testinput2
Original file line number Diff line number Diff line change
Expand Up @@ -6378,7 +6378,42 @@ a)"xI

/^abc(?<!b\Kq)d/,allow_lookaround_bsk
abcd


# PCRE2 now also rejects sneaky cases where the \K is inside a lookaround... but
# it's not always easy to detect this syntactically at compile-time (indeed,
# a conditional expression could dynamically invoke \K via a subroutine, based
# on the subject contents).

/(?(DEFINE)(?<sneaky>b\K))a(?=(?&sneaky))/g,allow_lookaround_bsk
ab

/(?(DEFINE)(?<sneaky>b\K))a(?=(?&sneaky))/g
ab
zz

/a|(?(DEFINE)(?<sneaky>\Ka))(?<=(?&sneaky))b/g,allow_lookaround_bsk
ab

/a|(?(DEFINE)(?<sneaky>\Ka))(?<=(?&sneaky))b/g
ab
zz

/a|(?(DEFINE)(?<sneaky>\K\Ga))(?<=(?&sneaky))b/g
ab
zz

/(?=.{10}(?1))x(\K){0}/
x1234567890

/(?=.{10}(.))(*scs:(1)(?2))x(\K){0}/
x1234567890

/(?=.{5}(?1))\d*(\K){0}/
\= Totally fine - pattern does nothing bad even though \K is reachable
1234567890
\= Not fine - the subject now causes the \K to misbehave
abcdefgh

# ---------

# Tests for zero-length NULL to be treated as an empty string.
Expand Down
52 changes: 51 additions & 1 deletion testdata/testoutput2
Original file line number Diff line number Diff line change
Expand Up @@ -19062,7 +19062,57 @@ Failed: error 199 at offset 14: \K is not allowed in lookarounds (but see PCRE2_
/^abc(?<!b\Kq)d/,allow_lookaround_bsk
abcd
0: abcd


# PCRE2 now also rejects sneaky cases where the \K is inside a lookaround... but
# it's not always easy to detect this syntactically at compile-time (indeed,
# a conditional expression could dynamically invoke \K via a subroutine, based
# on the subject contents).

/(?(DEFINE)(?<sneaky>b\K))a(?=(?&sneaky))/g,allow_lookaround_bsk
ab
Start of matched string is beyond its end - displaying from end to start.
0: b

/(?(DEFINE)(?<sneaky>b\K))a(?=(?&sneaky))/g
ab
Failed: error -75: disallowed use of \K in lookaround
zz
No match

/a|(?(DEFINE)(?<sneaky>\Ka))(?<=(?&sneaky))b/g,allow_lookaround_bsk
ab
0: a
0: ab

/a|(?(DEFINE)(?<sneaky>\Ka))(?<=(?&sneaky))b/g
ab
0: a
Failed: error -75: disallowed use of \K in lookaround
zz
No match

/a|(?(DEFINE)(?<sneaky>\K\Ga))(?<=(?&sneaky))b/g
ab
0: a
zz
No match

/(?=.{10}(?1))x(\K){0}/
x1234567890
Failed: error -75: disallowed use of \K in lookaround

/(?=.{10}(.))(*scs:(1)(?2))x(\K){0}/
x1234567890
Failed: error -75: disallowed use of \K in lookaround

/(?=.{5}(?1))\d*(\K){0}/
\= Totally fine - pattern does nothing bad even though \K is reachable
1234567890
0: 67890
\= Not fine - the subject now causes the \K to misbehave
abcdefgh
Failed: error -75: disallowed use of \K in lookaround

# ---------

# Tests for zero-length NULL to be treated as an empty string.
Expand Down