From a7657d5a750dab565c61414ed414e2e7060112a7 Mon Sep 17 00:00:00 2001 From: Nicholas Wilson Date: Mon, 7 Oct 2024 11:41:03 +0100 Subject: [PATCH] Fix OP_REFI for caseless_restrict --- HACKING | 9 +++- src/pcre2_compile.c | 6 +++ src/pcre2_internal.h | 8 +++- src/pcre2_jit_compile.c | 41 ++++++++++++---- src/pcre2_match.c | 28 +++++++---- src/pcre2_printint.c | 4 ++ src/pcre2_study.c | 4 +- testdata/testinput5 | 46 ++++++++++++++++++ testdata/testoutput5 | 100 ++++++++++++++++++++++++++++++++++++++++ 9 files changed, 222 insertions(+), 24 deletions(-) diff --git a/HACKING b/HACKING index 561698d15..ebe7dba94 100644 --- a/HACKING +++ b/HACKING @@ -365,8 +365,10 @@ Changeable options The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL) and some others may be changed in the middle of patterns by items such as (?i). Their processing is handled entirely at compile time by generating different -opcodes for the different settings. The runtime functions do not need to keep -track of an option's state. +opcodes for the different settings. Some options are copied into the opcode's +data, for opcodes such as OP_REFI which depends on the (?r) +(PCRE2_EXTRA_CASELESS_RESTRICT) option. The runtime functions do not need to +keep track of an option's state. PCRE2_DUPNAMES, PCRE2_EXTENDED, PCRE2_EXTENDED_MORE, and PCRE2_NO_AUTO_CAPTURE are tracked and processed during the parsing pre-pass. The others are handled @@ -639,6 +641,9 @@ generates OP_DNREF or OP_DNREFI. These are followed by two counts: the index required name, followed by the number of groups with the same name. The matching code can then search for the first one that is set. +OP_REFI and OP_DNREFI are further followed by an item containing any +case-insensitivity flags. + Repeating character classes and back references ----------------------------------------------- diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 0baec90f1..08f27088b 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -7188,6 +7188,9 @@ for (;; pptr++) *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF; PUT2INC(code, 0, index); PUT2INC(code, 0, count); + if ((options & PCRE2_CASELESS) != 0) + *code++ = ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)? + REFI_FLAG_CASELESS_RESTRICT : 0; } break; @@ -8142,6 +8145,9 @@ for (;; pptr++) if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE; *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF; PUT2INC(code, 0, meta_arg); + if ((options & PCRE2_CASELESS) != 0) + *code++ = ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)? + REFI_FLAG_CASELESS_RESTRICT : 0; /* Update the map of back references, and keep the highest one. We could do this in parse_regex() for numerical back references, but not diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index d6133e6ca..2ce93e036 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -1780,9 +1780,9 @@ in UTF-8 mode. The code that uses this table must know about such things. */ 1+(32/sizeof(PCRE2_UCHAR)), /* NCLASS */ \ 0, /* XCLASS - variable length */ \ 1+IMM2_SIZE, /* REF */ \ - 1+IMM2_SIZE, /* REFI */ \ + 1+IMM2_SIZE+1, /* REFI */ \ 1+2*IMM2_SIZE, /* DNREF */ \ - 1+2*IMM2_SIZE, /* DNREFI */ \ + 1+2*IMM2_SIZE+1, /* DNREFI */ \ 1+LINK_SIZE, /* RECURSE */ \ 1+2*LINK_SIZE+1, /* CALLOUT */ \ 0, /* CALLOUT_STR - variable length */ \ @@ -1829,6 +1829,10 @@ in UTF-8 mode. The code that uses this table must know about such things. */ #define RREF_ANY 0xffff +/* Constants used by OP_REFI and OP_DNREFI to control matching behaviour. */ + +#define REFI_FLAG_CASELESS_RESTRICT 0x1 + /* ---------- Private structures that are mode-independent. ---------- */ diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c index 9c6f335b9..440507b92 100644 --- a/src/pcre2_jit_compile.c +++ b/src/pcre2_jit_compile.c @@ -1149,7 +1149,7 @@ while (cc < ccend) /* Fall through. */ case OP_REF: common->optimized_cbracket[GET2(cc, 1)] = 0; - cc += 1 + IMM2_SIZE; + cc += PRIV(OP_lengths)[*cc]; break; case OP_ASSERT_NA: @@ -1181,8 +1181,16 @@ while (cc < ccend) cc += 1 + IMM2_SIZE; break; - case OP_DNREF: case OP_DNREFI: +#ifdef SUPPORT_UNICODE + if (common->iref_ptr == 0) + { + common->iref_ptr = common->ovector_start; + common->ovector_start += 3 * sizeof(sljit_sw); + } +#endif /* SUPPORT_UNICODE */ + /* Fall through */ + case OP_DNREF: case OP_DNCREF: count = GET2(cc, 1 + IMM2_SIZE); slot = common->name_table + GET2(cc, 1) * common->name_entry_size; @@ -1191,7 +1199,7 @@ while (cc < ccend) common->optimized_cbracket[GET2(slot, 0)] = 0; slot += common->name_entry_size; } - cc += 1 + 2 * IMM2_SIZE; + cc += PRIV(OP_lengths)[*cc]; break; case OP_RECURSE: @@ -9424,6 +9432,10 @@ jump_list *no_match = NULL; int source_reg = COUNT_MATCH; int source_end_reg = ARGUMENTS; int char1_reg = STACK_LIMIT; +PCRE2_UCHAR refi_flag = 0; + +if (*cc == OP_REFI || *cc == OP_DNREFI) + refi_flag = cc[PRIV(OP_lengths)[*cc] - 1]; #endif /* SUPPORT_UNICODE */ if (ref) @@ -9438,7 +9450,7 @@ else OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), 0); #if defined SUPPORT_UNICODE -if (common->utf && *cc == OP_REFI) +if (common->utf && (*cc == OP_REFI || *cc == OP_DNREFI)) { SLJIT_ASSERT(common->iref_ptr != 0); @@ -9491,6 +9503,8 @@ if (common->utf && *cc == OP_REFI) OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0); CMPTO(SLJIT_EQUAL, TMP1, 0, char1_reg, 0, loop); + if (refi_flag & REFI_FLAG_CASELESS_RESTRICT) + add_jump(compiler, &no_match, CMP(SLJIT_LESS, char1_reg, 0, SLJIT_IMM, 128)); add_jump(compiler, &no_match, CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, 0)); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 2); OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_caseless_sets)); @@ -9594,6 +9608,9 @@ if (ref) offset = GET2(cc, 1) << 1; else cc += IMM2_SIZE; + +if (*ccbegin == OP_REFI || *ccbegin == OP_DNREFI) + cc += 1; type = cc[1 + IMM2_SIZE]; SLJIT_COMPILE_ASSERT((OP_CRSTAR & 0x1) == 0, crstar_opcode_must_be_even); @@ -12687,25 +12704,31 @@ while (cc < ccend) case OP_REF: case OP_REFI: - if (cc[1 + IMM2_SIZE] >= OP_CRSTAR && cc[1 + IMM2_SIZE] <= OP_CRPOSRANGE) + { + int op_len = PRIV(OP_lengths)[*cc]; + if (cc[op_len] >= OP_CRSTAR && cc[op_len] <= OP_CRPOSRANGE) cc = compile_ref_iterator_matchingpath(common, cc, parent); else { compile_ref_matchingpath(common, cc, parent->top != NULL ? &parent->top->simple_backtracks : &parent->own_backtracks, TRUE, FALSE); - cc += 1 + IMM2_SIZE; + cc += op_len; } + } break; case OP_DNREF: case OP_DNREFI: - if (cc[1 + 2 * IMM2_SIZE] >= OP_CRSTAR && cc[1 + 2 * IMM2_SIZE] <= OP_CRPOSRANGE) + { + int op_len = PRIV(OP_lengths)[*cc]; + if (cc[op_len] >= OP_CRSTAR && cc[op_len] <= OP_CRPOSRANGE) cc = compile_ref_iterator_matchingpath(common, cc, parent); else { compile_dnref_search(common, cc, parent->top != NULL ? &parent->top->simple_backtracks : &parent->own_backtracks); compile_ref_matchingpath(common, cc, parent->top != NULL ? &parent->top->simple_backtracks : &parent->own_backtracks, TRUE, FALSE); - cc += 1 + 2 * IMM2_SIZE; + cc += op_len; } + } break; case OP_RECURSE: @@ -12992,7 +13015,7 @@ PCRE2_SPTR cc = current->cc; BOOL ref = (*cc == OP_REF || *cc == OP_REFI); PCRE2_UCHAR type; -type = cc[ref ? 1 + IMM2_SIZE : 1 + 2 * IMM2_SIZE]; +type = cc[PRIV(OP_lengths)[*cc]]; if ((type & 0x1) == 0) { diff --git a/src/pcre2_match.c b/src/pcre2_match.c index bff2dfd8e..bae7157d0 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -348,6 +348,7 @@ seems unlikely.) Arguments: offset index into the offset vector caseless TRUE if caseless + caseopts bitmask of REFI_FLAG_XYZ values F the current backtracking frame pointer mb points to match block lengthptr pointer for returning the length matched @@ -358,8 +359,8 @@ Returns: = 0 sucessful match; number of code units matched is set */ static int -match_ref(PCRE2_SIZE offset, BOOL caseless, heapframe *F, match_block *mb, - PCRE2_SIZE *lengthptr) +match_ref(PCRE2_SIZE offset, BOOL caseless, int caseopts, heapframe *F, + match_block *mb, PCRE2_SIZE *lengthptr) { PCRE2_SPTR p; PCRE2_SIZE length; @@ -389,6 +390,7 @@ if (caseless) { #if defined SUPPORT_UNICODE BOOL utf = (mb->poptions & PCRE2_UTF) != 0; + BOOL caseless_restrict = (caseopts & REFI_FLAG_CASELESS_RESTRICT) != 0; if (utf || (mb->poptions & PCRE2_UCP) != 0) { @@ -424,6 +426,11 @@ if (caseless) if (c != d && c != (uint32_t)((int)d + ur->other_case)) { const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset; + + /* When PCRE2_EXTRA_CASELESS_RESTRICT is set, ignore any caseless sets + that start with an ASCII character. */ + if (caseless_restrict && *pp < 128) return -1; /* No match */ + for (;;) { if (c < *pp) return -1; /* No match */ @@ -5006,16 +5013,18 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode, #define Lmin F->temp_32[0] #define Lmax F->temp_32[1] #define Lcaseless F->temp_32[2] +#define Lcaseopts F->temp_32[3] #define Lstart F->temp_sptr[0] #define Loffset F->temp_size case OP_DNREF: case OP_DNREFI: Lcaseless = (Fop == OP_DNREFI); + Lcaseopts = (Fop == OP_DNREFI)? Fecode[1 + 2*IMM2_SIZE] : 0; { int count = GET2(Fecode, 1+IMM2_SIZE); PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; - Fecode += 1 + 2*IMM2_SIZE; + Fecode += 1 + 2*IMM2_SIZE + (Fop == OP_DNREFI? 1 : 0); while (count-- > 0) { @@ -5029,8 +5038,9 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode, case OP_REF: case OP_REFI: Lcaseless = (Fop == OP_REFI); + Lcaseopts = (Fop == OP_REFI)? Fecode[1 + IMM2_SIZE] : 0; Loffset = (GET2(Fecode, 1) << 1) - 2; - Fecode += 1 + IMM2_SIZE; + Fecode += 1 + IMM2_SIZE + (Fop == OP_REFI? 1 : 0); /* Set up for repetition, or handle the non-repeated case. The maximum and minimum must be in the heap frame, but as they are short-term values, we @@ -5062,7 +5072,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode, default: /* No repeat follows */ { - rrc = match_ref(Loffset, Lcaseless, F, mb, &length); + rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &length); if (rrc != 0) { if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ @@ -5096,7 +5106,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode, for (i = 1; i <= Lmin; i++) { PCRE2_SIZE slength; - rrc = match_ref(Loffset, Lcaseless, F, mb, &slength); + rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength); if (rrc != 0) { if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ @@ -5120,7 +5130,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode, RMATCH(Fecode, RM20); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); - rrc = match_ref(Loffset, Lcaseless, F, mb, &slength); + rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength); if (rrc != 0) { if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ @@ -5145,7 +5155,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode, for (i = Lmin; i < Lmax; i++) { PCRE2_SIZE slength; - rrc = match_ref(Loffset, Lcaseless, F, mb, &slength); + rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength); if (rrc != 0) { /* Can't use CHECK_PARTIAL because we don't want to update Feptr in @@ -5196,7 +5206,7 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode, for (i = Lmin; i < Lmax; i++) { PCRE2_SIZE slength; - (void)match_ref(Loffset, Lcaseless, F, mb, &slength); + (void)match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength); Feptr += slength; } } diff --git a/src/pcre2_printint.c b/src/pcre2_printint.c index 2c7760bf3..086e25e8b 100644 --- a/src/pcre2_printint.c +++ b/src/pcre2_printint.c @@ -633,14 +633,17 @@ for(;;) case OP_REFI: flag = "/i"; + extra = code[1 + IMM2_SIZE]; /* Fall through */ case OP_REF: fprintf(f, " %s \\%d", flag, GET2(code,1)); + if (extra != 0) fprintf(f, " 0x%02x", extra); ccode = code + OP_lengths[*code]; goto CLASS_REF_REPEAT; case OP_DNREFI: flag = "/i"; + extra = code[1 + 2*IMM2_SIZE]; /* Fall through */ case OP_DNREF: { @@ -648,6 +651,7 @@ for(;;) fprintf(f, " %s \\k<", flag); print_custring(f, entry); fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE)); + if (extra != 0) fprintf(f, " 0x%02x", extra); } ccode = code + OP_lengths[*code]; goto CLASS_REF_REPEAT; diff --git a/src/pcre2_study.c b/src/pcre2_study.c index 352dd6dfb..2456b0d65 100644 --- a/src/pcre2_study.c +++ b/src/pcre2_study.c @@ -540,7 +540,7 @@ for (;;) } } else d = 0; - cc += 1 + 2*IMM2_SIZE; + cc += PRIV(OP_lengths)[*cc]; goto REPEAT_BACK_REFERENCE; /* Single back reference by number. References by name are converted to by @@ -593,7 +593,7 @@ for (;;) backref_cache[0] = recno; } - cc += 1 + IMM2_SIZE; + cc += PRIV(OP_lengths)[*cc]; /* Handle repeated back references */ diff --git a/testdata/testinput5 b/testdata/testinput5 index 74f9c2202..25bff66f1 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -2289,6 +2289,52 @@ s\x{212a}s K\x{17f}K +/(.) \1/i,utf,caseless_restrict + s S + k K +\= Expect no match + s \x{17f} + k \x{212a} + +/(.) (?r:\1)/i,utf + s S + k K +\= Expect no match + s \x{17f} + k \x{212a} + +/(.) \1/i,utf + s S + k K + s \x{17f} + k \x{212a} + +/(?:(?ss)|(?kk)) \k/i,utf,dupnames,caseless_restrict + sS Ss + kK Kk +\= Expect no match + sS \x{17f}s + kK \x{212a}k + +/(?:(?ss)|(?kk)) \k/i,utf,dupnames + sS Ss + kK Kk + sS \x{17f}s + kK \x{212a}k + +/(?:(?s)|(?k)) \k{3,}!/i,utf,dupnames,caseless_restrict + s SsSs! + k KkKk! +\= Expect no match + s \x{17f}sSs\x{17f}! + k \x{212a}kKk\x{212a}! + +/(?:(?s)|(?k)) \k{3,}!/i,utf,dupnames + s SsSs! + k KkKk! + s \x{17f}sSs\x{17f}! + k \x{212a}kKk\x{212a}! + # End caseless restrict tests # TESTS for PCRE2_EXTRA_ASCII_xxx - again, tests with and without. diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 90002f194..bdcb1a619 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -5210,6 +5210,106 @@ No match K\x{17f}K No match +/(.) \1/i,utf,caseless_restrict + s S + 0: s S + 1: s + k K + 0: k K + 1: k +\= Expect no match + s \x{17f} +No match + k \x{212a} +No match + +/(.) (?r:\1)/i,utf + s S + 0: s S + 1: s + k K + 0: k K + 1: k +\= Expect no match + s \x{17f} +No match + k \x{212a} +No match + +/(.) \1/i,utf + s S + 0: s S + 1: s + k K + 0: k K + 1: k + s \x{17f} + 0: s \x{17f} + 1: s + k \x{212a} + 0: k \x{212a} + 1: k + +/(?:(?ss)|(?kk)) \k/i,utf,dupnames,caseless_restrict + sS Ss + 0: sS Ss + 1: sS + kK Kk + 0: kK Kk + 1: + 2: kK +\= Expect no match + sS \x{17f}s +No match + kK \x{212a}k +No match + +/(?:(?ss)|(?kk)) \k/i,utf,dupnames + sS Ss + 0: sS Ss + 1: sS + kK Kk + 0: kK Kk + 1: + 2: kK + sS \x{17f}s + 0: sS \x{17f}s + 1: sS + kK \x{212a}k + 0: kK \x{212a}k + 1: + 2: kK + +/(?:(?s)|(?k)) \k{3,}!/i,utf,dupnames,caseless_restrict + s SsSs! + 0: s SsSs! + 1: s + k KkKk! + 0: k KkKk! + 1: + 2: k +\= Expect no match + s \x{17f}sSs\x{17f}! +No match + k \x{212a}kKk\x{212a}! +No match + +/(?:(?s)|(?k)) \k{3,}!/i,utf,dupnames + s SsSs! + 0: s SsSs! + 1: s + k KkKk! + 0: k KkKk! + 1: + 2: k + s \x{17f}sSs\x{17f}! + 0: s \x{17f}sSs\x{17f}! + 1: s + k \x{212a}kKk\x{212a}! + 0: k \x{212a}kKk\x{212a}! + 1: + 2: k + # End caseless restrict tests # TESTS for PCRE2_EXTRA_ASCII_xxx - again, tests with and without.