From 4a818dbc4bed8c818c20d7f4778584752c531187 Mon Sep 17 00:00:00 2001 From: Zoltan Herczeg Date: Mon, 23 Dec 2024 16:44:20 +0000 Subject: [PATCH] Move L& conversion --- src/pcre2_compile.c | 12 ++++++++++++ src/pcre2_compile_class.c | 11 ----------- src/pcre2_jit_char_inc.h | 7 ++++--- testdata/testinput4 | 24 +++++++++++++++++------- testdata/testinput5 | 2 ++ testdata/testoutput4 | 31 ++++++++++++++++++++++++------- testdata/testoutput5 | 3 +++ 7 files changed, 62 insertions(+), 28 deletions(-) diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 432a585be..0ffac8939 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -4338,6 +4338,18 @@ while (ptr < ptrend) uint16_t ptype = 0, pdata = 0; if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb)) goto FAILED; + + /* In caseless matching, particular characteristics Lu, Ll, and Lt + get converted to the general characteristic L&. That is, upper, + lower, and title case letters are all conflated. */ + + if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC && + (pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt)) + { + ptype = PT_LAMP; + pdata = 0; + } + if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P; *parsed_pattern++ = META_ESCAPE + escape; *parsed_pattern++ = (ptype << 16) | pdata; diff --git a/src/pcre2_compile_class.c b/src/pcre2_compile_class.c index 855b019fb..ca13306fb 100644 --- a/src/pcre2_compile_class.c +++ b/src/pcre2_compile_class.c @@ -1378,17 +1378,6 @@ while (TRUE) continue; } - /* In caseless matching, particular characteristics Lu, Ll, and Lt - get converted to the general characteristic L&. That is, upper, - lower, and title case letters are all conflated. */ - - if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC && - (pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt)) - { - ptype = PT_LAMP; - pdata = 0; - } - PRIV(update_classbits)(ptype, pdata, (escape == ESC_P), classbits); if ((xclass_props & XCLASS_HIGH_ANY) == 0) diff --git a/src/pcre2_jit_char_inc.h b/src/pcre2_jit_char_inc.h index c770b5c2e..c9d478ba7 100644 --- a/src/pcre2_jit_char_inc.h +++ b/src/pcre2_jit_char_inc.h @@ -626,6 +626,9 @@ if (category_list == UCPCAT_ALL) add_jump(compiler, backtracks, JUMP(SLJIT_JUMP)); return; } + +if (category_list != 0) + compares++; #endif if (*cc != XCL_END) @@ -633,6 +636,7 @@ if (*cc != XCL_END) #if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16) if (common->utf && compares == 0 && !(status & XCLASS_IS_ECLASS)) { + SLJIT_ASSERT(category_list == 0); max = 0; min = (ccbegin[-1] & XCL_MAP) != 0 ? 0 : READ_CHAR_MAX; xclass_update_min_max(common, cc, &min, &max); @@ -701,9 +705,6 @@ if (status & XCLASS_NEEDS_UCD) ccbegin = cc; - if (category_list != 0) - compares++; - if (status & XCLASS_HAS_BIDICL) { OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx_bidiclass)); diff --git a/testdata/testinput4 b/testdata/testinput4 index 08c250cff..2eb5f54a4 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -2918,25 +2918,35 @@ /\p{Lu}\p{Ll}\P{Lu}\P{Ll}/utf >AbbD< >Abb\x{01c5}< -\= Expect no match +\= Expect no match >aBBd< - >aB!!< + >aB!!< /\p{Lu}\p{Ll}\P{Lu}\P{Ll}/i,utf - >aB!!< - >\x{01c5}B!!< -\= Expect no match + >aB!!< + >\x{01c5}B!!< +\= Expect no match >AbbD< >aBBd< >Abb\x{01c5}< /[.\p{Lu}][.\p{Ll}][.\P{Lu}][.\P{Ll}]/i,utf - >aB!!< -\= Expect no match + >aB!!< +\= Expect no match >AbbD< >aBBd< >Abb\x{01c5}< +/[\p{Lt}\x{36b}][\P{Lt}\x{10a0}]/i,utf + >A!< + >\x{3c9}\x{58d}< + >\x{413}\x{940}< +\= Expect no match + \x{3c9}\x{3c9} + \x{58d}\x{58d} + \x{413}\x{413} + \x{940}\x{940} + /^\p{Lt}+/i,utf \x{1c5}AB diff --git a/testdata/testinput5 b/testdata/testinput5 index c1e2847b9..168e70a08 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -339,6 +339,8 @@ /[[:a\x{100}b:]]/utf +/[\p{InvalidOrBadProperty}]/ + /a[^]b/utf,allow_empty_class,match_unset_backref a\x{1234}b a\nb diff --git a/testdata/testoutput4 b/testdata/testoutput4 index 4701d9bd0..b68ad2977 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -4681,18 +4681,18 @@ No match 0: AbbD >Abb\x{01c5}< 0: Abb\x{1c5} -\= Expect no match +\= Expect no match >aBBd< No match - >aB!!< + >aB!!< No match /\p{Lu}\p{Ll}\P{Lu}\P{Ll}/i,utf - >aB!!< + >aB!!< 0: aB!! - >\x{01c5}B!!< + >\x{01c5}B!!< 0: \x{1c5}B!! -\= Expect no match +\= Expect no match >AbbD< No match >aBBd< @@ -4701,9 +4701,9 @@ No match No match /[.\p{Lu}][.\p{Ll}][.\P{Lu}][.\P{Ll}]/i,utf - >aB!!< + >aB!!< 0: aB!! -\= Expect no match +\= Expect no match >AbbD< No match >aBBd< @@ -4711,6 +4711,23 @@ No match >Abb\x{01c5}< No match +/[\p{Lt}\x{36b}][\P{Lt}\x{10a0}]/i,utf + >A!< + 0: A! + >\x{3c9}\x{58d}< + 0: \x{3c9}\x{58d} + >\x{413}\x{940}< + 0: \x{413}\x{940} +\= Expect no match + \x{3c9}\x{3c9} +No match + \x{58d}\x{58d} +No match + \x{413}\x{413} +No match + \x{940}\x{940} +No match + /^\p{Lt}+/i,utf \x{1c5}AB 0: \x{1c5}AB diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 6efa6be5d..4893c5a3b 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -822,6 +822,9 @@ No match /[[:a\x{100}b:]]/utf Failed: error 130 at offset 14: unknown POSIX class name +/[\p{InvalidOrBadProperty}]/ +Failed: error 147 at offset 25: unknown property after \P or \p + /a[^]b/utf,allow_empty_class,match_unset_backref a\x{1234}b 0: a\x{1234}b