PCRE2Project · PhilipHazel · Oct 14, 2024 · Oct 14, 2024 · carenas · Oct 8, 2024
diff --git a/doc/html/pcre2_set_compile_extra_options.html b/doc/html/pcre2_set_compile_extra_options.html
@@ -43,8 +43,10 @@ <h1>pcre2_set_compile_extra_options man page</h1>
   PCRE2_EXTRA_ESCAPED_CR_IS_LF         Interpret \r as \n
   PCRE2_EXTRA_MATCH_LINE               Pattern matches whole lines
   PCRE2_EXTRA_MATCH_WORD               Pattern matches "words"
+  PCRE2_EXTRA_NEVER_CALLOUT            Disallow callouts in pattern
   PCRE2_EXTRA_NO_BS0                   Disallow \0 (but not \00 or \000)
   PCRE2_EXTRA_PYTHON_OCTAL             Use Python rules for octal
+  PCRE2_EXTRA_TURKISH_CASING           Use Turkish I case folding
 </pre>
 There is a complete description of the PCRE2 native API in the
 <a href="pcre2api.html"><b>pcre2api</b></a>

diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html
@@ -1697,12 +1697,21 @@ <h1>pcre2api man page</h1>
 changed within a pattern by a (?i) option setting. If either PCRE2_UTF or
 PCRE2_UCP is set, Unicode properties are used for all characters with more than
 one other case, and for all characters whose code points are greater than
-U+007F. Note that there are two ASCII characters, K and S, that, in addition to
+U+007F.
+</P>
+<P>
+Note that there are two ASCII characters, K and S, that, in addition to
 their lower case ASCII equivalents, are case-equivalent with U+212A (Kelvin
 sign) and U+017F (long S) respectively. If you do not want this case
 equivalence, you can suppress it by setting PCRE2_EXTRA_CASELESS_RESTRICT.
 </P>
 <P>
+One language family, Turkish and Azeri, has its own case-insensitivity rules,
+which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the
+behaviour of the 'i', 'I', U+0130 (capital I with dot above), and U+0131
+(small dotless i) characters.
+</P>
+<P>
 For lower valued characters with only one other case, a lookup table is used
 for speed. When neither PCRE2_UTF nor PCRE2_UCP is set, a lookup table is used
 for all code points less than 256, and higher code points (available only in
@@ -2037,9 +2046,16 @@ <h1>pcre2api man page</h1>
 upper/lower casing operations, even when PCRE2_UTF is not set. This makes it
 possible to process strings in the 16-bit UCS-2 code. This option is available
 only if PCRE2 has been compiled with Unicode support (which is the default).
-The PCRE2_EXTRA_CASELESS_RESTRICT option (see below) restricts caseless
+</P>
+<P>
+The PCRE2_EXTRA_CASELESS_RESTRICT option (see above) restricts caseless
 matching such that ASCII characters match only ASCII characters and non-ASCII
-characters match only non-ASCII characters.
+characters match only non-ASCII characters. The PCRE2_EXTRA_TURKISH_CASING option
+(see above) alters the matching of the 'i' characters to follow their behaviour
+in Turkish and Azeri languages. For further details on
+PCRE2_EXTRA_CASELESS_RESTRICT and PCRE2_EXTRA_TURKISH_CASING, see the
+<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
+page.
 <pre>
   PCRE2_UNGREEDY
 </pre>
@@ -2176,7 +2192,8 @@ <h1>pcre2api man page</h1>
 ASCII letter K is case-equivalent to U+212a (Kelvin sign). This option disables
 recognition of case-equivalences that cross the ASCII/non-ASCII boundary. In a
 caseless match, both characters must either be ASCII or non-ASCII. The option
-can be changed with a pattern by the (?r) option setting.
+can be changed within a pattern by the (*CASELESS_RESTRICT) or (?r) option
+settings.
 <pre>
   PCRE2_EXTRA_ESCAPED_CR_IS_LF
 </pre>
@@ -2223,6 +2240,14 @@ <h1>pcre2api man page</h1>
 returning PCRE2_ERROR_CALLOUT_CALLER_DISABLED. This is useful if the application
 knows that a callout will not be provided to <b>pcre2_match()</b>, so that
 callouts in the pattern are not silently ignored.
+<pre>
+  PCRE2_EXTRA_TURKISH_CASING
+</pre>
+This option alters case-equivalence of the 'i' letters to follow the
+alphabet used by Turkish and Azeri languages. The option can be changed within
+a pattern by the (*TURKISH_CASING) start-of-pattern setting. Either the UTF or
+UCP options must be set. In the 8-bit library, UTF must be set. This option
+cannot be combined with PCRE2_EXTRA_CASELESS_RESTRICT.
 <a name="jitcompiling"></a></P>
 <br><a name="SEC21" href="#TOC1">JUST-IN-TIME (JIT) COMPILATION</a><br>
 <P>

diff --git a/doc/html/pcre2pattern.html b/doc/html/pcre2pattern.html
@@ -302,7 +302,10 @@ <h1>pcre2pattern man page</h1>
 equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F
 (long S) respectively when either PCRE2_UTF or PCRE2_UCP is set, unless the
 PCRE2_EXTRA_CASELESS_RESTRICT option is in force (either passed to
-<b>pcre2_compile()</b> or set by (?r) within the pattern).
+<b>pcre2_compile()</b> or set by (*CASELESS_RESTRICT) or (?r) within the
+pattern). If the PCRE2_EXTRA_TURKISH_CASING option is in force (either passed
+to <b>pcre2_compile()</b> or set by (*TURKISH_CASING) within the pattern), then
+the 'i' letters are matched according to Turkish and Azeri languages.
 </P>
 <P>
 The power of regular expressions comes from the ability to include wild cards,

diff --git a/doc/html/pcre2syntax.html b/doc/html/pcre2syntax.html
@@ -436,17 +436,19 @@ <h1>pcre2syntax man page</h1>
 of the newline or \R sequences or options with similar syntax. More than one
 of them may appear. For the first three, d is a decimal number.
 <pre>
-  (*LIMIT_DEPTH=d) set the backtracking limit to d
-  (*LIMIT_HEAP=d)  set the heap size limit to d * 1024 bytes
-  (*LIMIT_MATCH=d) set the match limit to d
-  (*NOTEMPTY)      set PCRE2_NOTEMPTY when matching
-  (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching
-  (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS)
+  (*CASELESS_RESTRICT) set PCRE2_EXTRA_CASELESS_RESTRICT when matching
+  (*LIMIT_DEPTH=d)     set the backtracking limit to d
+  (*LIMIT_HEAP=d)      set the heap size limit to d * 1024 bytes
+  (*LIMIT_MATCH=d)     set the match limit to d
+  (*NOTEMPTY)          set PCRE2_NOTEMPTY when matching
+  (*NOTEMPTY_ATSTART)  set PCRE2_NOTEMPTY_ATSTART when matching
+  (*NO_AUTO_POSSESS)   no auto-possessification (PCRE2_NO_AUTO_POSSESS)
   (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR)
-  (*NO_JIT)       disable JIT optimization
-  (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE)
-  (*UTF)          set appropriate UTF mode for the library in use
-  (*UCP)          set PCRE2_UCP (use Unicode properties for \d etc)
+  (*NO_JIT)            disable JIT optimization
+  (*NO_START_OPT)      no start-match optimization (PCRE2_NO_START_OPTIMIZE)
+  (*TURKISH_CASING)    set PCRE2_EXTRA_TURKISH_CASING when matching
+  (*UTF)               set appropriate UTF mode for the library in use
+  (*UCP)               set PCRE2_UCP (use Unicode properties for \d etc)
 </pre>
 Note that LIMIT_DEPTH, LIMIT_HEAP, and LIMIT_MATCH can only reduce the value of
 the limits set by the caller of <b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>,

diff --git a/doc/html/pcre2test.html b/doc/html/pcre2test.html
@@ -673,6 +673,7 @@ <h1>pcre2test man page</h1>
       no_start_optimize         set PCRE2_NO_START_OPTIMIZE
       no_utf_check              set PCRE2_NO_UTF_CHECK
       python_octal              set PCRE2_EXTRA_PYTHON_OCTAL
+      turkish_casing            set PCRE2_EXTRA_TURKISH_CASING
       ucp                       set PCRE2_UCP
       ungreedy                  set PCRE2_UNGREEDY
       use_offset_limit          set PCRE2_USE_OFFSET_LIMIT

diff --git a/doc/html/pcre2unicode.html b/doc/html/pcre2unicode.html
@@ -157,6 +157,35 @@ <h1>pcre2unicode man page</h1>
 counterparts can be disabled by setting the PCRE2_EXTRA_CASELESS_RESTRICT
 option. When this is set, all characters in a case equivalence must either be
 ASCII or non-ASCII; there can be no mixing.
+<pre>
+    Without PCRE2_EXTRA_CASELESS_RESTRICT:
+      'k' = 'K' = U+212A (Kelvin sign)
+      's' = 'S' = U+017F (long S)
+    With PCRE2_EXTRA_CASELESS_RESTRICT:
+      'k' = 'K'
+      U+212A (Kelvin sign)  only case-equivalent to itself
+      's' = 'S'
+      U+017F (long S)       only case-equivalent to itself
+</PRE>
+</P>
+<P>
+One language family, Turkish and Azeri, has its own case-insensitivity rules,
+which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the
+behaviour of the 'i', 'I', U+0130 (capital I with dot above), and U+0131
+(small dotless i) characters.
+<pre>
+    Without PCRE2_EXTRA_TURKISH_CASING:
+      'i' = 'I'
+      U+0130 (capital I with dot above)  only case-equivalent to itself
+      U+0131 (small dotless i)           only case-equivalent to itself
+    With PCRE2_EXTRA_TURKISH_CASING:
+      'i' = U+0130 (capital I with dot above)
+      U+0131 (small dotless i) = 'I'
+</PRE>
+</P>
+<P>
+It is not allowed to specify both PCRE2_EXTRA_CASELESS_RESTRICT and
+PCRE2_EXTRA_TURKISH_CASING together.
 </P>
 <P>
 From release 10.45 the Unicode letter properties Lu (upper case), Ll (lower

diff --git a/doc/pcre2_set_compile_extra_options.3 b/doc/pcre2_set_compile_extra_options.3
@@ -43,8 +43,10 @@ options are:
   PCRE2_EXTRA_ESCAPED_CR_IS_LF         Interpret \er as \en
   PCRE2_EXTRA_MATCH_LINE               Pattern matches whole lines
   PCRE2_EXTRA_MATCH_WORD               Pattern matches "words"
+  PCRE2_EXTRA_NEVER_CALLOUT            Disallow callouts in pattern
   PCRE2_EXTRA_NO_BS0                   Disallow \e0 (but not \e00 or \e000)
   PCRE2_EXTRA_PYTHON_OCTAL             Use Python rules for octal
+  PCRE2_EXTRA_TURKISH_CASING           Use Turkish I case folding
 .sp
 There is a complete description of the PCRE2 native API in the
 .\" HREF

diff --git a/doc/pcre2api.3 b/doc/pcre2api.3
@@ -1633,11 +1633,18 @@ letters in the subject. It is equivalent to Perl's /i option, and it can be
 changed within a pattern by a (?i) option setting. If either PCRE2_UTF or
 PCRE2_UCP is set, Unicode properties are used for all characters with more than
 one other case, and for all characters whose code points are greater than
-U+007F. Note that there are two ASCII characters, K and S, that, in addition to
+U+007F.
+.P
+Note that there are two ASCII characters, K and S, that, in addition to
 their lower case ASCII equivalents, are case-equivalent with U+212A (Kelvin
 sign) and U+017F (long S) respectively. If you do not want this case
 equivalence, you can suppress it by setting PCRE2_EXTRA_CASELESS_RESTRICT.
 .P
+One language family, Turkish and Azeri, has its own case-insensitivity rules,
+which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the
+behaviour of the 'i', 'I', U+0130 (capital I with dot above), and U+0131
+(small dotless i) characters.
+.P
 For lower valued characters with only one other case, a lookup table is used
 for speed. When neither PCRE2_UTF nor PCRE2_UCP is set, a lookup table is used
 for all code points less than 256, and higher code points (available only in
@@ -1986,9 +1993,17 @@ The second effect of PCRE2_UCP is to force the use of Unicode properties for
 upper/lower casing operations, even when PCRE2_UTF is not set. This makes it
 possible to process strings in the 16-bit UCS-2 code. This option is available
 only if PCRE2 has been compiled with Unicode support (which is the default).
-The PCRE2_EXTRA_CASELESS_RESTRICT option (see below) restricts caseless
+.P
+The PCRE2_EXTRA_CASELESS_RESTRICT option (see above) restricts caseless
 matching such that ASCII characters match only ASCII characters and non-ASCII
-characters match only non-ASCII characters.
+characters match only non-ASCII characters. The PCRE2_EXTRA_TURKISH_CASING option
+(see above) alters the matching of the 'i' characters to follow their behaviour
+in Turkish and Azeri languages. For further details on
+PCRE2_EXTRA_CASELESS_RESTRICT and PCRE2_EXTRA_TURKISH_CASING, see the
+.\" HREF
+\fBpcre2unicode\fP
+.\"
+page.
 .sp
   PCRE2_UNGREEDY
 .sp
@@ -2128,7 +2143,8 @@ characters. The ASCII letter S is case-equivalent to U+017f (long S) and the
 ASCII letter K is case-equivalent to U+212a (Kelvin sign). This option disables
 recognition of case-equivalences that cross the ASCII/non-ASCII boundary. In a
 caseless match, both characters must either be ASCII or non-ASCII. The option
-can be changed with a pattern by the (?r) option setting.
+can be changed within a pattern by the (*CASELESS_RESTRICT) or (?r) option
+settings.
 .sp
   PCRE2_EXTRA_ESCAPED_CR_IS_LF
 .sp
@@ -2177,6 +2193,14 @@ If this option is set, PCRE2 treats callouts in the pattern as a syntax error,
 returning PCRE2_ERROR_CALLOUT_CALLER_DISABLED. This is useful if the application
 knows that a callout will not be provided to \fBpcre2_match()\fP, so that
 callouts in the pattern are not silently ignored.
+.sp
+  PCRE2_EXTRA_TURKISH_CASING
+.sp
+This option alters case-equivalence of the 'i' letters to follow the
+alphabet used by Turkish and Azeri languages. The option can be changed within
+a pattern by the (*TURKISH_CASING) start-of-pattern setting. Either the UTF or
+UCP options must be set. In the 8-bit library, UTF must be set. This option
+cannot be combined with PCRE2_EXTRA_CASELESS_RESTRICT.
 .
 .
 .\" HTML <a name="jitcompiling"></a>

diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3
@@ -278,7 +278,10 @@ ASCII characters, K and S, that, in addition to their lower case ASCII
 equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F
 (long S) respectively when either PCRE2_UTF or PCRE2_UCP is set, unless the
 PCRE2_EXTRA_CASELESS_RESTRICT option is in force (either passed to
-\fBpcre2_compile()\fP or set by (?r) within the pattern).
+\fBpcre2_compile()\fP or set by (*CASELESS_RESTRICT) or (?r) within the
+pattern). If the PCRE2_EXTRA_TURKISH_CASING option is in force (either passed
+to \fBpcre2_compile()\fP or set by (*TURKISH_CASING) within the pattern), then
+the 'i' letters are matched according to Turkish and Azeri languages.
 .P
 The power of regular expressions comes from the ability to include wild cards,
 character classes, alternatives, and repetitions in the pattern. These are

diff --git a/doc/pcre2syntax.3 b/doc/pcre2syntax.3
@@ -411,17 +411,19 @@ The following are recognized only at the very start of a pattern or after one
 of the newline or \eR sequences or options with similar syntax. More than one
 of them may appear. For the first three, d is a decimal number.
 .sp
-  (*LIMIT_DEPTH=d) set the backtracking limit to d
-  (*LIMIT_HEAP=d)  set the heap size limit to d * 1024 bytes
-  (*LIMIT_MATCH=d) set the match limit to d
-  (*NOTEMPTY)      set PCRE2_NOTEMPTY when matching
-  (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching
-  (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS)
+  (*CASELESS_RESTRICT) set PCRE2_EXTRA_CASELESS_RESTRICT when matching
+  (*LIMIT_DEPTH=d)     set the backtracking limit to d
+  (*LIMIT_HEAP=d)      set the heap size limit to d * 1024 bytes
+  (*LIMIT_MATCH=d)     set the match limit to d
+  (*NOTEMPTY)          set PCRE2_NOTEMPTY when matching
+  (*NOTEMPTY_ATSTART)  set PCRE2_NOTEMPTY_ATSTART when matching
+  (*NO_AUTO_POSSESS)   no auto-possessification (PCRE2_NO_AUTO_POSSESS)
   (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR)
-  (*NO_JIT)       disable JIT optimization
-  (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE)
-  (*UTF)          set appropriate UTF mode for the library in use
-  (*UCP)          set PCRE2_UCP (use Unicode properties for \ed etc)
+  (*NO_JIT)            disable JIT optimization
+  (*NO_START_OPT)      no start-match optimization (PCRE2_NO_START_OPTIMIZE)
+  (*TURKISH_CASING)    set PCRE2_EXTRA_TURKISH_CASING when matching
+  (*UTF)               set appropriate UTF mode for the library in use
+  (*UCP)               set PCRE2_UCP (use Unicode properties for \ed etc)
 .sp
 Note that LIMIT_DEPTH, LIMIT_HEAP, and LIMIT_MATCH can only reduce the value of
 the limits set by the caller of \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP,

diff --git a/doc/pcre2test.1 b/doc/pcre2test.1
@@ -628,6 +628,7 @@ for a description of the effects of these options.
       no_start_optimize         set PCRE2_NO_START_OPTIMIZE
       no_utf_check              set PCRE2_NO_UTF_CHECK
       python_octal              set PCRE2_EXTRA_PYTHON_OCTAL
+      turkish_casing            set PCRE2_EXTRA_TURKISH_CASING
       ucp                       set PCRE2_UCP
       ungreedy                  set PCRE2_UNGREEDY
       use_offset_limit          set PCRE2_USE_OFFSET_LIMIT

diff --git a/doc/pcre2unicode.3 b/doc/pcre2unicode.3
@@ -147,6 +147,31 @@ Recognition of these non-ASCII characters as case-equivalent to their ASCII
 counterparts can be disabled by setting the PCRE2_EXTRA_CASELESS_RESTRICT
 option. When this is set, all characters in a case equivalence must either be
 ASCII or non-ASCII; there can be no mixing.
+.sp
+    Without PCRE2_EXTRA_CASELESS_RESTRICT:
+      'k' = 'K' = U+212A (Kelvin sign)
+      's' = 'S' = U+017F (long S)
+    With PCRE2_EXTRA_CASELESS_RESTRICT:
+      'k' = 'K'
+      U+212A (Kelvin sign)  only case-equivalent to itself
+      's' = 'S'
+      U+017F (long S)       only case-equivalent to itself
+.P
+One language family, Turkish and Azeri, has its own case-insensitivity rules,
+which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the
+behaviour of the 'i', 'I', U+0130 (capital I with dot above), and U+0131
+(small dotless i) characters.
+.sp
+    Without PCRE2_EXTRA_TURKISH_CASING:
+      'i' = 'I'
+      U+0130 (capital I with dot above)  only case-equivalent to itself
+      U+0131 (small dotless i)           only case-equivalent to itself
+    With PCRE2_EXTRA_TURKISH_CASING:
+      'i' = U+0130 (capital I with dot above)
+      U+0131 (small dotless i) = 'I'
+.P
+It is not allowed to specify both PCRE2_EXTRA_CASELESS_RESTRICT and
+PCRE2_EXTRA_TURKISH_CASING together.
 .P
 From release 10.45 the Unicode letter properties Lu (upper case), Ll (lower
 case), and Lt (title case) are all treated as Lc (cased letter) when caseless

diff --git a/maint/GenerateUcd.py b/maint/GenerateUcd.py
@@ -737,6 +737,12 @@ def write_bitsets(list, item_size):
     if x > 127 and x + other_case[x] < 128:
       other_case[x] = 0  
 
+# Append a couple of extra caseless sets (unreferenced by the record objects)
+# to hold the optional Turkish case equivalences.
+turkish_dotted_i_index = offset
+caseless_sets.append([0x69, 0x0130])
+caseless_sets.append([0x49, 0x0131])
+
 # Combine all the tables
 
 table, records = combine_tables(script, category, break_props,
@@ -855,6 +861,17 @@ def write_bitsets(list, item_size):
   f.write('  NOTACHAR,\n')
 f.write('};\n\n')
 
+# --- Output the indices of the Turkish caseless character sets ---
+
+f.write("""\
+/* This is the index, within ucd_caseless_sets, of the additional
+Turkish case-equivalences. The dotted I ones are this offset; the
+dotless I are +3 from here. */
+
+const uint32_t PRIV(ucd_turkish_dotted_i_caseset) = %d;
+
+""" % (turkish_dotted_i_index))
+
 # --- Other tables are not needed by pcre2test ---
 
 f.write("""\
@@ -867,7 +884,7 @@ def write_bitsets(list, item_size):
 # --- Output the nocase sets ---
 
 f.write("""\
-/* This table contains character ranges, where the characters in the range has
+/* This table contains character ranges, where the characters in the range have
 no other case. Both start and end values are excluded from the range. */
 
 const uint32_t PRIV(ucd_nocase_ranges)[] = {
@@ -880,7 +897,7 @@ def write_bitsets(list, item_size):
 total = 0
 
 for c in range(1, MAX_UNICODE):
-  if other_case[c] != 0:
+  if other_case[c] != 0 or c in [0x0130, 0x0131]: # add the two chars that gain casing in Turkish
     if c - range_start > expected_size:
       range_size = c - range_start - 1
       f.write('  0x%04x, 0x%04x, /* %d */\n' % (range_start, c, range_size))
@@ -980,6 +997,6 @@ def write_bitsets(list, item_size):
 /* End of pcre2_ucd.c */
 """)
 
-f.close
+f.close()
 
 # End
diff --git a/src/pcre2.h.generic b/src/pcre2.h.generic
@@ -162,6 +162,7 @@ D   is inspected during pcre2_dfa_match() execution
 #define PCRE2_EXTRA_PYTHON_OCTAL             0x00002000u  /* C */
 #define PCRE2_EXTRA_NO_BS0                   0x00004000u  /* C */
 #define PCRE2_EXTRA_NEVER_CALLOUT            0x00008000u  /* C */
+#define PCRE2_EXTRA_TURKISH_CASING           0x00010000u  /* C */
 
 /* These are for pcre2_jit_compile(). */
 
@@ -328,6 +329,9 @@ pcre2_pattern_convert(). */
 #define PCRE2_ERROR_PATTERN_COMPILED_SIZE_TOO_BIG  201
 #define PCRE2_ERROR_OVERSIZE_PYTHON_OCTAL          202
 #define PCRE2_ERROR_CALLOUT_CALLER_DISABLED        203
+#define PCRE2_ERROR_EXTRA_CASING_REQUIRES_UNICODE  204
+#define PCRE2_ERROR_TURKISH_CASING_REQUIRES_UTF    205
+#define PCRE2_ERROR_EXTRA_CASING_INCOMPATIBLE      206
 
 
 /* "Expected" matching error codes: no match and partial match. */