From 179a454aa34ba3b33690aefc4b5ffd5552705fac Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Fri, 3 Oct 2025 12:38:35 -0600
Subject: [PATCH 1/2] perldiag, perlre: Clarify capture group name rules

---
 pod/perldiag.pod | 5 +++--
 pod/perlre.pod   | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index e6250bd970ee..40bf32ae0572 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -2867,8 +2867,9 @@ has since been undefined.
 S<<-- HERE> in m/%s/
 
 (F) Group names must follow the rules for perl identifiers, meaning
-they must start with a non-digit word character.  A common cause of
-this error is using (?&0) instead of (?0).  See L<perlre>.
+that ASCII-range ones must start with a non-digit word character.  A
+common cause of this error is using (?&0) instead of (?0).  See
+L<perlre> and L<perldata/Identifier parsing>.
 
 =item ()-group starts with a count
 
diff --git a/pod/perlre.pod b/pod/perlre.pod
index f128eb717688..6f898c3bf178 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -1198,7 +1198,9 @@ You can dispense with numbers altogether and create named capture groups.
 The notation is C<(?E<lt>I<name>E<gt>...)> to declare and C<\g{I<name>}> to
 reference.  (To be compatible with .Net regular expressions, C<\g{I<name>}> may
 also be written as C<\k{I<name>}>, C<\kE<lt>I<name>E<gt>> or C<\k'I<name>'>.)
-I<name> must not begin with a number, nor contain hyphens.
+I<name> must follow the rules for perl identifiers
+(L<perldata/Identifier parsing>) which means, for example, that they
+can't begin with a number, nor contain hyphens.
 When different groups within the same pattern have the same name, any reference
 to that name assumes the leftmost defined group.  Named groups count in
 absolute and relative numbering, and so can also be referred to by those

From c11897a3cbc571854ae0d6e8a244fe5e05c8ea18 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Fri, 3 Oct 2025 09:39:28 -0600
Subject: [PATCH 2/2] regex capture group names must use identifier syntax

Prior to this commit the non-first characters could be any \w character.
But an identifier excludes a few \w characters from appearing in them.
This commit tightens what is allowed.

Commit xd1e2a852fbc901b45fba20906a8f42ca227ae462 gave a list of them,
but I forgot a couple details in generating that list, so it wasn't
quite right.

The complete corrected list is:
GREEK YPOGEGRAMMENI
COMBINING CYRILLIC HUNDRED THOUSANDS SIGN
COMBINING CYRILLIC MILLIONS SIGN
COMBINING PARENTHESES OVERLAY
COMBINING ENCLOSING CIRCLE
COMBINING ENCLOSING SQUARE
COMBINING ENCLOSING DIAMOND
COMBINING ENCLOSING CIRCLE BACKSLASH
COMBINING ENCLOSING SCREEN
COMBINING ENCLOSING KEYCAP
COMBINING ENCLOSING UPWARD POINTING TRIANGLE
CIRCLED LATIN CAPITAL LETTER A - Z
CIRCLED LATIN SMALL LETTER A - Z
VERTICAL TILDE
COMBINING CYRILLIC TEN MILLIONS SIGN
COMBINING CYRILLIC HUNDRED MILLIONS SIGN
COMBINING CYRILLIC THOUSAND MILLIONS SIGN
ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM
ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM
ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM
ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM
ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM
ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM
ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM
ARABIC LIGATURE JALLAJALALOUHOU
ARABIC FATHATAN ISOLATED FORM
ARABIC DAMMATAN ISOLATED FORM
ARABIC KASRATAN ISOLATED FORM
ARABIC FATHA ISOLATED FORM
ARABIC DAMMA ISOLATED FORM
ARABIC KASRA ISOLATED FORM
ARABIC SHADDA ISOLATED FORM
ARABIC SUKUN ISOLATED FORM
SQUARED LATIN CAPITAL LETTER A - Z
NEGATIVE CIRCLED LATIN CAPITAL LETTER A - Z
NEGATIVE SQUARED LATIN CAPITAL LETTER A - Z
---
 pod/perldelta.pod | 35 +++++++++++++++++++++++++++++++++++
 regcomp.c         |  2 +-
 t/re/pat.t        | 24 +++++++++++++++++++++++-
 3 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 147ab795c621..9aca18c31470 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -373,6 +373,41 @@ consisted of only ASCII characters.  The real upper limit was as few as
 Chinese or Osage.  Now an identifier in any language may contain at
 least 255 characters.
 
+=item *
+
+The allowed characters for regular expression capture group names has
+been corrected to conform to Perl identifier syntax, which in turn is
+based on public Unicode rules.  The net result of this change is that,
+as of Unicode 17.0, about 160 characters that formerly were allowed to
+be in an identifier no longer are.  Only programs that do
+L<C<use utf8>|utf8> can be affected, and then only characters that
+appear in the 2nd or later positions of the name.  The characters that
+an identifier name can begin with are unchanged.
+
+130 of the now unacceptable characters are 5 sets of 26 Latin letters
+that are enclosed by some shape, such as CIRCLED LATIN CAPITAL LETTER N.
+Another 8 are generic modifiers that add shapes around other characters;
+5 are modifiers to Cyrillic numbers; and 16 are Arabic ligatures and
+isolated forms.  The other two are GREEK YPOGEGRAMMENI and VERTICAL
+TILDE.
+
+You can get a complete list of them by running the following program
+
+ perl -le 'use re qw(Debug COMPILE); qr/(?[ \w - \p{XIDC} ])/'
+
+Look near the final line.  The one that begins C<stclass> contains a
+list of 4 and 5 digit hexdecimal numbers.  These are the Unicode code
+points that were previously allowed, but no longer are.
+
+(Long after Perl identifier rules were formed, Unicode has added
+recommendations to further restrict legal identifier names.  These were
+added to counter cases where, for example, programmers snuck code past
+reviewers using characters that look like other ones.  The two
+properties are C<Identifier_Status> and C<Identifier_Type>.
+L<https://www.unicode.org/reports/tr39/>.  Perl currently doesn't do
+anything with these, except to furnish you the ability to use them in
+regular expressions.)
+
 =back
 
 =head1 Known Problems
diff --git a/regcomp.c b/regcomp.c
index 134a59fd03c0..0dbea26a4ffd 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2530,7 +2530,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
         do {
             RExC_parse_advance(advance);
         } while (   RExC_parse < RExC_end
-                 && (advance = isWORDCHAR_utf8_safe( (U8 *) RExC_parse,
+                 && (advance = isIDCONT_utf8_safe( (U8 *) RExC_parse,
                                                      (U8 *) RExC_end)));
     } else {
         RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending
diff --git a/t/re/pat.t b/t/re/pat.t
index ce826a5730d1..544348e78288 100644
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -28,7 +28,7 @@ skip_all_without_unicode_tables();
 my $has_locales = locales_enabled('LC_CTYPE');
 my $utf8_locale = find_utf8_ctype_locale();
 
-plan tests => 1296;  # Update this when adding/deleting tests.
+plan tests => 1298;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -1388,6 +1388,28 @@ EOP
             fresh_perl_like($prog, qr!Group name must start with a non-digit word character!, {},
                         sprintf("'U+%04X not legal IDFirst'", ord($char)));
         }
+
+        foreach my $char (chr(0x2115), chr(0x24B7)) {
+            my $prog = <<"EOP";
+use utf8;;
+no warnings 'utf8';
+print 0 + "abc" =~ qr/(?<a${char}b>abc)/;
+EOP
+            utf8::encode($prog);
+            if ($char =~ /\p{XID_Continue}/) {
+                fresh_perl_is($prog, 1,
+                                {},
+                                sprintf("U+%04X is legal IDCont",
+                                        ord($char)));
+            }
+            else {
+                fresh_perl_like($prog,
+                                qr/Sequence .* not terminated/,
+                                {},
+                                sprintf("U+%04X not legal IDCont",
+                                ord($char)));
+            }
+        }
     }
 
     { # [perl #101710]