Perl · khwilliamson · Oct 3, 2025 · Oct 3, 2025 · jkeenan · Oct 6, 2025
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
@@ -373,6 +373,41 @@ consisted of only ASCII characters.  The real upper limit was as few as
 Chinese or Osage.  Now an identifier in any language may contain at
 least 255 characters.
 
+=item *
+
+The allowed characters for regular expression capture group names has
+been corrected to conform to Perl identifier syntax, which in turn is
+based on public Unicode rules.  The net result of this change is that,
+as of Unicode 17.0, about 160 characters that formerly were allowed to
+be in an identifier no longer are.  Only programs that do
+L<C<use utf8>|utf8> can be affected, and then only characters that
+appear in the 2nd or later positions of the name.  The characters that
+an identifier name can begin with are unchanged.
+
+130 of the now unacceptable characters are 5 sets of 26 Latin letters
+that are enclosed by some shape, such as CIRCLED LATIN CAPITAL LETTER N.
+Another 8 are generic modifiers that add shapes around other characters;
+5 are modifiers to Cyrillic numbers; and 16 are Arabic ligatures and
+isolated forms.  The other two are GREEK YPOGEGRAMMENI and VERTICAL
+TILDE.
+
+You can get a complete list of them by running the following program
+
+ perl -le 'use re qw(Debug COMPILE); qr/(?[ \w - \p{XIDC} ])/'
+
+Look near the final line.  The one that begins C<stclass> contains a
+list of 4 and 5 digit hexdecimal numbers.  These are the Unicode code
+points that were previously allowed, but no longer are.
+
+(Long after Perl identifier rules were formed, Unicode has added
+recommendations to further restrict legal identifier names.  These were
+added to counter cases where, for example, programmers snuck code past
+reviewers using characters that look like other ones.  The two
+properties are C<Identifier_Status> and C<Identifier_Type>.
+L<https://www.unicode.org/reports/tr39/>.  Perl currently doesn't do
+anything with these, except to furnish you the ability to use them in
+regular expressions.)
+
 =back
 
 =head1 Known Problems

diff --git a/pod/perldiag.pod b/pod/perldiag.pod
@@ -2867,8 +2867,9 @@ has since been undefined.
 S<<-- HERE> in m/%s/
 
 (F) Group names must follow the rules for perl identifiers, meaning
-they must start with a non-digit word character.  A common cause of
-this error is using (?&0) instead of (?0).  See L<perlre>.
+that ASCII-range ones must start with a non-digit word character.  A
+common cause of this error is using (?&0) instead of (?0).  See
+L<perlre> and L<perldata/Identifier parsing>.
 
 =item ()-group starts with a count
 

diff --git a/pod/perlre.pod b/pod/perlre.pod
@@ -1198,7 +1198,9 @@ You can dispense with numbers altogether and create named capture groups.
 The notation is C<(?E<lt>I<name>E<gt>...)> to declare and C<\g{I<name>}> to
 reference.  (To be compatible with .Net regular expressions, C<\g{I<name>}> may
 also be written as C<\k{I<name>}>, C<\kE<lt>I<name>E<gt>> or C<\k'I<name>'>.)
-I<name> must not begin with a number, nor contain hyphens.
+I<name> must follow the rules for perl identifiers
+(L<perldata/Identifier parsing>) which means, for example, that they
+can't begin with a number, nor contain hyphens.
 When different groups within the same pattern have the same name, any reference
 to that name assumes the leftmost defined group.  Named groups count in
 absolute and relative numbering, and so can also be referred to by those

diff --git a/regcomp.c b/regcomp.c
@@ -2530,7 +2530,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
         do {
             RExC_parse_advance(advance);
         } while (   RExC_parse < RExC_end
-                 && (advance = isWORDCHAR_utf8_safe( (U8 *) RExC_parse,
+                 && (advance = isIDCONT_utf8_safe( (U8 *) RExC_parse,
                                                      (U8 *) RExC_end)));
     } else {
         RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending

diff --git a/t/re/pat.t b/t/re/pat.t
@@ -28,7 +28,7 @@ skip_all_without_unicode_tables();
 my $has_locales = locales_enabled('LC_CTYPE');
 my $utf8_locale = find_utf8_ctype_locale();
 
-plan tests => 1296;  # Update this when adding/deleting tests.
+plan tests => 1298;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -1388,6 +1388,28 @@ EOP
             fresh_perl_like($prog, qr!Group name must start with a non-digit word character!, {},
                         sprintf("'U+%04X not legal IDFirst'", ord($char)));
         }
+
+        foreach my $char (chr(0x2115), chr(0x24B7)) {
+            my $prog = <<"EOP";
+use utf8;;
+no warnings 'utf8';
+print 0 + "abc" =~ qr/(?<a${char}b>abc)/;
+EOP
+            utf8::encode($prog);
+            if ($char =~ /\p{XID_Continue}/) {
+                fresh_perl_is($prog, 1,
+                                {},
+                                sprintf("U+%04X is legal IDCont",
+                                        ord($char)));
+            }
+            else {
+                fresh_perl_like($prog,
+                                qr/Sequence .* not terminated/,
+                                {},
+                                sprintf("U+%04X not legal IDCont",
+                                ord($char)));
+            }
+        }
     }
 
     { # [perl #101710]