toke.c - improve handling of $00 and ${00}

In 60267e1 I patched toke.c to refuse $00 but did not properly handle ${00} and related cases when the code was unicode. Part of the reason was the confusing macro VALID_LEN_ONE_IDENT() which despite its name does not restrict what it matches to things which are one character long. Since the VALID_LEN_ONE_IDENT() macro is used in only one place and its name and placement is confusing I have moved it back into the code inline as part of this fix. I have also added more comments about what is going on, and moved the related comment directly next to the code that it affects. If it moved out of this code then we should think of a better name and be more careful and clear about checking things like length. I would argue the logic is used to parse what might be called a variable "description", and thus it is not identical to code which might validate an actual parsed variable name. Eg, ${^Var} is a description of the variable whose "name" is "\026ar". The exception of course is $^ whose name actually is "^". This includes more tests for leading zero checks. See Issue #12948, Issue #19986, and Issue #19989.
Perl · Jul 28, 2022 · 1c55371 · 1c55371
1 parent f217bf5
commit 1c55371
Show file tree

Hide file tree

Showing 3 changed files with 119 additions and 41 deletions.
diff --git a/t/comp/parser_run.t b/t/comp/parser_run.t
@@ -10,7 +10,7 @@ BEGIN {
     set_up_inc( qw(. ../lib ) );
 }
 
-plan(7);
+plan(62);
 
 # [perl #130814] can reallocate lineptr while looking ahead for
 # "Missing $ on loop variable" diagnostic.
@@ -22,11 +22,38 @@ is($result . "\n", <<EXPECT);
 Identifier too long at - line 2.
 EXPECT
 
-fresh_perl_is(<<'EOS', <<'EXPECT', {}, "check zero vars");
-print $001;
-EOS
-Numeric variables with more than one digit may not start with '0' at - line 1.
-EXPECT
+for my $var ('$00','${00}','$001','${001}','$01','${01}','$09324', '${09324}') {
+    for my $utf8 ("","use utf8;") {
+        for my $strict ("","use strict;") {
+            fresh_perl_is(
+                "${strict}${utf8}print $var;",
+                "Numeric variables with more than one digit may not start with '0' at - line 1.",
+                {},
+                sprintf("check %s is illegal%s%s", $var,
+                    $utf8   ? " under utf8" : "",
+                    $strict ? " under strict" : ""
+                ),
+            );
+        }
+    }
+}
+
+for my $var ('$0', '${0}', '$1', '${1}', '$10', '${10}', '$9324', '${9324}') {
+    for my $utf8 ("","use utf8;") {
+        for my $strict ("","use strict;") {
+            fresh_perl_is(
+                "${strict}${utf8} print '$var' if $var or !$var;",
+                $var,
+                {},
+                sprintf("check %s is legal%s%s", $var,
+                    $utf8   ? " under utf8" : "",
+                    $strict ? " under strict" : ""
+                )
+            );
+        }
+    }
+}
+
 
 fresh_perl_is(<<EOS, <<'EXPECT', {}, "linestart before bufptr");
 \${ \xB6eeeeeeeeeeee

diff --git a/t/re/pat.t b/t/re/pat.t
@@ -27,14 +27,42 @@ skip_all_without_unicode_tables();
 
 my $has_locales = locales_enabled('LC_CTYPE');
 
-plan tests => 1046;  # Update this when adding/deleting tests.
+plan tests => 1214;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
 #
 # Tests start here.
 #
 sub run_tests {
+    {
+        # see https://github.com/Perl/perl5/issues/12948
+        my $string="ABCDEFGHIJKL";
+        my $pat= "(.)" x length($string);
+        my $ok= $string=~/^$pat\z/;
+        foreach my $n (1 .. length($string)) {
+            $ok= eval sprintf 'is $%d, "%s", q($%d = %s); 1', ($n, substr($string,$n-1,1))x2;
+            ok($ok, "eval for \$$n test");
+            $ok= eval sprintf 'is ${%d}, "%s", q(${%d} = %s); 1', ($n, substr($string,$n-1,1))x2;
+            ok($ok, "eval for \${$n} test");
+
+            $ok= eval sprintf 'is $0%d, "%s", q($0%d = %s); 1', ($n, substr($string,$n-1,1))x2;
+            ok(!$ok, "eval failed as expected for \$0$n test");
+            $ok= eval sprintf 'is ${0%d}, "%s", q(${0%d} = %s); 1', ($n, substr($string,$n-1,1))x2;
+            ok(!$ok, "eval failed as expected for \${0$n} test");
+
+            no strict 'refs';
+            $ok= eval sprintf 'is ${0b%b}, "%s", q(${0b%b} = %s); 1', ($n, substr($string,$n-1,1))x2;
+            ok($ok, sprintf "eval for \${0b%b} test", $n);
+            $ok= eval sprintf 'is ${0x%x}, "%s", q(${0x%x} = %s); 1', ($n, substr($string,$n-1,1))x2;
+            ok($ok, sprintf "eval for \${0x%x} test", $n);
+            $ok= eval sprintf 'is ${0b%08b}, "%s", q(${0b%08b} = %s); 1', ($n, substr($string,$n-1,1))x2;
+            ok($ok, sprintf "eval for \${0b%b} test", $n);
+            $ok= eval sprintf 'is ${0x%04x}, "%s", q(${0x%04x} = %s); 1', ($n, substr($string,$n-1,1))x2;
+            ok($ok, sprintf "eval for \${0x%04x} test", $n);
+        }
+    }
+
     my $sharp_s = uni_to_native("\xdf");
 
     {

diff --git a/toke.c b/toke.c
@@ -10076,25 +10076,14 @@ Perl_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STR
     return s;
 }
 
-/* Is the byte 'd' a legal single character identifier name?  'u' is true
- * iff Unicode semantics are to be used.  The legal ones are any of:
- *  a) all ASCII characters except:
- *          1) control and space-type ones, like NUL, SOH, \t, and SPACE;
- *          2) '{'
- *     The final case currently doesn't get this far in the program, so we
- *     don't test for it.  If that were to change, it would be ok to allow it.
- *  b) When not under Unicode rules, any upper Latin1 character
- *  c) Otherwise, when unicode rules are used, all XIDS characters.
- *
- *      Because all ASCII characters have the same representation whether
- *      encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and
- *      '{' without knowing if is UTF-8 or not. */
-#define VALID_LEN_ONE_IDENT(s, e, is_utf8)                                  \
-    (isGRAPH_A(*(s)) || ((is_utf8)                                          \
-                         ? isIDFIRST_utf8_safe(s, e)                        \
-                         : (isGRAPH_L1(*s)                                  \
-                            && LIKELY((U8) *(s) != LATIN1_TO_NATIVE(0xAD)))))
 
+/* scan s and extract an identifier ($var) from it if possible
+ * into dest.
+ * XXX: This function has subtle implications on parsing, and
+ * changing how it behaves can cause a variable to change from
+ * being a run time rv2sv call or a compile time binding to a
+ * specific variable name.
+ */
 STATIC char *
 S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
 {
@@ -10158,11 +10147,41 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
             s = skipspace(s);
         }
     }
+
+
+    /* Extract the first character of the variable name from 's' and
+     * copy it, null terminated into 'd'. Note that this does not
+     * involve checking for just IDFIRST characters, as it allows the
+     * '^' for ${^FOO} type variable names, and it allows all the
+     * characters that are legal in a single character variable name.
+     *
+     * The legal ones are any of:
+     *  a) all ASCII characters except:
+     *          1) control and space-type ones, like NUL, SOH, \t, and SPACE;
+     *          2) '{'
+     *     The final case currently doesn't get this far in the program, so we
+     *     don't test for it.  If that were to change, it would be ok to allow it.
+     *  b) When not under Unicode rules, any upper Latin1 character
+     *  c) Otherwise, when unicode rules are used, all XIDS characters.
+     *
+     *      Because all ASCII characters have the same representation whether
+     *      encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and
+     *      '{' without knowing if is UTF-8 or not. */
+
     if ((s <= PL_bufend - ((is_utf8)
                           ? UTF8SKIP(s)
                           : 1))
-        && VALID_LEN_ONE_IDENT(s, PL_bufend, is_utf8))
-    {
+        && (
+            isGRAPH_A(*s)
+            ||
+            ( is_utf8
+              ? isIDFIRST_utf8_safe(s, PL_bufend)
+              : (isGRAPH_L1(*s)
+                 && LIKELY((U8) *s != LATIN1_TO_NATIVE(0xAD))
+                )
+            )
+        )
+    ){
         if (is_utf8) {
             const STRLEN skip = UTF8SKIP(s);
             STRLEN i;
@@ -10172,24 +10191,27 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
         }
         else {
             *d = *s++;
-            /* special case to handle ${10}, ${11} the same way we handle ${1} etc */
-            if (isDIGIT(*d)) {
-                bool is_zero= *d == '0' ? TRUE : FALSE;
-                char *digit_start= d;
-                while (s < PL_bufend && isDIGIT(*s)) {
-                    d++;
-                    if (d >= e)
-                        Perl_croak(aTHX_ "%s", ident_too_long);
-                    *d= *s++;
-                }
-                if (is_zero && d - digit_start > 1)
-                    Perl_croak(aTHX_ ident_var_zero_multi_digit);
-            }
             d[1] = '\0';
         }
     }
+
+    /* special case to handle ${10}, ${11} the same way we handle ${1} etc */
+    if (isDIGIT(*d)) {
+        bool is_zero= *d == '0' ? TRUE : FALSE;
+        char *digit_start= d;
+        while (s < PL_bufend && isDIGIT(*s)) {
+            d++;
+            if (d >= e)
+                Perl_croak(aTHX_ "%s", ident_too_long);
+            *d= *s++;
+        }
+        if (is_zero && d - digit_start >= 1) /* d points at the last digit */
+            Perl_croak(aTHX_ ident_var_zero_multi_digit);
+        d[1] = '\0';
+    }
+
     /* Convert $^F, ${^F} and the ^F of ${^FOO} to control characters */
-    if (*d == '^' && *s && isCONTROLVAR(*s)) {
+    else if (*d == '^' && *s && isCONTROLVAR(*s)) {
         *d = toCTRL(*s);
         s++;
     }
@@ -10198,6 +10220,7 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
        about when not to warn.  */
     else if (ck_uni && bracket == -1)
         check_uni();
+
     if (bracket != -1) {
         bool skip;
         char *s2;