toke.c - improve handling of $00 and ${00}

In 60267e1 I patched toke.c to refuse $00 but did not properly handle ${00} and related cases when the code was unicode. Part of the reason was the confusing macro VALID_LEN_ONE_IDENT() which despite its name does not restrict what it matches to things which are one character long. Since the VALID_LEN_ONE_IDENT() macro is used in only one place and its name and placement is confusing I have moved it back into the code inline as part of this fix. I have also added more comments about what is going on, and moved the related comment directly next to the code that it affects. If it moved out of this code then we should think of a better name and be more careful and clear about checking things like length. I would argue the logic is used to parse what might be called a variable "description", and thus it is not identical to code which might validate an actual parsed variable name. Eg, ${^Var} is a description of the variable whose "name" is "\026ar". The exception of course is $^ whose name actually is "^". A byproduct of this change is that the logic to detect duplicated leading zeros is now quite a bit simpler. This includes more tests for leading zero checks. See Issue #12948, Issue #19986, and Issue #19989.
Perl · Jul 27, 2022 · 1ce6429 · 1ce6429
1 parent 8290d9d
commit 1ce6429
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 39 deletions.
diff --git a/t/comp/parser_run.t b/t/comp/parser_run.t
@@ -10,7 +10,7 @@ BEGIN {
     set_up_inc( qw(. ../lib ) );
 }
 
-plan(7);
+plan(14);
 
 # [perl #130814] can reallocate lineptr while looking ahead for
 # "Missing $ on loop variable" diagnostic.
@@ -22,11 +22,16 @@ is($result . "\n", <<EXPECT);
 Identifier too long at - line 2.
 EXPECT
 
-fresh_perl_is(<<'EOS', <<'EXPECT', {}, "check zero vars");
-print $001;
-EOS
-Numeric variables with more than one digit may not start with '0' at - line 1.
-EXPECT
+for my $var ('$00','${00}','$001','${001}') {
+    for my $utf8 ("","use utf8;") {
+        fresh_perl_is(
+              "${utf8}print $var;",
+              "Numeric variables with more than one digit may not start with '0' at - line 1.",
+              {},
+              sprintf("check %s is illegal%s", $var, $utf8 ? " under utf8" : "") );
+    }
+}
+
 
 fresh_perl_is(<<EOS, <<'EXPECT', {}, "linestart before bufptr");
 \${ \xB6eeeeeeeeeeee

diff --git a/toke.c b/toke.c
@@ -10076,24 +10076,7 @@ Perl_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STR
     return s;
 }
 
-/* Is the byte 'd' a legal single character identifier name?  'u' is true
- * iff Unicode semantics are to be used.  The legal ones are any of:
- *  a) all ASCII characters except:
- *          1) control and space-type ones, like NUL, SOH, \t, and SPACE;
- *          2) '{'
- *     The final case currently doesn't get this far in the program, so we
- *     don't test for it.  If that were to change, it would be ok to allow it.
- *  b) When not under Unicode rules, any upper Latin1 character
- *  c) Otherwise, when unicode rules are used, all XIDS characters.
- *
- *      Because all ASCII characters have the same representation whether
- *      encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and
- *      '{' without knowing if is UTF-8 or not. */
-#define VALID_LEN_ONE_IDENT(s, e, is_utf8)                                  \
-    (isGRAPH_A(*(s)) || ((is_utf8)                                          \
-                         ? isIDFIRST_utf8_safe(s, e)                        \
-                         : (isGRAPH_L1(*s)                                  \
-                            && LIKELY((U8) *(s) != LATIN1_TO_NATIVE(0xAD)))))
+
 
 STATIC char *
 S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
@@ -10158,11 +10141,41 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
             s = skipspace(s);
         }
     }
+
+
+    /* Extract the first character of the variable name from 's' and
+     * copy it, null terminated into 'd'. Note that this does not
+     * involve checking for just IDFIRST characters, as it allows the
+     * '^' for ${^FOO} type variable names, and it allows all the
+     * characters that are legal in a single character variable name.
+     *
+     * The legal ones are any of:
+     *  a) all ASCII characters except:
+     *          1) control and space-type ones, like NUL, SOH, \t, and SPACE;
+     *          2) '{'
+     *     The final case currently doesn't get this far in the program, so we
+     *     don't test for it.  If that were to change, it would be ok to allow it.
+     *  b) When not under Unicode rules, any upper Latin1 character
+     *  c) Otherwise, when unicode rules are used, all XIDS characters.
+     *
+     *      Because all ASCII characters have the same representation whether
+     *      encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and
+     *      '{' without knowing if is UTF-8 or not. */
+
     if ((s <= PL_bufend - ((is_utf8)
                           ? UTF8SKIP(s)
                           : 1))
-        && VALID_LEN_ONE_IDENT(s, PL_bufend, is_utf8))
-    {
+        && (
+            isGRAPH_A(*s)
+            ||
+            ( is_utf8
+              ? isIDFIRST_utf8_safe(s, PL_bufend)
+              : (isGRAPH_L1(*s)
+                 && LIKELY((U8) *s != LATIN1_TO_NATIVE(0xAD))
+                )
+            )
+        )
+    ){
         if (is_utf8) {
             const STRLEN skip = UTF8SKIP(s);
             STRLEN i;
@@ -10172,22 +10185,16 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
         }
         else {
             *d = *s++;
-            /* special case to handle ${10}, ${11} the same way we handle ${1} etc */
-            if (isDIGIT(*d)) {
-                bool is_zero= *d == '0' ? TRUE : FALSE;
-                char *digit_start= d;
-                while (s < PL_bufend && isDIGIT(*s)) {
-                    d++;
-                    if (d >= e)
-                        Perl_croak(aTHX_ "%s", ident_too_long);
-                    *d= *s++;
-                }
-                if (is_zero && d - digit_start > 1)
-                    Perl_croak(aTHX_ ident_var_zero_multi_digit);
-            }
             d[1] = '\0';
         }
     }
+
+    /* make sure that we dont have a var starting with two zero digits
+     * which is not allowed as it introduces confusion with octal and
+     * decimal. */
+    if (*d == '0' && *s == '0')
+        Perl_croak(aTHX_ ident_var_zero_multi_digit);
+
     /* Convert $^F, ${^F} and the ^F of ${^FOO} to control characters */
     if (*d == '^' && *s && isCONTROLVAR(*s)) {
         *d = toCTRL(*s);