Skip to content

Commit

Permalink
toke.c - improve handling of $00 and ${00}
Browse files Browse the repository at this point in the history
In 60267e1 I patched toke.c to refuse
$00 but did not properly handle ${00} and related cases when the code
was unicode. Part of the reason was the confusing macro
VALID_LEN_ONE_IDENT() which despite its name does not restrict what it
matches to things which are one character long.

Since the VALID_LEN_ONE_IDENT() macro is used in only one place and its
name and placement is confusing I have moved it back into the code
inline as part of this fix. I have also added more comments about what
is going on, and moved the related comment directly next to the code
that it affects. If it moved out of this code then we should think of a
better name and be more careful and clear about checking things like
length. I would argue the logic is used to parse what might be called a
variable "description", and thus it is not identical to code which might
validate an actual parsed variable name. Eg, ${^Var} is a description of
the variable whose "name" is "\026ar". The exception of course is $^
whose name actually is "^".

This includes more tests for leading zero checks.

See Issue #12948, Issue #19986, and Issue #19989.
  • Loading branch information
demerphq committed Jul 28, 2022
1 parent f217bf5 commit 1c55371
Show file tree
Hide file tree
Showing 3 changed files with 119 additions and 41 deletions.
39 changes: 33 additions & 6 deletions t/comp/parser_run.t
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ BEGIN {
set_up_inc( qw(. ../lib ) );
}

plan(7);
plan(62);

# [perl #130814] can reallocate lineptr while looking ahead for
# "Missing $ on loop variable" diagnostic.
Expand All @@ -22,11 +22,38 @@ is($result . "\n", <<EXPECT);
Identifier too long at - line 2.
EXPECT

fresh_perl_is(<<'EOS', <<'EXPECT', {}, "check zero vars");
print $001;
EOS
Numeric variables with more than one digit may not start with '0' at - line 1.
EXPECT
for my $var ('$00','${00}','$001','${001}','$01','${01}','$09324', '${09324}') {
for my $utf8 ("","use utf8;") {
for my $strict ("","use strict;") {
fresh_perl_is(
"${strict}${utf8}print $var;",
"Numeric variables with more than one digit may not start with '0' at - line 1.",
{},
sprintf("check %s is illegal%s%s", $var,
$utf8 ? " under utf8" : "",
$strict ? " under strict" : ""
),
);
}
}
}

for my $var ('$0', '${0}', '$1', '${1}', '$10', '${10}', '$9324', '${9324}') {
for my $utf8 ("","use utf8;") {
for my $strict ("","use strict;") {
fresh_perl_is(
"${strict}${utf8} print '$var' if $var or !$var;",
$var,
{},
sprintf("check %s is legal%s%s", $var,
$utf8 ? " under utf8" : "",
$strict ? " under strict" : ""
)
);
}
}
}


fresh_perl_is(<<EOS, <<'EXPECT', {}, "linestart before bufptr");
\${ \xB6eeeeeeeeeeee
Expand Down
30 changes: 29 additions & 1 deletion t/re/pat.t
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,42 @@ skip_all_without_unicode_tables();

my $has_locales = locales_enabled('LC_CTYPE');

plan tests => 1046; # Update this when adding/deleting tests.
plan tests => 1214; # Update this when adding/deleting tests.

run_tests() unless caller;

#
# Tests start here.
#
sub run_tests {
{
# see https://github.com/Perl/perl5/issues/12948
my $string="ABCDEFGHIJKL";
my $pat= "(.)" x length($string);
my $ok= $string=~/^$pat\z/;
foreach my $n (1 .. length($string)) {
$ok= eval sprintf 'is $%d, "%s", q($%d = %s); 1', ($n, substr($string,$n-1,1))x2;
ok($ok, "eval for \$$n test");
$ok= eval sprintf 'is ${%d}, "%s", q(${%d} = %s); 1', ($n, substr($string,$n-1,1))x2;
ok($ok, "eval for \${$n} test");

$ok= eval sprintf 'is $0%d, "%s", q($0%d = %s); 1', ($n, substr($string,$n-1,1))x2;
ok(!$ok, "eval failed as expected for \$0$n test");
$ok= eval sprintf 'is ${0%d}, "%s", q(${0%d} = %s); 1', ($n, substr($string,$n-1,1))x2;
ok(!$ok, "eval failed as expected for \${0$n} test");

no strict 'refs';
$ok= eval sprintf 'is ${0b%b}, "%s", q(${0b%b} = %s); 1', ($n, substr($string,$n-1,1))x2;
ok($ok, sprintf "eval for \${0b%b} test", $n);
$ok= eval sprintf 'is ${0x%x}, "%s", q(${0x%x} = %s); 1', ($n, substr($string,$n-1,1))x2;
ok($ok, sprintf "eval for \${0x%x} test", $n);
$ok= eval sprintf 'is ${0b%08b}, "%s", q(${0b%08b} = %s); 1', ($n, substr($string,$n-1,1))x2;
ok($ok, sprintf "eval for \${0b%b} test", $n);
$ok= eval sprintf 'is ${0x%04x}, "%s", q(${0x%04x} = %s); 1', ($n, substr($string,$n-1,1))x2;
ok($ok, sprintf "eval for \${0x%04x} test", $n);
}
}

my $sharp_s = uni_to_native("\xdf");

{
Expand Down
91 changes: 57 additions & 34 deletions toke.c
Original file line number Diff line number Diff line change
Expand Up @@ -10076,25 +10076,14 @@ Perl_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STR
return s;
}

/* Is the byte 'd' a legal single character identifier name? 'u' is true
* iff Unicode semantics are to be used. The legal ones are any of:
* a) all ASCII characters except:
* 1) control and space-type ones, like NUL, SOH, \t, and SPACE;
* 2) '{'
* The final case currently doesn't get this far in the program, so we
* don't test for it. If that were to change, it would be ok to allow it.
* b) When not under Unicode rules, any upper Latin1 character
* c) Otherwise, when unicode rules are used, all XIDS characters.
*
* Because all ASCII characters have the same representation whether
* encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and
* '{' without knowing if is UTF-8 or not. */
#define VALID_LEN_ONE_IDENT(s, e, is_utf8) \
(isGRAPH_A(*(s)) || ((is_utf8) \
? isIDFIRST_utf8_safe(s, e) \
: (isGRAPH_L1(*s) \
&& LIKELY((U8) *(s) != LATIN1_TO_NATIVE(0xAD)))))

/* scan s and extract an identifier ($var) from it if possible
* into dest.
* XXX: This function has subtle implications on parsing, and
* changing how it behaves can cause a variable to change from
* being a run time rv2sv call or a compile time binding to a
* specific variable name.
*/
STATIC char *
S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
{
Expand Down Expand Up @@ -10158,11 +10147,41 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
s = skipspace(s);
}
}


/* Extract the first character of the variable name from 's' and
* copy it, null terminated into 'd'. Note that this does not
* involve checking for just IDFIRST characters, as it allows the
* '^' for ${^FOO} type variable names, and it allows all the
* characters that are legal in a single character variable name.
*
* The legal ones are any of:
* a) all ASCII characters except:
* 1) control and space-type ones, like NUL, SOH, \t, and SPACE;
* 2) '{'
* The final case currently doesn't get this far in the program, so we
* don't test for it. If that were to change, it would be ok to allow it.
* b) When not under Unicode rules, any upper Latin1 character
* c) Otherwise, when unicode rules are used, all XIDS characters.
*
* Because all ASCII characters have the same representation whether
* encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and
* '{' without knowing if is UTF-8 or not. */

if ((s <= PL_bufend - ((is_utf8)
? UTF8SKIP(s)
: 1))
&& VALID_LEN_ONE_IDENT(s, PL_bufend, is_utf8))
{
&& (
isGRAPH_A(*s)
||
( is_utf8
? isIDFIRST_utf8_safe(s, PL_bufend)
: (isGRAPH_L1(*s)
&& LIKELY((U8) *s != LATIN1_TO_NATIVE(0xAD))
)
)
)
){
if (is_utf8) {
const STRLEN skip = UTF8SKIP(s);
STRLEN i;
Expand All @@ -10172,24 +10191,27 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
}
else {
*d = *s++;
/* special case to handle ${10}, ${11} the same way we handle ${1} etc */
if (isDIGIT(*d)) {
bool is_zero= *d == '0' ? TRUE : FALSE;
char *digit_start= d;
while (s < PL_bufend && isDIGIT(*s)) {
d++;
if (d >= e)
Perl_croak(aTHX_ "%s", ident_too_long);
*d= *s++;
}
if (is_zero && d - digit_start > 1)
Perl_croak(aTHX_ ident_var_zero_multi_digit);
}
d[1] = '\0';
}
}

/* special case to handle ${10}, ${11} the same way we handle ${1} etc */
if (isDIGIT(*d)) {
bool is_zero= *d == '0' ? TRUE : FALSE;
char *digit_start= d;
while (s < PL_bufend && isDIGIT(*s)) {
d++;
if (d >= e)
Perl_croak(aTHX_ "%s", ident_too_long);
*d= *s++;
}
if (is_zero && d - digit_start >= 1) /* d points at the last digit */
Perl_croak(aTHX_ ident_var_zero_multi_digit);
d[1] = '\0';
}

/* Convert $^F, ${^F} and the ^F of ${^FOO} to control characters */
if (*d == '^' && *s && isCONTROLVAR(*s)) {
else if (*d == '^' && *s && isCONTROLVAR(*s)) {
*d = toCTRL(*s);
s++;
}
Expand All @@ -10198,6 +10220,7 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
about when not to warn. */
else if (ck_uni && bracket == -1)
check_uni();

if (bracket != -1) {
bool skip;
char *s2;
Expand Down

0 comments on commit 1c55371

Please sign in to comment.