Skip to content
Permalink
Browse files

PATCH: [perl #126310] single quote UTF-8 malformation detection

This adds UTF-8 wellformedness checking in Perl_lex_next_chunk, which
should get called for all program text, so this makes sure the entire
program is well-formed, not just single- or double-quoted strings.
  • Loading branch information
khwilliamson committed Dec 2, 2016
1 parent 86ae6e9 commit 6cdc5cd8f36f88172b0fcefdcadec75f5b6600b2
Showing with 33 additions and 7 deletions.
  1. +2 −1 pod/perldelta.pod
  2. +16 −6 t/lib/warnings/utf8
  3. +15 −0 toke.c
@@ -343,7 +343,8 @@ files in F<ext/> and F<lib/> are best summarized in L</Modules and Pragmata>.

=item *

XXX
Under C<use utf8>, the entire Perl program is now checked that the UTF-8
is wellformed. This resolves [perl #126310].

=back

@@ -15,23 +15,33 @@

__END__
# utf8.c [utf8_to_uvchr_buf] -W
# NAME Malformed under 'use utf8' in double-quoted string
BEGIN {
if (ord('A') == 193) {
print "SKIPPED\n# ebcdic platforms generates different Malformed UTF-8 warnings.";
exit 0;
}
}
use utf8 ;
no warnings; # Malformed is a fatal error, so gets output anyway.
my $a = "sn�storm" ;
{
no warnings 'utf8' ;
my $a = "sn�storm";
use warnings 'utf8' ;
my $a = "sn�storm";
EXPECT
Malformed UTF-8 character: \xf8\x73\x74\x6f\x72 (unexpected non-continuation byte 0x73, immediately after start byte 0xf8; need 5 bytes, got 1) at - line 10.
Malformed UTF-8 character (fatal) at - line 10.
########
# NAME Malformed under 'use utf8' in single-quoted string
BEGIN {
if (ord('A') == 193) {
print "SKIPPED\n# ebcdic platforms generates different Malformed UTF-8 warnings.";
exit 0;
}
}
use utf8 ;
no warnings; # Malformed is a fatal error, so gets output anyway.
my $a = 'sn�storm' ;
EXPECT
Malformed UTF-8 character: \xf8\x73\x74\x6f\x72 (unexpected non-continuation byte 0x73, immediately after start byte 0xf8; need 5 bytes, got 1) at - line 9.
Malformed UTF-8 character: \xf8\x73\x74\x6f\x72 (unexpected non-continuation byte 0x73, immediately after start byte 0xf8; need 5 bytes, got 1) at - line 14.
Malformed UTF-8 character (fatal) at - line 9.
########
use warnings 'utf8';
my $d7ff = uc(chr(0xD7FF));
15 toke.c
@@ -1286,6 +1286,8 @@ Perl_lex_next_chunk(pTHX_ U32 flags)
STRLEN linestart_pos, last_uni_pos, last_lop_pos;
bool got_some_for_debugger = 0;
bool got_some;
const U8* first_bad_char_loc;

if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
@@ -1350,6 +1352,19 @@ Perl_lex_next_chunk(pTHX_ U32 flags)
new_bufend_pos = SvCUR(linestr);
PL_parser->bufend = buf + new_bufend_pos;
PL_parser->bufptr = buf + bufptr_pos;

if (UTF && ! is_utf8_string_loc((U8 *) PL_parser->bufptr,
PL_parser->bufend - PL_parser->bufptr,
&first_bad_char_loc))
{

_force_out_malformed_utf8_message(first_bad_char_loc,
(U8 *) PL_parser->bufend,
0,
1 /* 1 means die */ );
NOT_REACHED; /* NOTREACHED */
}

PL_parser->oldbufptr = buf + oldbufptr_pos;
PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
PL_parser->linestart = buf + linestart_pos;

0 comments on commit 6cdc5cd

Please sign in to comment.
You can’t perform that action at this time.