From 3485f29ea37a541f6455993922c5d8c8c575870f Mon Sep 17 00:00:00 2001 From: Leon Timmermans Date: Mon, 9 Apr 2012 21:49:11 +0200 Subject: [PATCH] Made :utf8 an actual layer It will check the input for validity, by default strict validity though less strict forms are provided. This also means PerlIO::get_layers doesn't return a "utf8" pseudo-layer anymore, which can break some code making that assumption. --- cpan/CPAN-Meta-YAML/t/11_read_string.t | 5 +- lib/PerlIO.pm | 21 +- perlio.c | 418 ++++++++++++++++++++++++- perliol.h | 1 + pod/perldiag.pod | 8 +- pod/perlfunc.pod | 7 +- pod/perliol.pod | 6 +- pod/perlrun.pod | 8 + pod/perlunifaq.pod | 17 +- pod/perluniintro.pod | 5 +- t/io/crlf.t | 37 +-- t/io/layers.t | 13 +- t/io/utf8.t | 24 +- t/op/print.t | 2 +- universal.c | 8 - 15 files changed, 486 insertions(+), 94 deletions(-) diff --git a/cpan/CPAN-Meta-YAML/t/11_read_string.t b/cpan/CPAN-Meta-YAML/t/11_read_string.t index 491fd8e8ee2a..f4ef869dc41b 100644 --- a/cpan/CPAN-Meta-YAML/t/11_read_string.t +++ b/cpan/CPAN-Meta-YAML/t/11_read_string.t @@ -40,7 +40,10 @@ subtest 'invalid UTF-8' => sub { # get invalid UTF-8 by reading Latin-1 with lax :utf8 layer my $string = do { local $SIG{__WARN__} = sub {}; - slurp( test_data_file('latin1.yml'), ":utf8" ); + my $ret = slurp( test_data_file('latin1.yml'), ":raw" ); + require Encode; + Encode::_utf8_on($ret); + $ret; }; my $obj = eval { CPAN::Meta::YAML->read_string($string); }; is( $obj, undef, "read_string should return undef" ); diff --git a/lib/PerlIO.pm b/lib/PerlIO.pm index 85dd083cc02a..ec64d85ebc1c 100644 --- a/lib/PerlIO.pm +++ b/lib/PerlIO.pm @@ -1,6 +1,6 @@ package PerlIO; -our $VERSION = '1.11'; +our $VERSION = '1.12'; # Map layer name to package that defines it our %alias; @@ -173,6 +173,8 @@ instead produce UTF-EBCDIC on EBCDIC systems. The C<:encoding(UTF-8)> layer (hyphen is significant) is preferred as it will ensure translation between valid UTF-8 bytes and valid Unicode characters. +Note that before perl 5.36, this layer did not validate byte sequences. + =item :bytes This is the inverse of the C<:utf8> pseudo-layer. It turns off the flag @@ -220,10 +222,9 @@ but then enable UTF-8 translation. =item :pop -A pseudo-layer that removes the top-most layer. Gives Perl code a -way to manipulate the layer stack. Note that C<:pop> only works on -real layers and will not undo the effects of pseudo-layers or flags -like C<:utf8>. An example of a possible use might be: +A pseudo layer that removes the top-most layer. Gives perl code +a way to manipulate the layer stack. An example of a possible use +might be: open(my $fh,...) or die "open failed: $!"; ... @@ -254,9 +255,8 @@ Some custom layers come with the Perl distribution. =item :encoding Use C<:encoding(ENCODING)> to transparently do character set and encoding -transformations, for example from Shift-JIS to Unicode. Note that an -C<:encoding> also enables C<:utf8>. See L for more -information. +transformations, for example from Shift-JIS to Unicode. See +L for more information. =item :mmap @@ -372,9 +372,8 @@ You are supposed to use open() and binmode() to manipulate the stack. B The arguments to layers are by default returned in parentheses after -the name of the layer, and certain layers (like C<:utf8>) are not real -layers but instead flags on real layers; to get all of these returned -separately, use the optional C
argument: +the name of the layer; to get all of these returned separately, use the +optional C
argument: my @layer_and_args_and_flags = PerlIO::get_layers($fh, details => 1); diff --git a/perlio.c b/perlio.c index 38312e8cb410..cb69637860b3 100644 --- a/perlio.c +++ b/perlio.c @@ -1076,6 +1076,7 @@ PerlIO_default_layers(pTHX) PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_stdio)); PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_crlf)); PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_utf8)); + PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_utf8_lax)); PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_remove)); PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_byte)); PerlIO_list_push(aTHX_ PL_def_layerlist, (PerlIO_funcs *)osLayer, @@ -1828,16 +1829,23 @@ PerlIOUtf8_pushed(pTHX_ PerlIO *f, const char *mode, SV *arg, PerlIO_funcs *tab) if (PerlIOValid(f)) { if (tab && tab->kind & PERLIO_K_UTF8) PerlIOBase(f)->flags |= PERLIO_F_UTF8; + else { + if (PerlIOBase(f)->tab->kind & PERLIO_K_UTF8) + while (PerlIOBase(f)->tab->kind & PERLIO_K_UTF8) { + PerlIO_flush(f); + PerlIO_pop(aTHX_ f); + } else PerlIOBase(f)->flags &= ~PERLIO_F_UTF8; + } return 0; } return -1; } -PERLIO_FUNCS_DECL(PerlIO_utf8) = { +PERLIO_FUNCS_DECL(PerlIO_utf8_lax) = { sizeof(PerlIO_funcs), - "utf8", + "utf8_lax", 0, PERLIO_K_DUMMY | PERLIO_K_UTF8 | PERLIO_K_MULTIARG, PerlIOUtf8_pushed, @@ -4141,6 +4149,8 @@ PerlIOBuf_readdelim(pTHX_ PerlIO *f, STDCHAR *vbuf, Size_t count, STDCHAR delim) next = PerlIO_readdelim(f, vbuf + read, count - read, delim); if (next >= 0) return read + next; + if (read > 0) + return read; else return next; /* XXX */ } @@ -4912,6 +4922,410 @@ PERLIO_FUNCS_DECL(PerlIO_crlf) = { PerlIOBuf_readdelim, }; +#define UTF8_MAX_BYTES 4 + +static const U8 utf8_sequence_len[0x100] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00-0x0F */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10-0x1F */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20-0x2F */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30-0x3F */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40-0x4F */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50-0x5F */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60-0x6F */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70-0x7F */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */ + 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */ + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */ + 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */ +}; + + +typedef enum { STRICT_UTF8=0, ALLOW_SURROGATES=1, ALLOW_NONCHARACTERS=2, ALLOW_NONSHORTEST=4 } utf8_flags; + + +static STRLEN skip_sequence(const U8 *cur, const STRLEN len) { + STRLEN i, n = utf8_sequence_len[*cur]; + + if (n < 1 || len < 2) + return 1; + + switch (cur[0]) { + case 0xE0: if ((cur[1] & 0xE0) != 0xA0) return 1; break; + case 0xED: if ((cur[1] & 0xE0) != 0x80) return 1; break; + case 0xF4: if ((cur[1] & 0xF0) != 0x80) return 1; break; + case 0xF0: if ((cur[1] & 0xF0) == 0x80) return 1; /* FALLTROUGH */ + default: if ((cur[1] & 0xC0) != 0x80) return 1; break; + } + + if (n > len) + n = len; + for (i = 2; i < n; i++) + if ((cur[i] & 0xC0) != 0x80) + break; + return i; +} + +static void report_illformed(pTHX_ const U8 *cur, STRLEN len, bool eof) { + static const char *hex = "0123456789ABCDEF"; + const char *fmt; + char seq[UTF8_MAX_BYTES * 3]; + char *d = seq; + + if (eof) + fmt = "Can't decode ill-formed UTF-8 octet sequence <%s> at end of file"; + else + fmt = "Can't decode ill-formed UTF-8 octet sequence <%s>"; + + while (len-- > 0) { + const U8 c = *cur++; + *d++ = hex[c >> 4]; + *d++ = hex[c & 15]; + if (len) + *d++ = ' '; + } + *d = 0; + Perl_croak(aTHX_ fmt, seq); +} + +static void report_noncharacter(pTHX_ UV usv) { + static const char *fmt = "Can't interchange noncharacter code point U+%"UVXf; + Perl_croak(aTHX_ fmt, usv); +} + +static STRLEN validate(pTHX_ const U8 *buf, const U8 *end, const int flags, PerlIO* handle) { + const bool eof = PerlIO_eof(handle); + const U8 *cur = buf; + const U8 *end4 = end - UTF8_MAX_BYTES; + STRLEN skip = 0; + U32 v; + + while (cur < end4) { + while (cur < end4 && *cur < 0x80) + cur++; + + check: + switch (utf8_sequence_len[*cur]) { + case 0: + goto illformed; + case 1: + cur += 1; + break; + case 2: + /* 110xxxxx 10xxxxxx */ + if ((cur[1] & 0xC0) != 0x80) + goto illformed; + cur += 2; + break; + case 3: + v = ((U32)cur[0] << 16) + | ((U32)cur[1] << 8) + | ((U32)cur[2]); + /* 1110xxxx 10xxxxxx 10xxxxxx */ + if ((v & 0x00F0C0C0) != 0x00E08080 || + /* Non-shortest form */ + v < 0x00E0A080) + goto illformed; + /* Surrogates U+D800..U+DFFF */ + if (!(flags & ALLOW_SURROGATES) && (v & 0x00EFA080) == 0x00EDA080) + goto illformed; + /* Non-characters U+FDD0..U+FDEF, U+FFFE..U+FFFF */ + if (!(flags & ALLOW_NONCHARACTERS) && v >= 0x00EFB790 && (v <= 0x00EFB7AF || v >= 0x00EFBFBE)) + goto noncharacter; + cur += 3; + break; + case 4: + v = ((U32)cur[0] << 24) + | ((U32)cur[1] << 16) + | ((U32)cur[2] << 8) + | ((U32)cur[3]); + /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + if ((v & 0xF8C0C0C0) != 0xF0808080 || + /* Non-shortest form */ + v < 0xF0908080 || + /* Greater than U+10FFFF */ + v > 0xF48FBFBF) + goto illformed; + /* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */ + if (!(flags & ALLOW_NONCHARACTERS) && (v & 0x000FBFBE) == 0x000FBFBE) + goto noncharacter; + cur += 4; + break; + } + } + + if (cur < end) { + if (cur + utf8_sequence_len[*cur] <= end) + goto check; + skip = skip_sequence(cur, end - cur); + if (eof || cur + skip < end) + goto illformed; + } + return cur - buf; + +illformed: + if (!skip) + skip = skip_sequence(cur, end - cur); + PerlIOBase(handle)->flags |= PERLIO_F_ERROR; + report_illformed(aTHX_ cur, skip, eof); + +noncharacter: + if (v < 0xF0808080) + v = (v & 0x3F) | (v & 0x3F00) >> 2 | (v & 0x0F0000) >> 4; + else + v = (v & 0x3F) | (v & 0x3F00) >> 2 | (v & 0x3F0000) >> 4 | (v & 0x07000000) >> 6; + PerlIOBase(handle)->flags |= PERLIO_F_ERROR; + report_noncharacter(aTHX_ v); +} + +typedef struct { + PerlIOBuf buf; + STDCHAR leftovers[UTF8_MAX_BYTES]; + size_t leftover_length; + utf8_flags flags; +} PerlIOUnicode; + +static struct { + const char* name; + size_t length; + utf8_flags value; +} map[] = { + { STR_WITH_LEN("allow_surrogates"), ALLOW_SURROGATES }, + { STR_WITH_LEN("allow_noncharacters"), ALLOW_NONCHARACTERS }, + { STR_WITH_LEN("allow_nonshortest"), ALLOW_NONSHORTEST }, + { STR_WITH_LEN("strict"), STRICT_UTF8 }, + { STR_WITH_LEN("loose"), ALLOW_SURROGATES | ALLOW_NONCHARACTERS | ALLOW_NONSHORTEST }, +}; + +static utf8_flags lookup_parameter(pTHX_ const char* ptr, size_t len) { + unsigned i; + for (i = 0; i < sizeof map / sizeof *map; ++i) { + if (map[i].length == len && memcmp(ptr, map[i].name, len) == 0) + return map[i].value; + } + Perl_croak(aTHX_ "Unknown argument to :utf8: %*s", (int)len, ptr); +} +static utf8_flags parse_parameters(pTHX_ SV* param) { + STRLEN len; + const char *begin, *delim; + if (!param || !SvOK(param)) + return 0; + + begin = SvPV(param, len); + delim = strchr(begin, ','); + if(delim) { + utf8_flags ret = 0; + const char* end = begin + len; + do { + ret |= lookup_parameter(aTHX_ begin, delim - begin); + begin = delim + 1; + delim = strchr(begin, ','); + } while (delim); + if (begin < end) + ret |= lookup_parameter(aTHX_ begin, end - begin); + return ret; + } + else { + return lookup_parameter(aTHX_ begin, len); + } +} + +static IV PerlIOUnicode_pushed(pTHX_ PerlIO* f, const char* mode, SV* arg, PerlIO_funcs* tab) { + utf8_flags flags = parse_parameters(aTHX_ arg); + if (PerlIOBuf_pushed(aTHX_ f, mode, arg, tab) == 0) { + PerlIOBase(f)->flags |= PERLIO_F_UTF8; + PerlIOSelf(f, PerlIOUnicode)->flags = flags; + return 0; + } + return -1; +} + +static IV PerlIOUnicode_fill(pTHX_ PerlIO* f) { + PerlIOUnicode * const u = PerlIOSelf(f, PerlIOUnicode); + PerlIOBuf * const b = &u->buf; + PerlIO *n = PerlIONext(f); + SSize_t avail; + Size_t read_bytes = 0; + STDCHAR *end; + SSize_t fit; + + if (PerlIO_flush(f) != 0) + return -1; + if (PerlIOBase(f)->flags & PERLIO_F_TTY) + PerlIOBase_flush_linebuf(aTHX); + + if (!b->buf) + PerlIO_get_base(f); + + assert(b->buf); + + if (u->leftover_length) { + Copy(u->leftovers, b->buf, u->leftover_length, STDCHAR); + b->end = b->buf + u->leftover_length; + read_bytes = u->leftover_length; + u->leftover_length = 0; + } + else { + b->ptr = b->end = b->buf; + } + fit = (SSize_t)b->bufsiz - (b->end - b->buf); + + if (!PerlIOValid(n)) { + PerlIOBase(f)->flags |= PERLIO_F_EOF; + return -1; + } + + if (PerlIO_fast_gets(n)) { + /* + * Layer below is also buffered. We do _NOT_ want to call its + * ->Read() because that will loop till it gets what we asked for + * which may hang on a pipe etc. Instead take anything it has to + * hand, or ask it to fill _once_. + */ + avail = PerlIO_get_cnt(n); + if (avail <= 0) { + avail = PerlIO_fill(n); + if (avail == 0) + avail = PerlIO_get_cnt(n); + else { + if (!PerlIO_error(n) && PerlIO_eof(n)) + avail = 0; + } + } + if (avail > 0) { + STDCHAR *ptr = PerlIO_get_ptr(n); + const SSize_t cnt = avail; + if (avail > fit) + avail = fit; + Copy(ptr, b->end, avail, STDCHAR); + PerlIO_set_ptrcnt(n, ptr + avail, cnt - avail); + read_bytes += avail; + } + } + else { + avail = PerlIO_read(n, b->end, fit); + if (avail > 0) + read_bytes += avail; + } + if (avail <= 0) { + if (avail < 0 || (read_bytes == 0 && PerlIO_eof(n))) { + PerlIOBase(f)->flags |= (avail == 0) ? PERLIO_F_EOF : PERLIO_F_ERROR; + return -1; + } + } + end = b->buf + read_bytes; + b->end = b->buf + validate(aTHX_ (const U8 *)b->buf, (const U8 *)end, u->flags, n); + if (b->end < end) { + size_t len = b->buf + read_bytes - b->end; + Copy(b->end, u->leftovers, len, char); + u->leftover_length = len; + } + PerlIOBase(f)->flags |= PERLIO_F_RDBUF; + + return 0; +} + +SSize_t +PerlIOUnicode_readdelim(pTHX_ PerlIO *f, STDCHAR *vbuf, Size_t count, STDCHAR delim) +{ + PerlIOUnicode * const u = PerlIOSelf(f, PerlIOUnicode); + PerlIOBuf * const b = &u->buf; + PerlIO *n = PerlIONext(f); + SSize_t avail = PerlIO_get_cnt(f); + Size_t wanted = MIN(avail, count); + Size_t read = 0, also = 0; + STDCHAR *validated = NULL, *end = NULL; + int seen = FALSE; + + if (avail == 0) { + if (PerlIO_flush(f) != 0) + return -1; + if (PerlIOBase(f)->flags & PERLIO_F_TTY) + PerlIOBase_flush_linebuf(aTHX); + + if (!b->buf) + PerlIO_get_base(f); + + assert(b->buf); + + if (u->leftover_length) { + Copy(u->leftovers, vbuf, u->leftover_length, STDCHAR); + read += u->leftover_length; + u->leftover_length = 0; + } + + if (!PerlIOValid(n)) { + PerlIOBase(f)->flags |= PERLIO_F_EOF; + return -1; + } + } + else { + STDCHAR* ptr = PerlIO_get_ptr(f); + STDCHAR* found = (STDCHAR*) memchr(ptr, delim, wanted); + if (found) { + Size_t len = found + 1 - ptr; + Copy(ptr, vbuf, len, char); + read += len; + PerlIO_set_ptrcnt(f, found + 1, avail - len); + seen = TRUE; + } + else { + Copy(ptr, vbuf, wanted, char); + read += wanted; + PerlIO_set_ptrcnt(f, ptr + wanted, avail - wanted); + } + } + + if (seen == FALSE && count - read) + also = PerlIO_readdelim(n, vbuf + read, count - read, delim); + read += also; + end = vbuf + read; + validated = vbuf + validate(aTHX_ (const U8 *)vbuf, (const U8 *) end, u->flags, n); + if (validated < end) { + size_t len = end - validated; + Copy(validated, u->leftovers, len, char); + u->leftover_length = len; + read -= len; + } + PerlIOBase(f)->flags |= PERLIO_F_RDBUF; + return read; +} + +PERLIO_FUNCS_DECL(PerlIO_utf8) = { + sizeof(PerlIO_funcs), + "utf8", + sizeof(PerlIOUnicode), + PERLIO_K_BUFFERED|PERLIO_K_UTF8, + PerlIOUnicode_pushed, + PerlIOBuf_popped, + PerlIOBuf_open, + PerlIOBase_binmode, + NULL, + PerlIOBase_fileno, + PerlIOBuf_dup, + PerlIOBuf_read, + PerlIOBuf_unread, + PerlIOBuf_write, + PerlIOBuf_seek, + PerlIOBuf_tell, + PerlIOBuf_close, + PerlIOBuf_flush, + PerlIOUnicode_fill, + PerlIOBase_eof, + PerlIOBase_error, + PerlIOBase_clearerr, + PerlIOBase_setlinebuf, + PerlIOBuf_get_base, + PerlIOBuf_bufsiz, + PerlIOBuf_get_ptr, + PerlIOBuf_get_cnt, + PerlIOBuf_set_ptrcnt, + PerlIOUnicode_readdelim, +}; + PerlIO * Perl_PerlIO_stdin(pTHX) { diff --git a/perliol.h b/perliol.h index c77cc6a60e53..0c9159eb25b7 100644 --- a/perliol.h +++ b/perliol.h @@ -112,6 +112,7 @@ EXTCONST PerlIO_funcs PerlIO_perlio; EXTCONST PerlIO_funcs PerlIO_stdio; EXTCONST PerlIO_funcs PerlIO_crlf; EXTCONST PerlIO_funcs PerlIO_utf8; +EXTCONST PerlIO_funcs PerlIO_utf8_lax; EXTCONST PerlIO_funcs PerlIO_byte; EXTCONST PerlIO_funcs PerlIO_raw; EXTCONST PerlIO_funcs PerlIO_pending; diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 88df9485059e..e5776442b6d0 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -3591,8 +3591,8 @@ you thought to be in UTF-8 but it wasn't (it was for example legacy 8-bit data). To guard against this, you can use C. If you use the C<:encoding(UTF-8)> PerlIO layer for input, invalid byte -sequences are handled gracefully, but if you use C<:utf8>, the flag is set -without validating the data, possibly resulting in this error message. +sequences are handled gracefully, but if you use C<:utf8>, an exception +will be thrown. See also L. @@ -6784,6 +6784,10 @@ problems when being input or output, which is likely where this message came from. If you really really know what you are doing you can turn off this warning by C. +=item Unknown argument to :utf8: %s + +(F) The :utf8 was given an unknown argument and dies because of it. + =item Unknown charname '%s' (F) The name you used inside C<\N{}> is unknown to Perl. Check the diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod index 47958b285174..2703602acdac 100644 --- a/pod/perlfunc.pod +++ b/pod/perlfunc.pod @@ -834,9 +834,6 @@ therefore refers to "layers" rather than to "disciplines". Now back to the regularly scheduled documentation...> To mark FILEHANDLE as UTF-8, use C<:utf8> or C<:encoding(UTF-8)>. -C<:utf8> just marks the data as UTF-8 without further checking, -while C<:encoding(UTF-8)> checks the data for actually being valid -UTF-8. More details can be found in L. In general, L|/binmode FILEHANDLE, LAYER> should be called after L|/open FILEHANDLE,MODE,EXPR> but before any I/O is done on the @@ -845,9 +842,7 @@ flushes any pending buffered output data (and perhaps pending input data) on the handle. An exception to this is the C<:encoding> layer that changes the default character encoding of the handle. The C<:encoding> layer sometimes needs to be called in -mid-stream, and it doesn't flush the stream. C<:encoding> -also implicitly pushes on top of itself the C<:utf8> layer because -internally Perl operates on UTF8-encoded Unicode characters. +mid-stream, and it doesn't flush the stream. The operating system, device drivers, C libraries, and Perl run-time system all conspire to let the programmer treat a single diff --git a/pod/perliol.pod b/pod/perliol.pod index 2f6f801d9f41..7d4086440d7a 100644 --- a/pod/perliol.pod +++ b/pod/perliol.pod @@ -332,7 +332,7 @@ for the layers class. Data written to this layer should be UTF-8 encoded; data provided by this layer should be considered UTF-8 encoded. Can be set on any layer -by ":utf8" dummy layer. Also set on ":encoding" layer. +by ":utf8-lax" dummy layer. =item PERLIO_F_UNBUF @@ -952,9 +952,7 @@ their own Binmode entry. =item "utf8" -Another dummy layer. When pushed it pops itself and sets the -C flag on the layer which was (and now is once more) -the top of the stack. +This layer reads utf-8 encodede unicode data and automatically decodes it. =back diff --git a/pod/perlrun.pod b/pod/perlrun.pod index 5d3aa3eb3c47..d56cee5b3fc8 100644 --- a/pod/perlrun.pod +++ b/pod/perlrun.pod @@ -1098,6 +1098,14 @@ X<:unix> Low-level layer that calls C, C, C, etc. +<<<<<<< HEAD +======= +=item :utf8 +X<:utf8> + +This layer reads utf-8 encodede unicode data and automatically decodes it. + +>>>>>>> 9a81523725 (Made :utf8 an actual layer) =item :win32 X<:win32> diff --git a/pod/perlunifaq.pod b/pod/perlunifaq.pod index 262585d47864..28342581c387 100644 --- a/pod/perlunifaq.pod +++ b/pod/perlunifaq.pod @@ -277,19 +277,10 @@ based on the user's locale, C. =head2 What is the difference between C<:encoding> and C<:utf8>? -Because UTF-8 is one of Perl's internal formats, you can often just skip the -encoding or decoding step, and manipulate the UTF8 flag directly. - -Instead of C<:encoding(UTF-8)>, you can simply use C<:utf8>, which skips the -encoding step if the data was already represented as UTF8 internally. This is -widely accepted as good behavior when you're writing, but it can be dangerous -when reading, because it causes internal inconsistency when you have invalid -byte sequences. Using C<:utf8> for input can sometimes result in security -breaches, so please use C<:encoding(UTF-8)> instead. - -Instead of C and C, you could use C<_utf8_on> and C<_utf8_off>, -but this is considered bad style. Especially C<_utf8_on> can be dangerous, for -the same reason that C<:utf8> can. +C<:encoding> is a generic conversion layer, that converts a file from a variety +of encodings to perl's internal encoding utf8 and vice versa. C<:utf8> is a +validation layer that checks if input data is correct UTF-8 but doesn't change +the bytestream in any way. There are some shortcuts for oneliners; see L<-C in perlrun|perlrun/-C [numberElist]>. diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod index 16f349fed7b8..1747cd2d43ea 100644 --- a/pod/perluniintro.pod +++ b/pod/perluniintro.pod @@ -398,10 +398,7 @@ and on already open streams, use C: The matching of encoding names is loose: case does not matter, and many encodings have several aliases. Note that the C<:utf8> layer must always be specified exactly like that; it is I subject to -the loose matching of encoding names. Also note that currently C<:utf8> is unsafe for -input, because it accepts the data without validating that it is indeed valid -UTF-8; you should instead use C<:encoding(UTF-8)> (with or without a -hyphen). +the loose matching of encoding names. See L for the C<:utf8> layer, L and L for the C<:encoding()> layer, and diff --git a/t/io/crlf.t b/t/io/crlf.t index d0275af9a0fd..3be7320ebfff 100644 --- a/t/io/crlf.t +++ b/t/io/crlf.t @@ -18,7 +18,7 @@ my $crcr = uni_to_native("\x0d\x0d"); my $ungetc_count = 8200; # Somewhat over the likely buffer size { - plan(tests => 21 + 2 * $ungetc_count); + plan(tests => 17 + 2 * $ungetc_count); ok(open(FOO,">:crlf",$file)); ok(print FOO 'a'.((('a' x 14).qq{\n}) x 2000) || close(FOO)); ok(open(FOO,"<:crlf",$file)); @@ -66,26 +66,21 @@ my $ungetc_count = 8200; # Somewhat over the likely buffer size # binmode :crlf should not cumulate. # Try it first once and then twice so that even UNIXy boxes # get to exercise this, for DOSish boxes even once is enough. - # Try also pushing :utf8 first so that there are other layers - # in between (this should not matter: CRLF layers still should - # not accumulate). - for my $utf8 ('', ':utf8') { - for my $binmode (1..2) { - open(FOO, ">$file"); - # require PerlIO; print PerlIO::get_layers(FOO), "\n"; - binmode(FOO, "$utf8:crlf") for 1..$binmode; - # require PerlIO; print PerlIO::get_layers(FOO), "\n"; - print FOO "Hello\n"; - close FOO; - open(FOO, "<$file"); - binmode(FOO); - my $foo = scalar ; - close FOO; - print join(" ", "#", map { sprintf("%02x", $_) } unpack("C*", $foo)), - "\n"; - like($foo, qr/$crlf$/); - unlike($foo, qr/$crcr/); - } + for my $binmode (1..2) { + open(FOO, ">$file"); + # require PerlIO; print PerlIO::get_layers(FOO), "\n"; + binmode(FOO, ":crlf") for 1..$binmode; + # require PerlIO; print PerlIO::get_layers(FOO), "\n"; + print FOO "Hello\n"; + close FOO; + open(FOO, "<$file"); + binmode(FOO); + my $foo = scalar ; + close FOO; + print join(" ", "#", map { sprintf("%02x", $_) } unpack("C*", $foo)), + "\n"; + like($foo, qr/\x0d\x0a$/); + unlike($foo, qr/\x0d\x0d/); } { diff --git a/t/io/layers.t b/t/io/layers.t index 541b4775bc75..de1247a2f773 100644 --- a/t/io/layers.t +++ b/t/io/layers.t @@ -35,11 +35,9 @@ if (${^UNICODE} & 1) { } else { $UTF8_STDIN = 0; } -my $NTEST = 60 - (($DOSISH || !$FASTSTDIO) ? 7 : 0) - ($DOSISH ? 7 : 0) +my $NTEST = 55 - (($DOSISH || !$FASTSTDIO) ? 7 : 0) - ($DOSISH ? 7 : 0) + $UTF8_STDIN; -sub PerlIO::F_UTF8 () { 0x00008000 } # from perliol.h - plan tests => $NTEST; print <<__EOH__; @@ -54,6 +52,7 @@ __EOH__ { sub check { + local $Test::Builder::Level = $Test::Builder::Level + 1; my ($result, $expected, $id) = @_; # An interesting dance follows where we try to make the following # IO layer stack setups to compare equal: @@ -128,13 +127,13 @@ __EOH__ binmode(F, ":encoding(cp1047)"); check([ PerlIO::get_layers(F) ], - [ qw[stdio crlf encoding(cp1047) utf8] ], + [ qw[stdio crlf encoding(cp1047)] ], ":encoding(cp1047)"); binmode(F, ":crlf"); check([ PerlIO::get_layers(F) ], - [ qw[stdio crlf encoding(cp1047) utf8 crlf utf8] ], + [ qw[stdio crlf encoding(cp1047) crlf ] ], ":encoding(cp1047):crlf"); binmode(F, ":pop:pop"); @@ -164,7 +163,7 @@ __EOH__ binmode(F, ":encoding(utf8)"); check([ PerlIO::get_layers(F) ], - [ qw[stdio encoding(utf8) utf8] ], + [ qw[stdio encoding(utf8)] ], ":encoding(utf8)"); binmode(F, ":raw :crlf"); @@ -214,7 +213,7 @@ __EOH__ "use open IN"); check([ PerlIO::get_layers(G, output => 1) ], - [ qw[stdio encoding(cp1252) utf8] ], + [ qw[stdio encoding(cp1252)] ], "use open OUT"); close F; diff --git a/t/io/utf8.t b/t/io/utf8.t index 2ea7429f4dc8..9a18ff936214 100644 --- a/t/io/utf8.t +++ b/t/io/utf8.t @@ -11,7 +11,7 @@ skip_all_without_perlio(); no utf8; # needed for use utf8 not griping about the raw octets -plan(tests => 62); +plan(tests => 61); $| = 1; @@ -169,13 +169,9 @@ SKIP: { if ($::IS_EBCDIC) { skip("EBCDIC The file isn't deformed in UTF-EBCDIC", 2); } else { - my @warnings; open F, "<:utf8", $a_file or die $!; - $x = ; chomp $x; - local $SIG{__WARN__} = sub { push @warnings, $_[0]; }; - eval { sprintf "%vd\n", $x }; - is (scalar @warnings, 1); - like ($warnings[0], qr/Malformed UTF-8 character: \\x82 \(unexpected continuation byte 0x82, with no preceding start byte/); + eval { $x = ; chomp $x; }; + like ($@, qr/^Can't decode ill-formed UTF-8 octet sequence <82>/); } } @@ -326,7 +322,6 @@ is($failed, undef); # if it finds bad UTF-8 (:encoding(utf8) works this way) use warnings 'utf8'; undef $@; - local $SIG{__WARN__} = sub { $@ = shift }; open F, ">$a_file"; binmode F; my ($chrE4, $chrF6) = (chr(0xE4), chr(0xF6)); @@ -337,14 +332,16 @@ is($failed, undef); close F; open F, "<:utf8", $a_file; undef $@; + eval { my $line = ; + }; my ($chrE4, $chrF6) = ("E4", "F6"); if ($::IS_EBCDIC) { ($chrE4, $chrF6) = ("43", "EC"); } # EBCDIC - like( $@, qr/utf8 "\\x$chrE4" does not map to Unicode .+ line 1/, + like( $@, qr/^Can't decode ill-formed UTF-8 octet sequence /, "<:utf8 readline must warn about bad utf8"); undef $@; - $line .= ; - like( $@, qr/utf8 "\\x$chrF6" does not map to Unicode .+ line 2/, + eval { $line .= }; + like( $@, qr/Can\'t decode ill-formed UTF-8 octet sequence /, "<:utf8 rcatline must warn about bad utf8"); close F; } @@ -381,10 +378,9 @@ is($failed, undef); open F, "<:utf8", $a_file; undef $@; local $SIG{__WARN__} = sub { $@ = shift }; - $line = ; + $line = eval { }; - like( $@, qr/utf8 "\\xEF" does not map to Unicode .+ chunk 1/, - "<:utf8 readline (fixed) must warn about bad utf8"); + like( $@, qr/Can\'t decode ill-formed UTF-8 octet sequence at end of file/); close F; } diff --git a/t/op/print.t b/t/op/print.t index 55b341aa3e46..71c4b4133fe5 100644 --- a/t/op/print.t +++ b/t/op/print.t @@ -21,7 +21,7 @@ use warnings; no warnings 'utf8'; # These form overlong "oops" -open my $fh, "<:utf8", \"\xC1\xAF\xC1\xAF\xC1\xB0\xC1\xB3" +open my $fh, "<:utf8_lax", \"\xC1\xAF\xC1\xAF\xC1\xB0\xC1\xB3" or die "Could not open\n"; read($fh, my $s, 10) or die "Could not read\n"; print $s; diff --git a/universal.c b/universal.c index 5932767bdf3a..b86b3fd18c8b 100644 --- a/universal.c +++ b/universal.c @@ -826,14 +826,6 @@ XS(XS_PerlIO_get_layers) else PUSHs(&PL_sv_undef); nitem++; - if (flgok) { - const IV flags = SvIVX(*flgsvp); - - if (flags & PERLIO_F_UTF8) { - PUSHs(newSVpvs_flags("utf8", SVs_TEMP)); - nitem++; - } - } } }