From 3485f29ea37a541f6455993922c5d8c8c575870f Mon Sep 17 00:00:00 2001
From: Leon Timmermans <fawaka@gmail.com>
Date: Mon, 9 Apr 2012 21:49:11 +0200
Subject: [PATCH] Made :utf8 an actual layer

It will check the input for validity, by default strict validity though less strict forms are provided. This also means PerlIO::get_layers doesn't return a "utf8" pseudo-layer anymore, which can break some code making that assumption.
---
 cpan/CPAN-Meta-YAML/t/11_read_string.t |   5 +-
 lib/PerlIO.pm                          |  21 +-
 perlio.c                               | 418 ++++++++++++++++++++++++-
 perliol.h                              |   1 +
 pod/perldiag.pod                       |   8 +-
 pod/perlfunc.pod                       |   7 +-
 pod/perliol.pod                        |   6 +-
 pod/perlrun.pod                        |   8 +
 pod/perlunifaq.pod                     |  17 +-
 pod/perluniintro.pod                   |   5 +-
 t/io/crlf.t                            |  37 +--
 t/io/layers.t                          |  13 +-
 t/io/utf8.t                            |  24 +-
 t/op/print.t                           |   2 +-
 universal.c                            |   8 -
 15 files changed, 486 insertions(+), 94 deletions(-)
diff --git a/cpan/CPAN-Meta-YAML/t/11_read_string.t b/cpan/CPAN-Meta-YAML/t/11_read_string.t
index 491fd8e8ee2a..f4ef869dc41b 100644
--- a/cpan/CPAN-Meta-YAML/t/11_read_string.t
+++ b/cpan/CPAN-Meta-YAML/t/11_read_string.t
@@ -40,7 +40,10 @@ subtest 'invalid UTF-8' => sub {
     # get invalid UTF-8 by reading Latin-1 with lax :utf8 layer
     my $string = do {
         local $SIG{__WARN__} = sub {};
-        slurp( test_data_file('latin1.yml'), ":utf8" );
+        my $ret = slurp( test_data_file('latin1.yml'), ":raw" );
+		require Encode;
+		Encode::_utf8_on($ret);
+		$ret;
     };
     my $obj = eval { CPAN::Meta::YAML->read_string($string); };
     is( $obj, undef, "read_string should return undef" );
diff --git a/lib/PerlIO.pm b/lib/PerlIO.pm
index 85dd083cc02a..ec64d85ebc1c 100644
--- a/lib/PerlIO.pm
+++ b/lib/PerlIO.pm
@@ -1,6 +1,6 @@
 package PerlIO;
 
-our $VERSION = '1.11';
+our $VERSION = '1.12';
 
 # Map layer name to package that defines it
 our %alias;
@@ -173,6 +173,8 @@ instead produce UTF-EBCDIC on EBCDIC systems.  The C<:encoding(UTF-8)>
 layer (hyphen is significant) is preferred as it will ensure translation
 between valid UTF-8 bytes and valid Unicode characters.
 
+Note that before perl 5.36, this layer did not validate byte sequences.
+
 =item :bytes
 
 This is the inverse of the C<:utf8> pseudo-layer.  It turns off the flag
@@ -220,10 +222,9 @@ but then enable UTF-8 translation.
 
 =item :pop
 
-A pseudo-layer that removes the top-most layer. Gives Perl code a
-way to manipulate the layer stack.  Note that C<:pop> only works on
-real layers and will not undo the effects of pseudo-layers or flags
-like C<:utf8>.  An example of a possible use might be:
+A pseudo layer that removes the top-most layer. Gives perl code
+a way to manipulate the layer stack. An example of a possible use
+might be:
 
     open(my $fh,...) or die "open failed: $!";
     ...
@@ -254,9 +255,8 @@ Some custom layers come with the Perl distribution.
 =item :encoding
 
 Use C<:encoding(ENCODING)> to transparently do character set and encoding
-transformations, for example from Shift-JIS to Unicode.  Note that an
-C<:encoding> also enables C<:utf8>.  See L<PerlIO::encoding> for more
-information.
+transformations, for example from Shift-JIS to Unicode.  See 
+L<PerlIO::encoding> for more information.
 
 =item :mmap
 
@@ -372,9 +372,8 @@ You are supposed to use open() and binmode() to manipulate the stack.
 B<Implementation details follow, please close your eyes.>
 
 The arguments to layers are by default returned in parentheses after
-the name of the layer, and certain layers (like C<:utf8>) are not real
-layers but instead flags on real layers; to get all of these returned
-separately, use the optional C<details> argument:
+the name of the layer; to get all of these returned separately, use the
+optional C<details> argument:
 
    my @layer_and_args_and_flags = PerlIO::get_layers($fh, details => 1);
 
diff --git a/perlio.c b/perlio.c
index 38312e8cb410..cb69637860b3 100644
--- a/perlio.c
+++ b/perlio.c
@@ -1076,6 +1076,7 @@ PerlIO_default_layers(pTHX)
         PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_stdio));
         PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_crlf));
         PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_utf8));
+	PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_utf8_lax));
         PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_remove));
         PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_byte));
         PerlIO_list_push(aTHX_ PL_def_layerlist, (PerlIO_funcs *)osLayer,
@@ -1828,16 +1829,23 @@ PerlIOUtf8_pushed(pTHX_ PerlIO *f, const char *mode, SV *arg, PerlIO_funcs *tab)
     if (PerlIOValid(f)) {
         if (tab && tab->kind & PERLIO_K_UTF8)
             PerlIOBase(f)->flags |= PERLIO_F_UTF8;
+	else {
+            if (PerlIOBase(f)->tab->kind & PERLIO_K_UTF8)
+                while (PerlIOBase(f)->tab->kind & PERLIO_K_UTF8) {
+                    PerlIO_flush(f);
+                    PerlIO_pop(aTHX_ f);
+                }
         else
             PerlIOBase(f)->flags &= ~PERLIO_F_UTF8;
+        }
         return 0;
     }
     return -1;
 }
 
-PERLIO_FUNCS_DECL(PerlIO_utf8) = {
+PERLIO_FUNCS_DECL(PerlIO_utf8_lax) = {
     sizeof(PerlIO_funcs),
-    "utf8",
+    "utf8_lax",
     0,
     PERLIO_K_DUMMY | PERLIO_K_UTF8 | PERLIO_K_MULTIARG,
     PerlIOUtf8_pushed,
@@ -4141,6 +4149,8 @@ PerlIOBuf_readdelim(pTHX_ PerlIO *f, STDCHAR *vbuf, Size_t count, STDCHAR delim)
             next = PerlIO_readdelim(f, vbuf + read, count - read, delim);
             if (next >= 0)
                 return read + next;
+            if (read > 0)
+                return read;
             else
                 return next; /* XXX */
         }
@@ -4912,6 +4922,410 @@ PERLIO_FUNCS_DECL(PerlIO_crlf) = {
     PerlIOBuf_readdelim,
 };
 
+#define UTF8_MAX_BYTES 4
+
+static const U8 utf8_sequence_len[0x100] = {
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00-0x0F */
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10-0x1F */
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20-0x2F */
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30-0x3F */
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40-0x4F */
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50-0x5F */
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60-0x6F */
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70-0x7F */
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */
+    0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */
+    4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */
+};
+
+
+typedef enum { STRICT_UTF8=0, ALLOW_SURROGATES=1, ALLOW_NONCHARACTERS=2, ALLOW_NONSHORTEST=4 } utf8_flags;
+
+
+static STRLEN skip_sequence(const U8 *cur, const STRLEN len) {
+    STRLEN i, n = utf8_sequence_len[*cur];
+
+    if (n < 1 || len < 2)
+            return 1;
+
+    switch (cur[0]) {
+        case 0xE0: if ((cur[1] & 0xE0) != 0xA0) return 1; break;
+        case 0xED: if ((cur[1] & 0xE0) != 0x80) return 1; break;
+        case 0xF4: if ((cur[1] & 0xF0) != 0x80) return 1; break;
+        case 0xF0: if ((cur[1] & 0xF0) == 0x80) return 1; /* FALLTROUGH */
+        default:   if ((cur[1] & 0xC0) != 0x80) return 1; break;
+    }
+
+    if (n > len)
+        n = len;
+    for (i = 2; i < n; i++)
+        if ((cur[i] & 0xC0) != 0x80)
+            break;
+    return i;
+}
+
+static void report_illformed(pTHX_ const U8 *cur, STRLEN len, bool eof) {
+    static const char *hex = "0123456789ABCDEF";
+    const char *fmt;
+    char seq[UTF8_MAX_BYTES * 3];
+    char *d = seq;
+
+    if (eof)
+        fmt = "Can't decode ill-formed UTF-8 octet sequence <%s> at end of file";
+    else
+        fmt = "Can't decode ill-formed UTF-8 octet sequence <%s>";
+
+    while (len-- > 0) {
+        const U8 c = *cur++;
+        *d++ = hex[c >> 4];
+        *d++ = hex[c & 15];
+        if (len)
+            *d++ = ' ';
+    }
+    *d = 0;
+    Perl_croak(aTHX_ fmt, seq);
+}
+
+static void report_noncharacter(pTHX_ UV usv) {
+    static const char *fmt = "Can't interchange noncharacter code point U+%"UVXf;
+    Perl_croak(aTHX_ fmt, usv);
+}
+
+static STRLEN validate(pTHX_ const U8 *buf, const U8 *end, const int flags, PerlIO* handle) {
+    const bool eof = PerlIO_eof(handle);
+    const U8 *cur = buf;
+    const U8 *end4 = end - UTF8_MAX_BYTES;
+    STRLEN skip = 0;
+    U32 v;
+
+    while (cur < end4) {
+        while (cur < end4 && *cur < 0x80)
+            cur++;
+
+      check:
+        switch (utf8_sequence_len[*cur]) {
+            case 0:
+                goto illformed;
+            case 1:
+                cur += 1;
+                break;
+            case 2:
+                /* 110xxxxx 10xxxxxx */
+                if ((cur[1] & 0xC0) != 0x80)
+                    goto illformed;
+                cur += 2;
+                break;
+            case 3:
+                v = ((U32)cur[0] << 16)
+                  | ((U32)cur[1] <<  8)
+                  | ((U32)cur[2]);
+                /* 1110xxxx 10xxxxxx 10xxxxxx */
+                if ((v & 0x00F0C0C0) != 0x00E08080 ||
+                    /* Non-shortest form */
+                    v < 0x00E0A080)
+                    goto illformed;
+                /* Surrogates U+D800..U+DFFF */
+                if (!(flags & ALLOW_SURROGATES) && (v & 0x00EFA080) == 0x00EDA080)
+                    goto illformed;
+                /* Non-characters U+FDD0..U+FDEF, U+FFFE..U+FFFF */
+                if (!(flags & ALLOW_NONCHARACTERS) && v >= 0x00EFB790 && (v <= 0x00EFB7AF || v >= 0x00EFBFBE))
+                    goto noncharacter;
+                cur += 3;
+                break;
+            case 4:
+                v = ((U32)cur[0] << 24)
+                  | ((U32)cur[1] << 16)
+                  | ((U32)cur[2] <<  8)
+                  | ((U32)cur[3]);
+                /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+                if ((v & 0xF8C0C0C0) != 0xF0808080 ||
+                    /* Non-shortest form */
+                    v < 0xF0908080 ||
+                    /* Greater than U+10FFFF */
+                    v > 0xF48FBFBF)
+                    goto illformed;
+                /* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */
+                if (!(flags & ALLOW_NONCHARACTERS) && (v & 0x000FBFBE) == 0x000FBFBE)
+                    goto noncharacter;
+                cur += 4;
+                break;
+        }
+    }
+    
+    if (cur < end) {
+        if (cur + utf8_sequence_len[*cur] <= end)
+            goto check;
+        skip = skip_sequence(cur, end - cur);
+        if (eof || cur + skip < end)
+            goto illformed;
+    }
+    return cur - buf;
+
+illformed:
+    if (!skip)
+            skip = skip_sequence(cur, end - cur);
+    PerlIOBase(handle)->flags |= PERLIO_F_ERROR;
+    report_illformed(aTHX_ cur, skip, eof);
+
+noncharacter:
+    if (v < 0xF0808080)
+        v = (v & 0x3F) | (v & 0x3F00) >> 2 | (v & 0x0F0000) >> 4;
+    else
+        v = (v & 0x3F) | (v & 0x3F00) >> 2 | (v & 0x3F0000) >> 4 | (v & 0x07000000) >> 6;
+    PerlIOBase(handle)->flags |= PERLIO_F_ERROR;
+    report_noncharacter(aTHX_ v);
+}
+
+typedef struct {
+    PerlIOBuf buf;
+    STDCHAR leftovers[UTF8_MAX_BYTES];
+    size_t leftover_length;
+    utf8_flags flags;
+} PerlIOUnicode;
+
+static struct {
+    const char* name;
+    size_t length;
+    utf8_flags value;
+} map[] = {
+    { STR_WITH_LEN("allow_surrogates"), ALLOW_SURROGATES },
+    { STR_WITH_LEN("allow_noncharacters"), ALLOW_NONCHARACTERS },
+    { STR_WITH_LEN("allow_nonshortest"), ALLOW_NONSHORTEST },
+    { STR_WITH_LEN("strict"), STRICT_UTF8 },
+    { STR_WITH_LEN("loose"), ALLOW_SURROGATES | ALLOW_NONCHARACTERS | ALLOW_NONSHORTEST },
+};
+
+static utf8_flags lookup_parameter(pTHX_ const char* ptr, size_t len) {
+    unsigned i;
+    for (i = 0; i < sizeof map / sizeof *map; ++i) {
+        if (map[i].length == len && memcmp(ptr, map[i].name, len) == 0)
+            return map[i].value;
+    }
+    Perl_croak(aTHX_ "Unknown argument to :utf8: %*s", (int)len, ptr);
+}
+static utf8_flags parse_parameters(pTHX_ SV* param) {
+    STRLEN len;
+    const char *begin, *delim;
+    if (!param || !SvOK(param))
+        return 0;
+
+    begin = SvPV(param, len);
+    delim = strchr(begin, ',');
+    if(delim) {
+        utf8_flags ret = 0;
+        const char* end = begin + len;
+        do {
+            ret |= lookup_parameter(aTHX_ begin, delim - begin);
+            begin = delim + 1;
+            delim = strchr(begin, ',');
+        } while (delim);
+        if (begin < end)
+            ret |= lookup_parameter(aTHX_ begin, end - begin);
+        return ret;
+    }
+    else {
+            return lookup_parameter(aTHX_ begin, len);
+    }
+}
+
+static IV PerlIOUnicode_pushed(pTHX_ PerlIO* f, const char* mode, SV* arg, PerlIO_funcs* tab) {
+    utf8_flags flags = parse_parameters(aTHX_ arg);
+    if (PerlIOBuf_pushed(aTHX_ f, mode, arg, tab) == 0) {
+        PerlIOBase(f)->flags |= PERLIO_F_UTF8;
+        PerlIOSelf(f, PerlIOUnicode)->flags = flags;
+        return 0;
+    }
+    return -1;
+}
+
+static IV PerlIOUnicode_fill(pTHX_ PerlIO* f) {
+    PerlIOUnicode * const u = PerlIOSelf(f, PerlIOUnicode);
+    PerlIOBuf * const b = &u->buf;
+    PerlIO *n = PerlIONext(f);
+    SSize_t avail;
+    Size_t read_bytes = 0;
+    STDCHAR *end;
+    SSize_t fit;
+
+    if (PerlIO_flush(f) != 0)
+        return -1;
+    if (PerlIOBase(f)->flags & PERLIO_F_TTY)
+        PerlIOBase_flush_linebuf(aTHX);
+
+    if (!b->buf)
+        PerlIO_get_base(f);
+
+    assert(b->buf);
+
+    if (u->leftover_length) {
+        Copy(u->leftovers, b->buf, u->leftover_length, STDCHAR);
+        b->end = b->buf + u->leftover_length;
+        read_bytes = u->leftover_length;
+        u->leftover_length = 0;
+    }
+    else {
+        b->ptr = b->end = b->buf;
+    }
+    fit = (SSize_t)b->bufsiz - (b->end - b->buf);
+
+    if (!PerlIOValid(n)) {
+        PerlIOBase(f)->flags |= PERLIO_F_EOF;
+        return -1;
+    }
+
+    if (PerlIO_fast_gets(n)) {
+        /*
+         * Layer below is also buffered. We do _NOT_ want to call its
+         * ->Read() because that will loop till it gets what we asked for
+         * which may hang on a pipe etc. Instead take anything it has to
+         * hand, or ask it to fill _once_.
+         */
+        avail = PerlIO_get_cnt(n);
+        if (avail <= 0) {
+            avail = PerlIO_fill(n);
+            if (avail == 0)
+                avail = PerlIO_get_cnt(n);
+            else {
+                if (!PerlIO_error(n) && PerlIO_eof(n))
+                    avail = 0;
+            }
+        }
+        if (avail > 0) {
+            STDCHAR *ptr = PerlIO_get_ptr(n);
+            const SSize_t cnt = avail;
+            if (avail > fit)
+                avail = fit;
+            Copy(ptr, b->end, avail, STDCHAR);
+            PerlIO_set_ptrcnt(n, ptr + avail, cnt - avail);
+            read_bytes += avail;
+        }
+    }
+    else {
+        avail = PerlIO_read(n, b->end, fit);
+        if (avail > 0)
+            read_bytes += avail;
+    }
+    if (avail <= 0) {
+        if (avail < 0 || (read_bytes == 0 && PerlIO_eof(n))) {
+            PerlIOBase(f)->flags |= (avail == 0) ? PERLIO_F_EOF : PERLIO_F_ERROR;
+            return -1;
+        }
+    }
+    end = b->buf + read_bytes;
+    b->end = b->buf + validate(aTHX_ (const U8 *)b->buf, (const U8 *)end, u->flags, n);
+    if (b->end < end) {
+        size_t len = b->buf + read_bytes - b->end;
+        Copy(b->end, u->leftovers, len, char);
+        u->leftover_length = len;
+    }
+    PerlIOBase(f)->flags |= PERLIO_F_RDBUF;
+    
+    return 0;
+}
+
+SSize_t
+PerlIOUnicode_readdelim(pTHX_ PerlIO *f, STDCHAR *vbuf, Size_t count, STDCHAR delim)
+{
+    PerlIOUnicode * const u = PerlIOSelf(f, PerlIOUnicode);
+    PerlIOBuf * const b = &u->buf;
+    PerlIO *n = PerlIONext(f);
+    SSize_t avail = PerlIO_get_cnt(f);
+    Size_t wanted = MIN(avail, count);
+    Size_t read = 0, also = 0;
+    STDCHAR *validated = NULL, *end = NULL;
+    int seen = FALSE;
+
+    if (avail == 0) {
+        if (PerlIO_flush(f) != 0)
+            return -1;
+        if (PerlIOBase(f)->flags & PERLIO_F_TTY)
+            PerlIOBase_flush_linebuf(aTHX);
+
+        if (!b->buf)
+            PerlIO_get_base(f);
+
+        assert(b->buf);
+
+        if (u->leftover_length) {
+            Copy(u->leftovers, vbuf, u->leftover_length, STDCHAR);
+            read += u->leftover_length;
+            u->leftover_length = 0;
+        }
+
+        if (!PerlIOValid(n)) {
+            PerlIOBase(f)->flags |= PERLIO_F_EOF;
+            return -1;
+        }
+    }
+    else {
+        STDCHAR* ptr = PerlIO_get_ptr(f);
+        STDCHAR* found = (STDCHAR*) memchr(ptr, delim, wanted);
+        if (found) {
+            Size_t len = found + 1 - ptr;
+            Copy(ptr, vbuf, len, char);
+            read += len;
+            PerlIO_set_ptrcnt(f, found + 1, avail - len);
+            seen = TRUE;
+        }
+        else {
+            Copy(ptr, vbuf, wanted, char);
+            read += wanted;
+            PerlIO_set_ptrcnt(f, ptr + wanted, avail - wanted);
+        }
+    }
+
+    if (seen == FALSE && count - read)
+        also = PerlIO_readdelim(n, vbuf + read, count - read, delim);
+    read += also;
+    end = vbuf + read;
+    validated = vbuf + validate(aTHX_ (const U8 *)vbuf, (const U8 *) end, u->flags, n);
+    if (validated < end) {
+        size_t len = end - validated;
+        Copy(validated, u->leftovers, len, char);
+        u->leftover_length = len;
+        read -= len;
+    }
+    PerlIOBase(f)->flags |= PERLIO_F_RDBUF;
+    return read;
+}
+
+PERLIO_FUNCS_DECL(PerlIO_utf8) = {
+    sizeof(PerlIO_funcs),
+    "utf8",
+    sizeof(PerlIOUnicode),
+    PERLIO_K_BUFFERED|PERLIO_K_UTF8,
+    PerlIOUnicode_pushed,
+    PerlIOBuf_popped,
+    PerlIOBuf_open,
+    PerlIOBase_binmode,
+    NULL,
+    PerlIOBase_fileno,
+    PerlIOBuf_dup,
+    PerlIOBuf_read,
+    PerlIOBuf_unread,
+    PerlIOBuf_write,
+    PerlIOBuf_seek,
+    PerlIOBuf_tell,
+    PerlIOBuf_close,
+    PerlIOBuf_flush,
+    PerlIOUnicode_fill,
+    PerlIOBase_eof,
+    PerlIOBase_error,
+    PerlIOBase_clearerr,
+    PerlIOBase_setlinebuf,
+    PerlIOBuf_get_base,
+    PerlIOBuf_bufsiz,
+    PerlIOBuf_get_ptr,
+    PerlIOBuf_get_cnt,
+    PerlIOBuf_set_ptrcnt,
+    PerlIOUnicode_readdelim,
+};
+
 PerlIO *
 Perl_PerlIO_stdin(pTHX)
 {
diff --git a/perliol.h b/perliol.h
index c77cc6a60e53..0c9159eb25b7 100644
--- a/perliol.h
+++ b/perliol.h
@@ -112,6 +112,7 @@ EXTCONST PerlIO_funcs PerlIO_perlio;
 EXTCONST PerlIO_funcs PerlIO_stdio;
 EXTCONST PerlIO_funcs PerlIO_crlf;
 EXTCONST PerlIO_funcs PerlIO_utf8;
+EXTCONST PerlIO_funcs PerlIO_utf8_lax;
 EXTCONST PerlIO_funcs PerlIO_byte;
 EXTCONST PerlIO_funcs PerlIO_raw;
 EXTCONST PerlIO_funcs PerlIO_pending;
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 88df9485059e..e5776442b6d0 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -3591,8 +3591,8 @@ you thought to be in UTF-8 but it wasn't (it was for example legacy 8-bit
 data).  To guard against this, you can use C<Encode::decode('UTF-8', ...)>.
 
 If you use the C<:encoding(UTF-8)> PerlIO layer for input, invalid byte
-sequences are handled gracefully, but if you use C<:utf8>, the flag is set
-without validating the data, possibly resulting in this error message.
+sequences are handled gracefully, but if you use C<:utf8>, an exception
+will be thrown.
 
 See also L<Encode/"Handling Malformed Data">.
 
@@ -6784,6 +6784,10 @@ problems when being input or output, which is likely where this message
 came from.  If you really really know what you are doing you can turn
 off this warning by C<no warnings 'surrogate';>.
 
+=item Unknown argument to :utf8: %s
+
+(F) The :utf8 was given an unknown argument and dies because of it.
+
 =item Unknown charname '%s'
 
 (F) The name you used inside C<\N{}> is unknown to Perl.  Check the
diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod
index 47958b285174..2703602acdac 100644
--- a/pod/perlfunc.pod
+++ b/pod/perlfunc.pod
@@ -834,9 +834,6 @@ therefore refers to "layers" rather than to "disciplines".  Now back to
 the regularly scheduled documentation...>
 
 To mark FILEHANDLE as UTF-8, use C<:utf8> or C<:encoding(UTF-8)>.
-C<:utf8> just marks the data as UTF-8 without further checking,
-while C<:encoding(UTF-8)> checks the data for actually being valid
-UTF-8.  More details can be found in L<PerlIO::encoding>.
 
 In general, L<C<binmode>|/binmode FILEHANDLE, LAYER> should be called
 after L<C<open>|/open FILEHANDLE,MODE,EXPR> but before any I/O is done on the
@@ -845,9 +842,7 @@ flushes any pending buffered output data (and perhaps pending input
 data) on the handle.  An exception to this is the C<:encoding> layer
 that changes the default character encoding of the handle.
 The C<:encoding> layer sometimes needs to be called in
-mid-stream, and it doesn't flush the stream.  C<:encoding>
-also implicitly pushes on top of itself the C<:utf8> layer because
-internally Perl operates on UTF8-encoded Unicode characters.
+mid-stream, and it doesn't flush the stream.
 
 The operating system, device drivers, C libraries, and Perl run-time
 system all conspire to let the programmer treat a single
diff --git a/pod/perliol.pod b/pod/perliol.pod
index 2f6f801d9f41..7d4086440d7a 100644
--- a/pod/perliol.pod
+++ b/pod/perliol.pod
@@ -332,7 +332,7 @@ for the layers class.
 
 Data written to this layer should be UTF-8 encoded; data provided
 by this layer should be considered UTF-8 encoded. Can be set on any layer
-by ":utf8" dummy layer. Also set on ":encoding" layer.
+by ":utf8-lax" dummy layer.
 
 =item PERLIO_F_UNBUF
 
@@ -952,9 +952,7 @@ their own Binmode entry.
 
 =item "utf8"
 
-Another dummy layer. When pushed it pops itself and sets the
-C<PERLIO_F_UTF8> flag on the layer which was (and now is once more)
-the top of the stack.
+This layer reads utf-8 encodede unicode data and automatically decodes it.
 
 =back
 
diff --git a/pod/perlrun.pod b/pod/perlrun.pod
index 5d3aa3eb3c47..d56cee5b3fc8 100644
--- a/pod/perlrun.pod
+++ b/pod/perlrun.pod
@@ -1098,6 +1098,14 @@ X<:unix>
 
 Low-level layer that calls C<read>, C<write>, C<lseek>, etc.
 
+<<<<<<< HEAD
+=======
+=item :utf8
+X<:utf8>
+
+This layer reads utf-8 encodede unicode data and automatically decodes it.
+
+>>>>>>> 9a81523725 (Made :utf8 an actual layer)
 =item :win32
 X<:win32>
 
diff --git a/pod/perlunifaq.pod b/pod/perlunifaq.pod
index 262585d47864..28342581c387 100644
--- a/pod/perlunifaq.pod
+++ b/pod/perlunifaq.pod
@@ -277,19 +277,10 @@ based on the user's locale, C<use open>.
 
 =head2 What is the difference between C<:encoding> and C<:utf8>?
 
-Because UTF-8 is one of Perl's internal formats, you can often just skip the
-encoding or decoding step, and manipulate the UTF8 flag directly.
-
-Instead of C<:encoding(UTF-8)>, you can simply use C<:utf8>, which skips the
-encoding step if the data was already represented as UTF8 internally. This is
-widely accepted as good behavior when you're writing, but it can be dangerous
-when reading, because it causes internal inconsistency when you have invalid
-byte sequences. Using C<:utf8> for input can sometimes result in security
-breaches, so please use C<:encoding(UTF-8)> instead.
-
-Instead of C<decode> and C<encode>, you could use C<_utf8_on> and C<_utf8_off>,
-but this is considered bad style. Especially C<_utf8_on> can be dangerous, for
-the same reason that C<:utf8> can.
+C<:encoding> is a generic conversion layer, that converts a file from a variety
+of encodings to perl's internal encoding utf8 and vice versa. C<:utf8> is a
+validation layer that checks if input data is correct UTF-8 but doesn't change
+the bytestream in any way.
 
 There are some shortcuts for oneliners;
 see L<-C in perlrun|perlrun/-C [numberE<sol>list]>.
diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod
index 16f349fed7b8..1747cd2d43ea 100644
--- a/pod/perluniintro.pod
+++ b/pod/perluniintro.pod
@@ -398,10 +398,7 @@ and on already open streams, use C<binmode()>:
 The matching of encoding names is loose: case does not matter, and
 many encodings have several aliases.  Note that the C<:utf8> layer
 must always be specified exactly like that; it is I<not> subject to
-the loose matching of encoding names. Also note that currently C<:utf8> is unsafe for
-input, because it accepts the data without validating that it is indeed valid
-UTF-8; you should instead use C<:encoding(UTF-8)> (with or without a
-hyphen).
+the loose matching of encoding names.
 
 See L<PerlIO> for the C<:utf8> layer, L<PerlIO::encoding> and
 L<Encode::PerlIO> for the C<:encoding()> layer, and
diff --git a/t/io/crlf.t b/t/io/crlf.t
index d0275af9a0fd..3be7320ebfff 100644
--- a/t/io/crlf.t
+++ b/t/io/crlf.t
@@ -18,7 +18,7 @@ my $crcr = uni_to_native("\x0d\x0d");
 my $ungetc_count = 8200;    # Somewhat over the likely buffer size
 
 {
-    plan(tests => 21 + 2 * $ungetc_count);
+    plan(tests => 17 + 2 * $ungetc_count);
     ok(open(FOO,">:crlf",$file));
     ok(print FOO 'a'.((('a' x 14).qq{\n}) x 2000) || close(FOO));
     ok(open(FOO,"<:crlf",$file));
@@ -66,26 +66,21 @@ my $ungetc_count = 8200;    # Somewhat over the likely buffer size
     # binmode :crlf should not cumulate.
     # Try it first once and then twice so that even UNIXy boxes
     # get to exercise this, for DOSish boxes even once is enough.
-    # Try also pushing :utf8 first so that there are other layers
-    # in between (this should not matter: CRLF layers still should
-    # not accumulate).
-    for my $utf8 ('', ':utf8') {
-	for my $binmode (1..2) {
-	    open(FOO, ">$file");
-	    # require PerlIO; print PerlIO::get_layers(FOO), "\n";
-	    binmode(FOO, "$utf8:crlf") for 1..$binmode;
-	    # require PerlIO; print PerlIO::get_layers(FOO), "\n";
-	    print FOO "Hello\n";
-	    close FOO;
-	    open(FOO, "<$file");
-	    binmode(FOO);
-	    my $foo = scalar <FOO>;
-	    close FOO;
-	    print join(" ", "#", map { sprintf("%02x", $_) } unpack("C*", $foo)),
-	    "\n";
-	    like($foo, qr/$crlf$/);
-	    unlike($foo, qr/$crcr/);
-	}
+    for my $binmode (1..2) {
+	open(FOO, ">$file");
+	# require PerlIO; print PerlIO::get_layers(FOO), "\n";
+	binmode(FOO, ":crlf") for 1..$binmode;
+	# require PerlIO; print PerlIO::get_layers(FOO), "\n";
+	print FOO "Hello\n";
+	close FOO;
+	open(FOO, "<$file");
+	binmode(FOO);
+	my $foo = scalar <FOO>;
+	close FOO;
+	print join(" ", "#", map { sprintf("%02x", $_) } unpack("C*", $foo)),
+	"\n";
+	like($foo, qr/\x0d\x0a$/);
+	unlike($foo, qr/\x0d\x0d/);
     }
 
     {
diff --git a/t/io/layers.t b/t/io/layers.t
index 541b4775bc75..de1247a2f773 100644
--- a/t/io/layers.t
+++ b/t/io/layers.t
@@ -35,11 +35,9 @@ if (${^UNICODE} & 1) {
 } else {
     $UTF8_STDIN = 0;
 }
-my $NTEST = 60 - (($DOSISH || !$FASTSTDIO) ? 7 : 0) - ($DOSISH ? 7 : 0)
+my $NTEST = 55 - (($DOSISH || !$FASTSTDIO) ? 7 : 0) - ($DOSISH ? 7 : 0)
     + $UTF8_STDIN;
 
-sub PerlIO::F_UTF8 () { 0x00008000 } # from perliol.h
-
 plan tests => $NTEST;
 
 print <<__EOH__;
@@ -54,6 +52,7 @@ __EOH__
 
 {
     sub check {
+	local $Test::Builder::Level = $Test::Builder::Level + 1;
 	my ($result, $expected, $id) = @_;
 	# An interesting dance follows where we try to make the following
 	# IO layer stack setups to compare equal:
@@ -128,13 +127,13 @@ __EOH__
     binmode(F, ":encoding(cp1047)"); 
 
     check([ PerlIO::get_layers(F) ],
-	  [ qw[stdio crlf encoding(cp1047) utf8] ],
+	  [ qw[stdio crlf encoding(cp1047)] ],
 	  ":encoding(cp1047)");
 
     binmode(F, ":crlf");
 
     check([ PerlIO::get_layers(F) ],
-	  [ qw[stdio crlf encoding(cp1047) utf8 crlf utf8] ],
+	  [ qw[stdio crlf encoding(cp1047) crlf ] ],
 	  ":encoding(cp1047):crlf");
     
     binmode(F, ":pop:pop");
@@ -164,7 +163,7 @@ __EOH__
     binmode(F, ":encoding(utf8)");
 
     check([ PerlIO::get_layers(F) ],
-	    [ qw[stdio encoding(utf8) utf8] ],
+	    [ qw[stdio encoding(utf8)] ],
 	    ":encoding(utf8)");
 
     binmode(F, ":raw :crlf");
@@ -214,7 +213,7 @@ __EOH__
 	      "use open IN");
 	
 	check([ PerlIO::get_layers(G, output => 1) ],
-	      [ qw[stdio encoding(cp1252) utf8] ],
+	      [ qw[stdio encoding(cp1252)] ],
 	      "use open OUT");
 
 	close F;
diff --git a/t/io/utf8.t b/t/io/utf8.t
index 2ea7429f4dc8..9a18ff936214 100644
--- a/t/io/utf8.t
+++ b/t/io/utf8.t
@@ -11,7 +11,7 @@ skip_all_without_perlio();
 no utf8; # needed for use utf8 not griping about the raw octets
 
 
-plan(tests => 62);
+plan(tests => 61);
 
 $| = 1;
 
@@ -169,13 +169,9 @@ SKIP: {
     if ($::IS_EBCDIC) {
 	skip("EBCDIC The file isn't deformed in UTF-EBCDIC", 2);
     } else {
-	my @warnings;
 	open F, "<:utf8", $a_file or die $!;
-	$x = <F>; chomp $x;
-	local $SIG{__WARN__} = sub { push @warnings, $_[0]; };
-	eval { sprintf "%vd\n", $x };
-	is (scalar @warnings, 1);
-	like ($warnings[0], qr/Malformed UTF-8 character: \\x82 \(unexpected continuation byte 0x82, with no preceding start byte/);
+	eval { $x = <F>; chomp $x; };
+	like ($@, qr/^Can't decode ill-formed UTF-8 octet sequence <82>/);
     }
 }
 
@@ -326,7 +322,6 @@ is($failed, undef);
     # if it finds bad UTF-8 (:encoding(utf8) works this way)
     use warnings 'utf8';
     undef $@;
-    local $SIG{__WARN__} = sub { $@ = shift };
     open F, ">$a_file";
     binmode F;
     my ($chrE4, $chrF6) = (chr(0xE4), chr(0xF6));
@@ -337,14 +332,16 @@ is($failed, undef);
     close F;
     open F, "<:utf8", $a_file;
     undef $@;
+    eval { 
     my $line = <F>;
+    };
     my ($chrE4, $chrF6) = ("E4", "F6");
     if ($::IS_EBCDIC) { ($chrE4, $chrF6) = ("43", "EC"); } # EBCDIC
-    like( $@, qr/utf8 "\\x$chrE4" does not map to Unicode .+ <F> line 1/,
+    like( $@, qr/^Can't decode ill-formed UTF-8 octet sequence <E4>/,
 	  "<:utf8 readline must warn about bad utf8");
     undef $@;
-    $line .= <F>;
-    like( $@, qr/utf8 "\\x$chrF6" does not map to Unicode .+ <F> line 2/,
+    eval { $line .= <F> };
+    like( $@, qr/Can\'t decode ill-formed UTF-8 octet sequence <F6>/, 
 	  "<:utf8 rcatline must warn about bad utf8");
     close F;
 }
@@ -381,10 +378,9 @@ is($failed, undef);
     open F, "<:utf8", $a_file;
     undef $@;
     local $SIG{__WARN__} = sub { $@ = shift };
-    $line = <F>;
+	$line = eval { <F> };
 
-    like( $@, qr/utf8 "\\xEF" does not map to Unicode .+ <F> chunk 1/,
-	  "<:utf8 readline (fixed) must warn about bad utf8");
+    like( $@, qr/Can\'t decode ill-formed UTF-8 octet sequence <EF AC> at end of file/);
     close F;
 }
 
diff --git a/t/op/print.t b/t/op/print.t
index 55b341aa3e46..71c4b4133fe5 100644
--- a/t/op/print.t
+++ b/t/op/print.t
@@ -21,7 +21,7 @@ use warnings;
 no warnings 'utf8';
 
 # These form overlong "oops"
-open my $fh, "<:utf8", \"\xC1\xAF\xC1\xAF\xC1\xB0\xC1\xB3"
+open my $fh, "<:utf8_lax", \"\xC1\xAF\xC1\xAF\xC1\xB0\xC1\xB3"
     or die "Could not open\n";
 read($fh, my $s, 10) or die "Could not read\n";
 print $s;
diff --git a/universal.c b/universal.c
index 5932767bdf3a..b86b3fd18c8b 100644
--- a/universal.c
+++ b/universal.c
@@ -826,14 +826,6 @@ XS(XS_PerlIO_get_layers)
                        else
                             PUSHs(&PL_sv_undef);
                        nitem++;
-                       if (flgok) {
-                            const IV flags = SvIVX(*flgsvp);
-
-                            if (flags & PERLIO_F_UTF8) {
-                                 PUSHs(newSVpvs_flags("utf8", SVs_TEMP));
-                                 nitem++;
-                            }
-                       }
                   }
              }