From fdb0b3504f401a218083c7d250f5b042bd9c702a Mon Sep 17 00:00:00 2001
From: ScottPJones <scottjones@alum.mit.edu>
Date: Wed, 3 Jun 2015 02:48:59 +0200
Subject: [PATCH] Fix #10959 bugs with UTF-16 conversions

Rewrote a number of the conversions between ASCIIString, UTF8String, and UTF16String.
Rewrote length() for UTF16String().
Improved reverse() for UTF16String().

Added over 150 lines of testing code to detect the above conversion problems

Added (in a gist) code to show other conversion problems not yet fixed:
https://gist.github.com/ScottPJones/4e6e8938f0559998f9fc

Added (in a gist) code to benchmark the performance, to ensure that adding the extra validity
checking did not adversely affect performance (in fact, performance was greatly improved).
https://gist.github.com/ScottPJones/79ed895f05f85f333d84

Updated based on review comments

Changes to error handling and check_string

Rebased against #11575
Updated comment to go before function, not indented by 4

Updated to use unsafe_checkstring

Removed redundant argument documentation
---
 base/utf16.jl    | 282 +++++++++++++++++++++++++++++++++++++----------
 base/utf32.jl    |  15 +--
 base/utferror.jl |   2 +-
 test/strings.jl  | 138 ++++++++++++++++++++++-
 4 files changed, 371 insertions(+), 66 deletions(-)

diff --git a/base/utf16.jl b/base/utf16.jl
index 8caa3fd14f19d..5023eea20ca83 100644
--- a/base/utf16.jl
+++ b/base/utf16.jl
@@ -1,9 +1,34 @@
 # This file is a part of Julia. License is MIT: http://julialang.org/license
 
-utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800
-utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00
-utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800
-utf16_get_supplementary(lead::UInt16, trail::UInt16) = Char(UInt32(lead-0xd7f7)<<10 + trail)
+# Quickly copy and set trailing \0
+@inline function fast_utf_copy{S <: Union{UTF16String, UTF32String}, T <: Union{UInt16, Char}}(
+			      ::Type{S}, ::Type{T}, len, dat, flag::Bool=false)
+     S(setindex!(copy!(Vector{T}(len+1), 1, dat, 1, flag ? len : len+1), 0, len+1))
+end
+
+# Get rest of character ch from 3-byte UTF-8 sequence in dat
+@inline function get_utf8_3byte(dat, pos, ch)
+    @inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f)
+end
+# Get rest of character ch from 4-byte UTF-8 sequence in dat
+@inline function get_utf8_4byte(dat, pos, ch)
+    @inbounds return (((ch & 0x7) << 18)
+                        | (UInt32(dat[pos-2] & 0x3f) << 12)
+                        | (UInt32(dat[pos-1] & 0x3f) << 6)
+                        | (dat[pos] & 0x3f))
+end
+
+# Output a character as a 4-byte UTF-8 sequence
+@inline function output_utf8_4byte!(buf, out, ch)
+    @inbounds begin
+        buf[out + 1] = 0xf0 | (ch >>> 18)
+        buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f)
+        buf[out + 3] = 0x80 | ((ch >>> 6) & 0x3f)
+        buf[out + 4] = 0x80 | (ch & 0x3f)
+    end
+end
+
+const empty_utf16 = UTF16String(UInt16[0])
 
 function length(s::UTF16String)
     d = s.data
@@ -11,7 +36,7 @@ function length(s::UTF16String)
     len == 0 && return 0
     cnum = 0
     for i = 1:len
-        @inbounds cnum += !utf16_is_trail(d[i])
+        @inbounds cnum += !is_surrogate_trail(d[i])
     end
     cnum
 end
@@ -20,100 +45,240 @@ function endof(s::UTF16String)
     d = s.data
     i = length(d) - 1
     i == 0 && return i
-    utf16_is_surrogate(d[i]) ? i-1 : i
+    return is_surrogate_codeunit(d[i]) ? i-1 : i
 end
 
+get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail)
+
 function next(s::UTF16String, i::Int)
-    if !utf16_is_surrogate(s.data[i])
-        return Char(s.data[i]), i+1
-    elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1])
-        return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2
-    end
-    throw(UnicodeError(UTF_ERR_INVALID_INDEX,0,0))
+    ch = s.data[i]
+    !is_surrogate_codeunit(ch) && return (Char(ch), i+1)
+    # check length, account for terminating \0
+    i >= (length(s.data)-1) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch)))
+    !is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, i, ch))
+    ct = s.data[i+1]
+    !is_surrogate_trail(ct) && throw((UTF_ERR_NOT_TRAIL, i, ch))
+    Char(get_supplementary(ch, ct)), i+2
 end
 
 function reverseind(s::UTF16String, i::Integer)
     j = length(s.data) - i
-    return Base.utf16_is_trail(s.data[j]) ? j-1 : j
+    return is_surrogate_trail(s.data[j]) ? j-1 : j
 end
 
 lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator
 
 function reverse(s::UTF16String)
-    d =s.data
+    d = s.data
     out = similar(d)
     out[end] = 0 # NULL termination
     n = length(d)
-    for i = 1:n-1
-        out[i] = d[n-i]
-        if Base.utf16_is_lead(out[i])
-            out[i],out[i-1] = out[i-1],out[i]
+    @inbounds for i = 1:n-1
+        ch = d[n-i]
+        if is_surrogate_lead(ch)
+            out[i],out[i-1] = out[i-1],ch
+        else
+            out[i] = ch
+        end
+    end
+    UTF16String(out)
+end
+
+sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
+
+function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
+    i = 1
+    n = length(data) # this may include NULL termination; that's okay
+    @inbounds while i < n # check for unpaired surrogates
+        if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1])
+            i += 2
+        elseif is_surrogate_codeunit(data[i])
+            return false
+        else
+            i += 1
         end
     end
-    return UTF16String(out)
+    return i > n || !is_surrogate_codeunit(data[i])
 end
 
-# TODO: optimize this
-function encode16(s::AbstractString)
-    buf = UInt16[]
-    for ch in s
-        c = reinterpret(UInt32, ch)
+"
+Converts an `AbstractString` to a `UTF16String`
+
+### Returns:
+*   `UTF16String`
+
+### Throws:
+*   `UnicodeError`
+"
+function convert(::Type{UTF16String}, str::AbstractString)
+    len, flags, num4byte = unsafe_checkstring(str)
+    buf = Vector{UInt16}(len+num4byte+1)
+    out = 0
+    @inbounds for ch in str
+        c = UInt32(ch)
         if c < 0x10000
-            push!(buf, UInt16(c))
-        elseif c <= 0x10ffff
-            push!(buf, UInt16(0xd7c0 + (c>>10)))
-            push!(buf, UInt16(0xdc00 + (c & 0x3ff)))
+            buf[out += 1] = UInt16(c)
         else
-            throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, ch))
+            # output surrogate pair
+            buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
+            buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
         end
     end
-    push!(buf, 0) # NULL termination
+    @inbounds buf[out + 1] = 0 # NULL termination
     UTF16String(buf)
 end
 
-utf16(x) = convert(UTF16String, x)
-convert(::Type{UTF16String}, s::UTF16String) = s
-convert(::Type{UTF16String}, s::AbstractString) = encode16(s)
-convert(::Type{Array{UInt16,1}}, s::UTF16String) = s.data
-convert(::Type{Array{UInt16}}, s::UTF16String) = s.data
+"
+Converts a `UTF8String` to a `UTF16String`
 
-# TODO: optimize this
-convert(::Type{UTF8String}, s::UTF16String) =
-    sprint(length(s.data)-1, io->for c in s; write(io,c::Char); end)
+### Returns:
+*   `UTF16String`
 
-sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
-unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) =
-    convert(Ptr{T}, pointer(s))
+### Throws:
+*   `UnicodeError`
+"
+function convert(::Type{UTF16String}, str::UTF8String)
+    dat = str.data
+    # handle zero length string quickly
+    sizeof(dat) == 0 && return empty_utf16
+    # Check that is correct UTF-8 encoding and get number of words needed
+    len, flags, num4byte = unsafe_checkstring(dat)
+    len += num4byte
+    buf = Vector{UInt16}(len+1)
+    @inbounds buf[len+1] = 0
+    # Optimize case where no characters > 0x7f
+    flags == 0 && @inbounds return UTF16String(copy!(buf, dat))
+    out = 0
+    pos = 0
+    @inbounds while out < len
+        ch::UInt32 = dat[pos += 1]
+        # Handle ASCII characters
+        if ch <= 0x7f
+            buf[out += 1] = ch
+        # Handle range 0x80-0x7ff
+        elseif ch < 0xe0
+            buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
+        # Handle range 0x800-0xffff
+        elseif ch < 0xf0
+            pos += 2
+            buf[out += 1] = get_utf8_3byte(dat, pos, ch)
+        # Handle range 0x10000-0x10ffff
+        else
+            pos += 3
+            ch = get_utf8_4byte(dat, pos, ch)
+            # output surrogate pair
+            buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
+            buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
+        end
+    end
+    UTF16String(buf)
+end
 
-function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
-    i = 1
-    n = length(data) # this may include NULL termination; that's okay
-    while i < n # check for unpaired surrogates
-        if utf16_is_lead(data[i]) && utf16_is_trail(data[i+1])
-            i += 2
-        elseif utf16_is_surrogate(data[i])
-            return false
+"
+Converts a UTF-16 encoded vector of `UInt16` to a `UTF8String`
+
+### Returns:
+*   `UTF8String`
+
+### Throws:
+*   `UnicodeError`
+"
+function convert(::Type{UTF8String}, dat::Vector{UInt16})
+    len = sizeof(dat)
+    # handle zero length string quickly
+    len == 0 && return emtpy_utf8
+    # get number of bytes to allocate
+    len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len>>>1)
+    flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), dat))
+    return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
+end
+
+"
+Converts a `UTF16String` to a `UTF8String`
+
+### Returns:
+*   `UTF8String`
+
+### Throws:
+*   `UnicodeError`
+"
+function convert(::Type{UTF8String}, str::UTF16String)
+    dat = str.data
+    len = sizeof(dat) >>> 1
+    # handle zero length string quickly
+    len <= 1 && return empty_utf8
+    # get number of bytes to allocate
+    len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1)
+    flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
+    return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
+end
+
+"
+Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
+
+### Input Arguments:
+* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted
+* `len` length of output in bytes
+
+### Returns:
+* `UTF8String`
+"
+function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len)
+    buf = Vector{UInt8}(len)
+    out = 0
+    pos = 0
+    @inbounds while out < len
+        ch::UInt32 = dat[pos += 1]
+        # Handle ASCII characters
+        if ch <= 0x7f
+            buf[out += 1] = ch
+        # Handle 0x80-0x7ff
+        elseif ch < 0x800
+            buf[out += 1] = 0xc0 | (ch >>> 6)
+            buf[out += 1] = 0x80 | (ch & 0x3f)
+        # Handle 0x10000-0x10ffff (if input is UInt32)
+        elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
+            output_utf8_4byte!(buf, out, ch)
+            out += 4
+        # Handle surrogate pairs
+        elseif is_surrogate_codeunit(ch)
+            output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1]))
+            out += 4
+        # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
         else
-            i += 1
+            buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f)
+            buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f)
+            buf[out += 1] = 0x80 | (ch & 0x3f)
         end
     end
-    return i > n || !utf16_is_surrogate(data[i])
+    UTF8String(buf)
 end
 
-function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
-    !isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
-    len = length(data)
-    d = Array(UInt16, len + 1)
-    d[end] = 0 # NULL terminate
-    UTF16String(copy!(d,1, data,1, len))
+function convert(::Type{UTF16String}, str::ASCIIString)
+    dat = str.data
+    @inbounds return fast_utf_copy(UTF16String, UInt16, length(dat), dat, true)
 end
 
+convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data
+convert(::Type{Array{UInt16}},  str::UTF16String) = str.data
+
+convert(::Type{UTF16String}, str::UTF16String)    = str
+
+unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) =
+    convert(Ptr{T}, pointer(s))
+
 convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) =
     convert(T, reshape(data, length(data)))
 
 convert(T::Type{UTF16String}, data::AbstractArray{Int16}) =
     convert(T, reinterpret(UInt16, data))
 
+function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
+    !isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
+    len = length(data)
+    @inbounds return UTF16String(setindex!(copy!(Vector{UInt16}(len+1),1,data,1,len),0,len+1))
+end
+
 function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
     isempty(bytes) && return UTF16String(UInt16[0])
     isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0))
@@ -136,6 +301,9 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
     UTF16String(d)
 end
 
+convert(::Type{UTF16String}, str::UTF16String)    = str
+
+utf16(x) = convert(UTF16String, x)
 utf16(p::Ptr{UInt16}, len::Integer) = utf16(pointer_to_array(p, len))
 utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len)
 function utf16(p::Union{Ptr{UInt16}, Ptr{Int16}})
diff --git a/base/utf32.jl b/base/utf32.jl
index b85213e747db1..0d481bfda353c 100644
--- a/base/utf32.jl
+++ b/base/utf32.jl
@@ -1,5 +1,6 @@
 # This file is a part of Julia. License is MIT: http://julialang.org/license
 
+# UTF-32 basic functions
 next(s::UTF32String, i::Int) = (s.data[i], i+1)
 endof(s::UTF32String) = length(s.data) - 1
 length(s::UTF32String) = length(s.data) - 1
@@ -40,8 +41,8 @@ function convert{T<:ByteString}(::Type{T}, data::AbstractVector{Char})
     convert(T, takebuf_string(s))
 end
 
-convert(::Type{Array{Char,1}}, s::UTF32String) = s.data
-convert(::Type{Array{Char}}, s::UTF32String) = s.data
+convert(::Type{Vector{Char}}, str::UTF32String) = str.data
+convert(::Type{Array{Char}},  str::UTF32String) = str.data
 
 reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
 
@@ -60,19 +61,19 @@ function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
     elseif data[1] == Char(0xfffe0000) # byte-swapped
         d = Array(Char, length(data))
         for i = 2:length(data)
-            d[i-1] = bswap(data[i])
+            @inbounds d[i-1] = bswap(data[i])
         end
     else
         d = Array(Char, length(data) + 1)
         copy!(d, 1, data, 1, length(data)) # assume native byte order
     end
-    d[end] = Char(0) # NULL terminate
+    d[end] = 0 # NULL terminate
     UTF32String(d)
 end
 
 function isvalid(::Type{UTF32String}, str::Union{Vector{Char}, Vector{UInt32}})
     for i=1:length(str)
-        @inbounds if !isvalid(Char, reinterpret(UInt32, str[i])) ; return false ; end
+        @inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end
     end
     return true
 end
@@ -89,9 +90,9 @@ end
 function map(f, s::UTF32String)
     d = s.data
     out = similar(d)
-    out[end] = Char(0)
+    out[end] = 0
 
-    for i = 1:(length(d)-1)
+    @inbounds for i = 1:(length(d)-1)
         c2 = f(d[i])
         if !isa(c2, Char)
             throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0))
diff --git a/base/utferror.jl b/base/utferror.jl
index 27b36e45b44fb..352bd03a16308 100644
--- a/base/utferror.jl
+++ b/base/utferror.jl
@@ -1,6 +1,6 @@
 # This file is a part of Julia. License is MIT: http://julialang.org/license
 
-##\brief      Error messages for Unicode / UTF support
+##    Error messages for Unicode / UTF support
 
 const UTF_ERR_SHORT             = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)"
 const UTF_ERR_CONT              = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)"
diff --git a/test/strings.jl b/test/strings.jl
index 3701567bae68e..9d400c8ec4302 100644
--- a/test/strings.jl
+++ b/test/strings.jl
@@ -1732,7 +1732,7 @@ d = UTF32String(c)
 c[1] = 'A'
 @test d=="A"
 
-# 11575
+# Issue #11575
 # Test invalid sequences
 
 byt = 0x0 # Needs to be defined outside the try block!
@@ -1897,3 +1897,139 @@ end
 @test [c for c in "ḟøøƀäṙ"] == ['ḟ', 'ø', 'ø', 'ƀ', 'ä', 'ṙ']
 @test [i for i in eachindex("ḟøøƀäṙ")] == [1, 4, 6, 8, 10, 12]
 @test [x for x in enumerate("ḟøøƀäṙ")] == [(1, 'ḟ'), (2, 'ø'), (3, 'ø'), (4, 'ƀ'), (5, 'ä'), (6, 'ṙ')]
+
+# issue #11551 (#11004,#10959)
+function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String)
+    @test utf16(strUTF8) == strUTF16
+    @test utf8(strUTF16) == strUTF8
+end
+
+# Create some ASCII, UTF8 and UTF16
+strAscii = "abcdefgh"
+strA_UTF8 = ("abcdefgh\uff")[1:8]
+strL_UTF8 = "abcdef\uff\uff"
+str2_UTF8 = "abcd\uff\uff\u7ff\u7ff"
+str3_UTF8 = "abcd\uff\uff\u7fff\u7fff"
+str4_UTF8 = "abcd\uff\u7ff\u7fff\U7ffff"
+strS_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80")
+strC_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\U10000")
+strZ_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80")
+strz_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\0")
+
+strA_UTF16 = utf16(strA_UTF8)
+strL_UTF16 = utf16(strL_UTF8)
+str2_UTF16 = utf16(str2_UTF8)
+str3_UTF16 = utf16(str3_UTF8)
+str4_UTF16 = utf16(str4_UTF8)
+strS_UTF16 = utf16(strS_UTF8)
+
+@test utf8(strAscii) == strAscii
+@test utf16(strAscii) == strAscii
+
+tstcvt(strA_UTF8,strA_UTF16)
+tstcvt(strL_UTF8,strL_UTF16)
+tstcvt(str2_UTF8,str2_UTF16)
+tstcvt(str3_UTF8,str3_UTF16)
+tstcvt(str4_UTF8,str4_UTF16)
+
+# Test converting surrogate pairs
+@test utf16(strS_UTF8) == strC_UTF8
+@test utf8(strS_UTF16) == strC_UTF8
+
+# Test converting overlong \0
+# @test utf8(strZ_UTF8)  == strz_UTF8   # currently broken! (in utf8.jl)
+@test utf16(strZ_UTF8) == strz_UTF8
+
+# Test invalid sequences
+
+byt = 0x0
+for T in (UTF16String,) # UTF32String
+    try
+    # Continuation byte not after lead
+    for byt in 0x80:0xbf
+        @test_throws UnicodeError convert(T,  UTF8String(UInt8[byt]))
+    end
+
+    # Test lead bytes
+    for byt in 0xc0:0xff
+        # Single lead byte at end of string
+        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt]))
+        # Lead followed by non-continuation character < 0x80
+        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0]))
+        # Lead followed by non-continuation character > 0xbf
+        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0xc0]))
+    end
+
+    # Test overlong 2-byte
+    for byt in 0x81:0xbf
+        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xc0,byt]))
+    end
+    for byt in 0x80:0xbf
+        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xc1,byt]))
+    end
+
+    # Test overlong 3-byte
+    for byt in 0x80:0x9f
+        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xe0,byt,0x80]))
+    end
+
+    # Test overlong 4-byte
+    for byt in 0x80:0x8f
+        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xef,byt,0x80,0x80]))
+    end
+
+    # Test 4-byte > 0x10ffff
+    for byt in 0x90:0xbf
+        @test_throws UnicodeError convert(T, UTF8String(UInt8[0xf4,byt,0x80,0x80]))
+    end
+    for byt in 0xf5:0xf7
+        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80]))
+    end
+
+    # Test 5-byte
+    for byt in 0xf8:0xfb
+        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80]))
+    end
+
+    # Test 6-byte
+    for byt in 0xfc:0xfd
+        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80,0x80]))
+    end
+
+    # Test 7-byte
+    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80]))
+
+    # Three and above byte sequences
+    for byt in 0xe0:0xef
+        # Lead followed by only 1 continuation byte
+        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80]))
+        # Lead ended by non-continuation character < 0x80
+        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0]))
+        # Lead ended by non-continuation character > 0xbf
+        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0xc0]))
+    end
+
+    # 3-byte encoded surrogate character(s)
+    # Single surrogate
+    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80]))
+    # Not followed by surrogate
+    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80]))
+    # Trailing surrogate first
+    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80]))
+    # Followed by lead surrogate
+    @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80]))
+
+    # Four byte sequences
+    for byt in 0xf0:0xf4
+        # Lead followed by only 2 continuation bytes
+        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80]))
+        # Lead followed by non-continuation character < 0x80
+        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0]))
+        # Lead followed by non-continuation character > 0xbf
+        @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0xc0]))
+    end
+    catch exp ;
+        println("Error checking $T: $byt")
+        throw(exp)
+    end
+end