Fix JuliaLang#10959 bugs with UTF-16 conversions

Rewrote a number of the conversions between ASCIIString, UTF8String, and UTF16String. Rewrote length() for UTF16String(). Improved reverse() for UTF16String(). Added over 150 lines of testing code to detect the above conversion problems Added (in a gist) code to show other conversion problems not yet fixed: https://gist.github.com/ScottPJones/4e6e8938f0559998f9fc Added (in a gist) code to benchmark the performance, to ensure that adding the extra validity checking did not adversely affect performance (in fact, performance was greatly improved). https://gist.github.com/ScottPJones/79ed895f05f85f333d84 Updated based on review comments Changes to error handling and check_string Rebased against JuliaLang#11575 Updated comment to go before function, not indented by 4 Updated to use unsafe_checkstring Removed redundant argument documentation
ScottPJones · Jun 22, 2015 · fdb0b35 · fdb0b35
1 parent 1e081b7
commit fdb0b35
Show file tree

Hide file tree

Showing 4 changed files with 371 additions and 66 deletions.
diff --git a/base/utf16.jl b/base/utf16.jl
@@ -1,17 +1,42 @@
 # This file is a part of Julia. License is MIT: http://julialang.org/license
 
-utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800
-utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00
-utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800
-utf16_get_supplementary(lead::UInt16, trail::UInt16) = Char(UInt32(lead-0xd7f7)<<10 + trail)
+# Quickly copy and set trailing \0
+@inline function fast_utf_copy{S <: Union{UTF16String, UTF32String}, T <: Union{UInt16, Char}}(
+			      ::Type{S}, ::Type{T}, len, dat, flag::Bool=false)
+     S(setindex!(copy!(Vector{T}(len+1), 1, dat, 1, flag ? len : len+1), 0, len+1))
+end
+
+# Get rest of character ch from 3-byte UTF-8 sequence in dat
+@inline function get_utf8_3byte(dat, pos, ch)
+    @inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f)
+end
+# Get rest of character ch from 4-byte UTF-8 sequence in dat
+@inline function get_utf8_4byte(dat, pos, ch)
+    @inbounds return (((ch & 0x7) << 18)
+                        | (UInt32(dat[pos-2] & 0x3f) << 12)
+                        | (UInt32(dat[pos-1] & 0x3f) << 6)
+                        | (dat[pos] & 0x3f))
+end
+
+# Output a character as a 4-byte UTF-8 sequence
+@inline function output_utf8_4byte!(buf, out, ch)
+    @inbounds begin
+        buf[out + 1] = 0xf0 | (ch >>> 18)
+        buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f)
+        buf[out + 3] = 0x80 | ((ch >>> 6) & 0x3f)
+        buf[out + 4] = 0x80 | (ch & 0x3f)
+    end
+end
+
+const empty_utf16 = UTF16String(UInt16[0])
 
 function length(s::UTF16String)
     d = s.data
     len = length(d) - 1
     len == 0 && return 0
     cnum = 0
     for i = 1:len
-        @inbounds cnum += !utf16_is_trail(d[i])
+        @inbounds cnum += !is_surrogate_trail(d[i])
     end
     cnum
 end
@@ -20,100 +45,240 @@ function endof(s::UTF16String)
     d = s.data
     i = length(d) - 1
     i == 0 && return i
-    utf16_is_surrogate(d[i]) ? i-1 : i
+    return is_surrogate_codeunit(d[i]) ? i-1 : i
 end
 
+get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail)
+
 function next(s::UTF16String, i::Int)
-    if !utf16_is_surrogate(s.data[i])
-        return Char(s.data[i]), i+1
-    elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1])
-        return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2
-    end
-    throw(UnicodeError(UTF_ERR_INVALID_INDEX,0,0))
+    ch = s.data[i]
+    !is_surrogate_codeunit(ch) && return (Char(ch), i+1)
+    # check length, account for terminating \0
+    i >= (length(s.data)-1) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch)))
+    !is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, i, ch))
+    ct = s.data[i+1]
+    !is_surrogate_trail(ct) && throw((UTF_ERR_NOT_TRAIL, i, ch))
+    Char(get_supplementary(ch, ct)), i+2
 end
 
 function reverseind(s::UTF16String, i::Integer)
     j = length(s.data) - i
-    return Base.utf16_is_trail(s.data[j]) ? j-1 : j
+    return is_surrogate_trail(s.data[j]) ? j-1 : j
 end
 
 lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator
 
 function reverse(s::UTF16String)
-    d =s.data
+    d = s.data
     out = similar(d)
     out[end] = 0 # NULL termination
     n = length(d)
-    for i = 1:n-1
-        out[i] = d[n-i]
-        if Base.utf16_is_lead(out[i])
-            out[i],out[i-1] = out[i-1],out[i]
+    @inbounds for i = 1:n-1
+        ch = d[n-i]
+        if is_surrogate_lead(ch)
+            out[i],out[i-1] = out[i-1],ch
+        else
+            out[i] = ch
+        end
+    end
+    UTF16String(out)
+end
+
+sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
+
+function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
+    i = 1
+    n = length(data) # this may include NULL termination; that's okay
+    @inbounds while i < n # check for unpaired surrogates
+        if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1])
+            i += 2
+        elseif is_surrogate_codeunit(data[i])
+            return false
+        else
+            i += 1
         end
     end
-    return UTF16String(out)
+    return i > n || !is_surrogate_codeunit(data[i])
 end
 
-# TODO: optimize this
-function encode16(s::AbstractString)
-    buf = UInt16[]
-    for ch in s
-        c = reinterpret(UInt32, ch)
+"
+Converts an `AbstractString` to a `UTF16String`
+
+### Returns:
+*   `UTF16String`
+
+### Throws:
+*   `UnicodeError`
+"
+function convert(::Type{UTF16String}, str::AbstractString)
+    len, flags, num4byte = unsafe_checkstring(str)
+    buf = Vector{UInt16}(len+num4byte+1)
+    out = 0
+    @inbounds for ch in str
+        c = UInt32(ch)
         if c < 0x10000
-            push!(buf, UInt16(c))
-        elseif c <= 0x10ffff
-            push!(buf, UInt16(0xd7c0 + (c>>10)))
-            push!(buf, UInt16(0xdc00 + (c & 0x3ff)))
+            buf[out += 1] = UInt16(c)
         else
-            throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, ch))
+            # output surrogate pair
+            buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
+            buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
         end
     end
-    push!(buf, 0) # NULL termination
+    @inbounds buf[out + 1] = 0 # NULL termination
     UTF16String(buf)
 end
 
-utf16(x) = convert(UTF16String, x)
-convert(::Type{UTF16String}, s::UTF16String) = s
-convert(::Type{UTF16String}, s::AbstractString) = encode16(s)
-convert(::Type{Array{UInt16,1}}, s::UTF16String) = s.data
-convert(::Type{Array{UInt16}}, s::UTF16String) = s.data
+"
+Converts a `UTF8String` to a `UTF16String`
 
-# TODO: optimize this
-convert(::Type{UTF8String}, s::UTF16String) =
-    sprint(length(s.data)-1, io->for c in s; write(io,c::Char); end)
+### Returns:
+*   `UTF16String`
 
-sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
-unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) =
-    convert(Ptr{T}, pointer(s))
+### Throws:
+*   `UnicodeError`
+"
+function convert(::Type{UTF16String}, str::UTF8String)
+    dat = str.data
+    # handle zero length string quickly
+    sizeof(dat) == 0 && return empty_utf16
+    # Check that is correct UTF-8 encoding and get number of words needed
+    len, flags, num4byte = unsafe_checkstring(dat)
+    len += num4byte
+    buf = Vector{UInt16}(len+1)
+    @inbounds buf[len+1] = 0
+    # Optimize case where no characters > 0x7f
+    flags == 0 && @inbounds return UTF16String(copy!(buf, dat))
+    out = 0
+    pos = 0
+    @inbounds while out < len
+        ch::UInt32 = dat[pos += 1]
+        # Handle ASCII characters
+        if ch <= 0x7f
+            buf[out += 1] = ch
+        # Handle range 0x80-0x7ff
+        elseif ch < 0xe0
+            buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
+        # Handle range 0x800-0xffff
+        elseif ch < 0xf0
+            pos += 2
+            buf[out += 1] = get_utf8_3byte(dat, pos, ch)
+        # Handle range 0x10000-0x10ffff
+        else
+            pos += 3
+            ch = get_utf8_4byte(dat, pos, ch)
+            # output surrogate pair
+            buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
+            buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
+        end
+    end
+    UTF16String(buf)
+end
 
-function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
-    i = 1
-    n = length(data) # this may include NULL termination; that's okay
-    while i < n # check for unpaired surrogates
-        if utf16_is_lead(data[i]) && utf16_is_trail(data[i+1])
-            i += 2
-        elseif utf16_is_surrogate(data[i])
-            return false
+"
+Converts a UTF-16 encoded vector of `UInt16` to a `UTF8String`
+
+### Returns:
+*   `UTF8String`
+
+### Throws:
+*   `UnicodeError`
+"
+function convert(::Type{UTF8String}, dat::Vector{UInt16})
+    len = sizeof(dat)
+    # handle zero length string quickly
+    len == 0 && return emtpy_utf8
+    # get number of bytes to allocate
+    len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len>>>1)
+    flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), dat))
+    return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
+end
+
+"
+Converts a `UTF16String` to a `UTF8String`
+
+### Returns:
+*   `UTF8String`
+
+### Throws:
+*   `UnicodeError`
+"
+function convert(::Type{UTF8String}, str::UTF16String)
+    dat = str.data
+    len = sizeof(dat) >>> 1
+    # handle zero length string quickly
+    len <= 1 && return empty_utf8
+    # get number of bytes to allocate
+    len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1)
+    flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
+    return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
+end
+
+"
+Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
+
+### Input Arguments:
+* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted
+* `len` length of output in bytes
+
+### Returns:
+* `UTF8String`
+"
+function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len)
+    buf = Vector{UInt8}(len)
+    out = 0
+    pos = 0
+    @inbounds while out < len
+        ch::UInt32 = dat[pos += 1]
+        # Handle ASCII characters
+        if ch <= 0x7f
+            buf[out += 1] = ch
+        # Handle 0x80-0x7ff
+        elseif ch < 0x800
+            buf[out += 1] = 0xc0 | (ch >>> 6)
+            buf[out += 1] = 0x80 | (ch & 0x3f)
+        # Handle 0x10000-0x10ffff (if input is UInt32)
+        elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
+            output_utf8_4byte!(buf, out, ch)
+            out += 4
+        # Handle surrogate pairs
+        elseif is_surrogate_codeunit(ch)
+            output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1]))
+            out += 4
+        # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
         else
-            i += 1
+            buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f)
+            buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f)
+            buf[out += 1] = 0x80 | (ch & 0x3f)
         end
     end
-    return i > n || !utf16_is_surrogate(data[i])
+    UTF8String(buf)
 end
 
-function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
-    !isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
-    len = length(data)
-    d = Array(UInt16, len + 1)
-    d[end] = 0 # NULL terminate
-    UTF16String(copy!(d,1, data,1, len))
+function convert(::Type{UTF16String}, str::ASCIIString)
+    dat = str.data
+    @inbounds return fast_utf_copy(UTF16String, UInt16, length(dat), dat, true)
 end
 
+convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data
+convert(::Type{Array{UInt16}},  str::UTF16String) = str.data
+
+convert(::Type{UTF16String}, str::UTF16String)    = str
+
+unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) =
+    convert(Ptr{T}, pointer(s))
+
 convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) =
     convert(T, reshape(data, length(data)))
 
 convert(T::Type{UTF16String}, data::AbstractArray{Int16}) =
     convert(T, reinterpret(UInt16, data))
 
+function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
+    !isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
+    len = length(data)
+    @inbounds return UTF16String(setindex!(copy!(Vector{UInt16}(len+1),1,data,1,len),0,len+1))
+end
+
 function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
     isempty(bytes) && return UTF16String(UInt16[0])
     isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0))
@@ -136,6 +301,9 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
     UTF16String(d)
 end
 
+convert(::Type{UTF16String}, str::UTF16String)    = str
+
+utf16(x) = convert(UTF16String, x)
 utf16(p::Ptr{UInt16}, len::Integer) = utf16(pointer_to_array(p, len))
 utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len)
 function utf16(p::Union{Ptr{UInt16}, Ptr{Int16}})