From 231b53fb8b1f643decb000c97f1d99a268340187 Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Wed, 3 Jun 2015 02:48:59 +0200 Subject: [PATCH] Fix #10959 UTF-32 conversion errors Added new `convert` methods that use the `checkstring` function to validate input Added tests for many sorts of valid/invalid data Depends on PR #11551 and #11575 --- base/utf32.jl | 277 ++++++++++++++++++++++++++-- base/utfcheck.jl | 10 + base/utfconvert.jl | 444 +++++++++++++++++++++++++++++++++++++++++++++ test/strings.jl | 38 +++- 4 files changed, 751 insertions(+), 18 deletions(-) create mode 100644 base/utfconvert.jl diff --git a/base/utf32.jl b/base/utf32.jl index 0d481bfda353c..e9af613f3f32e 100644 --- a/base/utf32.jl +++ b/base/utf32.jl @@ -5,25 +5,277 @@ next(s::UTF32String, i::Int) = (s.data[i], i+1) endof(s::UTF32String) = length(s.data) - 1 length(s::UTF32String) = length(s.data) - 1 +reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s))) + +sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char) + +const empty_utf32 = UTF32String(UInt32[0]) + utf32(x) = convert(UTF32String, x) convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)]) convert(::Type{UTF32String}, s::UTF32String) = s -function convert(::Type{UTF32String}, s::AbstractString) - a = Array(Char, length(s) + 1) - i = 0 - for c in s - a[i += 1] = c +" +Converts an `AbstractString` to a `UTF16String` + +### Input Arguments: +* `::Type{UTF32String}` +* `str::AbstractString` + +### Returns: +* `::UTF32String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF32String}, str::AbstractString) + len, flags = check_string(str) + buf = Vector{Char}(len+1) + out = 0 + @inbounds for ch in str ; buf[out += 1] = ch ; end + @inbounds buf[out + 1] = 0 # NULL termination + UTF32String(buf) +end + +" +Converts a UTF-32 encoded vector of `UInt32` to a `UTF8String` + +### Input Arguments: +* `::Type{UTF8String}` +* `dat::Vector{UInt32}` + +### Returns: +* `::UTF8String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF8String}, dat::Vector{UInt32}) + len = sizeof(dat) + # handle zero length string quickly + len == 0 && return empty_utf8 + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = check_string(dat, len>>>2) + flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3) +end + +" +Converts a `UTF32String` to a `UTF8String` + +### Input Arguments: +* `::Type{UTF8String}` +* `str::UTF32String` + +### Returns: +* `::UTF8String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF8String}, str::UTF32String) + dat = reinterpret(UInt32, str.data) + len = sizeof(dat) >>> 2 + # handle zero length string quickly + len <= 1 && return empty_utf8 + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = check_string(dat, len-1) + flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3) +end + +" +Converts a `UTF8String` to a `UTF32String` + +### Input Arguments: +* `::Type{UTF32String}` +* `str::UTF8String` + +### Returns: +* `::UTF32String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF32String}, str::UTF8String) + dat = str.data + # handle zero length string quickly + sizeof(dat) == 0 && return empty_utf32 + # Validate UTF-8 encoding, and get number of words to create + len, flags = check_string(dat) + # Optimize case where no characters > 0x7f + flags == 0 && @inbounds return fast_utf_copy(UTF32String, Char, len, dat, true) + # has multi-byte UTF-8 sequences + buf = Vector{Char}(len+1) + @inbounds buf[len+1] = 0 # NULL termination + local ch::UInt32, surr::UInt32 + out = 0 + pos = 0 + @inbounds while out < len + ch = dat[pos += 1] + # Handle ASCII characters + if ch <= 0x7f + buf[out += 1] = ch + # Handle range 0x80-0x7ff + elseif ch < 0xe0 + buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f) + # Handle range 0x800-0xffff + elseif ch < 0xf0 + pos += 2 + ch = get_utf8_3byte(dat, pos, ch) + # Handle surrogate pairs (should have been encoded in 4 bytes) + if is_surrogate_lead(ch) + # Build up 32-bit character from ch and trailing surrogate in next 3 bytes + pos += 3 + surr = ((UInt32(dat[pos-2] & 0xf) << 12) + | (UInt32(dat[pos-1] & 0x3f) << 6) + | (dat[pos] & 0x3f)) + ch = get_supplementary(ch, surr) + end + buf[out += 1] = ch + # Handle range 0x10000-0x10ffff + else + pos += 3 + buf[out += 1] = get_utf8_4byte(dat, pos, ch) + end + end + UTF32String(buf) +end + +" +Converts a `UTF16String` to `UTF32String` + +### Input Arguments: +* `::Type{UTF32String}` +* `str::UTF16String` + +### Returns: +* `::UTF32String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF32String}, str::UTF16String) + dat = str.data + len = sizeof(dat) + # handle zero length string quickly (account for trailing \0) + len <= 2 && return empty_utf32 + # get number of words to create + len, flags, num4byte = check_string(dat, len>>>1) + # No surrogate pairs, do optimized copy + (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat)) + local ch::UInt32 + buf = Vector{Char}(len) + out = 0 + pos = 0 + @inbounds while out < len + ch = dat[pos += 1] + # check for surrogate pair + if is_surrogate_lead(ch) ; ch = get_supplementary(ch, dat[pos += 1]) ; end + buf[out += 1] = ch end - a[end] = Char(0) # NULL terminate - UTF32String(a) + UTF32String(buf) +end + +" +Converts a UTF-32 encoded vector of `UInt32` to a `UTF16String` + +### Input Arguments: +* `::Type{UTF16String}` +* `dat::Vector{UInt32}` + +### Returns: +* `::UTF16String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF16String}, dat::Vector{UInt32}) + len = sizeof(dat) + # handle zero length string quickly + len <= 4 && return empty_utf16 + # get number of words to allocate + len, flags, num4byte = check_string(dat, len>>>2) + len += num4byte + 1 + # optimized path, no surrogates + num4byte == 0 && @inbounds return fast_utf_copy(UTF16String, UInt16, len, dat) + return encode_to_utf16(dat, len) +end + +" +Converts a `UTF32String` to `UTF16String` + +### Input Arguments: +* `::Type{UTF16String}` +* `str::UTF32String` + +### Returns: +* `::UTF16String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF16String}, str::UTF32String) + dat = reinterpret(UInt32, str.data) + len = sizeof(dat) + # handle zero length string quickly + len <= 4 && return empty_utf16 + # get number of words to allocate + len, flags, num4byte = check_string(dat, len>>>2) + # optimized path, no surrogates + num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat)) + return encode_to_utf16(dat, len + num4byte) +end + +" +Converts an already validated UTF-32 encoded vector of `UInt32` to a `UTF16String` + +### Input Arguments: +* `dat::Vector{UInt32}` UTF-32 encoded data +* `len` length of output in 16-bit words + +### Returns: +* `::UTF16String` +" +function encode_to_utf16(dat, len) + buf = Vector{UInt16}(len) + @inbounds buf[len] = 0 # NULL termination + out = 0 + pos = 0 + @inbounds while out < len + ch = UInt32(dat[pos += 1]) + if ch > 0xffff + # Output surrogate pair for 0x10000-0x10ffff + buf[out += 1] = 0xd7c0 + (ch >>> 10) + ch = 0xdc00 + (ch & 0x3ff) + end + buf[out += 1] = ch + end + UTF16String(buf) +end + +convert(::Type{UTF8String}, dat::Vector{Char}) = convert(UTF8String, reinterpret(UInt32, dat)) + +convert(::Type{UTF16String}, dat::Vector{Char}) = convert(UTF16String, reinterpret(UInt32, dat)) +convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)]) + +function convert(::Type{UTF32String}, str::ASCIIString) + dat = str.data + @inbounds return fast_utf_copy(UTF32String, Char, length(dat), dat, true) +end + +function convert(::Type{UTF32String}, dat::AbstractVector{Char}) + @inbounds return fast_utf_copy(UTF32String, Char, length(dat), dat, true) end function convert(::Type{UTF32String}, data::AbstractVector{Char}) len = length(data) - d = Array(Char, len + 1) - d[end] = Char(0) # NULL terminate - UTF32String(copy!(d,1, data,1, len)) + @inbounds return UTF32String(setindex!(copy!(Vector{Char}(len+1),1,data,1,len),0,len+1)) +end + +function convert(::Type{UTF32String}, data::AbstractVector{Char}) + len = length(data) + @inbounds return UTF32String(setindex!(copy!(Vector{Char}(len+1),1,data,1,len),0,len+1)) end convert{T<:Union{Int32,UInt32}}(::Type{UTF32String}, data::AbstractVector{T}) = @@ -46,12 +298,11 @@ convert(::Type{Array{Char}}, str::UTF32String) = str.data reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s))) -sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char) unsafe_convert{T<:Union{Int32,UInt32,Char}}(::Type{Ptr{T}}, s::UTF32String) = convert(Ptr{T}, pointer(s)) function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8}) - isempty(bytes) && return UTF32String(Char[0]) + isempty(bytes) && return empty_utf32 length(bytes) & 3 != 0 && throw(UnicodeError(UTF_ERR_ODD_BYTES_32,0,0)) data = reinterpret(Char, bytes) # check for byte-order mark (BOM): @@ -79,6 +330,8 @@ function isvalid(::Type{UTF32String}, str::Union{Vector{Char}, Vector{UInt32}}) end isvalid(str::Vector{Char}) = isvalid(UTF32String, str) +utf32(x) = convert(UTF32String, x) + utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len)) utf32(p::Union{Ptr{UInt32}, Ptr{Int32}}, len::Integer) = utf32(convert(Ptr{Char}, p), len) function utf32(p::Union{Ptr{Char}, Ptr{UInt32}, Ptr{Int32}}) diff --git a/base/utfcheck.jl b/base/utfcheck.jl index 8483c05d9f2c5..9893495dcc8ed 100644 --- a/base/utfcheck.jl +++ b/base/utfcheck.jl @@ -194,7 +194,11 @@ end " Validates and calculates number of characters in a UTF-8,UTF-16 or UTF-32 encoded vector/string +<<<<<<< HEAD This function checks the bounds of the start and end positions +======= +This function checks the bounds of the start or end positions +>>>>>>> 0a652d1... Add Unicode validation function and fix UTF-16 conversion bugs Use `unsafe_checkstring` to avoid that overhead if the bounds have already been checked ### Input Arguments: @@ -221,9 +225,15 @@ function checkstring end checkstring(dat; kwargs...) = unsafe_checkstring(dat, start(dat), endof(dat); kwargs...) # Make sure that beginning and end positions are bounds checked +<<<<<<< HEAD function checkstring(dat, startpos, endpos = endof(dat); kwargs...) checkbounds(dat,startpos) checkbounds(dat,endpos) endpos < startpos && throw(ArgumentError("End position ($endpos) is less than start position ($startpos)")) +======= +function checkstring(dat, startpos = start(dat), endpos = endof(dat); kwargs...) + startpos < 1 && throw(BoundsError(dat, startpos)) + (startpos <= endpos <= endof(dat)) || throw(BoundsError(dat, endpos)) +>>>>>>> 0a652d1... Add Unicode validation function and fix UTF-16 conversion bugs unsafe_checkstring(dat, startpos, endpos; kwargs...) end diff --git a/base/utfconvert.jl b/base/utfconvert.jl new file mode 100644 index 0000000000000..cd5b12cb8b068 --- /dev/null +++ b/base/utfconvert.jl @@ -0,0 +1,444 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +# Functions to convert to different UTF encodings + +# Quickly copy and set trailing \0 +@inline function fast_utf_copy(T::Type{UInt16}, len, dat) + @inbounds return UTF16String(setindex!(copy!(Vector{T}(len), dat), 0, len)) +end +@inline function fast_utf_copy(T::Type{Char}, len, dat) + @inbounds return UTF32String(setindex!(copy!(Vector{T}(len), dat), 0, len)) +end + +# Get rest of character ch from 3-byte UTF-8 sequence in dat +@inline function get_utf8_3(dat, pos, ch) + @inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f) +end + +# Get rest of character ch from 4-byte UTF-8 sequence in dat +@inline function get_utf8_4(dat, pos, ch) + @inbounds return (((ch & 0x7) << 18) + | (UInt32(dat[pos-2] & 0x3f) << 12) + | (UInt32(dat[pos-1] & 0x3f) << 6) + | (dat[pos] & 0x3f)) +end + +# Output a character as a 4-byte UTF-8 sequence +@inline function output_utf8_4(buf, out, ch) + @inbounds begin + buf[out + 1] = 0xf0 | (ch >>> 18) + buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f) + buf[out + 3] = 0x80 | ((ch >>> 6) & 0x3f) + buf[out + 4] = 0x80 | (ch & 0x3f) + end +end + +#= +""" +@brief Converts an AbstractString to a UTF16String + +@param[in] ::Type{UTF16String} +@param[in] str::AbstractString + +@return ::UTF16String +@throws ArgumentError +""" +=# +function convert(::Type{UTF16String}, str::AbstractString) + len, flags, num4byte = check_string_abs(str) + buf = Vector{UInt16}(len+num4byte+1) + out = 0 + @inbounds for ch in str + c = UInt32(ch) + if c < 0x10000 + buf[out += 1] = UInt16(c) + else + # output surrogate pair + buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10)) + buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff)) + end + end + @inbounds buf[out + 1] = 0 # NULL termination + UTF16String(buf) +end + +#= +""" +@brief Converts an AbstractString to a UTF32String + +@param[in] ::Type{UTF32String} +@param[in] str::AbstractString + +@return ::UTF32String +@throws ArgumentError +""" +=# +function convert(::Type{UTF32String}, str::AbstractString) + len, flags = check_string_abs(str) + buf = Vector{Char}(len+1) + out = 0 + @inbounds for ch in str ; buf[out += 1] = ch ; end + @inbounds buf[out + 1] = 0 # NULL termination + UTF32String(buf) +end + +#= +@doc """ +@brief Converts a UTF8String to a UTF16String + +@param[in] ::Type{UTF16String} +@param[in] str::UTF8String + +@return ::UTF16String +@throws ArgumentError +""" -> +=# +function convert(::Type{UTF16String}, str::UTF8String) + dat = str.data + # handle zero length string quickly + sizeof(dat) == 0 && return empty_utf16 + # Check that is correct UTF-8 encoding and get number of words needed + len, flags, num4byte = check_string_utf8(dat) + len += num4byte + buf = Vector{UInt16}(len+1) + @inbounds buf[len+1] = 0 + # Optimize case where no characters > 0x7f + flags == 0 && @inbounds return UTF16String(copy!(buf, dat)) + out = 0 + pos = 0 + @inbounds while out < len + ch::UInt32 = dat[pos += 1] + # Handle ASCII characters + if ch <= 0x7f + buf[out += 1] = ch + # Handle range 0x80-0x7ff + elseif ch < 0xe0 + buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f) + # Handle range 0x800-0xffff + elseif ch < 0xf0 + pos += 2 + buf[out += 1] = get_utf8_3(dat, pos, ch) + # Handle range 0x10000-0x10ffff + else + pos += 3 + ch = get_utf8_4(dat, pos, ch) + # output surrogate pair + buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10)) + buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff)) + end + end + UTF16String(buf) +end + +#= +@doc """ +@brief Converts a UTF-16 encoded vector of UInt16 to a UTF8String + +@param[in] ::Type{UTF8String} +@param[in] dat::Vector{UInt16} + +@return ::UTF8String +@throws ArgumentError +""" -> +=# +function convert(::Type{UTF8String}, dat::Vector{UInt16}) + len = sizeof(dat) + # handle zero length string quickly + len == 0 && return UTF8String("") + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len>>>1) + flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), dat)) + return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3) +end + +#= +@doc """ +@brief Converts a UTF16String to a UTF8String + +@param[in] ::Type{UTF8String} +@param[in] str::UTF16String + +@return ::UTF8String +@throws ArgumentError +""" -> +=# +function convert(::Type{UTF8String}, str::UTF16String) + dat = str.data + len = sizeof(dat) >>> 1 + # handle zero length string quickly + len <= 1 && return UTF8String("") + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len-1) + flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3) +end + +#= +@doc """ +@brief Encodes a UTF-32 encoded vector of UInt32 to a UTF8String + +@param[in] ::Type{UTF8String} +@param[in] dat::Vector{UInt32} + +@return ::UTF8String +@throws ArgumentError +""" -> +=# +function convert(::Type{UTF8String}, dat::Vector{UInt32}) + len = sizeof(dat) + # handle zero length string quickly + len == 0 && return UTF8String("") + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = check_string_utf32(dat, len>>>2) + flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3) +end + +#= +@doc """ +@brief Converts a UTF32String to a UTF8String + +@param[in] ::Type{UTF8String} +@param[in] str::UTF32String + +@return ::UTF8String +@throws ArgumentError +""" -> +=# +function convert(::Type{UTF8String}, str::UTF32String) + dat = reinterpret(UInt32, str.data) + len = sizeof(dat) >>> 2 + # handle zero length string quickly + len <= 1 && return UTF8String("") + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = check_string_utf32(dat, len-1) + flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3) +end + +#= +@doc """ +@brief Converts an already validated vector of UInt16 or UInt32 to a UTF8String + +@param[in] T type (UInt16 or UInt32) +@param[in] dat Vector{T} +@param[in] len length of output in bytes + +@return ::UTF8String +""" -> +=# +function encode_to_utf8{T<:Union(UInt16, UInt32)}(::Type{T}, dat, len) + buf = Vector{UInt8}(len) + out = 0 + pos = 0 + @inbounds while out < len + ch::UInt32 = dat[pos += 1] + # Handle ASCII characters + if ch <= 0x7f + buf[out += 1] = ch + # Handle 0x80-0x7ff + elseif ch < 0x800 + buf[out += 1] = 0xc0 | (ch >>> 6) + buf[out += 1] = 0x80 | (ch & 0x3f) + # Handle 0x10000-0x10ffff (if input is UInt32) + elseif T == UInt32 && ch > 0xffff + output_utf8_4(buf, out, ch) + out += 4 + # Handle surrogate pairs + elseif is_surrogate_codeunit(ch) + output_utf8_4(buf, out, get_supplementary(ch, dat[pos += 1])) + out += 4 + # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters + else + buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f) + buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f) + buf[out += 1] = 0x80 | (ch & 0x3f) + end + end + UTF8String(buf) +end + +#= +""" +@brief Converts a UTF8String to a UTF32String + +@param[in] ::Type{UTF32String} +@param[in] str::UTF8String + +@return ::UTF32String +@throws ArgumentError +""" +=# +function convert(::Type{UTF32String}, str::UTF8String) + dat = str.data + # handle zero length string quickly + sizeof(dat) == 0 && return empty_utf32 + # Validate UTF-8 encoding, and get number of words to create + len, flags = check_string_utf8(dat) + # Optimize case where no characters > 0x7f + totlen = len+1 + flags == 0 && return fast_utf_copy(Char, totlen, dat) + # has multi-byte UTF-8 sequences + buf = Vector{Char}(totlen) + @inbounds buf[totlen] = 0 # NULL termination + local ch::UInt32, surr::UInt32 + out = 0 + pos = 0 + @inbounds while out < len + ch = dat[pos += 1] + # Handle ASCII characters + if ch <= 0x7f + buf[out += 1] = ch + # Handle range 0x80-0x7ff + elseif ch < 0xe0 + buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f) + # Handle range 0x800-0xffff + elseif ch < 0xf0 + pos += 2 + ch = get_utf8_3(dat, pos, ch) + # Handle surrogate pairs (should have been encoded in 4 bytes) + if is_surrogate_lead(ch) + # Build up 32-bit character from ch and trailing surrogate in next 3 bytes + pos += 3 + surr = ((UInt32(dat[pos-2] & 0xf) << 12) + | (UInt32(dat[pos-1] & 0x3f) << 6) + | (dat[pos] & 0x3f)) + ch = get_supplementary(ch, surr) + end + buf[out += 1] = ch + # Handle range 0x10000-0x10ffff + else + pos += 3 + buf[out += 1] = get_utf8_4(dat, pos, ch) + end + end + UTF32String(buf) +end + +#= +""" +@brief Converts a UTF16String to UTF32String + +@param[in] ::Type{UTF32String} +@param[in] str::UTF16String + +@return ::UTF32String +@throws ArgumentError +""" +=# +function convert(::Type{UTF32String}, str::UTF16String) + dat = str.data + len = sizeof(dat) + # handle zero length string quickly (account for trailing \0) + len <= 2 && return empty_utf32 + # get number of words to create + len, flags, num4byte = check_string_utf16(dat, len>>>1) + # No surrogate pairs, do optimized copy + (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat)) + local ch::UInt32 + buf = Vector{Char}(len) + out = 0 + pos = 0 + @inbounds while out < len + ch = dat[pos += 1] + # check for surrogate pair + if is_surrogate_lead(ch) ; ch = get_supplementary(ch, dat[pos += 1]) ; end + buf[out += 1] = ch + end + UTF32String(buf) +end + +#= +""" +@brief Converts a UTF-32 encoded vector of UInt32 to a UTF16String + +@param[in] ::Type{UTF16String} +@param[in] dat::Vector{UInt32} + +@return ::UTF16String +@throws ArgumentError +""" +=# +function convert(::Type{UTF16String}, dat::Vector{UInt32}) + len = sizeof(dat) + # handle zero length string quickly + len <= 4 && return empty_utf16 + # get number of words to allocate + len, flags, num4byte = check_string_utf32(dat, len>>>2) + len += num4byte + 1 + # optimized path, no surrogates + num4byte == 0 && return fast_utf_copy(UInt16, len, dat) + return encode_to_utf16(dat, len) +end + +#= +""" +@brief Converts a UTF32String to UTF16String + +@param[in] ::Type{UTF16String} +@param[in] str::UTF32String + +@return ::UTF16String +@throws ArgumentError +""" +=# +function convert(::Type{UTF16String}, str::UTF32String) + dat = reinterpret(UInt32, str.data) + len = sizeof(dat) + # handle zero length string quickly + len <= 4 && return empty_utf16 + # get number of words to allocate + len, flags, num4byte = check_string_utf32(dat, len>>>2) + # optimized path, no surrogates + num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat)) + return encode_to_utf16(dat, len + num4byte) +end + +#= +@doc """ +@brief Converts an already validated UTF-32 encoded vector of UInt32 to a UTF16String + +@param[in] dat::Vector{UInt32} UTF-32 encoded data +@param[in] len length of output in 16-bit words + +@return ::UTF16String +""" -> +=# +function encode_to_utf16(dat, len) + buf = Vector{UInt16}(len) + @inbounds buf[len] = 0 # NULL termination + out = 0 + pos = 0 + @inbounds while out < len + ch = UInt32(dat[pos += 1]) + if ch > 0xffff + # Output surrogate pair for 0x10000-0x10ffff + buf[out += 1] = 0xd7c0 + (ch >>> 10) + ch = 0xdc00 + (ch & 0x3ff) + end + buf[out += 1] = ch + end + UTF16String(buf) +end + +convert(::Type{UTF8String}, dat::Vector{Char}) = convert(UTF8String, reinterpret(UInt32, dat)) + +function convert(::Type{UTF16String}, str::ASCIIString) + dat = str.data + fast_utf_copy(UInt16, length(dat)+1, dat) +end + +function convert(::Type{UTF32String}, str::ASCIIString) + dat = str.data + fast_utf_copy(Char, length(dat)+1, dat) +end + +convert(::Type{UTF16String}, str::UTF16String) = str +convert(::Type{UTF16String}, dat::Vector{Char}) = convert(UTF16String, reinterpret(UInt32, dat)) + +convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data +convert(::Type{Array{UInt16}}, str::UTF16String) = str.data + +convert(::Type{UTF32String}, str::UTF32String) = str + +convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)]) diff --git a/test/strings.jl b/test/strings.jl index cc304f48a097a..200c5f780c640 100644 --- a/test/strings.jl +++ b/test/strings.jl @@ -1955,6 +1955,19 @@ function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String) end # Create some ASCII, UTF8 and UTF16 + +# issue #11551 (#11004,#10959) +function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String) + @test utf16(strUTF8) == strUTF16 + @test utf32(strUTF8) == strUTF32 + @test utf8(strUTF16) == strUTF8 + @test utf32(strUTF16) == strUTF32 + @test utf8(strUTF32) == strUTF8 + @test utf16(strUTF32) == strUTF16 +end + +# Create some ASCII, UTF8, UTF16, and UTF32 strings + strAscii = "abcdefgh" strA_UTF8 = ("abcdefgh\uff")[1:8] strL_UTF8 = "abcdef\uff\uff" @@ -1973,27 +1986,40 @@ str3_UTF16 = utf16(str3_UTF8) str4_UTF16 = utf16(str4_UTF8) strS_UTF16 = utf16(strS_UTF8) +strA_UTF32 = utf32(strA_UTF8) +strL_UTF32 = utf32(strL_UTF8) +str2_UTF32 = utf32(str2_UTF8) +str3_UTF32 = utf32(str3_UTF8) +str4_UTF32 = utf32(str4_UTF8) +strS_UTF32 = utf32(strS_UTF8) + @test utf8(strAscii) == strAscii @test utf16(strAscii) == strAscii +@test utf32(strAscii) == strAscii -tstcvt(strA_UTF8,strA_UTF16) -tstcvt(strL_UTF8,strL_UTF16) -tstcvt(str2_UTF8,str2_UTF16) -tstcvt(str3_UTF8,str3_UTF16) -tstcvt(str4_UTF8,str4_UTF16) +tstcvt(strA_UTF8,strA_UTF16,strA_UTF32) +tstcvt(strL_UTF8,strL_UTF16,strL_UTF32) +tstcvt(str2_UTF8,str2_UTF16,str2_UTF32) +tstcvt(str3_UTF8,str3_UTF16,str3_UTF32) +tstcvt(str4_UTF8,str4_UTF16,str4_UTF32) # Test converting surrogate pairs @test utf16(strS_UTF8) == strC_UTF8 +@test utf32(strS_UTF8) == strC_UTF8 @test utf8(strS_UTF16) == strC_UTF8 +@test utf32(strS_UTF16) == strC_UTF8 +@test utf8(strS_UTF32) == strC_UTF8 +@test utf16(strS_UTF32) == strC_UTF8 # Test converting overlong \0 # @test utf8(strZ_UTF8) == strz_UTF8 # currently broken! (in utf8.jl) @test utf16(strZ_UTF8) == strz_UTF8 +@test utf32(strZ_UTF8) == strz_UTF8 # Test invalid sequences byt = 0x0 -for T in (UTF16String,) # UTF32String +for T in (UTF16String, UTF32String) try # Continuation byte not after lead for byt in 0x80:0xbf