From 34952a24ea48df04765daab82a4426724255b09d Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Wed, 3 Jun 2015 02:48:59 +0200 Subject: [PATCH] Fix #10959 UTF-32 conversion errors Added new `convert` methods that use the `checkstring` function to validate input Added tests for many sorts of valid/invalid data Depends on PR #11551 and #11575 Updated to use unsafe_checkstring, fix comments Remove conversions from Vector{UInt32} Move some code from utf32.jl to utf16.jl and utf8.jl, hopefully more logical --- base/unicode/utf16.jl | 64 +++++++++------- base/unicode/utf32.jl | 173 ++++++++++++++++++++++++++++++++++++++---- base/unicode/utf8.jl | 61 +++++++++++++++ test/unicode/utf32.jl | 154 +++++++++++++++++++++++++++++++++++++ 4 files changed, 410 insertions(+), 42 deletions(-) diff --git a/base/unicode/utf16.jl b/base/unicode/utf16.jl index a8a520a6bf4c2..7f97a46db4acf 100644 --- a/base/unicode/utf16.jl +++ b/base/unicode/utf16.jl @@ -195,44 +195,52 @@ function convert(::Type{UTF8String}, str::UTF16String) end " -Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String` +Converts a vector of `Char` to a `UTF16String` + +### Returns: +* `::UTF16String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF16String}, chrs::Vector{Char}) + len = sizeof(chrs) + # handle zero length string quickly + len == 0 && return empty_utf16 + dat = reinterpret(UInt32, chrs) + # get number of words to allocate + len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>2) + len += num4byte + 1 + # optimized path, no surrogates + num4byte == 0 && @inbounds return fast_utf_copy(UTF16String, UInt16, len, dat) + return encode_to_utf16(dat, len) +end + +" +Converts an already validated UTF-32 encoded vector of `UInt32` to a `UTF16String` ### Input Arguments: -* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted -* `len` length of output in bytes +* `dat::Vector{UInt32}` UTF-32 encoded data +* `len` length of output in 16-bit words ### Returns: -* `UTF8String` +* `::UTF16String` " -function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len) - buf = Vector{UInt8}(len) +function encode_to_utf16(dat, len) + buf = Vector{UInt16}(len) + @inbounds buf[len] = 0 # NULL termination out = 0 pos = 0 @inbounds while out < len - ch::UInt32 = dat[pos += 1] - # Handle ASCII characters - if ch <= 0x7f - buf[out += 1] = ch - # Handle 0x80-0x7ff - elseif ch < 0x800 - buf[out += 1] = 0xc0 | (ch >>> 6) - buf[out += 1] = 0x80 | (ch & 0x3f) - # Handle 0x10000-0x10ffff (if input is UInt32) - elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16 - output_utf8_4byte!(buf, out, ch) - out += 4 - # Handle surrogate pairs - elseif is_surrogate_codeunit(ch) - output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1])) - out += 4 - # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters - else - buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f) - buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f) - buf[out += 1] = 0x80 | (ch & 0x3f) + ch = UInt32(dat[pos += 1]) + if ch > 0xffff + # Output surrogate pair for 0x10000-0x10ffff + buf[out += 1] = 0xd7c0 + (ch >>> 10) + ch = 0xdc00 + (ch & 0x3ff) end + buf[out += 1] = ch end - UTF8String(buf) + UTF16String(buf) end function convert(::Type{UTF16String}, str::ASCIIString) diff --git a/base/unicode/utf32.jl b/base/unicode/utf32.jl index 612a3bbe4d061..c738fec577410 100644 --- a/base/unicode/utf32.jl +++ b/base/unicode/utf32.jl @@ -5,25 +5,169 @@ next(s::UTF32String, i::Int) = (s.data[i], i+1) endof(s::UTF32String) = length(s.data) - 1 length(s::UTF32String) = length(s.data) - 1 +reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s))) + +sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char) + +const empty_utf32 = UTF32String(UInt32[0]) + utf32(x) = convert(UTF32String, x) convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)]) convert(::Type{UTF32String}, s::UTF32String) = s -function convert(::Type{UTF32String}, s::AbstractString) - a = Array(Char, length(s) + 1) - i = 0 - for c in s - a[i += 1] = c +" +Converts an `AbstractString` to a `UTF32String` + +### Returns: +* `UTF32String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF32String}, str::AbstractString) + len, flags = unsafe_checkstring(str) + buf = Vector{Char}(len+1) + out = 0 + @inbounds for ch in str ; buf[out += 1] = ch ; end + @inbounds buf[out + 1] = 0 # NULL termination + UTF32String(buf) +end + +" +Converts a `UTF32String` to a `UTF8String` + +### Returns: +* `UTF8String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF8String}, str::UTF32String) + dat = reinterpret(UInt32, str.data) + len = sizeof(dat) >>> 2 + # handle zero length string quickly + len <= 1 && return empty_utf8 + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1) + flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3) +end + +" +Converts a `UTF8String` to a `UTF32String` + +### Returns: +* `::UTF32String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF32String}, str::UTF8String) + dat = str.data + # handle zero length string quickly + sizeof(dat) == 0 && return empty_utf32 + # Validate UTF-8 encoding, and get number of words to create + len, flags = unsafe_checkstring(dat) + # Optimize case where no characters > 0x7f + flags == 0 && @inbounds return fast_utf_copy(UTF32String, Char, len, dat, true) + # has multi-byte UTF-8 sequences + buf = Vector{Char}(len+1) + @inbounds buf[len+1] = 0 # NULL termination + local ch::UInt32, surr::UInt32 + out = 0 + pos = 0 + @inbounds while out < len + ch = dat[pos += 1] + # Handle ASCII characters + if ch <= 0x7f + buf[out += 1] = ch + # Handle range 0x80-0x7ff + elseif ch < 0xe0 + buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f) + # Handle range 0x800-0xffff + elseif ch < 0xf0 + pos += 2 + ch = get_utf8_3byte(dat, pos, ch) + # Handle surrogate pairs (should have been encoded in 4 bytes) + if is_surrogate_lead(ch) + # Build up 32-bit character from ch and trailing surrogate in next 3 bytes + pos += 3 + surr = ((UInt32(dat[pos-2] & 0xf) << 12) + | (UInt32(dat[pos-1] & 0x3f) << 6) + | (dat[pos] & 0x3f)) + ch = get_supplementary(ch, surr) + end + buf[out += 1] = ch + # Handle range 0x10000-0x10ffff + else + pos += 3 + buf[out += 1] = get_utf8_4byte(dat, pos, ch) + end end - a[end] = Char(0) # NULL terminate - UTF32String(a) + UTF32String(buf) end -function convert(::Type{UTF32String}, data::AbstractVector{Char}) - len = length(data) - d = Array(Char, len + 1) - d[end] = Char(0) # NULL terminate - UTF32String(copy!(d,1, data,1, len)) +" +Converts a `UTF16String` to `UTF32String` + +### Returns: +* `::UTF32String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF32String}, str::UTF16String) + dat = str.data + len = sizeof(dat) + # handle zero length string quickly (account for trailing \0) + len <= 2 && return empty_utf32 + # get number of words to create + len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>1) + # No surrogate pairs, do optimized copy + (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat)) + local ch::UInt32 + buf = Vector{Char}(len) + out = 0 + pos = 0 + @inbounds while out < len + ch = dat[pos += 1] + # check for surrogate pair + if is_surrogate_lead(ch) ; ch = get_supplementary(ch, dat[pos += 1]) ; end + buf[out += 1] = ch + end + UTF32String(buf) +end + +" +Converts a `UTF32String` to `UTF16String` + +### Returns: +* `::UTF16String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF16String}, str::UTF32String) + dat = reinterpret(UInt32, str.data) + len = sizeof(dat) + # handle zero length string quickly + len <= 4 && return empty_utf16 + # get number of words to allocate + len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>2) + # optimized path, no surrogates + num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat)) + return encode_to_utf16(dat, len + num4byte) +end + +convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)]) + +function convert(::Type{UTF32String}, str::ASCIIString) + dat = str.data + @inbounds return fast_utf_copy(UTF32String, Char, length(dat), dat, true) +end + +function convert(::Type{UTF32String}, dat::AbstractVector{Char}) + @inbounds return fast_utf_copy(UTF32String, Char, length(dat), dat, true) end convert{T<:Union{Int32,UInt32}}(::Type{UTF32String}, data::AbstractVector{T}) = @@ -46,12 +190,11 @@ convert(::Type{Array{Char}}, str::UTF32String) = str.data reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s))) -sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char) unsafe_convert{T<:Union{Int32,UInt32,Char}}(::Type{Ptr{T}}, s::UTF32String) = convert(Ptr{T}, pointer(s)) function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8}) - isempty(bytes) && return UTF32String(Char[0]) + isempty(bytes) && return empty_utf32 length(bytes) & 3 != 0 && throw(UnicodeError(UTF_ERR_ODD_BYTES_32,0,0)) data = reinterpret(Char, bytes) # check for byte-order mark (BOM): @@ -79,6 +222,8 @@ function isvalid(::Type{UTF32String}, str::Union{Vector{Char}, Vector{UInt32}}) end isvalid(str::Vector{Char}) = isvalid(UTF32String, str) +utf32(x) = convert(UTF32String, x) + utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len)) utf32(p::Union{Ptr{UInt32}, Ptr{Int32}}, len::Integer) = utf32(convert(Ptr{Char}, p), len) function utf32(p::Union{Ptr{Char}, Ptr{UInt32}, Ptr{Int32}}) diff --git a/base/unicode/utf8.jl b/base/unicode/utf8.jl index e94a988777521..f38c2149179d5 100644 --- a/base/unicode/utf8.jl +++ b/base/unicode/utf8.jl @@ -238,6 +238,67 @@ function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractStr end convert(::Type{UTF8String}, s::AbstractString) = utf8(bytestring(s)) +" +Converts a vector of `Char` to a `UTF8String` + +### Returns: +* `UTF8String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF8String}, chrs::Vector{Char}) + len = sizeof(chrs) + # handle zero length string quickly + len == 0 && return empty_utf8 + dat = reinterpret(UInt32, chrs) + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len>>>2) + flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3) +end + +" +Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String` + +### Input Arguments: +* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted +* `len` length of output in bytes + +### Returns: +* `UTF8String` +" +function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len) + buf = Vector{UInt8}(len) + out = 0 + pos = 0 + @inbounds while out < len + ch::UInt32 = dat[pos += 1] + # Handle ASCII characters + if ch <= 0x7f + buf[out += 1] = ch + # Handle 0x80-0x7ff + elseif ch < 0x800 + buf[out += 1] = 0xc0 | (ch >>> 6) + buf[out += 1] = 0x80 | (ch & 0x3f) + # Handle 0x10000-0x10ffff (if input is UInt32) + elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16 + output_utf8_4byte!(buf, out, ch) + out += 4 + # Handle surrogate pairs + elseif is_surrogate_codeunit(ch) + output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1])) + out += 4 + # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters + else + buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f) + buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f) + buf[out += 1] = 0x80 | (ch & 0x3f) + end + end + UTF8String(buf) +end + utf8(p::Ptr{UInt8}) = UTF8String(bytestring(p)) utf8(p::Ptr{UInt8}, len::Integer) = utf8(pointer_to_array(p, len)) diff --git a/test/unicode/utf32.jl b/test/unicode/utf32.jl index 15ddb1da56f74..f554558e146e9 100644 --- a/test/unicode/utf32.jl +++ b/test/unicode/utf32.jl @@ -12,6 +12,160 @@ u32 = utf32(u8) @test u8 == utf32(pointer(u32)) == utf32(convert(Ptr{Int32}, pointer(u32))) @test_throws UnicodeError utf32(UInt8[1,2,3]) +# issue #11551 (#11004,#10959) +function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String) + @test utf16(strUTF8) == strUTF16 + @test utf32(strUTF8) == strUTF32 + @test utf8(strUTF16) == strUTF8 + @test utf32(strUTF16) == strUTF32 + @test utf8(strUTF32) == strUTF8 + @test utf16(strUTF32) == strUTF16 +end + +# Create some ASCII, UTF8, UTF16, and UTF32 strings + +strAscii = "abcdefgh" +strA_UTF8 = ("abcdefgh\uff")[1:8] +strL_UTF8 = "abcdef\uff\uff" +str2_UTF8 = "abcd\uff\uff\u7ff\u7ff" +str3_UTF8 = "abcd\uff\uff\u7fff\u7fff" +str4_UTF8 = "abcd\uff\u7ff\u7fff\U7ffff" +strS_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80") +strC_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\U10000") +strZ_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80") +strz_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\0") + +strA_UTF16 = utf16(strA_UTF8) +strL_UTF16 = utf16(strL_UTF8) +str2_UTF16 = utf16(str2_UTF8) +str3_UTF16 = utf16(str3_UTF8) +str4_UTF16 = utf16(str4_UTF8) +strS_UTF16 = utf16(strS_UTF8) + +strA_UTF32 = utf32(strA_UTF8) +strL_UTF32 = utf32(strL_UTF8) +str2_UTF32 = utf32(str2_UTF8) +str3_UTF32 = utf32(str3_UTF8) +str4_UTF32 = utf32(str4_UTF8) +strS_UTF32 = utf32(strS_UTF8) + +@test utf8(strAscii) == strAscii +@test utf16(strAscii) == strAscii +@test utf32(strAscii) == strAscii + +tstcvt(strA_UTF8,strA_UTF16,strA_UTF32) +tstcvt(strL_UTF8,strL_UTF16,strL_UTF32) +tstcvt(str2_UTF8,str2_UTF16,str2_UTF32) +tstcvt(str3_UTF8,str3_UTF16,str3_UTF32) +tstcvt(str4_UTF8,str4_UTF16,str4_UTF32) + +# Test converting surrogate pairs +@test utf16(strS_UTF8) == strC_UTF8 +@test utf32(strS_UTF8) == strC_UTF8 +@test utf8(strS_UTF16) == strC_UTF8 +@test utf32(strS_UTF16) == strC_UTF8 +@test utf8(strS_UTF32) == strC_UTF8 +@test utf16(strS_UTF32) == strC_UTF8 + +# Test converting overlong \0 +# @test utf8(strZ_UTF8) == strz_UTF8 # currently broken! (in utf8.jl) +@test utf16(strZ_UTF8) == strz_UTF8 +@test utf32(strZ_UTF8) == strz_UTF8 + +# Test invalid sequences + +byt = 0x0 +for T in (UTF16String, UTF32String) + try + # Continuation byte not after lead + for byt in 0x80:0xbf + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt])) + end + + # Test lead bytes + for byt in 0xc0:0xff + # Single lead byte at end of string + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt])) + # Lead followed by non-continuation character < 0x80 + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0])) + # Lead followed by non-continuation character > 0xbf + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0xc0])) + end + + # Test overlong 2-byte + for byt in 0x81:0xbf + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xc0,byt])) + end + for byt in 0x80:0xbf + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xc1,byt])) + end + + # Test overlong 3-byte + for byt in 0x80:0x9f + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xe0,byt,0x80])) + end + + # Test overlong 4-byte + for byt in 0x80:0x8f + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xef,byt,0x80,0x80])) + end + + # Test 4-byte > 0x10ffff + for byt in 0x90:0xbf + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xf4,byt,0x80,0x80])) + end + for byt in 0xf5:0xf7 + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80])) + end + + # Test 5-byte + for byt in 0xf8:0xfb + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80])) + end + + # Test 6-byte + for byt in 0xfc:0xfd + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80,0x80])) + end + + # Test 7-byte + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80])) + + # Three and above byte sequences + for byt in 0xe0:0xef + # Lead followed by only 1 continuation byte + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80])) + # Lead ended by non-continuation character < 0x80 + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0])) + # Lead ended by non-continuation character > 0xbf + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0xc0])) + end + + # 3-byte encoded surrogate character(s) + # Single surrogate + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80])) + # Not followed by surrogate + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80])) + # Trailing surrogate first + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80])) + # Followed by lead surrogate + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80])) + + # Four byte sequences + for byt in 0xf0:0xf4 + # Lead followed by only 2 continuation bytes + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80])) + # Lead followed by non-continuation character < 0x80 + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0])) + # Lead followed by non-continuation character > 0xbf + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0xc0])) + end + catch exp ; + println("Error checking $T: $byt") + throw(exp) + end +end + # Wstring u8 = "\U10ffff\U1d565\U1d7f6\U00066\U2008a" w = wstring(u8)