diff --git a/base/utf16.jl b/base/utf16.jl index 8caa3fd14f19d..5023eea20ca83 100644 --- a/base/utf16.jl +++ b/base/utf16.jl @@ -1,9 +1,34 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license -utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800 -utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00 -utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800 -utf16_get_supplementary(lead::UInt16, trail::UInt16) = Char(UInt32(lead-0xd7f7)<<10 + trail) +# Quickly copy and set trailing \0 +@inline function fast_utf_copy{S <: Union{UTF16String, UTF32String}, T <: Union{UInt16, Char}}( + ::Type{S}, ::Type{T}, len, dat, flag::Bool=false) + S(setindex!(copy!(Vector{T}(len+1), 1, dat, 1, flag ? len : len+1), 0, len+1)) +end + +# Get rest of character ch from 3-byte UTF-8 sequence in dat +@inline function get_utf8_3byte(dat, pos, ch) + @inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f) +end +# Get rest of character ch from 4-byte UTF-8 sequence in dat +@inline function get_utf8_4byte(dat, pos, ch) + @inbounds return (((ch & 0x7) << 18) + | (UInt32(dat[pos-2] & 0x3f) << 12) + | (UInt32(dat[pos-1] & 0x3f) << 6) + | (dat[pos] & 0x3f)) +end + +# Output a character as a 4-byte UTF-8 sequence +@inline function output_utf8_4byte!(buf, out, ch) + @inbounds begin + buf[out + 1] = 0xf0 | (ch >>> 18) + buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f) + buf[out + 3] = 0x80 | ((ch >>> 6) & 0x3f) + buf[out + 4] = 0x80 | (ch & 0x3f) + end +end + +const empty_utf16 = UTF16String(UInt16[0]) function length(s::UTF16String) d = s.data @@ -11,7 +36,7 @@ function length(s::UTF16String) len == 0 && return 0 cnum = 0 for i = 1:len - @inbounds cnum += !utf16_is_trail(d[i]) + @inbounds cnum += !is_surrogate_trail(d[i]) end cnum end @@ -20,100 +45,240 @@ function endof(s::UTF16String) d = s.data i = length(d) - 1 i == 0 && return i - utf16_is_surrogate(d[i]) ? i-1 : i + return is_surrogate_codeunit(d[i]) ? i-1 : i end +get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail) + function next(s::UTF16String, i::Int) - if !utf16_is_surrogate(s.data[i]) - return Char(s.data[i]), i+1 - elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1]) - return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2 - end - throw(UnicodeError(UTF_ERR_INVALID_INDEX,0,0)) + ch = s.data[i] + !is_surrogate_codeunit(ch) && return (Char(ch), i+1) + # check length, account for terminating \0 + i >= (length(s.data)-1) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch))) + !is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, i, ch)) + ct = s.data[i+1] + !is_surrogate_trail(ct) && throw((UTF_ERR_NOT_TRAIL, i, ch)) + Char(get_supplementary(ch, ct)), i+2 end function reverseind(s::UTF16String, i::Integer) j = length(s.data) - i - return Base.utf16_is_trail(s.data[j]) ? j-1 : j + return is_surrogate_trail(s.data[j]) ? j-1 : j end lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator function reverse(s::UTF16String) - d =s.data + d = s.data out = similar(d) out[end] = 0 # NULL termination n = length(d) - for i = 1:n-1 - out[i] = d[n-i] - if Base.utf16_is_lead(out[i]) - out[i],out[i-1] = out[i-1],out[i] + @inbounds for i = 1:n-1 + ch = d[n-i] + if is_surrogate_lead(ch) + out[i],out[i-1] = out[i-1],ch + else + out[i] = ch + end + end + UTF16String(out) +end + +sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16) + +function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16}) + i = 1 + n = length(data) # this may include NULL termination; that's okay + @inbounds while i < n # check for unpaired surrogates + if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1]) + i += 2 + elseif is_surrogate_codeunit(data[i]) + return false + else + i += 1 end end - return UTF16String(out) + return i > n || !is_surrogate_codeunit(data[i]) end -# TODO: optimize this -function encode16(s::AbstractString) - buf = UInt16[] - for ch in s - c = reinterpret(UInt32, ch) +" +Converts an `AbstractString` to a `UTF16String` + +### Returns: +* `UTF16String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF16String}, str::AbstractString) + len, flags, num4byte = unsafe_checkstring(str) + buf = Vector{UInt16}(len+num4byte+1) + out = 0 + @inbounds for ch in str + c = UInt32(ch) if c < 0x10000 - push!(buf, UInt16(c)) - elseif c <= 0x10ffff - push!(buf, UInt16(0xd7c0 + (c>>10))) - push!(buf, UInt16(0xdc00 + (c & 0x3ff))) + buf[out += 1] = UInt16(c) else - throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, ch)) + # output surrogate pair + buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10)) + buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff)) end end - push!(buf, 0) # NULL termination + @inbounds buf[out + 1] = 0 # NULL termination UTF16String(buf) end -utf16(x) = convert(UTF16String, x) -convert(::Type{UTF16String}, s::UTF16String) = s -convert(::Type{UTF16String}, s::AbstractString) = encode16(s) -convert(::Type{Array{UInt16,1}}, s::UTF16String) = s.data -convert(::Type{Array{UInt16}}, s::UTF16String) = s.data +" +Converts a `UTF8String` to a `UTF16String` -# TODO: optimize this -convert(::Type{UTF8String}, s::UTF16String) = - sprint(length(s.data)-1, io->for c in s; write(io,c::Char); end) +### Returns: +* `UTF16String` -sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16) -unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) = - convert(Ptr{T}, pointer(s)) +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF16String}, str::UTF8String) + dat = str.data + # handle zero length string quickly + sizeof(dat) == 0 && return empty_utf16 + # Check that is correct UTF-8 encoding and get number of words needed + len, flags, num4byte = unsafe_checkstring(dat) + len += num4byte + buf = Vector{UInt16}(len+1) + @inbounds buf[len+1] = 0 + # Optimize case where no characters > 0x7f + flags == 0 && @inbounds return UTF16String(copy!(buf, dat)) + out = 0 + pos = 0 + @inbounds while out < len + ch::UInt32 = dat[pos += 1] + # Handle ASCII characters + if ch <= 0x7f + buf[out += 1] = ch + # Handle range 0x80-0x7ff + elseif ch < 0xe0 + buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f) + # Handle range 0x800-0xffff + elseif ch < 0xf0 + pos += 2 + buf[out += 1] = get_utf8_3byte(dat, pos, ch) + # Handle range 0x10000-0x10ffff + else + pos += 3 + ch = get_utf8_4byte(dat, pos, ch) + # output surrogate pair + buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10)) + buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff)) + end + end + UTF16String(buf) +end -function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16}) - i = 1 - n = length(data) # this may include NULL termination; that's okay - while i < n # check for unpaired surrogates - if utf16_is_lead(data[i]) && utf16_is_trail(data[i+1]) - i += 2 - elseif utf16_is_surrogate(data[i]) - return false +" +Converts a UTF-16 encoded vector of `UInt16` to a `UTF8String` + +### Returns: +* `UTF8String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF8String}, dat::Vector{UInt16}) + len = sizeof(dat) + # handle zero length string quickly + len == 0 && return emtpy_utf8 + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len>>>1) + flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), dat)) + return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3) +end + +" +Converts a `UTF16String` to a `UTF8String` + +### Returns: +* `UTF8String` + +### Throws: +* `UnicodeError` +" +function convert(::Type{UTF8String}, str::UTF16String) + dat = str.data + len = sizeof(dat) >>> 1 + # handle zero length string quickly + len <= 1 && return empty_utf8 + # get number of bytes to allocate + len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1) + flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) + return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3) +end + +" +Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String` + +### Input Arguments: +* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted +* `len` length of output in bytes + +### Returns: +* `UTF8String` +" +function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len) + buf = Vector{UInt8}(len) + out = 0 + pos = 0 + @inbounds while out < len + ch::UInt32 = dat[pos += 1] + # Handle ASCII characters + if ch <= 0x7f + buf[out += 1] = ch + # Handle 0x80-0x7ff + elseif ch < 0x800 + buf[out += 1] = 0xc0 | (ch >>> 6) + buf[out += 1] = 0x80 | (ch & 0x3f) + # Handle 0x10000-0x10ffff (if input is UInt32) + elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16 + output_utf8_4byte!(buf, out, ch) + out += 4 + # Handle surrogate pairs + elseif is_surrogate_codeunit(ch) + output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1])) + out += 4 + # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters else - i += 1 + buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f) + buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f) + buf[out += 1] = 0x80 | (ch & 0x3f) end end - return i > n || !utf16_is_surrogate(data[i]) + UTF8String(buf) end -function convert(::Type{UTF16String}, data::AbstractVector{UInt16}) - !isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0)) - len = length(data) - d = Array(UInt16, len + 1) - d[end] = 0 # NULL terminate - UTF16String(copy!(d,1, data,1, len)) +function convert(::Type{UTF16String}, str::ASCIIString) + dat = str.data + @inbounds return fast_utf_copy(UTF16String, UInt16, length(dat), dat, true) end +convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data +convert(::Type{Array{UInt16}}, str::UTF16String) = str.data + +convert(::Type{UTF16String}, str::UTF16String) = str + +unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) = + convert(Ptr{T}, pointer(s)) + convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) = convert(T, reshape(data, length(data))) convert(T::Type{UTF16String}, data::AbstractArray{Int16}) = convert(T, reinterpret(UInt16, data)) +function convert(::Type{UTF16String}, data::AbstractVector{UInt16}) + !isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0)) + len = length(data) + @inbounds return UTF16String(setindex!(copy!(Vector{UInt16}(len+1),1,data,1,len),0,len+1)) +end + function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8}) isempty(bytes) && return UTF16String(UInt16[0]) isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0)) @@ -136,6 +301,9 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8}) UTF16String(d) end +convert(::Type{UTF16String}, str::UTF16String) = str + +utf16(x) = convert(UTF16String, x) utf16(p::Ptr{UInt16}, len::Integer) = utf16(pointer_to_array(p, len)) utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len) function utf16(p::Union{Ptr{UInt16}, Ptr{Int16}}) diff --git a/base/utf32.jl b/base/utf32.jl index b85213e747db1..0d481bfda353c 100644 --- a/base/utf32.jl +++ b/base/utf32.jl @@ -1,5 +1,6 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license +# UTF-32 basic functions next(s::UTF32String, i::Int) = (s.data[i], i+1) endof(s::UTF32String) = length(s.data) - 1 length(s::UTF32String) = length(s.data) - 1 @@ -40,8 +41,8 @@ function convert{T<:ByteString}(::Type{T}, data::AbstractVector{Char}) convert(T, takebuf_string(s)) end -convert(::Type{Array{Char,1}}, s::UTF32String) = s.data -convert(::Type{Array{Char}}, s::UTF32String) = s.data +convert(::Type{Vector{Char}}, str::UTF32String) = str.data +convert(::Type{Array{Char}}, str::UTF32String) = str.data reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s))) @@ -60,19 +61,19 @@ function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8}) elseif data[1] == Char(0xfffe0000) # byte-swapped d = Array(Char, length(data)) for i = 2:length(data) - d[i-1] = bswap(data[i]) + @inbounds d[i-1] = bswap(data[i]) end else d = Array(Char, length(data) + 1) copy!(d, 1, data, 1, length(data)) # assume native byte order end - d[end] = Char(0) # NULL terminate + d[end] = 0 # NULL terminate UTF32String(d) end function isvalid(::Type{UTF32String}, str::Union{Vector{Char}, Vector{UInt32}}) for i=1:length(str) - @inbounds if !isvalid(Char, reinterpret(UInt32, str[i])) ; return false ; end + @inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end end return true end @@ -89,9 +90,9 @@ end function map(f, s::UTF32String) d = s.data out = similar(d) - out[end] = Char(0) + out[end] = 0 - for i = 1:(length(d)-1) + @inbounds for i = 1:(length(d)-1) c2 = f(d[i]) if !isa(c2, Char) throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0)) diff --git a/base/utferror.jl b/base/utferror.jl index 27b36e45b44fb..352bd03a16308 100644 --- a/base/utferror.jl +++ b/base/utferror.jl @@ -1,6 +1,6 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license -##\brief Error messages for Unicode / UTF support +## Error messages for Unicode / UTF support const UTF_ERR_SHORT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)" const UTF_ERR_CONT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)" diff --git a/test/strings.jl b/test/strings.jl index 3701567bae68e..9d400c8ec4302 100644 --- a/test/strings.jl +++ b/test/strings.jl @@ -1732,7 +1732,7 @@ d = UTF32String(c) c[1] = 'A' @test d=="A" -# 11575 +# Issue #11575 # Test invalid sequences byt = 0x0 # Needs to be defined outside the try block! @@ -1897,3 +1897,139 @@ end @test [c for c in "ḟøøƀäṙ"] == ['ḟ', 'ø', 'ø', 'ƀ', 'ä', 'ṙ'] @test [i for i in eachindex("ḟøøƀäṙ")] == [1, 4, 6, 8, 10, 12] @test [x for x in enumerate("ḟøøƀäṙ")] == [(1, 'ḟ'), (2, 'ø'), (3, 'ø'), (4, 'ƀ'), (5, 'ä'), (6, 'ṙ')] + +# issue #11551 (#11004,#10959) +function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String) + @test utf16(strUTF8) == strUTF16 + @test utf8(strUTF16) == strUTF8 +end + +# Create some ASCII, UTF8 and UTF16 +strAscii = "abcdefgh" +strA_UTF8 = ("abcdefgh\uff")[1:8] +strL_UTF8 = "abcdef\uff\uff" +str2_UTF8 = "abcd\uff\uff\u7ff\u7ff" +str3_UTF8 = "abcd\uff\uff\u7fff\u7fff" +str4_UTF8 = "abcd\uff\u7ff\u7fff\U7ffff" +strS_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80") +strC_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\U10000") +strZ_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80") +strz_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\0") + +strA_UTF16 = utf16(strA_UTF8) +strL_UTF16 = utf16(strL_UTF8) +str2_UTF16 = utf16(str2_UTF8) +str3_UTF16 = utf16(str3_UTF8) +str4_UTF16 = utf16(str4_UTF8) +strS_UTF16 = utf16(strS_UTF8) + +@test utf8(strAscii) == strAscii +@test utf16(strAscii) == strAscii + +tstcvt(strA_UTF8,strA_UTF16) +tstcvt(strL_UTF8,strL_UTF16) +tstcvt(str2_UTF8,str2_UTF16) +tstcvt(str3_UTF8,str3_UTF16) +tstcvt(str4_UTF8,str4_UTF16) + +# Test converting surrogate pairs +@test utf16(strS_UTF8) == strC_UTF8 +@test utf8(strS_UTF16) == strC_UTF8 + +# Test converting overlong \0 +# @test utf8(strZ_UTF8) == strz_UTF8 # currently broken! (in utf8.jl) +@test utf16(strZ_UTF8) == strz_UTF8 + +# Test invalid sequences + +byt = 0x0 +for T in (UTF16String,) # UTF32String + try + # Continuation byte not after lead + for byt in 0x80:0xbf + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt])) + end + + # Test lead bytes + for byt in 0xc0:0xff + # Single lead byte at end of string + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt])) + # Lead followed by non-continuation character < 0x80 + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0])) + # Lead followed by non-continuation character > 0xbf + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0xc0])) + end + + # Test overlong 2-byte + for byt in 0x81:0xbf + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xc0,byt])) + end + for byt in 0x80:0xbf + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xc1,byt])) + end + + # Test overlong 3-byte + for byt in 0x80:0x9f + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xe0,byt,0x80])) + end + + # Test overlong 4-byte + for byt in 0x80:0x8f + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xef,byt,0x80,0x80])) + end + + # Test 4-byte > 0x10ffff + for byt in 0x90:0xbf + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xf4,byt,0x80,0x80])) + end + for byt in 0xf5:0xf7 + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80])) + end + + # Test 5-byte + for byt in 0xf8:0xfb + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80])) + end + + # Test 6-byte + for byt in 0xfc:0xfd + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80,0x80])) + end + + # Test 7-byte + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80])) + + # Three and above byte sequences + for byt in 0xe0:0xef + # Lead followed by only 1 continuation byte + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80])) + # Lead ended by non-continuation character < 0x80 + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0])) + # Lead ended by non-continuation character > 0xbf + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0xc0])) + end + + # 3-byte encoded surrogate character(s) + # Single surrogate + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80])) + # Not followed by surrogate + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80])) + # Trailing surrogate first + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80])) + # Followed by lead surrogate + @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80])) + + # Four byte sequences + for byt in 0xf0:0xf4 + # Lead followed by only 2 continuation bytes + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80])) + # Lead followed by non-continuation character < 0x80 + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0])) + # Lead followed by non-continuation character > 0xbf + @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0xc0])) + end + catch exp ; + println("Error checking $T: $byt") + throw(exp) + end +end