Skip to content

Commit

Permalink
Merge pull request JuliaLang#11551 from ScottPJones/spj/fixutf
Browse files Browse the repository at this point in the history
Fix JuliaLang#10959 bugs with UTF-16 conversions
  • Loading branch information
tkelman committed Jul 1, 2015
2 parents 1c4dae4 + 2ab9334 commit 9071f14
Show file tree
Hide file tree
Showing 4 changed files with 350 additions and 67 deletions.
264 changes: 206 additions & 58 deletions base/utf16.jl
Original file line number Diff line number Diff line change
@@ -1,17 +1,42 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800
utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00
utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800
utf16_get_supplementary(lead::UInt16, trail::UInt16) = Char(UInt32(lead-0xd7f7)<<10 + trail)
# Quickly copy and set trailing \0
@inline function fast_utf_copy{S <: Union{UTF16String, UTF32String}, T <: Union{UInt16, Char}}(
::Type{S}, ::Type{T}, len, dat, flag::Bool=false)
S(setindex!(copy!(Vector{T}(len+1), 1, dat, 1, flag ? len : len+1), 0, len+1))
end

# Get rest of character ch from 3-byte UTF-8 sequence in dat
@inline function get_utf8_3byte(dat, pos, ch)
@inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f)
end
# Get rest of character ch from 4-byte UTF-8 sequence in dat
@inline function get_utf8_4byte(dat, pos, ch)
@inbounds return (((ch & 0x7) << 18)
| (UInt32(dat[pos-2] & 0x3f) << 12)
| (UInt32(dat[pos-1] & 0x3f) << 6)
| (dat[pos] & 0x3f))
end

# Output a character as a 4-byte UTF-8 sequence
@inline function output_utf8_4byte!(buf, out, ch)
@inbounds begin
buf[out + 1] = 0xf0 | (ch >>> 18)
buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f)
buf[out + 3] = 0x80 | ((ch >>> 6) & 0x3f)
buf[out + 4] = 0x80 | (ch & 0x3f)
end
end

const empty_utf16 = UTF16String(UInt16[0])

function length(s::UTF16String)
d = s.data
len = length(d) - 1
len == 0 && return 0
cnum = 0
for i = 1:len
@inbounds cnum += !utf16_is_trail(d[i])
@inbounds cnum += !is_surrogate_trail(d[i])
end
cnum
end
Expand All @@ -20,100 +45,220 @@ function endof(s::UTF16String)
d = s.data
i = length(d) - 1
i == 0 && return i
utf16_is_surrogate(d[i]) ? i-1 : i
return is_surrogate_codeunit(d[i]) ? i-1 : i
end

get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail)

function next(s::UTF16String, i::Int)
if !utf16_is_surrogate(s.data[i])
return Char(s.data[i]), i+1
elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1])
return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2
end
throw(UnicodeError(UTF_ERR_INVALID_INDEX,0,0))
ch = s.data[i]
!is_surrogate_codeunit(ch) && return (Char(ch), i+1)
# check length, account for terminating \0
i >= (length(s.data)-1) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch)))
!is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, i, ch))
ct = s.data[i+1]
!is_surrogate_trail(ct) && throw((UTF_ERR_NOT_TRAIL, i, ch))
Char(get_supplementary(ch, ct)), i+2
end

function reverseind(s::UTF16String, i::Integer)
j = length(s.data) - i
return Base.utf16_is_trail(s.data[j]) ? j-1 : j
return is_surrogate_trail(s.data[j]) ? j-1 : j
end

lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator

function reverse(s::UTF16String)
d =s.data
d = s.data
out = similar(d)
out[end] = 0 # NULL termination
n = length(d)
for i = 1:n-1
out[i] = d[n-i]
if Base.utf16_is_lead(out[i])
out[i],out[i-1] = out[i-1],out[i]
@inbounds for i = 1:n-1
ch = d[n-i]
if is_surrogate_lead(ch)
out[i],out[i-1] = out[i-1],ch
else
out[i] = ch
end
end
return UTF16String(out)
UTF16String(out)
end

# TODO: optimize this
function encode16(s::AbstractString)
buf = UInt16[]
for ch in s
c = reinterpret(UInt32, ch)
sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)

function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
i = 1
n = length(data) # this may include NULL termination; that's okay
@inbounds while i < n # check for unpaired surrogates
if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1])
i += 2
elseif is_surrogate_codeunit(data[i])
return false
else
i += 1
end
end
return i > n || !is_surrogate_codeunit(data[i])
end

"
Converts an `AbstractString` to a `UTF16String`
### Returns:
* `UTF16String`
### Throws:
* `UnicodeError`
"
function convert(::Type{UTF16String}, str::AbstractString)
len, flags, num4byte = unsafe_checkstring(str)
buf = Vector{UInt16}(len+num4byte+1)
out = 0
@inbounds for ch in str
c = UInt32(ch)
if c < 0x10000
push!(buf, UInt16(c))
elseif c <= 0x10ffff
push!(buf, UInt16(0xd7c0 + (c>>10)))
push!(buf, UInt16(0xdc00 + (c & 0x3ff)))
buf[out += 1] = UInt16(c)
else
throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, ch))
# output surrogate pair
buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
end
end
push!(buf, 0) # NULL termination
@inbounds buf[out + 1] = 0 # NULL termination
UTF16String(buf)
end

utf16(x) = convert(UTF16String, x)
convert(::Type{UTF16String}, s::UTF16String) = s
convert(::Type{UTF16String}, s::AbstractString) = encode16(s)
convert(::Type{Array{UInt16,1}}, s::UTF16String) = s.data
convert(::Type{Array{UInt16}}, s::UTF16String) = s.data
"
Converts a `UTF8String` to a `UTF16String`
# TODO: optimize this
convert(::Type{UTF8String}, s::UTF16String) =
sprint(length(s.data)-1, io->for c in s; write(io,c::Char); end)
### Returns:
* `UTF16String`
sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) =
convert(Ptr{T}, pointer(s))
### Throws:
* `UnicodeError`
"
function convert(::Type{UTF16String}, str::UTF8String)
dat = str.data
# handle zero length string quickly
sizeof(dat) == 0 && return empty_utf16
# Check that is correct UTF-8 encoding and get number of words needed
len, flags, num4byte = unsafe_checkstring(dat)
len += num4byte
buf = Vector{UInt16}(len+1)
@inbounds buf[len+1] = 0
# Optimize case where no characters > 0x7f
flags == 0 && @inbounds return UTF16String(copy!(buf, dat))
out = 0
pos = 0
@inbounds while out < len
ch::UInt32 = dat[pos += 1]
# Handle ASCII characters
if ch <= 0x7f
buf[out += 1] = ch
# Handle range 0x80-0x7ff
elseif ch < 0xe0
buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
# Handle range 0x800-0xffff
elseif ch < 0xf0
pos += 2
buf[out += 1] = get_utf8_3byte(dat, pos, ch)
# Handle range 0x10000-0x10ffff
else
pos += 3
ch = get_utf8_4byte(dat, pos, ch)
# output surrogate pair
buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
end
end
UTF16String(buf)
end

function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
i = 1
n = length(data) # this may include NULL termination; that's okay
while i < n # check for unpaired surrogates
if utf16_is_lead(data[i]) && utf16_is_trail(data[i+1])
i += 2
elseif utf16_is_surrogate(data[i])
return false
"
Converts a `UTF16String` to a `UTF8String`
### Returns:
* `UTF8String`
### Throws:
* `UnicodeError`
"
function convert(::Type{UTF8String}, str::UTF16String)
dat = str.data
len = sizeof(dat) >>> 1
# handle zero length string quickly
len <= 1 && return empty_utf8
# get number of bytes to allocate
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1)
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
end

"
Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
### Input Arguments:
* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted
* `len` length of output in bytes
### Returns:
* `UTF8String`
"
function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len)
buf = Vector{UInt8}(len)
out = 0
pos = 0
@inbounds while out < len
ch::UInt32 = dat[pos += 1]
# Handle ASCII characters
if ch <= 0x7f
buf[out += 1] = ch
# Handle 0x80-0x7ff
elseif ch < 0x800
buf[out += 1] = 0xc0 | (ch >>> 6)
buf[out += 1] = 0x80 | (ch & 0x3f)
# Handle 0x10000-0x10ffff (if input is UInt32)
elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
output_utf8_4byte!(buf, out, ch)
out += 4
# Handle surrogate pairs
elseif is_surrogate_codeunit(ch)
output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1]))
out += 4
# Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
else
i += 1
buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f)
buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f)
buf[out += 1] = 0x80 | (ch & 0x3f)
end
end
return i > n || !utf16_is_surrogate(data[i])
UTF8String(buf)
end

function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
!isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
len = length(data)
d = Array(UInt16, len + 1)
d[end] = 0 # NULL terminate
UTF16String(copy!(d,1, data,1, len))
function convert(::Type{UTF16String}, str::ASCIIString)
dat = str.data
@inbounds return fast_utf_copy(UTF16String, UInt16, length(dat), dat, true)
end

convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data
convert(::Type{Array{UInt16}}, str::UTF16String) = str.data

convert(::Type{UTF16String}, str::UTF16String) = str

unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) =
convert(Ptr{T}, pointer(s))

convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) =
convert(T, reshape(data, length(data)))

convert(T::Type{UTF16String}, data::AbstractArray{Int16}) =
convert(T, reinterpret(UInt16, data))

function convert(::Type{UTF16String}, dat::AbstractVector{UInt16})
len, flags, num4byte = unsafe_checkstring(dat)
@inbounds return fast_utf_copy(UTF16String, UInt16, len+num4byte, dat, true)
end

function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
isempty(bytes) && return UTF16String(UInt16[0])
isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0))
Expand All @@ -136,6 +281,9 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
UTF16String(d)
end

convert(::Type{UTF16String}, str::UTF16String) = str

utf16(x) = convert(UTF16String, x)
utf16(p::Ptr{UInt16}, len::Integer) = utf16(pointer_to_array(p, len))
utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len)
function utf16(p::Union{Ptr{UInt16}, Ptr{Int16}})
Expand All @@ -154,7 +302,7 @@ function map(fun, str::UTF16String)
end
uc = reinterpret(UInt32, c2)
if uc < 0x10000
if utf16_is_surrogate(UInt16(uc))
if is_surrogate_codeunit(UInt16(uc))
throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, uc))
end
push!(buf, UInt16(uc))
Expand Down
15 changes: 8 additions & 7 deletions base/utf32.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

# UTF-32 basic functions
next(s::UTF32String, i::Int) = (s.data[i], i+1)
endof(s::UTF32String) = length(s.data) - 1
length(s::UTF32String) = length(s.data) - 1
Expand Down Expand Up @@ -40,8 +41,8 @@ function convert{T<:ByteString}(::Type{T}, data::AbstractVector{Char})
convert(T, takebuf_string(s))
end

convert(::Type{Array{Char,1}}, s::UTF32String) = s.data
convert(::Type{Array{Char}}, s::UTF32String) = s.data
convert(::Type{Vector{Char}}, str::UTF32String) = str.data
convert(::Type{Array{Char}}, str::UTF32String) = str.data

reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))

Expand All @@ -60,19 +61,19 @@ function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
elseif data[1] == Char(0xfffe0000) # byte-swapped
d = Array(Char, length(data))
for i = 2:length(data)
d[i-1] = bswap(data[i])
@inbounds d[i-1] = bswap(data[i])
end
else
d = Array(Char, length(data) + 1)
copy!(d, 1, data, 1, length(data)) # assume native byte order
end
d[end] = Char(0) # NULL terminate
d[end] = 0 # NULL terminate
UTF32String(d)
end

function isvalid(::Type{UTF32String}, str::Union{Vector{Char}, Vector{UInt32}})
for i=1:length(str)
@inbounds if !isvalid(Char, reinterpret(UInt32, str[i])) ; return false ; end
@inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end
end
return true
end
Expand All @@ -89,9 +90,9 @@ end
function map(f, s::UTF32String)
d = s.data
out = similar(d)
out[end] = Char(0)
out[end] = 0

for i = 1:(length(d)-1)
@inbounds for i = 1:(length(d)-1)
c2 = f(d[i])
if !isa(c2, Char)
throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0))
Expand Down
Loading

0 comments on commit 9071f14

Please sign in to comment.