Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix kernel recompilation issue in Yao #362 #74

Merged
merged 4 commits into from
Apr 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
234 changes: 206 additions & 28 deletions src/gpuapplys.jl
Original file line number Diff line number Diff line change
@@ -1,54 +1,214 @@
using Yao.YaoBase
import Yao.YaoArrayRegister: u1rows!, unrows!, autostatic, instruct!, swaprows!

include("kernels.jl")
# get index
macro idx(shape, grididx=1, ctxsym=:ctx)
quote
x = $(esc(shape))
i = $linear_index($(esc(ctxsym)), $(esc(grididx)))
i > $prod2(x) && return
@inbounds Base.CartesianIndices(x)[i].I
end
end
replace_first(x::NTuple{2}, v) = (v, x[2])
replace_first(x::NTuple{1}, v) = (v,)
prod2(x::NTuple{2}) = x[1] * x[2]
prod2(x::NTuple{1}) = x[1]

gpu_compatible(A::AbstractVecOrMat) = A |> staticize
gpu_compatible(A::StaticArray) = A

###################### unapply! ############################
function instruct!(::Val{2}, state::DenseCuVecOrMat, U0::AbstractMatrix, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M}
@debug "The generic U(N) matrix of size ($(size(U0))), on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)."
nbit = log2dim1(state)
U0 = gpu_compatible(U0)
# reorder a unirary matrix.
D, kf = un_kernel(log2dim1(state), clocs, cvals, U0, locs)
U = (all(TupleTools.diff(locs).>0) ? U0 : reorder(U0, collect(locs)|>sortperm)) |> staticize
locked_bits = [clocs..., locs...]
locked_vals = [cvals..., zeros(Int, M)...]
locs_raw = [i+1 for i in itercontrol(nbit, setdiff(1:nbit, locs), zeros(Int, nbit-M))] |> staticize
configs = itercontrol(nbit, locked_bits, locked_vals)

len = length(configs)
@inline function kernel(ctx, state, locs_raw, U, configs, len)
inds = @idx replace_first(size(state), len)
x = @inbounds configs[inds[1]]
@inbounds unrows!(piecewise(state, inds), x .+ locs_raw, U)
return
end

gpu_call(kf, state; elements=D*size(state,2))
gpu_call(kernel, state, locs_raw, U, configs, len; elements=len*size(state,2))
state
end
instruct!(::Val{2}, state::DenseCuVecOrMat, U0::IMatrix, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M} = state
instruct!(::Val{2}, state::DenseCuVecOrMat, U0::SDSparseMatrixCSC, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M} = instruct!(Val(2), state, U0 |> Matrix, locs, clocs, cvals)

################## General U1 apply! ###################
for MT in [:SDDiagonal, :SDPermMatrix, :AbstractMatrix, :SDSparseMatrixCSC]
@eval function instruct!(::Val{2}, state::DenseCuVecOrMat, U1::$MT, ibit::Tuple{Int})
D,kf = u1_kernel(log2dim1(state), U1, ibit...)
gpu_call(kf, state; elements=D*size(state,2))
state
function instruct!(::Val{2}, state::DenseCuVecOrMat, U1::SDSparseMatrixCSC, locs::Tuple{Int})
instruct!(Val(2), state, Matrix(U1), locs, clocs, cval)
end
function instruct!(::Val{2}, state::DenseCuVecOrMat, U1::AbstractMatrix, locs::Tuple{Int})
@debug "The generic U(2) matrix of size ($(size(U1))), on: GPU, locations: $(locs)."
a, c, b, d = U1
nbit = log2dim1(state)
step = 1<<(locs[1]-1)
configs = itercontrol(nbit, [locs[1]], [0])

len = length(configs)
@inline function kernel(ctx, state, a, b, c, d, len)
inds = @idx replace_first(size(state), len)
i = @inbounds configs[inds[1]]+1
@inbounds u1rows!(piecewise(state, inds), i, i+step, a, b, c, d)
return
end
gpu_call(kernel, state, a, b, c, d, len; elements=len*size(state,2))
return state
end

function instruct!(::Val{2}, state::DenseCuVecOrMat, U1::SDPermMatrix, locs::Tuple{Int})
@debug "The single qubit permutation matrix of size ($(size(U1))), on: GPU, locations: $(locs)."
nbit = log2dim1(state)
b, c = U1.vals
step = 1<<(locs[1]-1)
configs = itercontrol(nbit, [locs[1]], [0])

len = length(configs)
function kernel(ctx, state, b, c, step, len, configs)
inds = @idx replace_first(size(state), len)
x = @inbounds configs[inds[1]] + 1
@inbounds swaprows!(piecewise(state, inds), x, x+step, c, b)
return
end
gpu_call(kernel, state, b, c, step, len, configs; elements=len*size(state,2))
return state
end

function instruct!(::Val{2}, state::DenseCuVecOrMat, U1::SDDiagonal, locs::Tuple{Int})
@debug "The single qubit diagonal matrix of size ($(size(U1))), on: GPU, locations: $(locs)."
a, d = U1.diag
nbit = log2dim1(state)
mask = bmask(locs...)
@inline function kernel(ctx, state, a, d, mask)
inds = @cartesianidx state
i = inds[1]
piecewise(state, inds)[i] *= anyone(i-1, mask) ? d : a
return
end
gpu_call(kernel, state, a, d, mask; elements=length(state))
return state
end

instruct!(::Val{2}, state::DenseCuVecOrMat, U::IMatrix, locs::Tuple{Int}) = state

################## XYZ #############
using Yao.ConstGate: S, T, Sdag, Tdag

for G in [:X, :Y, :Z, :S, :T, :Sdag, :Tdag]
KERNEL = Symbol(G |> string |> lowercase, :_kernel)
_instruct!(state::DenseCuVecOrMat, ::Val{:X}, locs::NTuple{L,Int}) where {L} = _instruct!(state, Val(:X), locs, (), ())
function _instruct!(state::DenseCuVecOrMat, ::Val{:X}, locs::NTuple{L,Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {L,C}
@debug "The X gate, on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)."
length(locs) == 0 && return state
nbit = log2dim1(state)
configs = itercontrol(nbit, [clocs..., locs[1]], [cvals..., 0])
mask = bmask(locs...)
len = length(configs)
@inline function kernel(ctx, state, mask, len, configs)
inds = @idx replace_first(size(state), len)
b = @inbounds configs[inds[1]]
@inbounds swaprows!(piecewise(state, inds), b+1, flip(b, mask) + 1)
return
end
gpu_call(kernel, state, mask, len, configs; elements=len*size(state,2))
return state
end

function _instruct!(state::DenseCuVecOrMat, ::Val{:Y}, locs::NTuple{C,Int}) where C
@debug "The Y gate, on: GPU, locations: $(locs)."
length(locs) == 0 && return state
nbit = log2dim1(state)
mask = bmask(Int, locs...)
configs = itercontrol(nbit, [locs[1]], [0])
bit_parity = length(locs)%2 == 0 ? 1 : -1
factor = (-im)^length(locs)
len = length(configs)
@inline function kernel(ctx, state, mask, bit_parity, configs, len)
inds = @idx replace_first(size(state), len)
b = @inbounds configs[inds[1]]
i_ = flip(b, mask) + 1
factor1 = count_ones(b&mask)%2 == 1 ? -factor : factor
factor2 = factor1*bit_parity
@inbounds swaprows!(piecewise(state, inds), b+1, i_, factor2, factor1)
return
end
gpu_call(kernel, state, mask, bit_parity, configs, len; elements=len*size(state,2))
return state
end

@eval function _instruct!(state::DenseCuVecOrMat, ::Val{$(QuoteNode(G))}, locs::NTuple{C,Int}) where C
length(locs) == 0 && return state
function _instruct!(state::DenseCuVecOrMat, ::Val{:Y}, locs::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C
@debug "The Y gate, on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)."
length(locs) == 0 && return state
nbit = log2dim1(state)
configs = itercontrol(nbit, [clocs..., locs...], [cvals..., 0])
mask = bmask(locs...)
len = length(configs)
@inline function kernel(ctx, state, configs, mask, len)
inds = @idx replace_first(size(state), len)
b = @inbounds configs[inds[1]]
@inbounds swaprows!(piecewise(state, inds), b+1, flip(b, mask) + 1, im, -im)
return
end
gpu_call(kernel, state, configs, mask, len; elements=len*size(state,2))
return state
end

D, kf = $KERNEL(log2dim1(state), locs)
gpu_call(kf, state; elements=D*size(state,2))
state
function _instruct!(state::DenseCuVecOrMat, ::Val{:Z}, locs::NTuple{C,Int}) where C
@debug "The Z gate, on: GPU, locations: $(locs)."
length(locs) == 0 && return state
nbit = log2dim1(state)
mask = bmask(locs...)
@inline function kernel(ctx, state, mask)
inds = @cartesianidx state
i = inds[1]
piecewise(state, inds)[i] *= count_ones((i-1)&mask)%2==1 ? -1 : 1
return
end
gpu_call(kernel, state, mask; elements=length(state))
return state
end


CKERNEL = Symbol(:c, KERNEL)
@eval function _instruct!(state::DenseCuVecOrMat, ::Val{$(QuoteNode(G))}, loc::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C
D,kf = $CKERNEL(log2dim1(state), clocs, cvals, loc...)
gpu_call(kf, state; elements=D*size(state,2))
state
for (G, FACTOR) in zip([:Z, :S, :T, :Sdag, :Tdag], [:(-one(Int32)), :(1f0im), :($(exp(im*π/4))), :(-1f0im), :($(exp(-im*π/4)))])
if G !== :Z
@eval function _instruct!(state::DenseCuVecOrMat, ::Val{$(QuoteNode(G))}, locs::NTuple{C,Int}) where C
@debug "The $($(QuoteNode(G))) gate, on: GPU, locations: $(locs)."
length(locs) == 0 && return state
nbit = log2dim1(state)
mask = bmask(Int32, locs...)
@inline function kernel(ctx, state, mask)
inds = @cartesianidx state
i = inds[1]
piecewise(state, inds)[i] *= $FACTOR ^ count_ones(Int32(i-1)&mask)
return
end
gpu_call(kernel, state, mask; elements=length(state))
return state
end
end
@eval function _instruct!(state::DenseCuVecOrMat, ::Val{$(QuoteNode(G))}, locs::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C
@debug "The $($(QuoteNode(G))) gate, on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)."
ctrl = controller((clocs..., locs...), (cvals..., 1))
@inline function kernel(ctx, state, ctrl)
inds = @cartesianidx state
i = inds[1]
piecewise(state, inds)[i] *= ctrl(i-1) ? $FACTOR : one($FACTOR)
return
end
gpu_call(kernel, state, ctrl; elements=length(state))
return state
end
end

for G in [:X, :Y, :Z, :S, :T, :Sdag, :Tdag]
@eval begin
function YaoBase.instruct!(::Val{2}, state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, locs::NTuple{C,Int}) where C
_instruct!(state, g, locs)
Expand All @@ -58,18 +218,19 @@ for G in [:X, :Y, :Z, :S, :T, :Sdag, :Tdag]
_instruct!(state, g, locs)
end

function YaoBase.instruct!(::Val{2}, state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, loc::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C
_instruct!(state, g, loc, clocs, cvals)
function YaoBase.instruct!(::Val{2}, state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, locs::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C
_instruct!(state, g, locs, clocs, cvals)
end

function YaoBase.instruct!(::Val{2}, state::DenseCuVecOrMat, vg::Val{$(QuoteNode(G))}, loc::Tuple{Int}, cloc::Tuple{Int}, cval::Tuple{Int})
_instruct!(state, vg, loc, cloc, cval)
function YaoBase.instruct!(::Val{2}, state::DenseCuVecOrMat, vg::Val{$(QuoteNode(G))}, locs::Tuple{Int}, cloc::Tuple{Int}, cval::Tuple{Int})
_instruct!(state, vg, locs, cloc, cval)
end
end

end

function instruct!(::Val{2}, state::DenseCuVecOrMat, ::Val{:SWAP}, locs::Tuple{Int,Int})
@debug "The SWAP gate, on: GPU, locations: $(locs)."
b1, b2 = locs
mask1 = bmask(b1)
mask2 = bmask(b2)
Expand All @@ -93,8 +254,25 @@ end
using Yao.ConstGate: SWAPGate

function instruct!(::Val{2}, state::DenseCuVecOrMat, ::Val{:PSWAP}, locs::Tuple{Int, Int}, θ::Real)
D, kf = pswap_kernel(log2dim1(state),locs..., θ)
gpu_call(kf, state; elements=D*size(state,2))
@debug "The PSWAP gate, on: GPU, locations: $(locs)."
nbit = log2dim1(state)
mask1 = bmask(locs[1])
mask2 = bmask(locs[2])
mask12 = mask1|mask2
a, c, b_, d = mat(Rx(θ))
e = exp(-im/2*θ)
configs = itercontrol(nbit, [locs...], [0,0])
len = length(configs)
@inline function kernel(ctx, state, mask2, mask12, configs, a, b_, c, d)
inds = @idx replace_first(size(state), len)
@inbounds x = configs[inds[1]]
piecewise(state, inds)[x+1] *= e
piecewise(state, inds)[x⊻mask12+1] *= e
y = x ⊻ mask2
@inbounds u1rows!(piecewise(state, inds), y+1, y⊻mask12+1, a, b_, c, d)
return
end
gpu_call(kernel, state, mask2, mask12, configs, a, b_, c, d; elements=len*size(state,2))
state
end

Expand All @@ -113,7 +291,7 @@ for RG in [:Rx, :Ry, :Rz]
locs::Tuple{Int},
theta::Number
) where {T, N}
YaoArrayRegister.instruct!(state, Val($(QuoteNode(RG))), locs, (), (), theta)
YaoArrayRegister.instruct!(Val(2), state, Val($(QuoteNode(RG))), locs, (), (), theta)
return state
end
end
end
Loading