Skip to content

Commit

Permalink
Merge pull request #74 from QuantumBFS/fix-dispatch
Browse files Browse the repository at this point in the history
Fix kernel recompilation issue in Yao #362
  • Loading branch information
GiggleLiu committed Apr 22, 2022
2 parents b46608d + 023c337 commit 93dc626
Show file tree
Hide file tree
Showing 2 changed files with 206 additions and 207 deletions.
234 changes: 206 additions & 28 deletions src/gpuapplys.jl
Original file line number Diff line number Diff line change
@@ -1,54 +1,214 @@
using Yao.YaoBase
import Yao.YaoArrayRegister: u1rows!, unrows!, autostatic, instruct!, swaprows!

include("kernels.jl")
# get index
macro idx(shape, grididx=1, ctxsym=:ctx)
quote
x = $(esc(shape))
i = $linear_index($(esc(ctxsym)), $(esc(grididx)))
i > $prod2(x) && return
@inbounds Base.CartesianIndices(x)[i].I
end
end
replace_first(x::NTuple{2}, v) = (v, x[2])
replace_first(x::NTuple{1}, v) = (v,)
prod2(x::NTuple{2}) = x[1] * x[2]
prod2(x::NTuple{1}) = x[1]

gpu_compatible(A::AbstractVecOrMat) = A |> staticize
gpu_compatible(A::StaticArray) = A

###################### unapply! ############################
function instruct!(::Val{2}, state::DenseCuVecOrMat, U0::AbstractMatrix, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M}
@debug "The generic U(N) matrix of size ($(size(U0))), on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)."
nbit = log2dim1(state)
U0 = gpu_compatible(U0)
# reorder a unirary matrix.
D, kf = un_kernel(log2dim1(state), clocs, cvals, U0, locs)
U = (all(TupleTools.diff(locs).>0) ? U0 : reorder(U0, collect(locs)|>sortperm)) |> staticize
locked_bits = [clocs..., locs...]
locked_vals = [cvals..., zeros(Int, M)...]
locs_raw = [i+1 for i in itercontrol(nbit, setdiff(1:nbit, locs), zeros(Int, nbit-M))] |> staticize
configs = itercontrol(nbit, locked_bits, locked_vals)

len = length(configs)
@inline function kernel(ctx, state, locs_raw, U, configs, len)
inds = @idx replace_first(size(state), len)
x = @inbounds configs[inds[1]]
@inbounds unrows!(piecewise(state, inds), x .+ locs_raw, U)
return
end

gpu_call(kf, state; elements=D*size(state,2))
gpu_call(kernel, state, locs_raw, U, configs, len; elements=len*size(state,2))
state
end
instruct!(::Val{2}, state::DenseCuVecOrMat, U0::IMatrix, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M} = state
instruct!(::Val{2}, state::DenseCuVecOrMat, U0::SDSparseMatrixCSC, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M} = instruct!(Val(2), state, U0 |> Matrix, locs, clocs, cvals)

################## General U1 apply! ###################
for MT in [:SDDiagonal, :SDPermMatrix, :AbstractMatrix, :SDSparseMatrixCSC]
@eval function instruct!(::Val{2}, state::DenseCuVecOrMat, U1::$MT, ibit::Tuple{Int})
D,kf = u1_kernel(log2dim1(state), U1, ibit...)
gpu_call(kf, state; elements=D*size(state,2))
state
function instruct!(::Val{2}, state::DenseCuVecOrMat, U1::SDSparseMatrixCSC, locs::Tuple{Int})
instruct!(Val(2), state, Matrix(U1), locs, clocs, cval)
end
function instruct!(::Val{2}, state::DenseCuVecOrMat, U1::AbstractMatrix, locs::Tuple{Int})
@debug "The generic U(2) matrix of size ($(size(U1))), on: GPU, locations: $(locs)."
a, c, b, d = U1
nbit = log2dim1(state)
step = 1<<(locs[1]-1)
configs = itercontrol(nbit, [locs[1]], [0])

len = length(configs)
@inline function kernel(ctx, state, a, b, c, d, len)
inds = @idx replace_first(size(state), len)
i = @inbounds configs[inds[1]]+1
@inbounds u1rows!(piecewise(state, inds), i, i+step, a, b, c, d)
return
end
gpu_call(kernel, state, a, b, c, d, len; elements=len*size(state,2))
return state
end

function instruct!(::Val{2}, state::DenseCuVecOrMat, U1::SDPermMatrix, locs::Tuple{Int})
@debug "The single qubit permutation matrix of size ($(size(U1))), on: GPU, locations: $(locs)."
nbit = log2dim1(state)
b, c = U1.vals
step = 1<<(locs[1]-1)
configs = itercontrol(nbit, [locs[1]], [0])

len = length(configs)
function kernel(ctx, state, b, c, step, len, configs)
inds = @idx replace_first(size(state), len)
x = @inbounds configs[inds[1]] + 1
@inbounds swaprows!(piecewise(state, inds), x, x+step, c, b)
return
end
gpu_call(kernel, state, b, c, step, len, configs; elements=len*size(state,2))
return state
end

function instruct!(::Val{2}, state::DenseCuVecOrMat, U1::SDDiagonal, locs::Tuple{Int})
@debug "The single qubit diagonal matrix of size ($(size(U1))), on: GPU, locations: $(locs)."
a, d = U1.diag
nbit = log2dim1(state)
mask = bmask(locs...)
@inline function kernel(ctx, state, a, d, mask)
inds = @cartesianidx state
i = inds[1]
piecewise(state, inds)[i] *= anyone(i-1, mask) ? d : a
return
end
gpu_call(kernel, state, a, d, mask; elements=length(state))
return state
end

instruct!(::Val{2}, state::DenseCuVecOrMat, U::IMatrix, locs::Tuple{Int}) = state

################## XYZ #############
using Yao.ConstGate: S, T, Sdag, Tdag

for G in [:X, :Y, :Z, :S, :T, :Sdag, :Tdag]
KERNEL = Symbol(G |> string |> lowercase, :_kernel)
_instruct!(state::DenseCuVecOrMat, ::Val{:X}, locs::NTuple{L,Int}) where {L} = _instruct!(state, Val(:X), locs, (), ())
function _instruct!(state::DenseCuVecOrMat, ::Val{:X}, locs::NTuple{L,Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {L,C}
@debug "The X gate, on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)."
length(locs) == 0 && return state
nbit = log2dim1(state)
configs = itercontrol(nbit, [clocs..., locs[1]], [cvals..., 0])
mask = bmask(locs...)
len = length(configs)
@inline function kernel(ctx, state, mask, len, configs)
inds = @idx replace_first(size(state), len)
b = @inbounds configs[inds[1]]
@inbounds swaprows!(piecewise(state, inds), b+1, flip(b, mask) + 1)
return
end
gpu_call(kernel, state, mask, len, configs; elements=len*size(state,2))
return state
end

function _instruct!(state::DenseCuVecOrMat, ::Val{:Y}, locs::NTuple{C,Int}) where C
@debug "The Y gate, on: GPU, locations: $(locs)."
length(locs) == 0 && return state
nbit = log2dim1(state)
mask = bmask(Int, locs...)
configs = itercontrol(nbit, [locs[1]], [0])
bit_parity = length(locs)%2 == 0 ? 1 : -1
factor = (-im)^length(locs)
len = length(configs)
@inline function kernel(ctx, state, mask, bit_parity, configs, len)
inds = @idx replace_first(size(state), len)
b = @inbounds configs[inds[1]]
i_ = flip(b, mask) + 1
factor1 = count_ones(b&mask)%2 == 1 ? -factor : factor
factor2 = factor1*bit_parity
@inbounds swaprows!(piecewise(state, inds), b+1, i_, factor2, factor1)
return
end
gpu_call(kernel, state, mask, bit_parity, configs, len; elements=len*size(state,2))
return state
end

@eval function _instruct!(state::DenseCuVecOrMat, ::Val{$(QuoteNode(G))}, locs::NTuple{C,Int}) where C
length(locs) == 0 && return state
function _instruct!(state::DenseCuVecOrMat, ::Val{:Y}, locs::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C
@debug "The Y gate, on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)."
length(locs) == 0 && return state
nbit = log2dim1(state)
configs = itercontrol(nbit, [clocs..., locs...], [cvals..., 0])
mask = bmask(locs...)
len = length(configs)
@inline function kernel(ctx, state, configs, mask, len)
inds = @idx replace_first(size(state), len)
b = @inbounds configs[inds[1]]
@inbounds swaprows!(piecewise(state, inds), b+1, flip(b, mask) + 1, im, -im)
return
end
gpu_call(kernel, state, configs, mask, len; elements=len*size(state,2))
return state
end

D, kf = $KERNEL(log2dim1(state), locs)
gpu_call(kf, state; elements=D*size(state,2))
state
function _instruct!(state::DenseCuVecOrMat, ::Val{:Z}, locs::NTuple{C,Int}) where C
@debug "The Z gate, on: GPU, locations: $(locs)."
length(locs) == 0 && return state
nbit = log2dim1(state)
mask = bmask(locs...)
@inline function kernel(ctx, state, mask)
inds = @cartesianidx state
i = inds[1]
piecewise(state, inds)[i] *= count_ones((i-1)&mask)%2==1 ? -1 : 1
return
end
gpu_call(kernel, state, mask; elements=length(state))
return state
end


CKERNEL = Symbol(:c, KERNEL)
@eval function _instruct!(state::DenseCuVecOrMat, ::Val{$(QuoteNode(G))}, loc::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C
D,kf = $CKERNEL(log2dim1(state), clocs, cvals, loc...)
gpu_call(kf, state; elements=D*size(state,2))
state
for (G, FACTOR) in zip([:Z, :S, :T, :Sdag, :Tdag], [:(-one(Int32)), :(1f0im), :($(exp(im*π/4))), :(-1f0im), :($(exp(-im*π/4)))])
if G !== :Z
@eval function _instruct!(state::DenseCuVecOrMat, ::Val{$(QuoteNode(G))}, locs::NTuple{C,Int}) where C
@debug "The $($(QuoteNode(G))) gate, on: GPU, locations: $(locs)."
length(locs) == 0 && return state
nbit = log2dim1(state)
mask = bmask(Int32, locs...)
@inline function kernel(ctx, state, mask)
inds = @cartesianidx state
i = inds[1]
piecewise(state, inds)[i] *= $FACTOR ^ count_ones(Int32(i-1)&mask)
return
end
gpu_call(kernel, state, mask; elements=length(state))
return state
end
end
@eval function _instruct!(state::DenseCuVecOrMat, ::Val{$(QuoteNode(G))}, locs::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C
@debug "The $($(QuoteNode(G))) gate, on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)."
ctrl = controller((clocs..., locs...), (cvals..., 1))
@inline function kernel(ctx, state, ctrl)
inds = @cartesianidx state
i = inds[1]
piecewise(state, inds)[i] *= ctrl(i-1) ? $FACTOR : one($FACTOR)
return
end
gpu_call(kernel, state, ctrl; elements=length(state))
return state
end
end

for G in [:X, :Y, :Z, :S, :T, :Sdag, :Tdag]
@eval begin
function YaoBase.instruct!(::Val{2}, state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, locs::NTuple{C,Int}) where C
_instruct!(state, g, locs)
Expand All @@ -58,18 +218,19 @@ for G in [:X, :Y, :Z, :S, :T, :Sdag, :Tdag]
_instruct!(state, g, locs)
end

function YaoBase.instruct!(::Val{2}, state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, loc::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C
_instruct!(state, g, loc, clocs, cvals)
function YaoBase.instruct!(::Val{2}, state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, locs::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C
_instruct!(state, g, locs, clocs, cvals)
end

function YaoBase.instruct!(::Val{2}, state::DenseCuVecOrMat, vg::Val{$(QuoteNode(G))}, loc::Tuple{Int}, cloc::Tuple{Int}, cval::Tuple{Int})
_instruct!(state, vg, loc, cloc, cval)
function YaoBase.instruct!(::Val{2}, state::DenseCuVecOrMat, vg::Val{$(QuoteNode(G))}, locs::Tuple{Int}, cloc::Tuple{Int}, cval::Tuple{Int})
_instruct!(state, vg, locs, cloc, cval)
end
end

end

function instruct!(::Val{2}, state::DenseCuVecOrMat, ::Val{:SWAP}, locs::Tuple{Int,Int})
@debug "The SWAP gate, on: GPU, locations: $(locs)."
b1, b2 = locs
mask1 = bmask(b1)
mask2 = bmask(b2)
Expand All @@ -93,8 +254,25 @@ end
using Yao.ConstGate: SWAPGate

function instruct!(::Val{2}, state::DenseCuVecOrMat, ::Val{:PSWAP}, locs::Tuple{Int, Int}, θ::Real)
D, kf = pswap_kernel(log2dim1(state),locs..., θ)
gpu_call(kf, state; elements=D*size(state,2))
@debug "The PSWAP gate, on: GPU, locations: $(locs)."
nbit = log2dim1(state)
mask1 = bmask(locs[1])
mask2 = bmask(locs[2])
mask12 = mask1|mask2
a, c, b_, d = mat(Rx(θ))
e = exp(-im/2*θ)
configs = itercontrol(nbit, [locs...], [0,0])
len = length(configs)
@inline function kernel(ctx, state, mask2, mask12, configs, a, b_, c, d)
inds = @idx replace_first(size(state), len)
@inbounds x = configs[inds[1]]
piecewise(state, inds)[x+1] *= e
piecewise(state, inds)[xmask12+1] *= e
y = x mask2
@inbounds u1rows!(piecewise(state, inds), y+1, ymask12+1, a, b_, c, d)
return
end
gpu_call(kernel, state, mask2, mask12, configs, a, b_, c, d; elements=len*size(state,2))
state
end

Expand All @@ -113,7 +291,7 @@ for RG in [:Rx, :Ry, :Rz]
locs::Tuple{Int},
theta::Number
) where {T, N}
YaoArrayRegister.instruct!(state, Val($(QuoteNode(RG))), locs, (), (), theta)
YaoArrayRegister.instruct!(Val(2), state, Val($(QuoteNode(RG))), locs, (), (), theta)
return state
end
end
end
Loading

0 comments on commit 93dc626

Please sign in to comment.