/
DynamicExpressionsCUDAExt.jl
157 lines (141 loc) · 5.57 KB
/
DynamicExpressionsCUDAExt.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
module DynamicExpressionsCUDAExt
#! format: off
using CUDA
using DynamicExpressions: OperatorEnum, AbstractExpressionNode
using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna
using DynamicExpressions.AsArrayModule: as_array
import DynamicExpressions.EvaluateEquationModule: eval_tree_array
function eval_tree_array(
tree::AbstractExpressionNode{T}, gcX::CuArray{T,2}, operators::OperatorEnum; _...
) where {T<:Number}
(outs, is_good) = eval_tree_array((tree,), gcX, operators)
return (only(outs), only(is_good))
end
function eval_tree_array(
trees::NTuple{M,N}, gcX::CuArray{T,2}, operators::OperatorEnum; _...
) where {T<:Number,N<:AbstractExpressionNode{T},M}
(; val, execution_order, roots, buffer) = as_array(Int32, trees...)
num_launches = maximum(execution_order)
num_elem = size(gcX, 2)
num_nodes = size(buffer, 2)
## Floating point arrays:
gworkspace = CuArray{T}(undef, num_elem, num_nodes)
gval = CuArray(val)
## Index arrays (much faster to have `@view` here)
gbuffer = CuArray(buffer)
gdegree = @view gbuffer[1, :]
gfeature = @view gbuffer[2, :]
gop = @view gbuffer[3, :]
gexecution_order = @view gbuffer[4, :]
gidx_self = @view gbuffer[5, :]
gidx_l = @view gbuffer[6, :]
gidx_r = @view gbuffer[7, :]
gconstant = @view gbuffer[8, :]
num_threads = 256
num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads))
_launch_gpu_kernel!(
num_threads, num_blocks, num_launches, gworkspace,
# Thread info:
num_elem, num_nodes, gexecution_order,
# Input data and tree
operators, gcX, gidx_self, gidx_l, gidx_r,
gdegree, gconstant, gval, gfeature, gop,
)
out = ntuple(
i -> @view(gworkspace[:, roots[i]]),
Val(M)
)
is_good = ntuple(
i -> true, # Up to user to find NaNs
Val(M)
)
return (out, is_good)
end
function _launch_gpu_kernel!(
num_threads, num_blocks, num_launches::Integer, buffer::AbstractArray{T,2},
# Thread info:
num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray{I},
# Input data and tree
operators::OperatorEnum, cX::AbstractArray{T,2}, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray,
degree::AbstractArray, constant::AbstractArray, val::AbstractArray{T,1}, feature::AbstractArray, op::AbstractArray,
) where {I,T}
nuna = get_nuna(typeof(operators))
nbin = get_nbin(typeof(operators))
(nuna > 10 || nbin > 10) && error("Too many operators. Kernels are only compiled up to 10.")
gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin))
for launch in one(I):I(num_launches)
@cuda threads=num_threads blocks=num_blocks gpu_kernel!(
buffer,
launch, num_elem, num_nodes, execution_order,
cX, idx_self, idx_l, idx_r,
degree, constant, val, feature, op
)
end
return nothing
end
# Need to pre-compute the GPU kernels with an `@eval` for each number of operators
# 1. We need to use an `@nif` over operators, as GPU kernels
# can't index into arrays of operators.
# 2. `@nif` is evaluated at parse time and needs to know the number of
# ifs to generate at that time, so we can't simply use specialization.
# 3. We can't use `@generated` because we can't create closures in those.
for nuna in 0:10, nbin in 0:10
@eval function create_gpu_kernel(operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin})
function (
# Storage:
buffer,
# Thread info:
launch::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray,
# Input data and tree
cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray,
degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray,
)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
if i > num_elem * num_nodes
return nothing
end
node = (i - 1) % num_nodes + 1
elem = (i - node) ÷ num_nodes + 1
if execution_order[node] != launch
return nothing
end
cur_degree = degree[node]
cur_idx = idx_self[node]
if cur_degree == 0
if constant[node] == 1
cur_val = val[node]
buffer[elem, cur_idx] = cur_val
else
cur_feature = feature[node]
buffer[elem, cur_idx] = cX[cur_feature, elem]
end
else
if cur_degree == 1
cur_op = op[node]
l_idx = idx_l[node]
Base.Cartesian.@nif(
$nuna,
i -> i == cur_op,
i -> let op = operators.unaops[i]
buffer[elem, cur_idx] = op(buffer[elem, l_idx])
end
)
else
cur_op = op[node]
l_idx = idx_l[node]
r_idx = idx_r[node]
Base.Cartesian.@nif(
$nbin,
i -> i == cur_op,
i -> let op = operators.binops[i]
buffer[elem, cur_idx] = op(buffer[elem, l_idx], buffer[elem, r_idx])
end
)
end
end
return nothing
end
end
end
#! format: on
end