SymbolicML
diff --git a/‎Project.toml‎
Lines changed: 5 additions & 1 deletion b/‎Project.toml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎ext/DynamicExpressionsCUDAExt.jl‎
Lines changed: 213 additions & 0 deletions b/‎ext/DynamicExpressionsCUDAExt.jl‎
Lines changed: 213 additions & 0 deletions
diff --git a/‎src/AsArray.jl‎
Lines changed: 106 additions & 0 deletions b/‎src/AsArray.jl‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎src/DynamicExpressions.jl‎
Lines changed: 2 additions & 0 deletions b/‎src/DynamicExpressions.jl‎
Lines changed: 2 additions & 0 deletions
@@ -14,13 +14,15 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 
 [weakdeps]
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [extensions]
 DynamicExpressionsBumperExt = "Bumper"
+DynamicExpressionsCUDAExt = "CUDA"
 DynamicExpressionsLoopVectorizationExt = "LoopVectorization"
 DynamicExpressionsOptimExt = "Optim"
 DynamicExpressionsSymbolicUtilsExt = "SymbolicUtils"
@@ -29,6 +31,7 @@ DynamicExpressionsZygoteExt = "Zygote"
 [compat]
 Aqua = "0.7"
 Bumper = "0.6"
+CUDA = "4, 5"
 Compat = "3.37, 4"
 Enzyme = "^0.11.12"
 LoopVectorization = "0.12"
@@ -44,6 +47,7 @@ julia = "1.6"
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -57,4 +61,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Test", "SafeTestsets", "Aqua", "Bumper", "Enzyme", "ForwardDiff", "LinearAlgebra", "LoopVectorization", "Optim", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"]
+test = ["Test", "SafeTestsets", "Aqua", "Bumper", "CUDA", "Enzyme", "ForwardDiff", "LinearAlgebra", "LoopVectorization", "Optim", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"]
@@ -0,0 +1,213 @@
+module DynamicExpressionsCUDAExt
+
+# TODO: Switch to KernelAbstractions.jl (once they hit v1.0)
+using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx
+using DynamicExpressions: OperatorEnum, AbstractExpressionNode
+using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna
+using DynamicExpressions.AsArrayModule: as_array
+
+import DynamicExpressions.EvaluateEquationModule: eval_tree_array
+
+# array type for exclusively testing purposes
+struct FakeCuArray{T,N,A<:AbstractArray{T,N}} <: AbstractArray{T,N}
+    a::A
+end
+Base.similar(x::FakeCuArray, dims::Integer...) = FakeCuArray(similar(x.a, dims...))
+Base.getindex(x::FakeCuArray, i::Int...) = getindex(x.a, i...)
+Base.setindex!(x::FakeCuArray, v, i::Int...) = setindex!(x.a, v, i...)
+Base.size(x::FakeCuArray) = size(x.a)
+
+const MaybeCuArray{T,N} = Union{CuArray{T,N},FakeCuArray{T,N}}
+
+to_device(a, ::CuArray) = CuArray(a)
+to_device(a, ::FakeCuArray) = FakeCuArray(a)
+
+function eval_tree_array(
+    tree::AbstractExpressionNode{T}, gcX::MaybeCuArray{T,2}, operators::OperatorEnum; kws...
+) where {T<:Number}
+    (outs, is_good) = eval_tree_array((tree,), gcX, operators; kws...)
+    return (only(outs), only(is_good))
+end
+
+function eval_tree_array(
+    trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}},
+    gcX::MaybeCuArray{T,2},
+    operators::OperatorEnum;
+    buffer=nothing,
+    gpu_workspace=nothing,
+    gpu_buffer=nothing,
+    roots=nothing,
+    num_nodes=nothing,
+    num_launches=nothing,
+    update_buffers::Val{_update_buffers}=Val(true),
+    kws...,
+) where {T<:Number,N<:AbstractExpressionNode{T},_update_buffers}
+    if _update_buffers
+        (; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, trees; buffer)
+    end
+    num_elem = size(gcX, 2)
+
+    ## The following array is our "workspace" for
+    ## the GPU kernel, with size equal to the number of rows
+    ## in the input data by the number of nodes in the tree.
+    ## It has one extra row to store the constant values.
+    gworkspace = if gpu_workspace === nothing
+        similar(gcX, num_elem + 1, num_nodes)
+    else
+        gpu_workspace
+    end
+    gval = @view gworkspace[end, :]
+    if _update_buffers
+        copyto!(gval, val)
+    end
+
+    ## Index arrays (much faster to have `@view` here)
+    gbuffer = if !_update_buffers
+        gpu_buffer
+    elseif gpu_buffer === nothing
+        to_device(buffer, gcX)
+    else
+        copyto!(gpu_buffer, buffer)
+    end
+    gdegree = @view gbuffer[1, :]
+    gfeature = @view gbuffer[2, :]
+    gop = @view gbuffer[3, :]
+    gexecution_order = @view gbuffer[4, :]
+    gidx_self = @view gbuffer[5, :]
+    gidx_l = @view gbuffer[6, :]
+    gidx_r = @view gbuffer[7, :]
+    gconstant = @view gbuffer[8, :]
+
+    num_threads = 256
+    num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads))
+
+    #! format: off
+    _launch_gpu_kernel!(
+        num_threads, num_blocks, num_launches, gworkspace,
+        # Thread info:
+        num_elem, num_nodes, gexecution_order,
+        # Input data and tree
+        operators, gcX, gidx_self, gidx_l, gidx_r,
+        gdegree, gconstant, gval, gfeature, gop,
+    )
+    #! format: on
+
+    out = (r -> @view(gworkspace[begin:(end - 1), r])).(roots)
+    is_good = (_ -> true).(trees)
+
+    return (out, is_good)
+end
+
+#! format: off
+function _launch_gpu_kernel!(
+    num_threads, num_blocks, num_launches::Integer, buffer::AbstractArray{T,2},
+    # Thread info:
+    num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray{I},
+    # Input data and tree
+    operators::OperatorEnum, cX::AbstractArray{T,2}, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray,
+    degree::AbstractArray, constant::AbstractArray, val::AbstractArray{T,1}, feature::AbstractArray, op::AbstractArray,
+) where {I,T}
+    #! format: on
+    nuna = get_nuna(typeof(operators))
+    nbin = get_nbin(typeof(operators))
+    (nuna > 10 || nbin > 10) &&
+        error("Too many operators. Kernels are only compiled up to 10.")
+    gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin))
+    for launch in one(I):I(num_launches)
+        #! format: off
+        if buffer isa CuArray
+            @cuda threads=num_threads blocks=num_blocks gpu_kernel!(
+                buffer,
+                launch, num_elem, num_nodes, execution_order,
+                cX, idx_self, idx_l, idx_r,
+                degree, constant, val, feature, op
+            )
+        else
+            Threads.@threads for i in 1:(num_threads * num_blocks)
+                gpu_kernel!(
+                    buffer,
+                    launch, num_elem, num_nodes, execution_order,
+                    cX, idx_self, idx_l, idx_r,
+                    degree, constant, val, feature, op,
+                    i
+                )
+            end
+        end
+        #! format: on
+    end
+    return nothing
+end
+
+# Need to pre-compute the GPU kernels with an `@eval` for each number of operators
+#   1. We need to use an `@nif` over operators, as GPU kernels
+#      can't index into arrays of operators.
+#   2. `@nif` is evaluated at parse time and needs to know the number of
+#      ifs to generate at that time, so we can't simply use specialization.
+#   3. We can't use `@generated` because we can't create closures in those.
+for nuna in 0:10, nbin in 0:10
+    @eval function create_gpu_kernel(operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin})
+        #! format: off
+        function (
+            # Storage:
+            buffer,
+            # Thread info:
+            launch::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray,
+            # Input data and tree
+            cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray,
+            degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray,
+            # Override for unittesting:
+            i=nothing,
+        )
+            #! format: on
+            i = i === nothing ? (blockIdx().x - 1) * blockDim().x + threadIdx().x : i
+            if i > num_elem * num_nodes
+                return nothing
+            end
+
+            node = (i - 1) % num_nodes + 1
+            elem = (i - node) ÷ num_nodes + 1
+
+            if execution_order[node] != launch
+                return nothing
+            end
+
+            cur_degree = degree[node]
+            cur_idx = idx_self[node]
+            if cur_degree == 0
+                if constant[node] == 1
+                    cur_val = val[node]
+                    buffer[elem, cur_idx] = cur_val
+                else
+                    cur_feature = feature[node]
+                    buffer[elem, cur_idx] = cX[cur_feature, elem]
+                end
+            else
+                if cur_degree == 1 && $nuna > 0
+                    cur_op = op[node]
+                    l_idx = idx_l[node]
+                    Base.Cartesian.@nif(
+                        $nuna,
+                        i -> i == cur_op,
+                        i -> let op = operators.unaops[i]
+                            buffer[elem, cur_idx] = op(buffer[elem, l_idx])
+                        end
+                    )
+                elseif $nbin > 0  # Note this check is to avoid type inference issues when binops is empty
+                    cur_op = op[node]
+                    l_idx = idx_l[node]
+                    r_idx = idx_r[node]
+                    Base.Cartesian.@nif(
+                        $nbin,
+                        i -> i == cur_op,
+                        i -> let op = operators.binops[i]
+                            buffer[elem, cur_idx] = op(buffer[elem, l_idx], buffer[elem, r_idx])
+                        end
+                    )
+                end
+            end
+            return nothing
+        end
+    end
+end
+
+end
@@ -0,0 +1,106 @@
+module AsArrayModule
+
+using ..EquationModule: AbstractExpressionNode, tree_mapreduce, count_nodes
+
+function as_array(
+    ::Type{I},
+    trees::Union{N,Tuple{N,Vararg{N}},AbstractVector{N}};
+    buffer::Union{AbstractArray,Nothing}=nothing,
+) where {T,N<:AbstractExpressionNode{T},I}
+    if trees isa N
+        return as_array(I, (trees,); buffer=buffer)
+    end
+    each_num_nodes = (t -> count_nodes(t; break_sharing=Val(true))).(trees)
+    num_nodes = sum(each_num_nodes)
+
+    # Want `roots` to be tuple if `trees` is tuple and similar for vector
+    roots = cumsum(
+        if each_num_nodes isa Tuple
+            tuple(one(I), each_num_nodes[1:(end - 1)]...)
+        else
+            vcat(one(I), each_num_nodes[1:(end - 1)])
+        end,
+    )
+
+    val = Array{T}(undef, num_nodes)
+
+    ## Views of the same matrix:
+    buffer = buffer === nothing ? Array{I}(undef, 8, num_nodes) : buffer
+    degree = @view buffer[1, :]
+    feature = @view buffer[2, :]
+    op = @view buffer[3, :]
+    execution_order = @view buffer[4, :]
+    idx_self = @view buffer[5, :]
+    idx_l = @view buffer[6, :]
+    idx_r = @view buffer[7, :]
+    constant = @view buffer[8, :]
+
+    cursor = Ref(zero(I))
+    num_launches = zero(I)
+    for (root, tree) in zip(roots, trees)
+        @assert root == cursor[] + 1
+        tree_mapreduce(
+            leaf -> begin
+                self = (cursor[] += one(I))
+                idx_self[self] = self
+                degree[self] = 0
+                execution_order[self] = one(I)
+                constant[self] = leaf.constant
+                if leaf.constant
+                    val[self] = leaf.val::T
+                else
+                    feature[self] = leaf.feature
+                end
+
+                (id=self, order=one(I))
+            end,
+            branch -> begin
+                self = (cursor[] += one(I))
+                idx_self[self] = self
+                op[self] = branch.op
+                degree[self] = branch.degree
+
+                (id=self, order=one(I))  # this order is unused
+            end,
+            ((parent, children::Vararg{Any,C}) where {C}) -> begin
+                idx_l[parent.id] = children[1].id
+                if C == 2
+                    idx_r[parent.id] = children[2].id
+                end
+                parent_execution_order = if C == 1
+                    children[1].order + one(I)
+                else
+                    max(children[1].order, children[2].order) + one(I)
+                end
+                execution_order[parent.id] = parent_execution_order
+
+                # Global number of launches equal to maximum execution order
+                if parent_execution_order > num_launches
+                    num_launches = parent_execution_order
+                end
+
+                (id=parent.id, order=parent_execution_order)
+            end,
+            tree;
+            break_sharing=Val(true),
+        )
+    end
+
+    return (;
+        degree,
+        constant,
+        val,
+        feature,
+        op,
+        execution_order,
+        num_launches,
+        idx_self,
+        idx_l,
+        idx_r,
+        roots,
+        buffer,
+        num_nodes,
+    )
+end
+
+end
@@ -12,6 +12,7 @@ include("EvaluationHelpers.jl")
 include("SimplifyEquation.jl")
 include("OperatorEnumConstruction.jl")
 include("Random.jl")
+include("AsArray.jl")
 
 import PackageExtensionCompat: @require_extensions
 import Reexport: @reexport
@@ -49,6 +50,7 @@ import .EquationModule: constructorof, preserve_sharing
 @reexport import .EvaluationHelpersModule
 @reexport import .ExtensionInterfaceModule: node_to_symbolic, symbolic_to_node
 @reexport import .RandomModule: NodeSampler
+@reexport import .AsArrayModule: as_array
 
 function __init__()
     @require_extensions