Tractables · YitaoLiang · Sep 3, 2020 · Aug 6, 2020 · Aug 6, 2020 · Aug 7, 2020
diff --git a/Project.toml b/Project.toml
@@ -5,6 +5,7 @@ version = "0.1.1"
 [deps]
 BlossomV = "6c721016-9dae-5d90-abf6-67daaccb2332"
 Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 LightGraphs = "093fc24a-ae57-5d10-9952-331d41423f4d"
 LogicCircuits = "a7847b3b-b7f1-4dd5-83c3-60e0aa0f8599"
@@ -19,10 +20,13 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
+MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
 
 [compat]
 BlossomV = "0.4"
 Clustering = "0.14"
+CUDA = "1.2"
 DataStructures = "0.17"
 LightGraphs = "1.3"
 LogicCircuits = "0.1.1"
@@ -32,4 +36,6 @@ Reexport = "0.2"
 SimpleWeightedGraphs = "1.1"
 StatsBase = "0.33"
 StatsFuns = "0.9"
+LoopVectorization = "0.8.20"
+MLDatasets = "0.4, 0.5"
 julia = "1.5"
diff --git a/src/LoadSave/circuit_line_compiler.jl b/src/LoadSave/circuit_line_compiler.jl
@@ -111,7 +111,7 @@ function decorate_logistic(lines::CircuitFormatLines, logic_circuit::LogicCircui
 
     function compile(ln::BiasLine)
         root = id2logisticnode(ln.node_id)::Logistic⋁Node
-        # @assert length(node.thetas) == 1
+        # @assert length(root.thetas) == 1
         root.thetas[1,:] .= ln.weights
     end
 

diff --git a/src/Logistic/Logistic.jl b/src/Logistic/Logistic.jl
@@ -5,7 +5,9 @@ using ..Utils
 
 include("logistic_nodes.jl")
 include("queries.jl")
+# include("parameter_circuit.jl")
+# include("learn_parameters.jl")
 
-# TODO learning
+# TODO structure learning
 
 end
diff --git a/src/Logistic/learn_parameters.jl b/src/Logistic/learn_parameters.jl
@@ -0,0 +1,49 @@
+export learn_parameters
+
+using LogicCircuits: compute_flows, or_nodes
+using LoopVectorization: @avx
+
+"""
+Maximum likilihood estimation of parameters given data through gradient descent
+"""
+function learn_parameters(lc::LogisticCircuit, classes::Int, data, labels; num_epochs=30, step_size=0.1, flows_computed=false)
+
+    @inline function one_hot(labels::Vector, classes::Int)        
+        one_hot_labels = zeros(length(labels), classes)
+        for (i, j) in enumerate(labels)
+            one_hot_labels[i, j] = 1.0
+        end
+        one_hot_labels
+    end
+
+    one_hot_labels = one_hot(labels, classes)
+    if !flows_computed
+        compute_flows(lc, data)
+    end
+
+    for _ = 1:num_epochs
+        class_probs = class_likelihood_per_instance(lc, classes, data; flows_computed=true)
+        update_parameters(lc, class_probs, one_hot_labels)
+    end
+
+    nothing
+end
+
+
+@inline function update_parameters(lc::LogisticCircuit, class_probs, one_hot_labels; step_size=0.1)
+    num_samples = Float64(size(one_hot_labels)[1])
+    error = class_probs .- one_hot_labels
+
+    foreach(or_nodes(lc)) do ln
+        foreach(eachrow(ln.thetas), children(ln)) do theta, c
+            flow = Float64.(downflow(ln, c))
+            @avx update_amount = flow' * error / num_samples * step_size
+            update_amount = dropdims(update_amount; dims=1)
+            @avx @. theta -= update_amount
+        end
+    end
+
+    nothing
+end
+
+
diff --git a/src/Logistic/logistic_nodes.jl b/src/Logistic/logistic_nodes.jl
@@ -1,12 +1,8 @@
 export 
     LogisticCircuit, 
-    LogisticLeafNode, 
-    LogisticInnerNode, 
-    LogisticLiteral,
-    Logistic⋀Node,
-    Logistic⋁Node,
-    classes,
-    num_parameters_perclass
+    LogisticLeafNode, LogisticInnerNode, 
+    LogisticLiteral, Logistic⋀Node, Logistic⋁Node,
+    num_classes, num_parameters_per_class
 
 #####################
 # Infrastructure for logistic circuit nodes
@@ -54,11 +50,11 @@ A logistic disjunction node (Or node)
 """
 mutable struct Logistic⋁Node <: LogisticInnerNode
     children::Vector{<:LogisticCircuit}
-    thetas::Array{Float64, 2}
+    thetas::Matrix{Float64}
     data
     counter::UInt32
     Logistic⋁Node(children, class::Int) = begin
-        new(convert(Vector{LogisticCircuit}, children), init_array(Float64, length(children), class), nothing, 0)
+        new(convert(Vector{LogisticCircuit}, children), init_array(Float32, length(children), class), nothing, 0)
     end
 end
 
@@ -77,11 +73,11 @@ import LogicCircuits.GateType # make available for extension
 
 import LogicCircuits: children # make available for extension
 @inline children(n::LogisticInnerNode) = n.children
-@inline classes(n::Logistic⋁Node) = size(n.thetas)[2]
+@inline num_classes(n::Logistic⋁Node) = size(n.thetas)[2]
 
 import ..Utils: num_parameters
 @inline num_parameters(c::LogisticCircuit) = sum(n -> num_children(n) * classes(n), ⋁_nodes(c))
-@inline num_parameters_perclass(c::LogisticCircuit) = sum(n -> num_children(n), ⋁_nodes(c))
+@inline num_parameters_per_class(c::LogisticCircuit) = sum(n -> num_children(n), ⋁_nodes(c))
 
 #####################
 # constructors and conversions

diff --git a/src/Logistic/parameter_circuit.jl b/src/Logistic/parameter_circuit.jl
@@ -0,0 +1,225 @@
+using CUDA
+using LogicCircuits
+
+export LayeredParameterCircuit, CuLayeredParameterCircuit
+export class_likelihood, class_weights
+export one_hot, learn_parameters, update_parameters
+
+# in a parameter circuit
+# 1 is true, 2 is false
+const TRUE_ID = Int32(1)
+const FALSE_ID = Int32(2)
+
+struct LayeredParameterCircuit
+    layered_circuit::LayeredBitCircuit
+    layered_parameters::Vector{Matrix{Float32}}
+end
+
+LayeredParameterCircuit(circuit::LogisticCircuit, nc::Integer, num_features::Integer) = begin
+    @assert is⋁gate(circuit)
+    decisions::Vector{Vector{Int32}} = Vector{Vector{Int32}}()
+    elements::Vector{Vector{Int32}} = Vector{Vector{Int32}}()
+    parameters::Vector{Vector{Float32}} = Vector{Vector{Float32}}()
+    num_decisions::Int32 = 2 * num_features + 2
+    num_elements::Vector{Int32} = Vector{Int32}()
+    # num_parameters always equals num_elements
+
+    ensure_layer(i) = begin
+        if length(decisions) < i
+            # add a new layer
+            push!(decisions, Int32[])
+            push!(elements, Int32[])
+            push!(parameters, Float32[])
+            push!(num_elements, 0)
+        end
+    end
+
+    f_con(n) = LayeredDecisionId(0, istrue(n) ? TRUE_ID : FALSE_ID)
+    f_lit(n) = LayeredDecisionId(0, 
+        ispositive(n) ? Int32(2 + variable(n)) : Int32(2 + num_features + variable(n)))
+
+    f_and(n, cs) = begin
+        @assert length(cs) == 2
+        LayeredDecisionId[cs[1], cs[2]]
+    end
+    f_or(n, cs) = begin
+        num_decisions += 1
+        # determine layer
+        layer_id = zero(Int32)
+        for c in cs
+            if c isa Vector{LayeredDecisionId}
+                @assert length(c) == 2
+                layer_id = max(layer_id, c[1].layer_id, c[2].layer_id)
+            else
+                @assert c isa LayeredDecisionId
+                layer_id = max(layer_id, c.layer_id)
+            end
+        end
+        layer_id += 1
+        ensure_layer(layer_id)
+        first_element = num_elements[layer_id] + 1
+        foreach(cs, eachrow(n.thetas)) do c, theta
+            @assert size(theta)[1] == nc
+            append!(parameters[layer_id], theta)
+            num_elements[layer_id] += 1
+            if c isa Vector{LayeredDecisionId}
+                push!(elements[layer_id], c[1].decision_id, c[2].decision_id)
+            else
+                push!(elements[layer_id], c.decision_id, TRUE_ID)
+            end
+        end
+        push!(decisions[layer_id], num_decisions, first_element, num_elements[layer_id])
+        LayeredDecisionId(layer_id, num_decisions)
+    end
+
+    foldup_aggregate(circuit, f_con, f_lit, f_and, f_or, 
+        Union{LayeredDecisionId,Vector{LayeredDecisionId}})
+
+    circuit_layers = map(decisions, elements) do d, e
+        Layer(reshape(d, 3, :), reshape(e, 2, :))
+    end
+    parameter_layers = map(parameters) do p
+        reshape(p, nc, :)
+    end
+    return LayeredParameterCircuit(LayeredBitCircuit(circuit_layers), parameter_layers)
+end
+
+struct CuLayeredParameterCircuit
+    layered_circuit::CuLayeredBitCircuit
+    layered_parameters::Vector{CuMatrix{Float32}}
+    CuLayeredParameterCircuit(l::LayeredParameterCircuit) = new(CuLayeredBitCircuit(l.layered_circuit), map(CuMatrix, l.layered_parameters))
+end
+
+
+
+function class_likelihood(circuit::CuLayeredParameterCircuit, nc::Integer, data::CuMatrix{Float32}, reuse_up=nothing, reuse_down=nothing, reuse_cp=nothing)
+    cw, flow, v = class_weights(circuit, nc, data, reuse_up, reuse_down, reuse_cp)
+    one = Float32(1.0)
+    return @. one / (one + exp(-cw)), flow, v
+end
+
+function class_weights(circuit::CuLayeredParameterCircuit, nc::Integer, data::CuMatrix{Float32}, reuse_up=nothing, reuse_down=nothing, reuse_cw=nothing)
+    flow, v = compute_flows2(circuit.layered_circuit, data, reuse_up, reuse_down)
+    cw = calculate_class_weights(circuit, nc, data, v, flow, reuse_cw)
+    return cw, flow, v
+end
+
+function calculate_class_weights(circuit::CuLayeredParameterCircuit, nc::Integer, data::CuMatrix{Float32}, v, flow, reuse_cw=nothing)
+    ne = num_examples(data)
+    cw = if reuse_cw isa CuMatrix{Float32} && size(reuse_cw) == (ne, nc)
+        reuse_cw .= zero(Float32)
+        reuse_cw
+    else
+        CUDA.zeros(Float32, ne, nc)
+    end
+
+    dec_per_thread = 4
+    CUDA.@sync for i = 1:length(circuit.layered_circuit.layers)
+        circuit_layer = circuit.layered_circuit.layers[i]
+        parameter_layer = circuit.layered_parameters[i]
+        ndl = num_decisions(circuit_layer)
+        num_threads = balance_threads(ne, ndl / dec_per_thread, 8)
+        num_blocks = ceil(Int, ne / num_threads[1]), ceil(Int, ndl / num_threads[2] / dec_per_thread)
+        @cuda threads=num_threads blocks=num_blocks calculate_class_weights_layer_kernel_cuda(cw, v, flow, circuit_layer.decisions, circuit_layer.elements, parameter_layer)
+    end
+
+    return cw
+end
+
+function calculate_class_weights_layer_kernel_cuda(cw, v, flow, decisions, elements, parameters)
+    index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    index_y = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    stride_x = blockDim().x * gridDim().x
+    stride_y = blockDim().y * gridDim().y
+    ne, nc = size(cw)
+    _, num_decisions = size(decisions)
+
+    for j = index_x:stride_x:ne
+        for i = index_y:stride_y:num_decisions
+            decision_id = @inbounds decisions[1, i]
+            n_up = @inbounds v[j, decision_id]
+            if n_up > zero(Float32)
+                first_elem = @inbounds decisions[2, i]
+                last_elem = @inbounds decisions[3, i]
+                n_down = @inbounds flow[j, decision_id]
+                for e = first_elem:last_elem
+                    e1 = @inbounds elements[1, first_elem]
+                    e2 = @inbounds elements[2, first_elem]
+                    e_up = @inbounds (v[j, e1] * v[j, e2])
+                    edge_flow = e_up / n_up * n_down
+                    # following needs to be memory safe
+                    for class=1:nc
+                        @CUDA.atomic cw[j, class] += edge_flow * parameters[class, e] # atomic is automatically inbounds
+                    end
+                end
+            end
+        end
+    end
+
+    return nothing
+end
+
+
+
+function one_hot(labels::Vector, nc::Integer)    
+    ne = length(labels) 
+    one_hot_labels = zeros(Float32, ne, nc)
+    for (i, j) in enumerate(labels)
+        one_hot_labels[i, j + 1] = 1.0
+    end
+    one_hot_labels
+end
+
+function learn_parameters(circuit::CuLayeredParameterCircuit, nc::Integer, data::CuMatrix{Float32}, labels::CuMatrix{Float32}, reuse_up=nothing, reuse_down=nothing, reuse_cp=nothing, num_epochs=20, step_size=0.0001)
+    cp, flow, v = class_likelihood(circuit, nc, data, reuse_up, reuse_down, reuse_cp)
+    update_parameters(circuit, labels, cp, flow, step_size)
+    for _ = 2:num_epochs
+        cp, flow, v = class_likelihood(circuit, nc, data, v, flow, cp)
+        update_parameters(circuit, labels, cp, v, flow, step_size)
+    end
+    return nothing
+end
+
+function update_parameters(circuit::CuLayeredParameterCircuit, labels, cp, v, flow, step_size=0.0001)
+    _, nc = size(labels)
+    step_size = Float32(step_size)
+    CUDA.@sync for i = 1:length(circuit.layered_circuit.layers)
+        circuit_layer = circuit.layered_circuit.layers[i]
+        flow_layer = flow[i]
+        parameter_layer = circuit.layered_parameters[i]
+        ndl = num_decisions(circuit_layer)
+        num_threads = balance_threads(ndl, nc, 6)
+        num_threads = num_threads[1], num_threads[2], 
+        num_blocks = ceil(Int, ndl / num_threads[1]), ceil(Int, nc / num_threads[2]), 4
+        @cuda threads=num_threads blocks=num_blocks update_parameters_layer_kernel_cuda(labels, cp, flow_layer, circuit_layer.decisions, parameter_layer, step_size)
+    end
+    return nothing
+end
+
+function update_parameters_layer_kernel_cuda(labels, cp, flow, decisions, parameters, step_size)
+    index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    index_y = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    index_z = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+    stride_x = blockDim().x * gridDim().x
+    stride_y = blockDim().y * gridDim().y
+    stride_z = blockDim().z * gridDim().z
+    ne, nc = size(labels)
+    _, num_decisions = size(decisions)
+
+    for class = index_y:stride_y:nc
+        for i = index_x:stride_x:num_decisions
+            first_elem = @inbounds decisions[2, i]
+            last_elem = @inbounds decisions[3, i]
+            for e = first_elem:last_elem
+                for j = index_z:stride_z:ne
+                    edge_flow = e_up / n_up * n_down
+                    u = @inbounds edge_flow * (cp[j, class] - labels[j, class]) * step_size
+                    # following needs to be memory safe
+                    @inbounds parameters[class, e] -= u 
+                end
+            end
+        end
+    end
+
+    return nothing
+end