diff --git a/.gitignore b/.gitignore
index 2ec4662b..b4bd018e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,5 @@
 *.vscode
 **checkpoint.ipynb
 *Manifest.toml
-docs/build/
\ No newline at end of file
+docs/build/
+scratch/
\ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
index 1c3624fc..753ff389 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,22 +12,23 @@ jobs:
   include:
     - stage: "Unit Tests"      
       os: linux
-      julia: 1.3
+      julia: 1.5
       install:
-        - julia -e 'using Pkg; Pkg.activate("."); Pkg.add(PackageSpec(url="https://github.com/Juice-jl/LogicCircuits.jl")); Pkg.instantiate(); Pkg.precompile()'
+        - julia -e 'using Pkg; Pkg.activate("."); Pkg.add(PackageSpec(url="https://github.com/Juice-jl/LogicCircuits.jl")); Pkg.instantiate(); Pkg.precompile();'
         - julia -e 'using Pkg; Pkg.activate("./test"); Pkg.add(PackageSpec(url="https://github.com/Juice-jl/LogicCircuits.jl")); Pkg.develop(PackageSpec(path = pwd())); Pkg.instantiate(); Pkg.precompile();'
 
       script:
         - julia --code-coverage --color=yes -p2 test/runtests.jl
 
       after_success:
-        - julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Codecov.submit(process_folder())'
+        - julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Codecov.submit(process_folder());'
 
     - stage: "Deploy Documentation"      
       os: linux
-      julia: 1.3
+      julia: 1.5
       script:
-        - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
+        - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(url="https://github.com/Juice-jl/LogicCircuits.jl")); Pkg.instantiate(); Pkg.precompile();'
+        - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate();'
         - julia --project=docs/ docs/make.jl
       after_success: skip
 
@@ -38,4 +39,4 @@ notifications:
     on_start: always
     on_error: always
     rooms:
-      - secure: VMXOgM9g758gZiU06/Gaahns6CFpoSuDYMnl9g0LMv165HEe7tZPlF1IFbTEXk6svr+tAuSEd3oxs/kAyK7onI3hIpP0PSc+Y7/+rnOMk8zU+z7R6JEzQKHHb1M6pQ6MjzOia9BM7SfcfVqedPREVXZx+XJPmVuR4BgTOxUnnyfltZzW0ldSbyeJ37FdDSd9SDRRf7Q4UzbEMN33GfVsTKMZoRqASrZXhvqAVp7deXMdGp1kNlvIbbwVkeICLYTIYrm5zd0HkH2yEhk0AtgeTpyx/kkR1T0Fs2+PCDsLRPhP1EEJs7FdsdQJuP0SueJ92GpPd7yLYZVVWWQkGWudNb6H3iYp2xtbZCoeCBLEUgusrawwdxp0OlNOgP/aeJDc+zNy59ikraluI0sNCV1Pl8dIXu8Ihu6e7W6hoiTQ8K9PjwcXSmBgUsR+kXD8NcCx73RTxynokv+24Xk0M1pkJhu6mjNjZBDIegXVM/CnNew1LSMoMjdi43asuDiDbkZg2uCxfHwaMxlgWuM/M38r662FbOjEfgr13fhCyuUQZRFOKvvqU17HbA+ewC/J40C2g0sBDGPu/uOJsDJaQGPXDpXsh4G+8R7uZRNunhwNPK4OnVdY+uVnYlD+9TG9T1IothaDSRJvYU8HwAcUOJhMNYDDQosWOy+01NQtX0IYRgk=
\ No newline at end of file
+      - secure: VMXOgM9g758gZiU06/Gaahns6CFpoSuDYMnl9g0LMv165HEe7tZPlF1IFbTEXk6svr+tAuSEd3oxs/kAyK7onI3hIpP0PSc+Y7/+rnOMk8zU+z7R6JEzQKHHb1M6pQ6MjzOia9BM7SfcfVqedPREVXZx+XJPmVuR4BgTOxUnnyfltZzW0ldSbyeJ37FdDSd9SDRRf7Q4UzbEMN33GfVsTKMZoRqASrZXhvqAVp7deXMdGp1kNlvIbbwVkeICLYTIYrm5zd0HkH2yEhk0AtgeTpyx/kkR1T0Fs2+PCDsLRPhP1EEJs7FdsdQJuP0SueJ92GpPd7yLYZVVWWQkGWudNb6H3iYp2xtbZCoeCBLEUgusrawwdxp0OlNOgP/aeJDc+zNy59ikraluI0sNCV1Pl8dIXu8Ihu6e7W6hoiTQ8K9PjwcXSmBgUsR+kXD8NcCx73RTxynokv+24Xk0M1pkJhu6mjNjZBDIegXVM/CnNew1LSMoMjdi43asuDiDbkZg2uCxfHwaMxlgWuM/M38r662FbOjEfgr13fhCyuUQZRFOKvvqU17HbA+ewC/J40C2g0sBDGPu/uOJsDJaQGPXDpXsh4G+8R7uZRNunhwNPK4OnVdY+uVnYlD+9TG9T1IothaDSRJvYU8HwAcUOJhMNYDDQosWOy+01NQtX0IYRgk=
diff --git a/Project.toml b/Project.toml
index 3dd5e37d..94501d5f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,10 +4,15 @@ version = "0.1.1"
 
 [deps]
 BlossomV = "6c721016-9dae-5d90-abf6-67daaccb2332"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 LightGraphs = "093fc24a-ae57-5d10-9952-331d41423f4d"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LogicCircuits = "a7847b3b-b7f1-4dd5-83c3-60e0aa0f8599"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
+MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
 MetaGraphs = "626554b9-1ddb-594c-aa3c-2596fe9399a5"
 Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
@@ -19,17 +24,22 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
+TikzGraphs = "b4f28e30-c73f-5eaf-a395-8a9db949a742"
 
 [compat]
 BlossomV = "0.4"
+CUDA = "1.2"
 Clustering = "0.14"
+DataFrames = "0.21"
 DataStructures = "0.17"
 LightGraphs = "1.3"
 LogicCircuits = "0.1.1"
+LoopVectorization = "0.8.20"
+MLDatasets = "0.4, 0.5"
 MetaGraphs = "0.6"
 Metis = "1.0"
 Reexport = "0.2"
 SimpleWeightedGraphs = "1.1"
 StatsBase = "0.33"
 StatsFuns = "0.9"
-julia = "1.3"
+julia = "1.5"
diff --git a/docs/src/api/internals/io.md b/docs/src/api/internals/io.md
deleted file mode 100644
index 972aa218..00000000
--- a/docs/src/api/internals/io.md
+++ /dev/null
@@ -1,6 +0,0 @@
-
-# IO
-
-```@autodocs
-Modules = [ProbabilisticCircuits.IO]
-```
\ No newline at end of file
diff --git a/docs/src/api/internals/loadsave.md b/docs/src/api/internals/loadsave.md
new file mode 100644
index 00000000..3f74274e
--- /dev/null
+++ b/docs/src/api/internals/loadsave.md
@@ -0,0 +1,6 @@
+
+# LoadSave
+
+```@autodocs
+Modules = [ProbabilisticCircuits.LoadSave]
+```
\ No newline at end of file
diff --git a/docs/src/api/internals/logistic.md b/docs/src/api/internals/logistic.md
deleted file mode 100644
index 6fc07a5e..00000000
--- a/docs/src/api/internals/logistic.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Logistic
-
-```@autodocs
-Modules = [Logistic]
-```
diff --git a/docs/src/api/internals/probabilistic.md b/docs/src/api/internals/probabilistic.md
deleted file mode 100644
index 2d862da9..00000000
--- a/docs/src/api/internals/probabilistic.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Probabilistic
-
-```@autodocs
-Modules = [Probabilistic]
-```
\ No newline at end of file
diff --git a/docs/src/api/internals/reasoning.md b/docs/src/api/internals/reasoning.md
deleted file mode 100644
index 65d5400a..00000000
--- a/docs/src/api/internals/reasoning.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Reasoning
-
-```@autodocs
-Modules = [Reasoning]
-```
diff --git a/docs/src/api/internals/structureLearner.md b/docs/src/api/internals/structureLearner.md
deleted file mode 100644
index 72719021..00000000
--- a/docs/src/api/internals/structureLearner.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# StructureLearner
-
-```@autodocs
-Modules = [StructureLearner]
-```
diff --git a/docs/src/api/internals/utils.md b/docs/src/api/internals/utils.md
index c8dae019..57998984 100644
--- a/docs/src/api/internals/utils.md
+++ b/docs/src/api/internals/utils.md
@@ -1,5 +1,5 @@
 # Utils
 
 ```@autodocs
-Modules = [Utils]
+Modules = [ProbabilisticCircuits.Utils]
 ```
diff --git a/src/IO/CircuitLineCompiler.jl b/src/IO/CircuitLineCompiler.jl
deleted file mode 100644
index 32abf1a2..00000000
--- a/src/IO/CircuitLineCompiler.jl
+++ /dev/null
@@ -1,117 +0,0 @@
-using LogicCircuits.IO: constant
-
-#####################
-# Compilers to ProbabilisticCircuits data structures starting from already parsed line objects
-#####################
-
-# reuse some internal infrastructure of LogicCircuits' IO module
-using LogicCircuits.IO: CircuitFormatLines, CircuitFormatLine, VtreeFormatLines, CircuitHeaderLine, UnweightedLiteralLine, WeightedLiteralLine, DecisionLine, LCElement, BiasLine, WeightedNamedConstantLine, PSDDElement, CircuitCommentLine, ID,
-compile_smooth_struct_logical_m, compile_smooth_logical_m
-
-"""
-Compile lines into a probabilistic circuit.
-"""
-function compile_prob(lines::CircuitFormatLines)::ProbΔ
-    # first compile a logical circuit
-    logical_circuit, id2lognode = compile_smooth_logical_m(lines)
-    decorate_prob(lines, logical_circuit, id2lognode)
-end
-
-"""
-Compile lines into a logistic circuit.
-"""
-function compile_logistic(lines::CircuitFormatLines, classes::Int)::LogisticΔ
-    # first compile a logical circuit
-    logical_circuit, id2lognode = compile_smooth_logical_m(lines)
-    decorate_logistic(lines, logical_circuit, classes, id2lognode)
-end
-
-"""
-Compile circuit and vtree lines into a structured probabilistic circuit (one whose logical circuit origin is structured).
-"""
-function compile_struct_prob(circuit_lines::CircuitFormatLines, vtree_lines::VtreeFormatLines)
-    logical_circuit, vtree, id2vtree, id2lognode = compile_smooth_struct_logical_m(circuit_lines, vtree_lines)
-    prob_circuit = decorate_prob(circuit_lines, logical_circuit, id2lognode)
-    return prob_circuit, vtree
-end
-
-function decorate_prob(lines::CircuitFormatLines, logical_circuit::LogicalΔ, id2lognode::Dict{ID,<:LogicalΔNode})::ProbΔ
-    # set up cache mapping logical circuit nodes to their probabilistic decorator
-    lognode2probnode = ProbCache()
-    # build a corresponding probabilistic circuit
-    prob_circuit = ProbΔ(logical_circuit,lognode2probnode)
-    # map from line node ids to probabilistic circuit nodes
-    id2probnode(id) = lognode2probnode[id2lognode[id]]
-
-    # go through lines again and update the probabilistic circuit node parameters
-
-    function compile(ln::CircuitFormatLine)
-        error("Compilation of line $ln into probabilistic circuit is not supported")
-    end
-    function compile(::Union{CircuitHeaderLine,CircuitCommentLine,UnweightedLiteralLine})
-        # do nothing
-    end
-    function compile(ln::WeightedNamedConstantLine)
-        @assert constant(ln) == true
-        node = id2probnode(ln.node_id)::Prob⋁
-        node.log_thetas .= [ln.weight, log1p(-exp(ln.weight)) ]
-    end
-    function compile(ln::DecisionLine{<:PSDDElement})
-        node = id2probnode(ln.node_id)::Prob⋁
-        node.log_thetas .= [x.weight for x in ln.elements]
-    end
-    for ln in lines
-        compile(ln)
-    end
-
-    prob_circuit
-end
-
-
-function decorate_logistic(lines::CircuitFormatLines, logical_circuit::LogicalΔ, 
-                            classes::Int, id2lognode::Dict{ID,<:LogicalΔNode})::LogisticΔ
-                        
-    # set up cache mapping logical circuit nodes to their logistic decorator
-    log2logistic = LogisticCache()
-    # build a corresponding probabilistic circuit
-    logistic_circuit = LogisticΔ(logical_circuit, classes, log2logistic)
-    # map from line node ids to probabilistic circuit nodes
-    id2logisticnode(id) = log2logistic[id2lognode[id]]
-
-    # go through lines again and update the probabilistic circuit node parameters
-
-    function compile(ln::CircuitFormatLine)
-        error("Compilation of line $ln into logistic circuit is not supported")
-    end
-    function compile(::Union{CircuitHeaderLine,CircuitCommentLine,UnweightedLiteralLine})
-        # do nothing
-    end
-
-    function compile(ln::CircuitHeaderLine)
-        # do nothing
-    end
-
-    function compile(ln::WeightedLiteralLine)
-        node = id2logisticnode(ln.node_id)::Logistic⋁
-        node.thetas[1, :] .= ln.weights
-    end
-
-    function compile(ln::DecisionLine{<:LCElement})
-        node = id2logisticnode(ln.node_id)::Logistic⋁
-        for (ind, elem) in enumerate(ln.elements)
-            node.thetas[ind, :] .= elem.weights
-        end
-    end
-
-    function compile(ln::BiasLine)
-        node = id2logisticnode(ln.node_id)::Logistic⋁
-        # @assert length(node.thetas) == 1
-        node.thetas[1,:] .= ln.weights
-    end
-
-    for ln in lines
-        compile(ln)
-    end
-
-    logistic_circuit
-end
\ No newline at end of file
diff --git a/src/IO/IO.jl b/src/IO/IO.jl
deleted file mode 100644
index c670cac8..00000000
--- a/src/IO/IO.jl
+++ /dev/null
@@ -1,33 +0,0 @@
-module IO
-
-using LogicCircuits
-using ..Utils
-using ..Probabilistic
-using ..Logistic
-
-export
-
-# CircuitParser
-load_prob_circuit, 
-load_struct_prob_circuit, 
-load_psdd_prob_circuit, 
-load_logistic_circuit,
-parse_clt,
-
-# CircuitSaver
-save_as_dot, is_true_node, save_circuit,
-# get_node2id,get_vtree2id,vtree_node, decompile, make_element, save_lines, save_psdd_comment_line, save_sdd_comment_line, 
-# save_line, to_string
-
-
-# Loaders
-zoo_psdd, zoo_lc, zoo_clt,
-zoo_clt_file
-
-include("CircuitLineCompiler.jl")
-include("CircuitParser.jl")
-include("CircuitSaver.jl")
-
-include("Loaders.jl")
-
-end
\ No newline at end of file
diff --git a/src/IO/Loaders.jl b/src/IO/Loaders.jl
deleted file mode 100644
index 719a1a60..00000000
--- a/src/IO/Loaders.jl
+++ /dev/null
@@ -1,22 +0,0 @@
-using LogicCircuits
-using Pkg.Artifacts
-
-
-#####################
-# Circuit loaders
-#####################
-
-zoo_lc(name, num_classes) = 
-    load_logistic_circuit(zoo_lc_file(name), num_classes)
-
-zoo_clt_file(name) = 
-    artifact"circuit_model_zoo" * "/Circuit-Model-Zoo-0.1.2/clts/$name"
-
-zoo_clt(name) = 
-    parse_clt(zoo_clt_file(name))
-
-zoo_psdd_file(name) = 
-    artifact"circuit_model_zoo" * "/Circuit-Model-Zoo-0.1.2/psdds/$name"
-
-zoo_psdd(name) = 
-    load_prob_circuit(zoo_psdd_file(name))
diff --git a/src/LoadSave/LoadSave.jl b/src/LoadSave/LoadSave.jl
new file mode 100644
index 00000000..26ae08f2
--- /dev/null
+++ b/src/LoadSave/LoadSave.jl
@@ -0,0 +1,11 @@
+module LoadSave
+
+using LogicCircuits
+using ...ProbabilisticCircuits
+
+include("circuit_line_compiler.jl")
+include("circuit_loaders.jl")
+include("circuit_savers.jl")
+include("plot.jl")
+
+end
\ No newline at end of file
diff --git a/src/LoadSave/circuit_line_compiler.jl b/src/LoadSave/circuit_line_compiler.jl
new file mode 100644
index 00000000..4d7687dd
--- /dev/null
+++ b/src/LoadSave/circuit_line_compiler.jl
@@ -0,0 +1,121 @@
+#####################
+# Compilers to ProbabilisticCircuits data structures starting from already parsed line objects
+#####################
+
+# reuse some internal infrastructure of LogicCircuits' LoadSave module
+using LogicCircuits.LoadSave: CircuitFormatLines, CircuitFormatLine, lnconstant,
+VtreeFormatLines, CircuitHeaderLine, UnweightedLiteralLine, WeightedLiteralLine, 
+DecisionLine, LCElement, BiasLine, WeightedNamedConstantLine, PSDDElement, 
+CircuitCommentLine, ID, compile_smooth_struct_logical_m, compile_smooth_logical_m
+
+"""
+Compile lines into a probabilistic circuit
+"""
+function compile_prob(lines::CircuitFormatLines)::ProbCircuit
+    # first compile a logic circuit
+    logic_circuit, id2lognode = compile_smooth_logical_m(lines)
+    decorate_prob(lines, logic_circuit, id2lognode)
+end
+
+"""
+Compile lines into a logistic circuit.
+"""
+function compile_logistic(lines::CircuitFormatLines, classes::Int)::LogisticCircuit
+    # first compile a logic circuit
+    logic_circuit, id2lognode = compile_smooth_logical_m(lines)
+    decorate_logistic(lines, logic_circuit, classes, id2lognode)
+end
+
+"""
+Compile circuit and vtree lines into a structured probabilistic circuit (one whose logic circuit origin is structured).
+"""
+function compile_struct_prob(circuit_lines::CircuitFormatLines, vtree_lines::VtreeFormatLines)
+    logic_circuit, vtree, id2lognode, id2vtree = compile_smooth_struct_logical_m(circuit_lines, vtree_lines)
+    prob_circuit = decorate_prob(circuit_lines, logic_circuit, id2lognode)
+    return prob_circuit, vtree
+end
+
+function decorate_prob(lines::CircuitFormatLines, logic_circuit::LogicCircuit, id2lognode::Dict{ID,<:LogicCircuit})::ProbCircuit
+    # set up cache mapping logic circuit nodes to their probabilistic decorator
+
+    prob_circuit = ProbCircuit(logic_circuit)
+    lognode2probnode = Dict{LogicCircuit, ProbCircuit}()
+
+    prob_lin = linearize(prob_circuit) # TODO better implementation
+    logic_lin = linearize(logic_circuit)
+
+    foreach(i -> lognode2probnode[logic_lin[i]] = prob_lin[i], 1 : num_nodes(logic_circuit)) 
+
+    # map from line node ids to probabilistic circuit nodes
+    id2probnode(id) = lognode2probnode[id2lognode[id]]
+
+    root = nothing
+
+    # go through lines again and update the probabilistic circuit node parameters
+
+    function compile(ln::CircuitFormatLine)
+        error("Compilation of line $ln into probabilistic circuit is not supported")
+    end
+    function compile(::Union{CircuitHeaderLine,CircuitCommentLine,UnweightedLiteralLine})
+        # do nothing
+    end
+    function compile(ln::WeightedNamedConstantLine)
+        @assert lnconstant(ln) == true
+        root = id2probnode(ln.node_id)
+        root.log_probs .= [ln.weight, log1p(-exp(ln.weight))]
+    end
+    function compile(ln::DecisionLine{<:PSDDElement})
+        root = id2probnode(ln.node_id)
+        root.log_probs .= [x.weight for x in ln.elements]
+    end
+
+    foreach(compile, lines)
+
+    root
+end
+
+function decorate_logistic(lines::CircuitFormatLines, logic_circuit::LogicCircuit,
+                            classes::Int, id2lognode::Dict{ID,<:LogicCircuit})::LogisticCircuit
+                        
+    # set up cache mapping logic circuit nodes to their logistic decorator
+    logistic_circuit = LogisticCircuit(logic_circuit, classes)
+    log2logistic = Dict{LogicCircuit, LogisticCircuit}()
+    logistic_lin = linearize(logistic_circuit)
+    logic_lin = linearize(logic_circuit)
+
+    foreach(i -> log2logistic[logic_lin[i]] = logistic_lin[i], 1 : length(logic_lin)) 
+    id2logisticnode(id) = log2logistic[id2lognode[id]]
+
+    root = nothing
+    # go through lines again and update the probabilistic circuit node parameters
+
+    function compile(ln::CircuitFormatLine)
+        error("Compilation of line $ln into logistic circuit is not supported")
+    end
+
+    function compile(::Union{CircuitHeaderLine,CircuitCommentLine,UnweightedLiteralLine})
+        # do nothing
+    end
+
+    function compile(ln::WeightedLiteralLine)
+        root = id2logisticnode(ln.node_id)::Logistic⋁Node
+        root.thetas[1, :] .= ln.weights
+    end
+
+    function compile(ln::DecisionLine{<:LCElement})
+        root = id2logisticnode(ln.node_id)::Logistic⋁Node
+        for (ind, elem) in enumerate(ln.elements)
+            root.thetas[ind, :] .= elem.weights
+        end
+    end
+
+    function compile(ln::BiasLine)
+        root = id2logisticnode(ln.node_id)::Logistic⋁Node
+        # @assert length(root.thetas) == 1
+        root.thetas[1,:] .= ln.weights
+    end
+
+    foreach(compile, lines)
+
+    root
+end
\ No newline at end of file
diff --git a/src/IO/CircuitParser.jl b/src/LoadSave/circuit_loaders.jl
similarity index 64%
rename from src/IO/CircuitParser.jl
rename to src/LoadSave/circuit_loaders.jl
index e76e28f5..b8f4955a 100644
--- a/src/IO/CircuitParser.jl
+++ b/src/LoadSave/circuit_loaders.jl
@@ -1,5 +1,25 @@
+export zoo_clt, zoo_clt_file, zoo_psdd, zoo_lc, load_prob_circuit, 
+load_struct_prob_circuit, load_logistic_circuit
 
-using MetaGraphs: MetaDiGraph, set_prop!, props
+using LogicCircuits
+using Pkg.Artifacts
+using LogicCircuits.LoadSave: parse_psdd_file, parse_circuit_file, parse_vtree_file
+
+#####################
+# circuit loaders from module zoo
+#####################
+
+zoo_lc(name, num_classes) = 
+    load_logistic_circuit(zoo_lc_file(name), num_classes)
+
+zoo_clt_file(name) = 
+    artifact"circuit_model_zoo" * "/Circuit-Model-Zoo-0.1.2/clts/$name"
+
+zoo_clt(name) = 
+    parse_clt(zoo_clt_file(name))
+
+zoo_psdd(name) = 
+    load_prob_circuit(zoo_psdd_file(name))
 
 #####################
 # general parser infrastructure for circuits
@@ -11,9 +31,9 @@ using MetaGraphs: MetaDiGraph, set_prop!, props
 """
 Load a probabilistic circuit from file.
 Support circuit file formats:
- * ".psdd" for PSDD files
+    * ".psdd" for PSDD files
 """
-function load_prob_circuit(file::String)::ProbΔ
+function load_prob_circuit(file::String)::ProbCircuit
     @assert endswith(file,".psdd")
     compile_prob(parse_psdd_file(file))
 end
@@ -21,40 +41,35 @@ end
 """
 Load a structured probabilistic circuit from file.
 Support circuit file formats:
- * ".psdd" for PSDD files
+    * ".psdd" for PSDD files
 Supported vtree file formats:
- * ".vtree" for VTree files
+    * ".vtree" for Vtree files
 """
-function load_struct_prob_circuit(circuit_file::String, vtree_file::String)::Tuple{ProbΔ,PlainVtree}
+function load_struct_prob_circuit(circuit_file::String, vtree_file::String)::Tuple{StructProbCircuit,PlainVtree}
     @assert endswith(circuit_file,".psdd")
     circuit_lines = parse_circuit_file(circuit_file)
     vtree_lines = parse_vtree_file(vtree_file)
     compile_struct_prob(circuit_lines, vtree_lines)
 end
 
-
-function load_logistic_circuit(circuit_file::String, classes::Int)::LogisticΔ
+"""
+Load a logistic circuit from file.
+Support circuit file formats:
+    * ".circuit" for logistic files
+Supported vtree file formats:
+    * ".vtree" for Vtree files
+"""
+function load_logistic_circuit(circuit_file::String, classes::Int)::LogisticCircuit
     @assert endswith(circuit_file,".circuit")
     circuit_lines = parse_circuit_file(circuit_file)
     compile_logistic(circuit_lines, classes)
 end
 
-
 #####################
 # parse based on file extension
 #####################
 
-function parse_circuit_file(file::String)::CircuitFormatLines
-    if endswith(file,".circuit")
-        parse_lc_file(file)
-    elseif endswith(file,".psdd")
-        parse_psdd_file(file)
-    elseif endswith(file,".sdd")
-        parse_sdd_file(file)
-    else
-        throw("Cannot parse this file type as a circuit: $file")
-    end
-end
+using MetaGraphs: MetaDiGraph, set_prop!, props, add_edge!
 
 "Parse a clt from given file"
 function parse_clt(filename::String)::MetaDiGraph
@@ -78,3 +93,4 @@ function parse_clt(filename::String)::MetaDiGraph
     end
     return clt
 end
+
diff --git a/src/IO/CircuitSaver.jl b/src/LoadSave/circuit_savers.jl
similarity index 51%
rename from src/IO/CircuitSaver.jl
rename to src/LoadSave/circuit_savers.jl
index 9bba0d78..fd22fc52 100644
--- a/src/IO/CircuitSaver.jl
+++ b/src/LoadSave/circuit_savers.jl
@@ -1,74 +1,43 @@
-using Printf: @sprintf
+export save_circuit, save_as_dot, save_as_psdd, save_as_logistic
 
-import Base.copy
-import LogicCircuits.IO: SDDElement, 
+using LogicCircuits.LoadSave: SDDElement, 
     PSDDElement, 
-    save_lines, 
+    save_lines,
+    get_vtree2id,
+    get_node2id,
     parse_psdd_file, 
     PsddHeaderLine, 
     LcHeaderLine, 
-    save_sdd_file, 
-    save_as_dot,
     get_nodes_level
 
-# Saving psdd
-
 #####################
-# decompile for nodes
+# decompile for probabilistic nodes
 #####################
 
-# decompile for psdd
-decompile(n::ProbLiteral, node2id, vtree2id)::UnweightedLiteralLine = 
-    UnweightedLiteralLine(node2id[n], vtree2id[n.origin.vtree], literal(n), true)
+"Decompile for psdd circuit, used during saving of circuits to file" 
+decompile(n::StructProbLiteralNode, node2id, vtree2id)::UnweightedLiteralLine = 
+    UnweightedLiteralLine(node2id[n], vtree2id[n.vtree], literal(n), true)
 
-make_element(n::Prob⋀, w::AbstractFloat, node2id) = 
-    PSDDElement(node2id[n.children[1]],  node2id[n.children[2]], w)
+make_element(n::StructMulNode, w::AbstractFloat, node2id) = 
+    PSDDElement(node2id[children(n)[1]],  node2id[children(n)[2]], w)
 
-is_true_node(n)::Bool = 
-    GateType(n) isa ⋁Gate && num_children(n) == 2 && GateType(children(n)[1]) isa LiteralGate && GateType(children(n)[2]) isa LiteralGate && 
-    positive(children(n)[1]) && negative(children(n)[2])
+istrue_node(n)::Bool = 
+    is⋁gate(n) && num_children(n) == 2 && GateType(children(n)[1]) isa LiteralGate && GateType(children(n)[2]) isa LiteralGate && 
+    ispositive(children(n)[1]) && isnegative(children(n)[2])
 
-function decompile(n::Prob⋁, node2id, vtree2id)::Union{WeightedNamedConstantLine, DecisionLine{PSDDElement}} 
-    if is_true_node(n)
-        WeightedNamedConstantLine(node2id[n], vtree2id[n.origin.vtree], lit2var(n.children[1].origin.literal), n.log_thetas[1]) # TODO
+function decompile(n::StructSumNode, node2id, vtree2id)::Union{WeightedNamedConstantLine, DecisionLine{PSDDElement}} 
+    if istrue_node(n)
+        WeightedNamedConstantLine(node2id[n], vtree2id[n.vtree], lit2var(children(n)[1].literal), n.log_probs[1]) # TODO
     else
-        DecisionLine(node2id[n], vtree2id[n.origin.vtree], UInt32(num_children(n)), map(x -> make_element(x[1], x[2], node2id), zip(children(n), n.log_thetas)))
-    end
-end
-
-#####################
-# build maping
-#####################
-
-function get_node2id(ln::AbstractVector{X}, T::Type)where X #<: T#::Dict{T, ID}
-    node2id = Dict{T, ID}()
-    outnodes = filter(n -> !(GateType(n) isa ⋀Gate), ln)
-    sizehint!(node2id, length(outnodes))
-    index = ID(0) # node id start from 0
-    for n in outnodes
-        node2id[n] = index
-        index += ID(1)
-    end
-    node2id
-end
-
-function get_vtree2id(ln::PlainVtree):: Dict{PlainVtreeNode, ID}
-    vtree2id = Dict{PlainVtreeNode, ID}()
-    sizehint!(vtree2id, length(ln))
-    index = ID(0) # vtree id start from 0
-
-    for n in ln
-        vtree2id[n] = index
-        index += ID(1)
+        DecisionLine(node2id[n], vtree2id[n.vtree], UInt32(num_children(n)), map(x -> make_element(x[1], x[2], node2id), zip(children(n), n.log_probs)))
     end
-    vtree2id
 end
 
 #####################
 # saver for circuits
 #####################
 
-
+"Returns header for PSDD file format"
 function psdd_header()
     """
     c ids of psdd nodes start at 0
@@ -84,16 +53,15 @@ function psdd_header()
     c"""
 end
 
-function save_psdd_file(name::String, ln::ProbΔ, vtree::PlainVtree)
+function save_as_psdd(name::String, circuit::ProbCircuit, vtree::PlainVtree)
     # TODO add method isstructured
-    @assert ln[end].origin isa StructLogicalΔNode "PSDD should decorate on StructLogicalΔ"
     @assert endswith(name, ".psdd")
-    node2id = get_node2id(ln, ProbΔNode)
+    node2id = get_node2id(circuit)
     vtree2id = get_vtree2id(vtree)
     formatlines = Vector{CircuitFormatLine}()
     append!(formatlines, parse_psdd_file(IOBuffer(psdd_header())))
-    push!(formatlines, PsddHeaderLine(num_nodes(ln)))
-    for n in filter(n -> !(GateType(n) isa ⋀Gate), ln)
+    push!(formatlines, PsddHeaderLine(num_nodes(circuit)))
+    for n in filter(n -> !is⋀gate(n), circuit)
         push!(formatlines, decompile(n, node2id, vtree2id))
     end
     save_lines(name, formatlines)
@@ -121,43 +89,35 @@ function lc_header()
     c"""
 end
     
-function save_lc_file(name::String, ln::LogisticΔ, vtree)
-    @assert ln[end].origin isa StructLogicalΔNode "LC should decorate on StructLogicalΔ"
+function save_as_logistic(name::String, circuit::LogisticCircuit, vtree)
     @assert endswith(name, ".circuit")
-    node2id = get_node2id(ln, ProbΔNode)
+    node2id = get_node2id(circuit)
     vtree2id = get_vtree2id(vtree)
     formatlines = Vector{CircuitFormatLine}()
     append!(formatlines, parse_lc_file(IOBuffer(lc_header())))
     push!(formatlines, LcHeaderLine())
-    for n in filter(n -> !(GateType(n) isa ⋀Gate), ln)
+    for n in filter(n -> !is⋀gate(n), circuit)
         push!(formatlines, decompile(n, node2id, vtree2id))
     end
     save_lines(name, formatlines)
 end
 
-import LogicCircuits.save_circuit # make available for extension
+# TODO add Decompile for logistic circuit
 
-function save_circuit(name::String, circuit, vtree=nothing)
-    if endswith(name, ".circuit")
-        save_lc_file(name, circuit, vtree)
-    elseif endswith(name, ".psdd")
-        save_psdd_file(name, circuit, vtree)
-    elseif endswith(name, ".sdd")
-        save_sdd_file(name, circuit, vtree)
-    else
-        error("Cannot save circuit to file with this extensions: $name")
-    end
-end
+import LogicCircuits.LoadSave: save_circuit, save_as_dot # make available for extension
 
-"Save prob circuit to .dot file"
-function save_as_dot(root::ProbΔNode, file::String)
-    return save_as_dot(node2dag(root), file)
-end
+"Save a circuit to file"
+save_circuit(name::String, circuit::StructProbCircuit, vtree::PlainVtree) =
+    save_as_psdd(name, circuit, vtree)
+
+save_circuit(name::String, circuit::LogisticCircuit, vtree::PlainVtree) =
+    save_as_logistic(name, circuit, vtree)
 
+using Printf: @sprintf
 "Save prob circuits to .dot file"
-function save_as_dot(circuit::ProbΔ, file::String)
+function save_as_dot(circuit::ProbCircuit, file::String)
     # TODO (https://github.com/Juice-jl/LogicCircuits.jl/issues/7)
-    node_cache = Dict{ProbΔNode, Int64}()
+    node_cache = Dict{ProbCircuit, Int64}()
     for (i, n) in enumerate(circuit)
         node_cache[n] = i
     end
@@ -179,26 +139,26 @@ function save_as_dot(circuit::ProbΔ, file::String)
     end
 
     for n in reverse(circuit)
-        if n isa Prob⋀
+        if n isa PlainMulNode
             write(f, "$(node_cache[n]) [label=\"*$(node_cache[n])\"]\n")
         elseif n isa Prob⋁
             write(f, "$(node_cache[n]) [label=\"+$(node_cache[n])\"]\n")
-        elseif n isa ProbLiteral && positive(n)
+        elseif n isa PlainProbLiteralNode && ispositive(n)
             write(f, "$(node_cache[n]) [label=\"+$(variable(n.origin))\"]\n")
-        elseif n isa ProbLiteral && negative(n)
+        elseif n isa PlainProbLiteralNode && isnegative(n)
             write(f, "$(node_cache[n]) [label=\"-$(variable(n.origin))\"]\n")
         else
-            throw("unknown ProbNode type")
+            throw("unknown ProbCircuit type")
         end
     end
 
     for n in reverse(circuit)
         if n isa Prob⋀
-            for c in n.children
+            for c in children(n)
                 write(f, "$(node_cache[n]) -> $(node_cache[c])\n")
             end
         elseif n isa Prob⋁
-            for (c, p) in zip(n.children, exp.(n.log_thetas))
+            for (c, p) in zip(children(n), exp.(n.log_probs))
                 prob = @sprintf "%0.1f" p
                 write(f, "$(node_cache[n]) -> $(node_cache[c]) [label=\"$prob\"]\n")
             end
diff --git a/src/LoadSave/plot.jl b/src/LoadSave/plot.jl
new file mode 100644
index 00000000..e294e835
--- /dev/null
+++ b/src/LoadSave/plot.jl
@@ -0,0 +1,34 @@
+export DiGraph, plot
+using LightGraphs
+using TikzGraphs
+
+import LightGraphs: DiGraph
+
+function DiGraph(pc::ProbCircuit)
+    edge_labels = Dict()
+    label = label = Vector{String}(undef, num_nodes(pc))
+
+    add_label!(g, dict, n::ProbCircuit) = begin
+        label[dict[n]] = 
+        if isliteralgate(n) "$(literal(n))"
+        elseif ismul(n) "*"
+        else "+"
+        end
+    end
+
+    on_edge(g, id_dict, n, c, n_id, c_id) = noop
+    on_edge(g, id_dict, n::Union{PlainSumNode, StructSumNode}, c, n_id, c_id) = begin
+        edge_labels[(n_id, c_id)] = begin
+            i = findall(x -> x === c, children(n))[1]
+            "$(round(exp(n.log_probs[i]), digits=3))"
+        end
+    end
+    g, _ = LogicCircuits.LoadSave.DiGraph(pc;on_edge=on_edge, on_var=add_label!)
+    g, label, edge_labels
+end
+
+import TikzGraphs: plot
+plot(pc::ProbCircuit) = begin
+    g, label, edge_labels = DiGraph(pc)
+    TikzGraphs.plot(g, label, edge_labels=edge_labels, edge_style="font=\\tiny")
+end
\ No newline at end of file
diff --git a/src/Logistic/Logistic.jl b/src/Logistic/Logistic.jl
index 433c9ad5..e36fcb59 100644
--- a/src/Logistic/Logistic.jl
+++ b/src/Logistic/Logistic.jl
@@ -1,25 +1,11 @@
 module Logistic
 
 using LogicCircuits
-using ..Utils
+using ...ProbabilisticCircuits
 
-export 
-    LogisticΔNode, 
-    LogisticLeafNode, 
-    LogisticInnerNode, 
-    LogisticLiteral,
-    Logistic⋀,
-    Logistic⋁,
-    LogisticΔ,
-    LogisticΔ,
-    LogisticCache,
-    num_parameters_perclass,
-    logistic_origin,
-    class_conditional_likelihood_per_instance,
-    classes
+include("queries.jl")
+include("parameters.jl")
 
-
-
-include("LogisticCircuits.jl")
+# TODO structure learning
 
 end
\ No newline at end of file
diff --git a/src/Logistic/LogisticCircuits.jl b/src/Logistic/LogisticCircuits.jl
deleted file mode 100644
index 4e40f70b..00000000
--- a/src/Logistic/LogisticCircuits.jl
+++ /dev/null
@@ -1,139 +0,0 @@
-#######################
-## Logistic Circuits
-#######################
-
-
-abstract type LogisticΔNode{O} <: DecoratorΔNode{O} end
-abstract type LogisticLeafNode{O} <: LogisticΔNode{O} end
-abstract type LogisticInnerNode{O} <: LogisticΔNode{O} end
-
-struct LogisticLiteral{O} <: LogisticLeafNode{O}
-    origin::O
-end
-
-struct Logistic⋀{O} <: LogisticInnerNode{O}
-    origin::O
-    children::Vector{<:LogisticΔNode{<:O}}
-end
-
-mutable struct Logistic⋁{O} <: LogisticInnerNode{O}
-    origin::O
-    children::Vector{<:LogisticΔNode{<:O}}
-    thetas::Array{Float64, 2}
-end
-
-
-
-const LogisticΔ{O} = AbstractVector{<:LogisticΔNode{O}}
-
-#####################
-# traits
-#####################
-
-import LogicCircuits.GateType # make available for extension
-
-@inline GateType(::Type{<:LogisticLiteral}) = LiteralGate()
-@inline GateType(::Type{<:Logistic⋀}) = ⋀Gate()
-@inline GateType(::Type{<:Logistic⋁}) = ⋁Gate()
-
-
-
-#####################
-# constructors and conversions
-#####################
-
-function Logistic⋁(::Type{O}, origin, children, classes::Int) where {O}
-    Logistic⋁{O}(origin, children, Array{Float64, 2}(undef, (length(children), classes)))
-end
-
-
-const LogisticCache = Dict{ΔNode, LogisticΔNode}
-
-function LogisticΔ(circuit::Δ, classes::Int, cache::LogisticCache = LogisticCache())
-
-    sizehint!(cache, length(circuit)*4÷3)
-    
-    O = grapheltype(circuit) # type of node in the origin
-
-    pc_node(::LiteralGate, n::ΔNode) = LogisticLiteral{O}(n)
-    pc_node(::ConstantGate, n::ΔNode) = error("Cannot construct a logistic circuit from constant leafs: first smooth and remove unsatisfiable branches.")
-
-    pc_node(::⋀Gate, n::ΔNode) = begin
-        children = map(c -> cache[c], n.children)
-        Logistic⋀{O}(n, children)
-    end
-
-    pc_node(::⋁Gate, n::ΔNode) = begin
-        children = map(c -> cache[c], n.children)
-        Logistic⋁(O, n, children, classes)
-    end
-        
-    map(circuit) do node
-        pcn = pc_node(GateType(node), node)
-        cache[node] = pcn
-        pcn
-    end
-end
-
-
-#####################
-# methods
-#####################
-
-import LogicCircuits: literal, children # make available for extension
-
-@inline literal(n::LogisticLiteral)::Lit  = literal(n.origin)
-@inline children(n::LogisticInnerNode) = n.children
-@inline classes(n::Logistic⋁) = size(n.thetas)[2]
-
-num_parameters(n::Logistic⋁) = num_children(n) * classes(n)
-num_parameters(c::LogisticΔ) = sum(n -> num_parameters(n), ⋁_nodes(c))
-
-num_parameters_perclass(n::Logistic⋁) = num_children(n)
-num_parameters_perclass(c::LogisticΔ) = sum(n -> num_parameters_perclass(n), ⋁_nodes(c))
-
-"Return the first origin that is a Logistic circuit node"
-logistic_origin(n::DecoratorΔNode)::LogisticΔNode = origin(n,LogisticΔNode)
-
-"Return the first origin that is a Logistic circuit"
-logistic_origin(c::DecoratorΔ)::LogisticΔ = origin(c, LogisticΔNode)
-
-
-# TODO Learning
-
-
-
-# Class Conditional Probability
-function class_conditional_likelihood_per_instance(fc::FlowΔ, 
-                                                    classes::Int, 
-                                                    batch::PlainXData{Bool})
-    lc = origin(origin(fc))
-    @assert(lc isa LogisticΔ)
-    pass_up_down(fc, batch)
-    likelihoods = zeros(num_examples(batch), classes)
-    for n in fc
-        orig = logistic_origin(n)
-        if orig isa Logistic⋁
-            # For each class. orig.thetas is 2D so used eachcol
-            for (idx, thetaC) in enumerate(eachcol(orig.thetas))
-                foreach(n.children, thetaC) do c, theta
-                    likelihoods[:, idx] .+= prod_fast(downflow(n), pr_factors(origin(c))) .* theta
-                end
-            end
-        end
-    end
-    likelihoods
-end
-
-"""
-Calculate conditional log likelihood for a batch of samples with evidence P(c | x).
-(Also returns the generated FlowΔ)
-"""
-function class_conditional_likelihood_per_instance(lc::LogisticΔ, 
-                                                    classes::Int, 
-                                                    batch::PlainXData{Bool})
-    opts = (max_factors = 2, compact⋀=false, compact⋁=false)
-    fc = FlowΔ(lc, num_examples(batch), Float64, opts)
-    (fc, class_conditional_likelihood_per_instance(fc, classes, batch))
-end
-
diff --git a/src/Logistic/parameter_circuit.jl b/src/Logistic/parameter_circuit.jl
new file mode 100644
index 00000000..ee008dd7
--- /dev/null
+++ b/src/Logistic/parameter_circuit.jl
@@ -0,0 +1,231 @@
+using CUDA
+using LogicCircuits
+
+export LayeredParameterCircuit, CuLayeredParameterCircuit
+export class_likelihood, class_weights
+export one_hot, learn_parameters, update_parameters
+
+#############################################################
+############## This is the old implementation ###############
+#### Not intended to be used under the current framework ####
+#############################################################
+
+
+# in a parameter circuit
+# 1 is true, 2 is false
+const TRUE_ID = Int32(1)
+const FALSE_ID = Int32(2)
+
+struct LayeredParameterCircuit
+    layered_circuit::LayeredBitCircuit
+    layered_parameters::Vector{Matrix{Float32}}
+end
+
+LayeredParameterCircuit(circuit::LogisticCircuit, nc::Integer, num_features::Integer) = begin
+    @assert is⋁gate(circuit)
+    decisions::Vector{Vector{Int32}} = Vector{Vector{Int32}}()
+    elements::Vector{Vector{Int32}} = Vector{Vector{Int32}}()
+    parameters::Vector{Vector{Float32}} = Vector{Vector{Float32}}()
+    num_decisions::Int32 = 2 * num_features + 2
+    num_elements::Vector{Int32} = Vector{Int32}()
+    # num_parameters always equals num_elements
+
+    ensure_layer(i) = begin
+        if length(decisions) < i
+            # add a new layer
+            push!(decisions, Int32[])
+            push!(elements, Int32[])
+            push!(parameters, Float32[])
+            push!(num_elements, 0)
+        end
+    end
+    
+    f_con(n) = LayeredDecisionId(0, istrue(n) ? TRUE_ID : FALSE_ID)
+    f_lit(n) = LayeredDecisionId(0, 
+        ispositive(n) ? Int32(2 + variable(n)) : Int32(2 + num_features + variable(n)))
+
+    f_and(n, cs) = begin
+        @assert length(cs) == 2
+        LayeredDecisionId[cs[1], cs[2]]
+    end
+    f_or(n, cs) = begin
+        num_decisions += 1
+        # determine layer
+        layer_id = zero(Int32)
+        for c in cs
+            if c isa Vector{LayeredDecisionId}
+                @assert length(c) == 2
+                layer_id = max(layer_id, c[1].layer_id, c[2].layer_id)
+            else
+                @assert c isa LayeredDecisionId
+                layer_id = max(layer_id, c.layer_id)
+            end
+        end
+        layer_id += 1
+        ensure_layer(layer_id)
+        first_element = num_elements[layer_id] + 1
+        foreach(cs, eachrow(n.thetas)) do c, theta
+            @assert size(theta)[1] == nc
+            append!(parameters[layer_id], theta)
+            num_elements[layer_id] += 1
+            if c isa Vector{LayeredDecisionId}
+                push!(elements[layer_id], c[1].decision_id, c[2].decision_id)
+            else
+                push!(elements[layer_id], c.decision_id, TRUE_ID)
+            end
+        end
+        push!(decisions[layer_id], num_decisions, first_element, num_elements[layer_id])
+        LayeredDecisionId(layer_id, num_decisions)
+    end
+
+    foldup_aggregate(circuit, f_con, f_lit, f_and, f_or, 
+        Union{LayeredDecisionId,Vector{LayeredDecisionId}})
+    
+    circuit_layers = map(decisions, elements) do d, e
+        Layer(reshape(d, 3, :), reshape(e, 2, :))
+    end
+    parameter_layers = map(parameters) do p
+        reshape(p, nc, :)
+    end
+    return LayeredParameterCircuit(LayeredBitCircuit(circuit_layers), parameter_layers)
+end
+
+struct CuLayeredParameterCircuit
+    layered_circuit::CuLayeredBitCircuit
+    layered_parameters::Vector{CuMatrix{Float32}}
+    CuLayeredParameterCircuit(l::LayeredParameterCircuit) = new(CuLayeredBitCircuit(l.layered_circuit), map(CuMatrix, l.layered_parameters))
+end
+
+
+
+function class_likelihood(circuit::CuLayeredParameterCircuit, nc::Integer, data::CuMatrix{Float32}, reuse_up=nothing, reuse_down=nothing, reuse_cp=nothing)
+    cw, flow, v = class_weights(circuit, nc, data, reuse_up, reuse_down, reuse_cp)
+    one = Float32(1.0)
+    return @. one / (one + exp(-cw)), flow, v
+end
+
+function class_weights(circuit::CuLayeredParameterCircuit, nc::Integer, data::CuMatrix{Float32}, reuse_up=nothing, reuse_down=nothing, reuse_cw=nothing)
+    flow, v = compute_flows2(circuit.layered_circuit, data, reuse_up, reuse_down)
+    cw = calculate_class_weights(circuit, nc, data, v, flow, reuse_cw)
+    return cw, flow, v
+end
+
+function calculate_class_weights(circuit::CuLayeredParameterCircuit, nc::Integer, data::CuMatrix{Float32}, v, flow, reuse_cw=nothing)
+    ne = num_examples(data)
+    cw = if reuse_cw isa CuMatrix{Float32} && size(reuse_cw) == (ne, nc)
+        reuse_cw .= zero(Float32)
+        reuse_cw
+    else
+        CUDA.zeros(Float32, ne, nc)
+    end
+
+    dec_per_thread = 4
+    CUDA.@sync for i = 1:length(circuit.layered_circuit.layers)
+        circuit_layer = circuit.layered_circuit.layers[i]
+        parameter_layer = circuit.layered_parameters[i]
+        ndl = num_decisions(circuit_layer)
+        num_threads = balance_threads(ne, ndl / dec_per_thread, 8)
+        num_blocks = ceil(Int, ne / num_threads[1]), ceil(Int, ndl / num_threads[2] / dec_per_thread)
+        @cuda threads=num_threads blocks=num_blocks calculate_class_weights_layer_kernel_cuda(cw, v, flow, circuit_layer.decisions, circuit_layer.elements, parameter_layer)
+    end
+    
+    return cw
+end
+
+function calculate_class_weights_layer_kernel_cuda(cw, v, flow, decisions, elements, parameters)
+    index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    index_y = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    stride_x = blockDim().x * gridDim().x
+    stride_y = blockDim().y * gridDim().y
+    ne, nc = size(cw)
+    _, num_decisions = size(decisions)
+    
+    for j = index_x:stride_x:ne
+        for i = index_y:stride_y:num_decisions
+            decision_id = @inbounds decisions[1, i]
+            n_up = @inbounds v[j, decision_id]
+            if n_up > zero(Float32)
+                first_elem = @inbounds decisions[2, i]
+                last_elem = @inbounds decisions[3, i]
+                n_down = @inbounds flow[j, decision_id]
+                for e = first_elem:last_elem
+                    e1 = @inbounds elements[1, first_elem]
+                    e2 = @inbounds elements[2, first_elem]
+                    e_up = @inbounds (v[j, e1] * v[j, e2])
+                    edge_flow = e_up / n_up * n_down
+                    # following needs to be memory safe
+                    for class=1:nc
+                        @CUDA.atomic cw[j, class] += edge_flow * parameters[class, e] # atomic is automatically inbounds
+                    end
+                end
+            end
+        end
+    end
+    
+    return nothing
+end
+
+
+
+function one_hot(labels::Vector, nc::Integer)    
+    ne = length(labels) 
+    one_hot_labels = zeros(Float32, ne, nc)
+    for (i, j) in enumerate(labels)
+        one_hot_labels[i, j + 1] = 1.0
+    end
+    one_hot_labels
+end
+
+function learn_parameters(circuit::CuLayeredParameterCircuit, nc::Integer, data::CuMatrix{Float32}, labels::CuMatrix{Float32}, reuse_up=nothing, reuse_down=nothing, reuse_cp=nothing, num_epochs=20, step_size=0.0001)
+    cp, flow, v = class_likelihood(circuit, nc, data, reuse_up, reuse_down, reuse_cp)
+    update_parameters(circuit, labels, cp, flow, step_size)
+    for _ = 2:num_epochs
+        cp, flow, v = class_likelihood(circuit, nc, data, v, flow, cp)
+        update_parameters(circuit, labels, cp, v, flow, step_size)
+    end
+    return nothing
+end
+
+function update_parameters(circuit::CuLayeredParameterCircuit, labels, cp, v, flow, step_size=0.0001)
+    _, nc = size(labels)
+    step_size = Float32(step_size)
+    CUDA.@sync for i = 1:length(circuit.layered_circuit.layers)
+        circuit_layer = circuit.layered_circuit.layers[i]
+        flow_layer = flow[i]
+        parameter_layer = circuit.layered_parameters[i]
+        ndl = num_decisions(circuit_layer)
+        num_threads = balance_threads(ndl, nc, 6)
+        num_threads = num_threads[1], num_threads[2], 
+        num_blocks = ceil(Int, ndl / num_threads[1]), ceil(Int, nc / num_threads[2]), 4
+        @cuda threads=num_threads blocks=num_blocks update_parameters_layer_kernel_cuda(labels, cp, flow_layer, circuit_layer.decisions, parameter_layer, step_size)
+    end
+    return nothing
+end
+
+function update_parameters_layer_kernel_cuda(labels, cp, flow, decisions, parameters, step_size)
+    index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    index_y = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    index_z = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+    stride_x = blockDim().x * gridDim().x
+    stride_y = blockDim().y * gridDim().y
+    stride_z = blockDim().z * gridDim().z
+    ne, nc = size(labels)
+    _, num_decisions = size(decisions)
+    
+    for class = index_y:stride_y:nc
+        for i = index_x:stride_x:num_decisions
+            first_elem = @inbounds decisions[2, i]
+            last_elem = @inbounds decisions[3, i]
+            for e = first_elem:last_elem
+                for j = index_z:stride_z:ne
+                    edge_flow = e_up / n_up * n_down
+                    u = @inbounds edge_flow * (cp[j, class] - labels[j, class]) * step_size
+                    # following needs to be memory safe
+                    @inbounds parameters[class, e] -= u 
+                end
+            end
+        end
+    end
+    
+    return nothing
+end
\ No newline at end of file
diff --git a/src/Logistic/parameters.jl b/src/Logistic/parameters.jl
new file mode 100644
index 00000000..fdc4e9f2
--- /dev/null
+++ b/src/Logistic/parameters.jl
@@ -0,0 +1,123 @@
+export learn_parameters, to_onehot
+
+using CUDA
+using LoopVectorization: @avx, vifelse
+
+"""
+Parameter learning through gradient descents
+Note: data need to be DataFrame and Labels need to be in one-hot form.
+"""
+function learn_parameters(lc::LogisticCircuit, nc::Int, data, labels; num_epochs=25, step_size=0.01)
+    bc = ParamBitCircuit(lc, nc, data)
+    if isgpu(data)
+        @assert isgpu(labels) "Data and labels must be both stored in either GPU or CPU."
+        for _ = 1:num_epochs
+            cl = class_likelihood_per_instance(bc, data)
+            update_parameters_gpu(to_gpu(bc), data, labels, cl, step_size)
+        end
+    else
+        @assert !isgpu(labels) "Data and labels must be both stored in either GPU or CPU."
+        for _ = 1:num_epochs
+            cl = class_likelihood_per_instance(bc, data)
+            update_parameters_cpu(bc, data, labels, cl, step_size)
+        end
+    end
+end
+
+function update_parameters_cpu(bc, data, labels, cl, step_size)
+    ne::Int = num_examples(data)
+    nc::Int = size(bc.params, 2)
+    params_lock::Threads.ReentrantLock = Threads.ReentrantLock()
+
+    @inline function on_edge_binary(flows, values, prime, sub, element, grandpa, single_child)
+        lock(params_lock) do # TODO: move lock to inner loop?
+            for i = 1:size(flows, 1)
+                @inbounds edge_flow = values[i, prime] & values[i, sub] & flows[i, grandpa]
+                first_true_bit = trailing_zeros(edge_flow) + 1
+                last_true_bit = 64 - leading_zeros(edge_flow)
+                @simd for j = first_true_bit:last_true_bit
+                    ex_id = ((i - 1) << 6) + j
+                    if get_bit(edge_flow, j)
+                        for class = 1:nc
+                            @inbounds bc.params[element, class] -= (cl[ex_id, class] - labels[ex_id, class]) * step_size
+                        end
+                    end
+                end
+            end
+        end
+    end
+    
+    @inline function on_edge_float(flows, values, prime, sub, element, grandpa, single_child)
+        lock(params_lock) do # TODO: move lock to inner loop?
+            @avx for i = 1:size(flows, 1)
+                @inbounds edge_flow = values[i, prime] * values[i, sub] / values[i, grandpa] * flows[i, grandpa]
+                edge_flow = vifelse(isfinite(edge_flow), edge_flow, zero(eltype(flows)))
+                for class = 1:nc
+                    @inbounds bc.parames[element, class] -= (cl[i, class] - labels[i, class]) * edge_flow * step_size
+                end
+            end
+        end
+        nothing
+    end
+
+    if isbinarydata(data)
+        v,f = satisfies_flows(bc.bitcircuit, data; on_edge = on_edge_binary) 
+    else
+        @assert isfpdata(data) "Only floating point and binary data are supported"
+        v,f = satisfies_flows(bc.bitcircuit, data; on_edge = on_edge_float)
+    end
+
+    nothing
+end
+
+
+function update_parameters_gpu(bc, data, labels, cl, step_size)
+    ne::Int = num_examples(data)
+    nc::Int = size(bc.params, 2)
+    cl_device = CUDA.cudaconvert(cl)
+    label_device = CUDA.cudaconvert(labels)
+    params_device = CUDA.cudaconvert(bc.params)
+
+    @inline function on_edge_binary(flows, values, prime, sub, element, grandpa, chunk_id, edge_flow, single_child)
+        first_true_bit = 1 + trailing_zeros(edge_flow)
+        last_true_bit = 64 - leading_zeros(edge_flow)
+        for j = first_true_bit:last_true_bit
+            if get_bit(edge_flow, j)
+                ex_id = ((chunk_id - 1) << 6) + j
+                for class = 1:nc
+                    CUDA.@atomic params_device[element, class] -= (cl_device[ex_id, class] - label_device[ex_id, class]) * step_size
+                end
+            end
+        end
+        nothing 
+    end
+
+    @inline function on_edge_float(flows, values, prime, sub, element, grandpa, ex_id, edge_flow, single_child)
+        for class = 1:nc
+            CUDA.@atomic params_device[element, class] -= (cl_device[ex_id, class] - label_device[ex_id, class]) * edge_flow * step_size
+        end
+        nothing
+    end
+
+    if isbinarydata(data)
+        v,f = satisfies_flows(bc.bitcircuit, data; on_edge = on_edge_binary) 
+    else
+        @assert isfpdata(data) "Only floating point and binary data are supported"
+        v,f = satisfies_flows(bc.bitcircuit, data; on_edge = on_edge_float)
+    end
+    CUDA.unsafe_free!(v) # save the GC some effort
+    CUDA.unsafe_free!(f) # save the GC some effort
+
+    nothing
+end
+
+
+
+function to_onehot(labels::Vector, nc::Integer)    
+    ne = length(labels) 
+    one_hot_labels = zeros(Float32, ne, nc)
+    for (i, j) in enumerate(labels)
+        one_hot_labels[i, j + 1] = 1.0
+    end
+    one_hot_labels
+end
\ No newline at end of file
diff --git a/src/Logistic/queries.jl b/src/Logistic/queries.jl
new file mode 100644
index 00000000..9931d766
--- /dev/null
+++ b/src/Logistic/queries.jl
@@ -0,0 +1,148 @@
+export class_likelihood_per_instance, class_weights_per_instance
+
+using CUDA
+using LoopVectorization: @avx, vifelse
+
+
+"""
+Class Conditional Probability
+"""
+function class_likelihood_per_instance(lc::LogicCircuit, nc::Int, data)    
+    cw = class_weights_per_instance(lc, nc, data)
+    one = Float32(1.0)
+    isgpu(data) ? (@. one / (one + exp(-cw))) : (@. @avx one / (one + exp(-cw)))
+end
+
+function class_likelihood_per_instance(bc, data)
+    cw = class_weights_per_instance(bc, data)
+    isgpu(data) ? (@. one / (one + exp(-cw))) : (@. @avx one / (one + exp(-cw)))
+end
+
+function class_weights_per_instance(lc::LogisticCircuit, nc::Int, data)
+    bc = ParamBitCircuit(lc, nc, data)
+    class_weights_per_instance(bc, data)
+end
+
+function class_weights_per_instance(bc, data)
+    if isgpu(data)
+        class_weights_per_instance_gpu(to_gpu(bc), data)
+    else
+        class_weights_per_instance_cpu(bc, data)
+    end
+end
+
+function class_weights_per_instance_cpu(bc, data)
+    ne::Int = num_examples(data)
+    nc::Int = size(bc.params, 2)
+    cw::Matrix{Float32} = zeros(Float32, ne, nc)
+    cw_lock::Threads.ReentrantLock = Threads.ReentrantLock()
+ 
+    @inline function on_edge_binary(flows, values, prime, sub, element, grandpa, single_child)
+        lock(cw_lock) do # TODO: move lock to inner loop?
+            for i = 1:size(flows, 1)
+                @inbounds edge_flow = values[i, prime] & values[i, sub] & flows[i, grandpa]
+                first_true_bit = trailing_zeros(edge_flow) + 1
+                last_true_bit = 64 - leading_zeros(edge_flow)
+                @simd for j = first_true_bit:last_true_bit
+                    ex_id = ((i - 1) << 6) + j
+                    if get_bit(edge_flow, j)
+                        for class = 1:nc
+                            @inbounds cw[ex_id, class] += bc.params[element, class]
+                        end
+                    end
+                end
+            end
+        end
+        nothing
+    end
+
+    @inline function on_edge_float(flows, values, prime, sub, element, grandpa, single_child)
+        lock(cw_lock) do # TODO: move lock to inner loop?
+            @avx for i = 1:size(flows, 1)
+                @inbounds edge_flow = values[i, prime] * values[i, sub] / values[i, grandpa] * flows[i, grandpa]
+                edge_flow = vifelse(isfinite(edge_flow), edge_flow, zero(eltype(flows)))
+                for class = 1:nc
+                    @inbounds cw[i, class] += edge_flow * bc.params[element, class]
+                end
+            end
+        end
+        nothing
+    end
+
+    if isbinarydata(data)
+        satisfies_flows(bc.bitcircuit, data; on_edge = on_edge_binary)
+    else
+        satisfies_flows(bc.bitcircuit, data; on_edge = on_edge_float)
+    end
+
+    return cw
+end
+
+function class_weights_per_instance_gpu(bc, data)
+    ne::Int = num_examples(data)
+    nc::Int = size(bc.params, 2)
+    cw::CuMatrix{Float32} = CUDA.zeros(Float32, num_examples(data), nc)
+    cw_device = CUDA.cudaconvert(cw)
+    params_device = CUDA.cudaconvert(bc.params)
+
+    @inline function on_edge_binary(flows, values, prime, sub, element, grandpa, chunk_id, edge_flow, single_child)
+        first_true_bit = 1 + trailing_zeros(edge_flow)
+        last_true_bit = 64 - leading_zeros(edge_flow)
+        for j = first_true_bit:last_true_bit
+            ex_id = ((chunk_id - 1) << 6) + j
+            if get_bit(edge_flow, j)
+                for class = 1:nc
+                    CUDA.@atomic cw_device[ex_id, class] += params_device[element, class]
+                end
+            end
+        end
+        nothing
+    end
+
+    @inline function on_edge_float(flows, values, prime, sub, element, grandpa, ex_id, edge_flow, single_child)
+        for class = 1:nc
+            CUDA.@atomic cw_device[ex_id, class] += edge_flow * params_device[element, class]
+        end
+        nothing
+    end
+    
+    if isbinarydata(data)
+        v,f = satisfies_flows(bc.bitcircuit, data; on_edge = on_edge_binary)
+    else
+        @assert isfpdata(data) "Only floating point and binary data are supported"
+        v,f = satisfies_flows(bc.bitcircuit, data; on_edge = on_edge_float)
+    end
+    CUDA.unsafe_free!(v) # save the GC some effort
+    CUDA.unsafe_free!(f) # save the GC some effort
+
+    return cw
+end
+
+
+
+"""
+Class Predictions
+"""
+function predict_class(lc::LogisticCircuit, nc::Int, data)
+    class_likelihoods = class_likelihood_per_instance(lc, nc, data)
+    predict_class(class_likelihoods)
+end
+
+function predict_class(class_likelihoods)
+    _, mxindex = findmax(class_likelihoods; dims=2)
+    dropdims(getindex.(mxindex, 2); dims=2)
+end
+
+
+
+"""
+Prediction accuracy
+"""
+accuracy(lc::LogisticCircuit, nc::Int, data, labels::Vector) = 
+    accuracy(predict_class(lc, nc, data), labels)
+
+accuracy(predicted_class::Vector, labels::Vector) = 
+    Float64(sum(@. predicted_class == labels)) / length(labels)
+
+accuracy(class_likelihoods::Matrix, labels::Vector) = 
+    accuracy(predict_class(class_likelihoods), labels)
diff --git a/src/Probabilistic/Bagging.jl b/src/Probabilistic/Bagging.jl
deleted file mode 100644
index 42e60515..00000000
--- a/src/Probabilistic/Bagging.jl
+++ /dev/null
@@ -1,61 +0,0 @@
-import StatsBase
-
-function bootstrap_samples_ids(train_x::PlainXData, n_samples::Int
-                               #, rand_gen::AbstractRNG
-                               )
-    n_instances = num_examples(train_x)
-    ids = 1:n_instances
-    return [StatsBase.sample(
-        #rand_gen,
-        ids, n_instances, replace=true) for i in 1:n_samples]
-end
-    
-function train_bagging(# pcs::Vector{<:ProbΔ},
-                        train_x::XBatches{Bool},
-                        n_components::Int64;
-                        init_models=nothing,
-                        mixture_weights,
-                        learn_base_estimator,
-                        base_estimator_params,
-                        logs)
-                        
-    @assert length(logs) == n_components "Dimension not match in train bagging."
-    # bootstrapping samples
-    bagging_samples = init_bagging_samples(train_x, n_components)
-    
-    # weights
-    weights = nothing
-    if mixture_weights == "uniform"
-        weights = ones(Float64, n_components) ./ n_components
-    else
-        throw(DomainError(mixture_weights, "Unrecognized mixture weight mode"))
-    end
-
-    if issomething(init_models)
-        @assert length(init_models) == n_components "Dimension not match in train bagging."
-        for i in 1 : n_components
-            learn_base_estimator(bagging_samples[i], init_models[i]; log=logs[i], base_estimator_params...)
-        end
-    
-    else
-        for i in 1 : n_components
-            learn_base_estimator(bagging_samples[i]; log=logs[i], base_estimator_params...)
-        end
-    end
-end
-
-function init_bagging_samples(train_x::XBatches{Bool}, num_bags::Int64)::Vector{XBatches{Bool}}
-    batch_size = max_batch_size(train_x)
-    
-    unbatched = unbatch(train_x)
-    m = feature_matrix(unbatched)
-    bagging_samples = Vector{XBatches{Bool}}()
-
-    bootstrapped_ids = bootstrap_samples_ids(unbatched, num_bags)
-
-    for i in 1 : num_bags
-        new_examples = PlainXData(m[bootstrapped_ids[i], :])
-        push!(bagging_samples, batch(new_examples, batch_size))
-    end
-    bagging_samples
-end
diff --git a/src/Probabilistic/Clustering.jl b/src/Probabilistic/Clustering.jl
deleted file mode 100644
index 47c30403..00000000
--- a/src/Probabilistic/Clustering.jl
+++ /dev/null
@@ -1,21 +0,0 @@
-using Clustering
-
-function clustering(train_x::XData, mix_num::Int64; maxiter=200)::XBatches{<:Bool}
-    if mix_num == 1
-        return convert(XBatches, train_x)
-    end
-    
-    n = num_examples(train_x)
-    X = feature_matrix(train_x)'
-
-    println("Running K-means clustering algorithm with num of components $mix_num, maximum iterations $maxiter")
-    R = kmeans(X, mix_num; maxiter=maxiter)
-    @assert nclusters(R) == mix_num
-    a = assignments(R)
-
-    clustered_train_x = Vector{PlainXData{Bool,BitMatrix}}()
-    for k in 1 : mix_num
-        push!(clustered_train_x, XData(convert(BitMatrix, X[:, findall(x -> x == k, a)]')))
-    end
-    return clustered_train_x
-end
\ No newline at end of file
diff --git a/src/Probabilistic/EMLearner.jl b/src/Probabilistic/EMLearner.jl
deleted file mode 100644
index e16b6f7f..00000000
--- a/src/Probabilistic/EMLearner.jl
+++ /dev/null
@@ -1,160 +0,0 @@
-"""
-Train a mixture of probabilistic circuits from data, starting with random example weights.
-"""
-function train_mixture( pcs::Vector{<:ProbΔ},
-                        train_x::XBatches{Bool},
-                        pseudocount, num_iters;
-                        structure_learner=nothing, learnstruct_step = num_iters + 1, # structure learning
-                        logger=nothing, logging_step = 1 # logging or saving results
-                      )::AbstractFlatMixture
-
-
-    # create mixture model with uniform component weights
-    mixture_flow = init_mixture_with_flows(FlatMixture(pcs), train_x)
-
-    # reset aggregate statistics
-    reset_mixture_aggregate_flows(mixture_flow)
-
-    # do a quick maximization step
-    for batch in train_x
-        example_weights = random_example_weights(num_examples(batch), num_components(mixture_flow))
-        aggregate_flows(mixture_flow, batch, example_weights)
-    end
-    estimate_parameters(mixture_flow, component_weights(mixture_flow); pseudocount=pseudocount)
-
-    train_mixture(mixture_flow, train_x, pseudocount, num_iters; 
-                structure_learner=structure_learner, learnstruct_step=learnstruct_step, 
-                logger=logger, logging_step=logging_step)
-end
-
-"""
-Train a mixture model from data.
-Learning is initialized from the parameters stored in the given mixture.
-When a `structure_learner` is given, it will be called between EM steps to update circuit structures.
-"""
-function train_mixture( mixture::AbstractFlatMixture, # we start from component weights that are already given
-                        train_x::XBatches{Bool},
-                        pseudocount, num_iters;
-                        structure_learner=nothing, learnstruct_step = num_iters + 1, # structure learning
-                        logger=nothing, logging_step = 1 # logging or saving results
-                      )::AbstractFlatMixture
-    
-    @assert feature_type(train_x) == Bool "Can only learn probabilistic circuits on Bool data"
-
-    # initialize data structures
-    mixture_flow = init_mixture_with_flows(mixture, train_x)
-
-    if issomething(logger)
-        logger(mixture_flow)
-    end
-    
-    for i in 1:num_iters
-
-        # reset aggregate statistics
-        total_component_probability = ones(Float64, num_components(mixture_flow)) .* pseudocount ./ num_components(mixture_flow)
-        reset_mixture_aggregate_flows(mixture_flow)
-
-        # are we doing structure learning at the end of this iteration?
-        is_learnstruct_iter = issomething(structure_learner) && i % learnstruct_step == 0 
-
-        all_example_weights = Vector{Matrix{Float64}}()
-
-        # Expectation step (update example weights given mixture parameters)
-        # + collecting aggregate statistics for subsequent maximization step
-        for batch in train_x
-            log_p_of_x_and_c = log_likelihood_per_instance_component(mixture_flow, batch)
-            example_weights = component_weights_per_example(log_p_of_x_and_c)
-            
-            # copy the flows already computed by `log_likelihood_per_instance_component` into the underlying aggregate flow circuit
-            # this way the maximization step can use them to estimate new parameters
-            aggregate_flows_cached(mixture_flow, batch, example_weights)
-
-            # store the aggregated component probabilities such that the maximization step can re-estimate the component weights
-            total_component_probability .+= dropdims(sum(example_weights, dims=1), dims=1)
-
-            # cache the example weights for the structure learner at the end of this EM iteration
-            is_learnstruct_iter && push!(all_example_weights, example_weights)
-        end
-
-        # Maximization step (update mixture parameters given example weights (as stored in aggregate circuits))
-        estimate_parameters(mixture_flow, total_component_probability; pseudocount=pseudocount)
-
-        # Structural EM step
-        if is_learnstruct_iter
-            new_mixture_flow = structure_learner(mixture_flow, train_x, all_example_weights)
-            # mixture = replace_prob_circuits(mixture, new_pcs)
-            # re-initialize data structures
-            mixture_flow = init_mixture_with_flows(new_mixture_flow, train_x)
-        end
-
-        if i % logging_step == 0 && issomething(logger)
-            logger(mixture_flow)
-        end
-    end
-
-    return mixture_flow
-end
-
-"Ensure we have a FlatMixtureWithFlow where the flow circuits have aggregate flow circuits as origin"
-function init_mixture_with_flows(mixture::FlatMixtureWithFlow, ::XBatches{Bool})::FlatMixtureWithFlow 
-    if ! all(fc -> grand_origin(fc) isa AggregateFlowΔ, mixture.flowcircuits)
-        init_mixture_with_flows(origin(mixture))
-    else
-        mixture 
-    end
-end
-function init_mixture_with_flows(mixture::FlatMixture, train_x::XBatches{Bool})::FlatMixtureWithFlow
-    aggr_circuits = [AggregateFlowΔ(pc, Float64) for pc in components(mixture)]
-    flow_circuits = [FlowΔ(afc, max_batch_size(train_x), Bool, opts_accumulate_flows) for afc in aggr_circuits]
-    FlatMixtureWithFlow(mixture, flow_circuits)
-end
-
-function reset_mixture_aggregate_flows(mixture_flow::FlatMixtureWithFlow)
-    for fc in mixture_flow.flowcircuits
-        reset_aggregate_flows(grand_origin(fc))
-    end
-end
-
-"Compute the component weights for each example from likelihoods"
-function component_weights_per_example(log_p_of_x_and_c)
-    log_p_of_x = logsumexp(log_p_of_x_and_c, 2) # marginalize out components
-    log_p_of_given_x_query_c = mapslices(col -> col .- log_p_of_x, log_p_of_x_and_c, dims=[1])
-    p_of_given_x_query_c = exp.(log_p_of_given_x_query_c) # no more risk of underflow, so go to linear space
-    @assert sum(p_of_given_x_query_c) ≈ size(log_p_of_x_and_c, 1) # each row has proability 1
-    p_of_given_x_query_c
-end
-
-"Compute and aggregate flows for mixture components"
-function aggregate_flows(mixture_flow, batch, example_weights)
-    for i in 1:num_components(mixture_flow)
-        fc =  mixture_flow.flowcircuits[i]
-        wbatch = weighted_batch_for_component(batch, example_weights,i)
-        accumulate_aggr_flows_batch(fc, wbatch)
-    end
-end
-
-"Aggregate already-computed flows for mixture components"
-function aggregate_flows_cached(mixture_flow, batch, example_weights)
-    for i in 1:num_components(mixture_flow)
-        fc =  mixture_flow.flowcircuits[i]
-        wbatch = weighted_batch_for_component(batch, example_weights,i)
-        accumulate_aggr_flows_cached(fc, wbatch)
-    end
-end
-
-function estimate_parameters(mixture_flow, total_component_probability; pseudocount)
-    component_weights(mixture_flow) .= total_component_probability ./ sum(total_component_probability)
-    for fc in mixture_flow.flowcircuits
-        estimate_parameters_cached(grand_origin(fc); pseudocount=pseudocount)
-    end
-end
-
-"Get a new weighted batch for this component"
-weighted_batch_for_component(batch::PlainXData, example_weights, component_i)::WXData =
-    WXData(batch, example_weights[:,component_i])
-
-"Create random example weights that sum to one overall components"
-function random_example_weights(num_examples::Int, num_components::Int)::Matrix{Float64}
-    w = rand(Float64, num_examples, num_components)
-    w ./ sum(w;dims=2)
-end
\ No newline at end of file
diff --git a/src/Probabilistic/Mixtures.jl b/src/Probabilistic/Mixtures.jl
deleted file mode 100644
index 68041988..00000000
--- a/src/Probabilistic/Mixtures.jl
+++ /dev/null
@@ -1,169 +0,0 @@
-#####################
-# Probabilistic circuit mixtures
-#####################
-
-"A probabilistic mixture model"
-abstract type AbstractMixture end
-
-"A probabilistic mixture model whose components are not themselves mixtures"
-abstract type AbstractFlatMixture <: AbstractMixture end
-
-"A probabilistic mixture model whose components are themselves mixtures"
-abstract type AbstractMetaMixture <: AbstractMixture end
-
-"A probabilistic mixture model of probabilistic circuits"
-struct FlatMixture <: AbstractFlatMixture
-    weights::Vector{Float64}
-    components::Vector{<:ProbΔ}
-    FlatMixture(w,c) = begin
-        @assert length(w) == length(c)
-        @assert sum(w) ≈ 1.0
-        new(w,c)
-    end
-end
-
-FlatMixture(c) = FlatMixture(uniform(length(c)),c)
-
-"A mixture with cached flow circuits for each component (which are assumed to be ProbΔs)"
-struct FlatMixtureWithFlow <: AbstractFlatMixture
-    origin::FlatMixture
-    flowcircuits::Vector{<:FlowΔ}
-    FlatMixtureWithFlow(origin,fcs) = begin
-        @assert num_components(origin) == length(fcs)
-        foreach(components(origin), fcs) do or, fc
-            @assert or[end] === prob_origin(fc)[end]
-        end
-        new(origin,fcs)
-    end
-end
-
-FlatMixtureWithFlow(w,c,f) = FlatMixtureWithFlow(FlatMixture(w,c),f)
-
-"A probabilistic mixture model of mixture models"
-struct MetaMixture <: AbstractMetaMixture
-    weights::Vector{Float64}
-    components::Vector{<:AbstractMixture}
-    MetaMixture(w,c) = begin
-        @assert length(w) == length(c)
-        @assert sum(w) ≈ 1.0
-        new(w,c)
-    end
-end
-
-MetaMixture(c) = MetaMixture(uniform(length(c)),c)
-
-Mixture(w, c::Vector{<:AbstractMixture}) = MetaMixture(w, c)
-Mixture(w, c::Vector{<:ProbΔ}) = FlatMixture(w, c)
-
-#####################
-# Functions
-#####################
-
-"Get the components in this mixture"
-@inline components(m::FlatMixture) = m.components
-@inline components(m::FlatMixtureWithFlow) = components(m.origin)
-@inline components(m::MetaMixture) = m.components
-
-"Get the component weights in this mixture"
-@inline component_weights(m::FlatMixture) = m.weights
-@inline component_weights(m::FlatMixtureWithFlow) = component_weights(m.origin)
-@inline component_weights(m::MetaMixture) = m.weights
-
-"Number of components in a mixture"
-@inline num_components(m::AbstractMixture)::Int = length(components(m))
-
-"Convert a given flat mixture into one with cached flows"
-ensure_with_flows(m::FlatMixture, size_hint::Int)::FlatMixtureWithFlow = begin
-    flowcircuits = [FlowΔ(pc, size_hint, Bool, opts_accumulate_flows) for pc in components(m)]
-    FlatMixtureWithFlow(m,flowcircuits)
-end
-ensure_with_flows(m::FlatMixtureWithFlow, ::Int)::FlatMixtureWithFlow = m
-
-replace_prob_circuits(m::FlatMixture, pcs::Vector{ProbΔ}) =
-    FlatMixture(component_weights(m), pcs)
-
-# log_likelihood
-
-function log_likelihood(mixture::FlatMixture, batches::XBatches{Bool})::Float64
-    mwf = ensure_with_flows(mixture, max_batch_size(batches))
-    log_likelihood(mwf, batches)
-end
-
-function log_likelihood(mixture::FlatMixtureWithFlow, batches::XBatches{Bool})::Float64
-    # assume the per-batch call will compute a weighted sum over examples
-    sum(batch -> log_likelihood(mixture, batch), batches)
-end
-
-function log_likelihood(mixture::FlatMixtureWithFlow, batch::PlainXData{Bool})::Float64
-    sum(log_likelihood_per_instance(mixture, batch))
-end
-
-function log_likelihood(mixture::MetaMixture, batches::XBatches{Bool})::Float64
-    sum(batch -> log_likelihood(mixture, batch), batches)
-end
-
-function log_likelihood(mixture::MetaMixture, batch::PlainXData{Bool})::Float64
-    sum(log_likelihood_per_instance(mixture, batch))
-end
-
-# log_likelihood_per_instance (including mixture weight likelihood)
-
-function log_likelihood_per_instance(mixture::FlatMixture, batches::XBatches{Bool})::Vector{Float64}
-    mwf = ensure_with_flows(mixture, max_batch_size(batches))
-    log_likelihood_per_instance(mwf, batches)
-end
-
-function log_likelihood_per_instance(mixture::FlatMixtureWithFlow, batches::XBatches{Bool})::Vector{Float64}
-    mapreduce(b -> log_likelihood_per_instance(mixture, b), vcat, batches)
-end
-
-function log_likelihood_per_instance(mixture::FlatMixtureWithFlow, batch::PlainXData{Bool})::Vector{Float64}
-    log_p_of_x_and_c = log_likelihood_per_instance_component(mixture, batch)
-    logsumexp(log_p_of_x_and_c, 2)
-end
-
-function log_likelihood_per_instance(mixture::MetaMixture, batches::XBatches{Bool})::Vector{Float64}
-    mapreduce(b -> log_likelihood_per_instance(mixture, b), vcat, batches)
-end
-
-function log_likelihood_per_instance(mixture::MetaMixture, batches::PlainXData{Bool})::Vector{Float64}
-    log_p_of_x_and_c = log_likelihood_per_instance_component(mixture, batch)
-    logsumexp(log_p_of_x_and_c, 2)
-end
-
-# Log likelihoods per instance and component (including mixture weight likelihood)
-
-
-"Log likelihood per instance and component. A vector of matrices per batch where the first dimension is instance, second is components."
-function log_likelihood_per_instance_component(mixture::FlatMixtureWithFlow, batches::XBatches{Bool})::Vector{Matrix{Float64}}
-    [log_likelihood_per_instance_component(mixture, batch) for batch in batches]
-end
-
-"Log likelihood per instance and component. First dimension is instance, second is components."
-function log_likelihood_per_instance_component(mixture::FlatMixtureWithFlow, batch::PlainXData{Bool})::Matrix{Float64}
-    hcat(log_likelihood_per_component_instance(mixture, batch)...)
-end
-
-"Log likelihood per component and instance. Outer vector is components, inner vector is instances"
-function log_likelihood_per_component_instance(mixture::FlatMixtureWithFlow, batch::PlainXData{Bool})::Vector{Vector{Float64}}
-    map(mixture.flowcircuits, component_weights(mixture)) do fc, component_weight
-        log_likelihood_per_instance(fc, batch) .+ log(component_weight)
-    end
-end
-
-"Log likelihood per instance and component. A vector of matrices per batch where the first dimension is instance, second is components."
-function log_likelihood_per_instance_component(mixture::MetaMixture, batches::XBatches{Bool})::Vector{Matrix{Float64}}
-    [log_likelihood_per_instance_component(mixture, batch) for batch in batches]
-end
-
-"Log likelihood per instance and component. First dimension is instance, second is components."
-function log_likelihood_per_instance_component(mixture::MetaMixture, batch::PlainXData{Bool})::Matrix{Float64}
-    hcat(log_likelihood_per_component_instance(mixture, batch)...)
-end
-
-"Log likelihood per component and instance. Outer vector is components, inner vector is instances"
-function log_likelihood_per_component_instance(mixture::MetaMixture, batch::PlainXData{Bool})::Vector{Vector{Float64}}
-    map(mixture.components, component_weights(mixture)) do c, component_weight
-        log_likelihood_per_instance(c, batch) .+ log(component_weight)
-    end
-end
\ No newline at end of file
diff --git a/src/Probabilistic/ProbCircuits.jl b/src/Probabilistic/ProbCircuits.jl
deleted file mode 100644
index c6156eb6..00000000
--- a/src/Probabilistic/ProbCircuits.jl
+++ /dev/null
@@ -1,515 +0,0 @@
-#####################
-# Probabilistic circuits
-#####################
-abstract type ProbΔNode{O} <: DecoratorΔNode{O} end
-abstract type ProbLeafNode{O} <: ProbΔNode{O} end
-abstract type ProbInnerNode{O} <: ProbΔNode{O} end
-
-mutable struct ProbLiteral{O} <: ProbLeafNode{O}
-    origin::O
-    data
-    bit::Bool
-    ProbLiteral(n) = new{node_type(n)}(n, nothing, false)
-end
-
-mutable struct Prob⋀{O} <: ProbInnerNode{O}
-    origin::O
-    children::Vector{<:ProbΔNode{<:O}}
-    data
-    bit::Bool
-    Prob⋀(n, children) = begin
-        new{node_type(n)}(n, convert(Vector{ProbΔNode{node_type(n)}},children), nothing, false)
-    end
-end
-
-mutable struct Prob⋁{O} <: ProbInnerNode{O}
-    origin::O
-    children::Vector{<:ProbΔNode{<:O}}
-    log_thetas::Vector{Float64}
-    data
-    bit::Bool
-    Prob⋁(n, children) = new{node_type(n)}(n, convert(Vector{ProbΔNode{node_type(n)}},children), some_vector(Float64, length(children)), nothing, false)
-end
-
-const ProbΔ{O} = AbstractVector{<:ProbΔNode{<:O}}
-
-Base.eltype(::Type{ProbΔ{O}}) where {O} = ProbΔNode{<:O}
-
-#####################
-# traits
-#####################
-
-import LogicCircuits.GateType # make available for extension
-import LogicCircuits.node_type
-
-@inline GateType(::Type{<:ProbLiteral}) = LiteralGate()
-@inline GateType(::Type{<:Prob⋀}) = ⋀Gate()
-@inline GateType(::Type{<:Prob⋁}) = ⋁Gate()
-
-@inline node_type(::ProbΔNode) = ProbΔNode
-
-#####################
-# constructors and conversions
-#####################
-
-const ProbCache = Dict{ΔNode, ProbΔNode}
-
-function ProbΔ2(circuit::Δ)::ProbΔ
-    node2dag(ProbΔ2(circuit[end]))
-end
-
-function ProbΔ2(circuit::ΔNode)::ProbΔNode
-    f_con(n) = error("Cannot construct a probabilistic circuit from constant leafs: first smooth and remove unsatisfiable branches.")
-    f_lit(n) = ProbLiteral(n)
-    f_a(n, cn) = Prob⋀(n, cn)
-    f_o(n, cn) = Prob⋁(n, cn)
-    foldup_aggregate(circuit, f_con, f_lit, f_a, f_o, ProbΔNode{node_type(circuit)})
-end
-
-function ProbΔ(circuit::Δ, cache::ProbCache = ProbCache())
-
-    sizehint!(cache, length(circuit)*4÷3)
-
-    pc_node(::LiteralGate, n::ΔNode) = ProbLiteral(n)
-    pc_node(::ConstantGate, n::ΔNode) = error("Cannot construct a probabilistic circuit from constant leafs: first smooth and remove unsatisfiable branches.")
-
-    pc_node(::⋀Gate, n::ΔNode) = begin
-        children = map(c -> cache[c], n.children)
-        Prob⋀(n, children)
-    end
-
-    pc_node(::⋁Gate, n::ΔNode) = begin
-        children = map(c -> cache[c], n.children)
-        Prob⋁(n, children)
-    end
-
-    map(circuit) do node
-        pcn = pc_node(GateType(node), node)
-        cache[node] = pcn
-        pcn
-    end
-end
-
-#####################
-# methods
-#####################
-
-import LogicCircuits: literal, children # make available for extension
-
-@inline literal(n::ProbLiteral)::Lit  = literal(n.origin)
-@inline children(n::ProbInnerNode) = n.children
-
-num_parameters(n::Prob⋁) = num_children(n)
-num_parameters(c::ProbΔ) = sum(n -> num_parameters(n), ⋁_nodes(c))
-
-"Return the first origin that is a probabilistic circuit node"
-prob_origin(n::DecoratorΔNode)::ProbΔNode = origin(n, ProbΔNode)
-
-"Return the first origin that is a probabilistic circuit"
-prob_origin(c::DecoratorΔ)::ProbΔ = origin(c, ProbΔNode)
-
-function estimate_parameters2(pc::ProbΔ, data::XData{Bool}; pseudocount::Float64)
-    Logical.pass_up_down2(pc, data)
-    w = (data isa PlainXData) ? nothing : weights(data)
-    estimate_parameters_cached2(pc, w; pseudocount=pseudocount)
-end
-
-function estimate_parameters_cached2(pc::ProbΔ, w; pseudocount::Float64)
-    flow(n) = Float64(sum(sum(n.data)))
-    children_flows(n) = sum.(map(c -> c.data[1] .& n.data[1], children(n)))
-
-    if issomething(w)
-        flow_w(n) = sum(Float64.(n.data[1]) .* w)
-        children_flows_w(n) = sum.(map(c -> Float64.(c.data[1] .& n.data[1]) .* w, children(n)))
-        flow = flow_w
-        children_flows = children_flows_w
-    end
-
-    estimate_parameters_node2(n::ProbΔNode) = ()
-    function estimate_parameters_node2(n::Prob⋁)
-        if num_children(n) == 1
-            n.log_thetas .= 0.0
-        else
-            smoothed_flow = flow(n) + pseudocount
-            uniform_pseudocount = pseudocount / num_children(n)
-            n.log_thetas .= log.((children_flows(n) .+ uniform_pseudocount) ./ smoothed_flow)
-            @assert isapprox(sum(exp.(n.log_thetas)), 1.0, atol=1e-6) "Parameters do not sum to one locally"
-            # normalize away any leftover error
-            n.log_thetas .- logsumexp(n.log_thetas)
-        end
-    end
-
-    foreach(estimate_parameters_node2, pc)
-end
-
-function log_likelihood_per_instance2(pc::ProbΔ, data::XData{Bool})
-    Logical.pass_up_down2(pc, data)
-    log_likelihood_per_instance_cached(pc, data)
-end
-
-function log_likelihood_per_instance_cached(pc::ProbΔ, data::XData{Bool})
-    log_likelihoods = zeros(num_examples(data))
-    indices = some_vector(Bool, num_examples(data))::BitVector
-    for n in pc
-         if n isa Prob⋁ && num_children(n) != 1 # other nodes have no effect on likelihood
-            foreach(n.children, n.log_thetas) do c, log_theta
-                indices = n.data[1] .& c.data[1]
-                view(log_likelihoods, indices::BitVector) .+=  log_theta # see MixedProductKernelBenchmark.jl
-            end
-         end
-    end
-    log_likelihoods
-end
-
-import LogicCircuits: conjoin_like, disjoin_like, literal_like, copy_node, normalize, replace_node # make available for extension
-
-"Conjoin nodes in the same way as the example"
-@inline function conjoin_like(example::ProbΔNode, arguments::Vector)
-    if isempty(arguments)
-        # @assert false "Probabilistic circuit does not have anonymous true node"
-        nothing
-    elseif example isa Prob⋀ && children(example) == arguments
-        example
-    else
-        n = conjoin_like(origin(example), origin.(arguments))
-        Prob⋀(n, arguments)
-    end
-end
-
-"Disjoin nodes in the same way as the example"
-@inline function disjoin_like(example::ProbΔNode, arguments::Vector)
-    if isempty(arguments)
-        # @assert false "Probabilistic circuit does not have false node"
-        nothing
-    elseif example isa Prob⋁ && children(example) == arguments
-        example
-    else
-        n = disjoin_like(origin(example), origin.(arguments))
-        # normalize parameters
-        thetas = zeros(Float64, length(arguments))
-        flag = falses(length(arguments))
-        for (i, c) in enumerate(arguments)
-            ind = findfirst(x -> x == c, children(example))
-            if issomething(ind)
-                thetas[i] = exp(example.log_thetas[ind])
-                flag[i] = true
-            end
-        end
-        if all(flag)
-            thetas = thetas / sum(thetas)
-        end
-        p = Prob⋁(n, arguments)
-        p.log_thetas .= log.(thetas)
-        p
-    end
-end
-
-"Construct a new literal node like the given node's type"
-@inline literal_like(::ProbΔNode, lit::Lit) = ProbLiteral(lit)
-
-@inline copy_node(n::Prob⋁, cns) = begin
-    orig = copy_node(origin(n), origin.(cns))
-    p = Prob⋁(orig, cns)
-    p.log_thetas .= copy(n.log_thetas)
-    p
-end
-
-@inline copy_node(n::Prob⋀, cns) = begin
-    orig = copy_node(origin(n), origin.(cns))
-    Prob⋀(orig, cns)
-end
-
-import LogicCircuits.normalize
-
-@inline normalize(n::Prob⋁, old_n::Prob⋁, kept::Union{Vector{Bool}, BitArray}) = begin
-     thetas = exp.(old_n.log_thetas[kept])
-     n.log_thetas .= log.(thetas / sum(thetas))
-end
-
-function estimate_parameters(pc::ProbΔ, data::XBatches{Bool}; pseudocount::Float64)
-    estimate_parameters(AggregateFlowΔ(pc, aggr_weight_type(data)), data; pseudocount=pseudocount)
-end
-
-function estimate_parameters(afc::AggregateFlowΔ, data::XBatches{Bool}; pseudocount::Float64)
-    @assert feature_type(data) == Bool "Can only learn probabilistic circuits on Bool data"
-    @assert (afc[end].origin isa ProbΔNode) "AggregateFlowΔ must originate in a ProbΔ"
-    collect_aggr_flows(afc, data)
-    estimate_parameters_cached(afc; pseudocount=pseudocount)
-    afc
-end
-
-function estimate_parameters(fc::FlowΔ, data::XBatches{Bool}; pseudocount::Float64)
-    @assert feature_type(data) == Bool "Can only learn probabilistic circuits on Bool data"
-    @assert (prob_origin(afc[end]) isa ProbΔNode) "FlowΔ must originate in a ProbΔ"
-    collect_aggr_flows(fc, data)
-    estimate_parameters_cached(origin(fc); pseudocount=pseudocount)
-end
-
- # turns aggregate statistics into theta parameters
-function estimate_parameters_cached(afc::AggregateFlowΔ; pseudocount::Float64)
-    foreach(n -> estimate_parameters_node(n; pseudocount=pseudocount), afc)
-end
-
-estimate_parameters_node(::AggregateFlowΔNode; pseudocount::Float64) = () # do nothing
-function estimate_parameters_node(n::AggregateFlow⋁; pseudocount)
-    origin = n.origin::Prob⋁
-    if num_children(n) == 1
-        origin.log_thetas .= 0.0
-    else
-        smoothed_aggr_flow = (n.aggr_flow + pseudocount)
-        uniform_pseudocount = pseudocount / num_children(n)
-        origin.log_thetas .= log.( (n.aggr_flow_children .+ uniform_pseudocount) ./ smoothed_aggr_flow )
-        @assert isapprox(sum(exp.(origin.log_thetas)), 1.0, atol=1e-6) "Parameters do not sum to one locally: $(exp.(origin.log_thetas)), estimated from $(n.aggr_flow) and $(n.aggr_flow_children). Did you actually compute the aggregate flows?"
-        #normalize away any leftover error
-        origin.log_thetas .- logsumexp(origin.log_thetas)
-    end
-end
-
-# compute log likelihood
-function compute_log_likelihood(pc::ProbΔ, data::XBatches{Bool})
-    compute_log_likelihood(AggregateFlowΔ(pc, aggr_weight_type(data)))
-end
-
-# compute log likelihood, reusing AggregateFlowΔ but ignoring its current aggregate values
-function compute_log_likelihood(afc::AggregateFlowΔ, data::XBatches{Bool})
-    @assert feature_type(data) == Bool "Can only test probabilistic circuits on Bool data"
-    collect_aggr_flows(afc, data)
-    ll = log_likelihood(afc)
-    (afc, ll)
-end
-
-# return likelihoods given current aggregate flows.
-function log_likelihood(afc::AggregateFlowΔ)
-    sum(n -> log_likelihood(n), afc)
-end
-
-log_likelihood(::AggregateFlowΔNode) = 0.0
-log_likelihood(n::AggregateFlow⋁) = sum(n.origin.log_thetas .* n.aggr_flow_children)
-
-"""
-Calculates log likelihood for a batch of fully observed samples.
-(Also retures the generated FlowΔ)
-"""
-function log_likelihood_per_instance(pc::ProbΔ, batch::PlainXData{Bool})
-    fc = FlowΔ(pc, num_examples(batch), Bool)
-    (fc, log_likelihood_per_instance(fc, batch))
-end
-
-function log_proba(pc::ProbΔ, batch::PlainXData{Bool})
-    log_likelihood_per_instance(pc, batch)[2]
-end
-
-function log_proba(pc::ProbΔ, batch::PlainXData{Int8})
-    marginal_log_likelihood_per_instance(pc, batch)[2]
-end
-
-"""
-Calculate log likelihood per instance for batches of samples.
-"""
-function log_likelihood_per_instance(pc::ProbΔ, batches::XBatches{Bool})::Vector{Float64}
-    mapreduce(b -> log_likelihood_per_instance(pc, b)[2], vcat, batches)
-end
-
-"""
-Calculate log likelihood for a batch of fully observed samples.
-(This is for when you already have a FlowΔ)
-"""
-function log_likelihood_per_instance(fc::FlowΔ, batch::PlainXData{Bool})
-    @assert (prob_origin(fc[end]) isa ProbΔNode) "FlowΔ must originate in a ProbΔ"
-    pass_up_down(fc, batch)
-    log_likelihoods = zeros(num_examples(batch))
-    indices = some_vector(Bool, flow_length(fc))::BitVector
-    for n in fc
-         if n isa DownFlow⋁ && num_children(n) != 1 # other nodes have no effect on likelihood
-            origin = prob_origin(n)::Prob⋁
-            foreach(n.children, origin.log_thetas) do c, log_theta
-                #  be careful here to allow for the Boolean multiplication to be done using & before switching to float arithmetic, or risk losing a lot of runtime!
-                # log_likelihoods .+= prod_fast(downflow(n), pr_factors(c)) .* log_theta
-                assign_prod(indices, downflow(n), pr_factors(c))
-                view(log_likelihoods, indices::BitVector) .+=  log_theta # see MixedProductKernelBenchmark.jl
-                # TODO put the lines above in Utils in order to ensure we have specialized types
-            end
-         end
-    end
-    log_likelihoods
-end
-
-"""
-Calculate log likelihood for a batch of samples with partial evidence P(e).
-(Also returns the generated FlowΔ)
-
-To indicate a variable is not observed, pass -1 for that variable.
-"""
-function marginal_log_likelihood_per_instance(pc::ProbΔ, batch::PlainXData{Int8})
-    opts = (flow_opts★..., el_type=Float64, compact⋀=false, compact⋁=false)
-    fc = UpFlowΔ(pc, num_examples(batch), Float64, opts)
-    (fc, marginal_log_likelihood_per_instance(fc, batch))
-end
-
-"""
-Calculate log likelihood for a batch of samples with partial evidence P(e).
-(If you already have a FlowΔ)
-
-To indicate a variable is not observed, pass -1 for that variable.
-"""
-function marginal_log_likelihood_per_instance(fc::UpFlowΔ, batch::PlainXData{Int8})
-    @assert (prob_origin(fc[end]) isa ProbΔNode) "FlowΔ must originate in a ProbΔ"
-    marginal_pass_up(fc, batch)
-    pr(fc[end])
-end
-
-function check_parameter_integrity(circuit::ProbΔ)
-    for node in filter(n -> GateType(n) isa Prob⋁, circuit)
-        @assert all(θ -> !isnan(θ), node.log_thetas) "There is a NaN in one of the log_thetas"
-    end
-    true
-end
-
-##################
-# Sampling from a psdd
-##################
-
-"""
-Sample from a PSDD without any evidence
-"""
-function sample(circuit::ProbΔ)::AbstractVector{Bool}
-    inst = Dict{Var,Int64}()
-    simulate(circuit[end], inst)
-    len = length(keys(inst))
-    ans = Vector{Bool}()
-    for i = 1:len
-        push!(ans, inst[i])
-    end
-    ans
-end
-
-# Uniformly sample based on the probability of the items
-# and return the selected index
-function sample(probs::AbstractVector{<:Number})::Int32
-    z = sum(probs)
-    q = rand() * z
-    cur = 0.0
-    for i = 1:length(probs)
-        cur += probs[i]
-        if q <= cur
-            return i
-        end
-    end
-    return length(probs)
-end
-
-function simulate(node::ProbLiteral, inst::Dict{Var,Int64})
-    if positive(node)
-        inst[variable(node.origin)] = 1
-    else
-        inst[variable(node.origin)] = 0
-    end
-end
-
-function simulate(node::Prob⋁, inst::Dict{Var,Int64})
-    idx = sample(exp.(node.log_thetas))
-    simulate(node.children[idx], inst)
-end
-function simulate(node::Prob⋀, inst::Dict{Var,Int64})
-    for child in node.children
-        simulate(child, inst)
-    end
-end
-
-"""
-Sampling with Evidence from a psdd.
-Internally would call marginal pass up on a newly generated flow circuit.
-"""
-function sample(circuit::ProbΔ, evidence::PlainXData{Int8})::AbstractVector{Bool}
-    opts= (compact⋀=false, compact⋁=false)
-    flow_circuit = UpFlowΔ(circuit, 1, Float64, opts)
-    marginal_pass_up(flow_circuit, evidence)
-    sample(flow_circuit)
-end
-
-"""
-Sampling with Evidence from a psdd.
-Assuming already marginal pass up has been done on the flow circuit.
-"""
-function sample(circuit::UpFlowΔ)::AbstractVector{Bool}
-    inst = Dict{Var,Int64}()
-    simulate2(circuit[end], inst)
-    len = length(keys(inst))
-    ans = Vector{Bool}()
-    for i = 1:len
-        push!(ans, inst[i])
-    end
-    ans
-end
-
-function simulate2(node::UpFlowLiteral, inst::Dict{Var,Int64})
-    if positive(node)
-        #TODO I don't think we need these 'grand_origin' parts below
-        inst[variable(grand_origin(node))] = 1
-    else
-        inst[variable(grand_origin(node))] = 0
-    end
-end
-
-function simulate2(node::UpFlow⋁, inst::Dict{Var,Int64})
-    prs = [ pr(ch)[1] for ch in children(node) ]
-    idx = sample(exp.(node.origin.log_thetas .+ prs))
-    simulate2(children(node)[idx], inst)
-end
-
-function simulate2(node::UpFlow⋀, inst::Dict{Var,Int64})
-    for child in children(node)
-        simulate2(child, inst)
-    end
-end
-
-
-
-##################
-# Most Probable Explanation MPE of a psdd
-#   aka MAP
-##################
-
-@inline function MAP(circuit::ProbΔ, evidence::PlainXData{Int8})::Matrix{Bool}
-    MPE(circuit, evidence)
-end
-
-function MPE(circuit::ProbΔ, evidence::PlainXData{Int8})::Matrix{Bool}
-    # Computing Marginal Likelihood for each node
-    fc, lls = marginal_log_likelihood_per_instance(circuit, evidence)
-
-    ans = Matrix{Bool}(zeros(size(evidence.x)))
-    active_samples = Array{Bool}(ones( num_examples(evidence) ))
-
-    mpe_simulate(fc[end], active_samples, ans)
-    ans
-end
-
-"""
-active_samples: bool vector indicating which samples are active for this node during mpe
-result: Matrix (num_samples, num_variables) indicating the final result of mpe
-"""
-function mpe_simulate(node::UpFlowLiteral, active_samples::Vector{Bool}, result::Matrix{Bool})
-    if positive(node)
-        result[active_samples, variable(node)] .= 1
-    else
-        result[active_samples, variable(node)] .= 0
-    end
-end
-function mpe_simulate(node::UpFlow⋁, active_samples::Vector{Bool}, result::Matrix{Bool})
-    prs = zeros( length(node.children), size(active_samples)[1] )
-    @simd  for i=1:length(node.children)
-        prs[i,:] .= pr(node.children[i]) .+ (node.origin.log_thetas[i])
-    end
-
-    max_child_ids = [a[1] for a in argmax(prs, dims = 1) ]
-    @simd for i=1:length(node.children)
-        ids = Vector{Bool}( active_samples .* (max_child_ids .== i)[1,:] )  # Only active for this child if it was the max for that sample
-        mpe_simulate(node.children[i], ids, result)
-    end
-end
-function mpe_simulate(node::UpFlow⋀, active_samples::Vector{Bool}, result::Matrix{Bool})
-    for child in node.children
-        mpe_simulate(child, active_samples, result)
-    end    
-end
diff --git a/src/Probabilistic/ProbFlowCircuits.jl b/src/Probabilistic/ProbFlowCircuits.jl
deleted file mode 100644
index dc145e35..00000000
--- a/src/Probabilistic/ProbFlowCircuits.jl
+++ /dev/null
@@ -1,105 +0,0 @@
-#####################
-
-#TODO This code seems to assume logspace flows as floating point numbers. if so, enforca that on type F
-function marginal_pass_up(circuit::UpFlowΔ{O,F}, data::XData{E}) where {E <: eltype(F)} where {O,F}
-    resize_flows(circuit, num_examples(data))
-    cache = zeros(Float64, num_examples(data)) #TODO: fix type later
-    marginal_pass_up_node(n::UpFlowΔNode, ::PlainXData) = ()
-
-    function marginal_pass_up_node(n::UpFlowLiteral{O,F}, cache::Array{Float64}, data::PlainXData{E}) where {E <: eltype(F)} where {O,F}
-        pass_up_node(n, data)
-        # now override missing values by 1
-        npr = pr(n)
-        npr[feature_matrix(data)[:,variable(n)] .< zero(eltype(F))] .= 1
-        npr .= log.( npr .+ 1e-300 )
-        return nothing
-    end
-
-    function marginal_pass_up_node(n::UpFlow⋀Cached, cache::Array{Float64}, ::PlainXData)
-        pr(n) .= 0
-        for i=1:length(n.children)
-            # pr(n) .+= pr(n.children[i])
-            broadcast!(+, pr(n), pr(n), pr(n.children[i]))
-        end
-        return nothing
-    end
-
-    function marginal_pass_up_node(n::UpFlow⋁Cached, cache::Array{Float64}, ::PlainXData)
-        pr(n) .= 1e-300
-        for i=1:length(n.children)    
-            cache .= 0
-            # broadcast reduced memory allocation, though accessing prob_origin(n).log_thetas[i] still allocates lots of extra memory, 
-            # it is proabably due to derefrencing the pointer
-            broadcast!(+, cache, pr(n.children[i]), prob_origin(n).log_thetas[i])
-            broadcast!(exp, cache, cache)
-            broadcast!(+, pr(n), pr(n), cache)
-        end
-        broadcast!(log, pr(n), pr(n));
-        return nothing
-    end
-
-    ## Pass Up on every node in order
-    for n in circuit
-        marginal_pass_up_node(n, cache, data)
-    end
-    return nothing
-end
-
-
-
-##### marginal_pass_down
-
-function marginal_pass_down(circuit::DownFlowΔ{O,F}) where {O,F}
-    resize_flows(circuit, flow_length(origin(circuit)))
-    for n in circuit
-        reset_downflow_in_progress(n)
-    end
-    for downflow in downflow_sinks(circuit[end])
-        # initialize root flows to 1
-        downflow.downflow .= one(eltype(F))
-    end
-    for n in Iterators.reverse(circuit)
-        marginal_pass_down_node(n)
-    end
-end
-
-marginal_pass_down_node(n::DownFlowΔNode) = () # do nothing
-marginal_pass_down_node(n::DownFlowLeaf) = ()
-
-function marginal_pass_down_node(n::DownFlow⋀Cached)
-    # todo(pashak) might need some changes, not tested, also to convert to logexpsum later
-     # downflow(n) = EF_n(e), the EF for edges or leaves are note stored
-    for c in n.children
-        for sink in downflow_sinks(c)
-            if !sink.in_progress
-                sink.downflow .= downflow(n)
-                sink.in_progress = true
-            else
-                sink.downflow .+= downflow(n)
-            end
-        end
-    end
-end
-
-function marginal_pass_down_node(n::DownFlow⋁Cached)
-    # todo(pashak) might need some changes, not tested, also to convert to logexpsum later
-    # downflow(n) = EF_n(e), the EF for edges or leaves are note stored
-    for (ind, c) in enumerate(n.children)
-        for sink in downflow_sinks(c)
-            if !sink.in_progress
-                sink.downflow .= downflow(n) .* exp.(prob_origin(n).log_thetas[ind] .+ pr(origin(c)) .- pr(origin(n)) )
-                sink.in_progress = true
-            else
-                sink.downflow .+= downflow(n) .* exp.(prob_origin(n).log_thetas[ind] .+ pr(origin(c)) .- pr(origin(n)))
-            end
-        end
-    end
-end
-
-#### marginal_pass_up_down
-
-function marginal_pass_up_down(circuit::DownFlowΔ{O,F}, data::XData{E}) where {E <: eltype(F)} where {O,F}
-    @assert !(E isa Bool)
-    marginal_pass_up(origin(circuit), data)
-    marginal_pass_down(circuit)
-end
diff --git a/src/Probabilistic/Probabilistic.jl b/src/Probabilistic/Probabilistic.jl
deleted file mode 100644
index 376467f2..00000000
--- a/src/Probabilistic/Probabilistic.jl
+++ /dev/null
@@ -1,58 +0,0 @@
-module Probabilistic
-
-using LogicCircuits
-using ..Utils
-
-export
-
-# ProbCircuits
-ProbΔNode, ProbΔ, ProbΔ, ProbLeafNode, ProbInnerNode,
-ProbLiteral, Prob⋀, Prob⋁, ProbCache, variable, num_parameters, compute_log_likelihood,
-log_proba,
-log_likelihood, estimate_parameters, log_likelihood_per_instance, marginal_log_likelihood_per_instance,
-initial_mixture_model, estimate_parameters_from_aggregates, compute_ensemble_log_likelihood,
-expectation_step, maximization_step, expectation_step_batch, train_mixture_with_structure, check_parameter_integrity,
-ll_per_instance_per_component, ll_per_instance_for_ensemble,estimate_parameters_cached,
-sample,
-MPE, MAP,prob_origin, copy_node, conjoin_like, disjoin_like, literal_like, normalize, replace_node,
-
-# ProbFlowCircuits
-marginal_pass_up, marginal_pass_down, marginal_pass_up_down,
-
-# Mixtures
-Mixture, AbstractFlatMixture, FlatMixture, FlatMixtureWithFlow,component_weights,FlatMixtureWithFlows,
-log_likelihood, log_likelihood_per_instance, log_likelihood_per_instance_component,
-init_mixture_with_flows, reset_mixture_aggregate_flows, aggregate_flows, estimate_parameters,
-AbstractMetaMixture, MetaMixture,AbstractFlatMixture,AbstractMixture, components, num_components,
-
-# EM Learner
-train_mixture,
-
-# Bagging
-bootstrap_samples_ids, learn_mixture_bagging, learn_mixture_bagging2,
-init_bagging_samples, train_bagging,
-
-# VtreeLearner
-MetisContext, metis_top_down, BlossomContext, blossom_bottom_up!,
-test_top_down, test_bottom_up!,learn_vtree_bottom_up,
-
-# MutualInformation
-mutual_information, DisCache, conditional_entropy, sum_entropy_given_x,
-
-# Clustering
-clustering,
-
-# Queries
-pr_constraint, psdd_entropy, psdd_kl_divergence
-
-include("Clustering.jl")
-include("ProbCircuits.jl")
-include("ProbFlowCircuits.jl")
-include("MutualInformation.jl")
-include("Mixtures.jl")
-include("Bagging.jl")
-include("EMLearner.jl")
-include("VtreeLearner.jl")
-include("Queries.jl")
-
-end
diff --git a/src/Probabilistic/Queries.jl b/src/Probabilistic/Queries.jl
deleted file mode 100644
index c960ed6c..00000000
--- a/src/Probabilistic/Queries.jl
+++ /dev/null
@@ -1,230 +0,0 @@
-using DataStructures
-
-# Arthur Choi, Guy Van den Broeck, and Adnan Darwiche. Tractable learning for structured probability
-# spaces: A case study in learning preference distributions. In Proceedings of IJCAI, 2015.
-"Calculate the probability of the logic formula given by sdd for the psdd"
-function pr_constraint(psdd_node::ProbΔNode, sdd_node::Union{ProbΔNode, StructLogicalΔNode})
-    cache = Dict{Tuple{ProbΔNode, Union{ProbΔNode, StructLogicalΔNode}}, Float64}()
-
-    return pr_constraint(psdd_node, sdd_node, cache)
-end
-function pr_constraint(psdd_node::ProbΔNode, sdd_node::Union{ProbΔNode, StructLogicalΔNode},
-                       cache::Dict{Tuple{ProbΔNode, Union{ProbΔNode, StructLogicalΔNode}}, Float64})::Float64
-    if (psdd_node, sdd_node) in keys(cache) # Cache hit
-        return cache[psdd_node, sdd_node]
-    elseif psdd_node isa ProbLiteral # Boundary cases
-        if sdd_node isa Union{ProbLiteral, StructLiteralNode} # Both are literals, just check whether they agrees with each other
-            if literal(psdd_node) == literal(sdd_node)
-                return get!(cache, (psdd_node, sdd_node), 1.0)
-            else
-                return get!(cache, (psdd_node, sdd_node), 0.0)
-            end
-        else
-            pr_constraint(psdd_node, sdd_node.children[1], cache)
-            if length(sdd_node.children) > 1
-                pr_constraint(psdd_node, sdd_node.children[2], cache)
-                return get!(cache, (psdd_node, sdd_node), 1.0)
-            else
-                return get!(cache, (psdd_node, sdd_node),
-                    literal(sdd_node.children[1]) == literal(psdd_node) ? 1.0 : 0.0
-                )
-            end
-        end
-    elseif psdd_node.children[1] isa ProbLiteral # The psdd is true
-        theta = exp(psdd_node.log_thetas[1])
-        return get!(cache, (psdd_node, sdd_node),
-            theta * pr_constraint(psdd_node.children[1], sdd_node, cache) +
-            (1.0 - theta) * pr_constraint(psdd_node.children[2], sdd_node, cache)
-        )
-    else # Both psdds are not trivial
-        prob = 0.0
-        for (prob⋀_node, log_theta) in zip(psdd_node.children, psdd_node.log_thetas)
-            p = prob⋀_node.children[1]
-            s = prob⋀_node.children[2]
-
-            theta = exp(log_theta)
-            for sdd⋀_node in sdd_node.children
-                r = sdd⋀_node.children[1]
-                t = sdd⋀_node.children[2]
-                prob += theta * pr_constraint(p, r, cache) * pr_constraint(s, t, cache)
-            end
-        end
-        return get!(cache, (psdd_node, sdd_node), prob)
-    end
-end
-
-
-"Entropy of the distribution of the input psdd."
-function psdd_entropy(psdd_node::ProbΔNode)::Float64
-    psdd_entropy_cache = Dict{ProbΔNode, Float64}()
-
-    return psdd_entropy(psdd_node, psdd_entropy_cache)
-end
-function psdd_entropy(psdd_node::Prob⋁, psdd_entropy_cache::Dict{ProbΔNode, Float64})::Float64
-    if psdd_node in keys(psdd_entropy_cache)
-        return psdd_entropy_cache[psdd_node]
-    elseif psdd_node.children[1] isa ProbLiteral
-        return get!(psdd_entropy_cache, psdd_node,
-            - exp(psdd_node.log_thetas[1]) * psdd_node.log_thetas[1] -
-            exp(psdd_node.log_thetas[2]) * psdd_node.log_thetas[2]
-        )
-    else
-        local_entropy = 0.0
-        for (prob⋀_node, log_prob) in zip(psdd_node.children, psdd_node.log_thetas)
-            p = prob⋀_node.children[1]
-            s = prob⋀_node.children[2]
-
-            local_entropy += exp(log_prob) * (psdd_entropy(p, psdd_entropy_cache) +
-                psdd_entropy(s, psdd_entropy_cache) - log_prob)
-        end
-        return get!(psdd_entropy_cache, psdd_node, local_entropy)
-    end
-end
-function psdd_entropy(psdd_node::Prob⋀, psdd_entropy_cache::Dict{ProbΔNode, Float64})::Float64
-    return get!(psdd_entropy_cache, psdd_node.children[1], psdd_entropy(psdd_node.children[1], psdd_entropy_cache)) +
-        get!(psdd_entropy_cache, psdd_node.children[2], psdd_entropy(psdd_node.children[2], psdd_entropy_cache))
-end
-function psdd_entropy(psdd_node::ProbLiteral, psdd_entropy_cache::Dict{ProbΔNode, Float64})::Float64
-    return get!(psdd_entropy_cache, psdd_node, 0.0)
-end
-
-
-"KL divergence calculation for psdds that are not necessarily identical"
-function psdd_kl_divergence(psdd_node1::ProbΔNode, psdd_node2::ProbΔNode)::Float64
-    kl_divergence_cache = Dict{Tuple{ProbΔNode, ProbΔNode}, Float64}()
-    pr_constraint_cache = Dict{Tuple{ProbΔNode, Union{ProbΔNode, StructLogicalΔNode}}, Float64}()
-
-    return psdd_kl_divergence(psdd_node1, psdd_node2, kl_divergence_cache, pr_constraint_cache)
-end
-function psdd_kl_divergence(psdd_node1::ProbΔNode, psdd_node2::ProbΔNode,
-        kl_divergence_cache::Dict{Tuple{ProbΔNode, ProbΔNode}, Float64})::Float64
-    pr_constraint_cache = Dict{Tuple{ProbΔNode, Union{ProbΔNode, StructLogicalΔNode}}, Float64}()
-
-    return psdd_kl_divergence(psdd_node1, psdd_node2, kl_divergence_cache, pr_constraint_cache)
-end
-function psdd_kl_divergence(psdd_node1::ProbΔNode, psdd_node2::ProbΔNode,
-        kl_divergence_cache::Dict{Tuple{ProbΔNode, ProbΔNode}, Float64},
-        pr_constraint_cache::Dict{Tuple{ProbΔNode, Union{ProbΔNode, StructLogicalΔNode}}, Float64})
-    @assert !(psdd_node1 isa Prob⋀ || psdd_node2 isa Prob⋀) "Prob⋀ not a valid PSDD node for KL-Divergence"
-
-    # Check if both nodes are normalized for same vtree node
-    @assert variables(psdd_node1.origin.vtree) == variables(psdd_node2.origin.vtree) "Both nodes not normalized for same vtree node"
-
-    if (psdd_node1, psdd_node2) in keys(kl_divergence_cache) # Cache hit
-        return kl_divergence_cache[(psdd_node1, psdd_node2)]
-    elseif psdd_node1.children[1] isa ProbLiteral
-        if psdd_node2 isa ProbLiteral
-            psdd_kl_divergence(psdd_node1.children[1], psdd_node2, kl_divergence_cache, pr_constraint_cache)
-            psdd_kl_divergence(psdd_node1.children[2], psdd_node2, kl_divergence_cache, pr_constraint_cache)
-            if literal(psdd_node1.children[1]) == literal(psdd_node2)
-                return get!(kl_divergence_cache, (psdd_node1, psdd_node2),
-                    psdd_node1.log_thetas[1] * exp(psdd_node1.log_thetas[1])
-                )
-            else
-                return get!(kl_divergence_cache, (psdd_node1, psdd_node2),
-                    psdd_node1.log_thetas[2] * exp(psdd_node1.log_thetas[2])
-                )
-            end
-        else
-            # The below four lines actually assign zero, but still we need to
-            # call it.
-            psdd_kl_divergence(psdd_node1.children[1], psdd_node2.children[1], kl_divergence_cache, pr_constraint_cache)
-            psdd_kl_divergence(psdd_node1.children[1], psdd_node2.children[2], kl_divergence_cache, pr_constraint_cache)
-            psdd_kl_divergence(psdd_node1.children[2], psdd_node2.children[1], kl_divergence_cache, pr_constraint_cache)
-            psdd_kl_divergence(psdd_node1.children[2], psdd_node2.children[2], kl_divergence_cache, pr_constraint_cache)
-            # There are two possible matches
-            if literal(psdd_node1.children[1]) == literal(psdd_node2.children[1])
-                return get!(kl_divergence_cache, (psdd_node1, psdd_node2),
-                    exp(psdd_node1.log_thetas[1]) * (psdd_node1.log_thetas[1] - psdd_node2.log_thetas[1]) +
-                    exp(psdd_node1.log_thetas[2]) * (psdd_node1.log_thetas[2] - psdd_node2.log_thetas[2])
-                )
-            else
-                return get!(kl_divergence_cache, (psdd_node1, psdd_node2),
-                    exp(psdd_node1.log_thetas[1]) * (psdd_node1.log_thetas[1] - psdd_node2.log_thetas[2]) +
-                    exp(psdd_node1.log_thetas[2]) * (psdd_node1.log_thetas[2] - psdd_node2.log_thetas[1])
-                )
-            end
-        end
-    else # the normal case
-        kl_divergence = 0.0
-
-        # loop through every combination of prim and sub
-        for (prob⋀_node1, log_theta1) in zip(psdd_node1.children, psdd_node1.log_thetas)
-            for (prob⋀_node2, log_theta2) in zip(psdd_node2.children, psdd_node2.log_thetas)
-                p = prob⋀_node1.children[1]
-                s = prob⋀_node1.children[2]
-
-                r = prob⋀_node2.children[1]
-                t = prob⋀_node2.children[2]
-
-                theta1 = exp(log_theta1)
-
-                p11 = pr_constraint(s, t, pr_constraint_cache)
-                p12 = pr_constraint(p, r, pr_constraint_cache)
-
-                p13 = theta1 * (log_theta1 - log_theta2)
-
-                p21 = psdd_kl_divergence(p, r, kl_divergence_cache, pr_constraint_cache)
-                p31 = psdd_kl_divergence(s, t, kl_divergence_cache, pr_constraint_cache)
-
-                kl_divergence += p11 * p12 * p13 + theta1 * (p11 * p21 + p12 * p31)
-            end
-        end
-        return get!(kl_divergence_cache, (psdd_node1, psdd_node2), kl_divergence)
-    end
-end
-function psdd_kl_divergence(psdd_node1::ProbLiteral, psdd_node2::ProbLiteral,
-        kl_divergence_cache::Dict{Tuple{ProbΔNode, ProbΔNode}, Float64},
-        pr_constraint_cache::Dict{Tuple{ProbΔNode, Union{ProbΔNode, StructLogicalΔNode}}, Float64})
-    # Check if literals are over same variables in vtree
-   @assert variables(psdd_node1.origin.vtree) == variables(psdd_node2.origin.vtree) "Both nodes not normalized for same vtree node"
-
-    if (psdd_node1, psdd_node2) in keys(kl_divergence_cache) # Cache hit
-        return kl_divergence_cache[psdd_node1, psdd_node2]
-    else
-        # In this case probability is 1, kl divergence is 0
-        return get!(kl_divergence_cache, (psdd_node1, psdd_node2), 0.0)
-    end
-end
-function psdd_kl_divergence(psdd_node1::Prob⋁, psdd_node2::ProbLiteral,
-        kl_divergence_cache::Dict{Tuple{ProbΔNode, ProbΔNode}, Float64},
-        pr_constraint_cache::Dict{Tuple{ProbΔNode, Union{ProbΔNode, StructLogicalΔNode}}, Float64})
-    @assert variables(psdd_node1.origin.vtree) == variables(psdd_node2.origin.vtree) "Both nodes not normalized for same vtree node"
-
-    if (psdd_node1, psdd_node2) in keys(kl_divergence_cache) # Cache hit
-        return kl_divergence_cache[psdd_node1, psdd_node2]
-    else
-        psdd_kl_divergence(psdd_node1.children[1], psdd_node2, kl_divergence_cache, pr_constraint_cache)
-        psdd_kl_divergence(psdd_node1.children[2], psdd_node2, kl_divergence_cache, pr_constraint_cache)
-        if literal(psdd_node1.children[1]) == literal(psdd_node2)
-            return get!(kl_divergence_cache, (psdd_node1, psdd_node2),
-                psdd_node1.log_thetas[1] * exp(psdd_node1.log_thetas[1])
-            )
-        else
-            return get!(kl_divergence_cache, (psdd_node1, psdd_node2),
-                psdd_node1.log_thetas[2] * exp(psdd_node1.log_thetas[2])
-            )
-        end
-    end
-end
-function psdd_kl_divergence(psdd_node1::ProbLiteral, psdd_node2::Prob⋁,
-        kl_divergence_cache::Dict{Tuple{ProbΔNode, ProbΔNode}, Float64},
-        pr_constraint_cache::Dict{Tuple{ProbΔNode, Union{ProbΔNode, StructLogicalΔNode}}, Float64})
-    @assert variables(psdd_node1.origin.vtree) == variables(psdd_node2.origin.vtree) "Both nodes not normalized for same vtree node"
-
-    if (psdd_node1, psdd_node2) in keys(kl_divergence_cache) # Cache hit
-        return kl_divergence_cache[psdd_node1, psdd_node2]
-    else
-        psdd_kl_divergence(psdd_node1, psdd_node2.children[1], kl_divergence_cache, pr_constraint_cache)
-        psdd_kl_divergence(psdd_node1, psdd_node2.children[2], kl_divergence_cache, pr_constraint_cache)
-        if literal(psdd_node1) == literal(psdd_node2.children[1])
-            return get!(kl_divergence_cache, (psdd_node1, psdd_node2),
-                -psdd_node2.log_thetas[1]
-            )
-        else
-            return get!(kl_divergence_cache, (psdd_node1, psdd_node2),
-                -psdd_node2.log_thetas[2]
-            )
-        end
-    end
-end
diff --git a/src/ProbabilisticCircuits.jl b/src/ProbabilisticCircuits.jl
index 6ed05250..8ec65ed9 100644
--- a/src/ProbabilisticCircuits.jl
+++ b/src/ProbabilisticCircuits.jl
@@ -5,24 +5,39 @@ module ProbabilisticCircuits
 # USE EXTERNAL MODULES
 
 using Reexport
+@reexport using LogicCircuits
 
 include("Utils/Utils.jl")
-
 @reexport using .Utils
 
-# INCLUDE CHILD MODULES
-include("Probabilistic/Probabilistic.jl")
+include("abstract_prob_nodes.jl")
+include("plain_prob_nodes.jl")
+include("structured_prob_nodes.jl")
+include("logistic_nodes.jl")
+include("param_bit_circuit.jl")
+include("parameters.jl")
+
+include("queries/likelihood.jl")
+include("queries/marginal_flow.jl")
+include("queries/map.jl")
+include("queries/sample.jl")
+include("queries/pr_constraint.jl")
+include("queries/information.jl")
+include("queries/expectation_rec.jl")
+include("queries/expectation_graph.jl")
+
 include("Logistic/Logistic.jl")
-include("IO/IO.jl")
-include("StructureLearner/StructureLearner.jl")
-include("Reasoning/Reasoning.jl")
+@reexport using .Logistic
 
+include("mixtures/shared_prob_nodes.jl")
+# include("mixtures/em.jl")
 
-# USE CHILD MODULES (in order to re-export some functions)
-@reexport using .Probabilistic
-@reexport using .Logistic
-@reexport using .IO
-@reexport using .StructureLearner
-@reexport using .Reasoning
+include("structurelearner/chow_liu_tree.jl")
+include("structurelearner/init.jl")
+include("structurelearner/heuristics.jl")
+include("structurelearner/learner.jl")
+
+include("LoadSave/LoadSave.jl")
+@reexport using .LoadSave
 
 end
diff --git a/src/Reasoning/ExpFlowCircuits.jl b/src/Reasoning/ExpFlowCircuits.jl
deleted file mode 100644
index 869cd966..00000000
--- a/src/Reasoning/ExpFlowCircuits.jl
+++ /dev/null
@@ -1,152 +0,0 @@
-########################
-# Do not use for now
-######################
-#####################
-# Expectation Flow circuits
-# For use of algorithms depending on pairs of nodes of two circuits
-#####################
-
-"A expectation circuit node that has pair of origins of type PC and type LC"
-abstract type DecoratorΔNodePair{PC<:ΔNode, LC<:ΔNode} <: ΔNode end
-
-abstract type ExpFlowΔNode{PC, LC, F} <: DecoratorΔNodePair{PC, LC} end
-
-const ExpFlowΔ{O} = AbstractVector{<:ExpFlowΔNode{<:O}}
-
-struct UpExpFlow{PC, LC, F} <: ExpFlowΔNode{PC, LC, F}
-    p_origin::PC
-    f_origin::LC
-    children::Vector{<:ExpFlowΔNode{<:PC, <:LC, <:F}}
-    f::F
-    fg::F
-end
-
-
-"""
-Construct a upward expectation flow circuit from a given pair of PC and LC circuits
-Note that its assuming the two circuits share the same vtree
-"""
-function ExpFlowΔ(pc::ProbΔ, lc::LogisticΔ, batch_size::Int, ::Type{El}) where El
-    pc_type = grapheltype(pc)
-    lc_type = grapheltype(lc)
-
-    F = Array{El, 2}
-    fmem = () -> zeros(1, batch_size) #Vector{El}(undef, batch_size)  #some_vector(El, batch_size) # note: fmem's return type will determine type of all UpFlows in the circuit (should be El)
-    fgmem = () -> zeros(classes(lc[end]), batch_size)
-
-    root_pc = pc[end]
-    root_lc = lc[end- 1]
-    
-    cache = Dict{Pair{ΔNode, ΔNode}, ExpFlowΔNode}()
-    sizehint!(cache, (length(pc) + length(lc))*4÷3)
-    expFlowCircuit = Vector{ExpFlowΔNode}()
-
-    function ExpflowTraverse(n::Prob⋁, m::Logistic⋁) 
-        get!(cache, Pair(n, m)) do
-            children = [ ExpflowTraverse(i, j) for i in n.children for j in m.children]
-            node = UpExpFlow{pc_type,lc_type, F}(n, m, children, fmem(), fgmem())
-            push!(expFlowCircuit, node)
-            return node
-        end
-    end
-    function ExpflowTraverse(n::Prob⋀, m::Logistic⋀) 
-        get!(cache, Pair(n, m)) do
-            children = [ ExpflowTraverse(z[1], z[2]) for z in zip(n.children, m.children) ]
-            node = UpExpFlow{pc_type,lc_type, F}(n, m, children, fmem(), fgmem())
-            push!(expFlowCircuit, node)
-            return node
-        end
-    end
-    function ExpflowTraverse(n::ProbLiteral, m::Logistic⋁) 
-        get!(cache, Pair(n, m)) do
-            children = Vector{ExpFlowΔNode{pc_type,lc_type, F}}() # TODO
-            node = UpExpFlow{pc_type,lc_type, F}(n, m, children, fmem(), fgmem())
-            push!(expFlowCircuit, node)
-            return node
-        end
-    end
-    function ExpflowTraverse(n::ProbLiteral, m::LogisticLiteral) 
-        get!(cache, Pair(n, m)) do
-            children = Vector{ExpFlowΔNode{pc_type,lc_type, F}}() # TODO
-            node = UpExpFlow{pc_type,lc_type, F}(n, m, children, fmem(), fgmem())
-            push!(expFlowCircuit, node)
-            return node
-        end
-    end
-
-    ExpflowTraverse(root_pc, root_lc)
-    expFlowCircuit
-end
-
-function exp_pass_up(pc::ProbΔ, lc::LogisticΔ, data::XData{E}) where{E <: eltype(F)} where{PC, LC, F}
-    expFlowCircuit = ExpFlowΔ(pc, lc, num_examples(data), Float64);
-    for n in expFlowCircuit
-        exp_pass_up_node(n, data)
-    end 
-    expFlowCircuit
-end
-
-function exp_pass_up(fc::ExpFlowΔ, data::XData{E}) where{E <: eltype(F)} where{PC, LC, F}
-    #TODO write resize_flows similar to flow_circuits
-    #     and give as input the expFlowCircuit instead
-    #expFlowCircuit = ExpFlowΔ(pc, lc, num_examples(data), Float64);
-    for n in fc
-        exp_pass_up_node(n, data)
-    end
-end
-
-function exp_pass_up_node(node::ExpFlowΔNode{PC,LC,F}, data::XData{E}) where{E <: eltype(F)} where{PC, LC, F}
-    pType = typeof(node.p_origin)
-    fType = typeof(node.f_origin)
-
-    if node.p_origin isa Prob⋁ && node.f_origin isa Logistic⋁
-        #todo this ordering might be different than the ExpFlowΔNode children
-        pthetas = [exp(node.p_origin.log_thetas[i])
-                    for i in 1:length(node.p_origin.children) for j in 1:length(node.f_origin.children)]
-        fthetas = [node.f_origin.thetas[j,:] # only taking the first class for now
-            for i in 1:length(node.p_origin.children) for j in 1:length(node.f_origin.children)]
-
-        node.f .= 0.0
-        node.fg .= 0.0
-        for z = 1:length(node.children)
-            node.f  .+= pthetas[z] .* node.children[z].f
-            node.fg .+= (pthetas[z] .* fthetas[z]) .* node.children[z].f
-            node.fg .+= pthetas[z] .* node.children[z].fg
-        end
-    elseif node.p_origin isa Prob⋀ && node.f_origin isa Logistic⋀
-        node.f .= node.children[1].f .* node.children[2].f # assume 2 children
-        node.fg .= (node.children[1].f .* node.children[2].fg) .+
-                   (node.children[2].f .* node.children[1].fg)
-
-    elseif node.p_origin isa ProbLiteral 
-        if node.f_origin isa Logistic⋁
-            m = node.f_origin.children[1]
-        elseif node.f_origin isa LogisticLiteral
-            m = node.f_origin
-        else
-            error("Invalid Types of pairs {$pType} - {$fType}")
-        end
-
-        var = lit2var(literal(m))
-        X = feature_matrix(data)
-        if positive(node.p_origin) && positive(m)
-            node.f[:, X[:, var] .!= 0 ] .= 1.0 # positive and missing observations
-            node.f[:, X[:, var] .== 0 ] .= 0.0
-        elseif negative(node.p_origin) && negative(m)
-            node.f[:, X[:, var] .!= 1 ] .= 1.0 # negative and missing observations
-            node.f[:, X[:, var] .== 1 ] .= 0.0
-        else
-            node.f .= 0.0
-        end
-
-        if node.f_origin isa Logistic⋁
-            node.fg .= node.f .* transpose(node.f_origin.thetas)
-        else
-            node.fg .= 0.0
-        end
-
-    else
-        error("Invalid Types of pairs {$pType} - {$fType}")
-    end
-
-end
\ No newline at end of file
diff --git a/src/Reasoning/Expectation.jl b/src/Reasoning/Expectation.jl
deleted file mode 100644
index 783dc923..00000000
--- a/src/Reasoning/Expectation.jl
+++ /dev/null
@@ -1,262 +0,0 @@
-ExpCacheDict = Dict{Pair{ProbΔNode, LogisticΔNode}, Array{Float64, 2}}
-MomentCacheDict = Dict{Tuple{ProbΔNode, LogisticΔNode, Int64}, Array{Float64, 2}}
-
-struct ExpectationCache 
-    f::ExpCacheDict
-    fg::ExpCacheDict
-end
-ExpectationCache() = ExpectationCache(ExpCacheDict(), ExpCacheDict())
-
-struct MomentCache 
-    f::ExpCacheDict
-    fg::MomentCacheDict
-end
-MomentCache() = MomentCache( ExpCacheDict(),  MomentCacheDict())
-
-
-# Find a better way to cache n_choose_k values
-max_k = 31
-choose_cache = [ 1.0 * binomial(i,j) for i=0:max_k+1, j=0:max_k+1 ]
-@inline function choose(n::Int, m::Int)
-    return choose_cache[n+1, m+1]
-end
-
-
-# On Tractable Computation of Expected Predictions (https://arxiv.org/abs/1910.02182)
-"""
-Missing values should be denoted by -1
-"""
-function Expectation(pc::ProbΔ, lc::LogisticΔ, data::XData{Int8})
-    # 1. Get probability of each observation
-    fc, log_likelihoods = marginal_log_likelihood_per_instance(pc, data)
-    p_observed = exp.( log_likelihoods )
-    
-    # 2. Expectation w.r.t. P(x_m, x_o)
-    cache = ExpectationCache()
-    results_unnormalized = exp_g(pc[end], lc[end-1], data, cache) # skipping the bias node of lc
-
-    # 3. Expectation w.r.t P(x_m | x_o)
-    results = transpose(results_unnormalized) ./ p_observed
-
-    # 4. Add Bias terms
-    biases = lc[end].thetas
-    results .+= biases
-    
-    results, cache
-end
-
-function Moment(pc::ProbΔ, lc::LogisticΔ, data::XData{Int8}, moment::Int)
-    # 1. Get probability of each observation
-    fc, log_likelihoods = marginal_log_likelihood_per_instance(pc, data)
-    p_observed = exp.( log_likelihoods )
-    
-    # 2. Moment w.r.t. P(x_m, x_o)
-    cache = MomentCache()
-    biases = lc[end].thetas
-    results_unnormalized = zeros(num_examples(data), classes(lc[end]))
-    
-    for z = 0:moment-1  
-        results_unnormalized .+= choose(moment, z) .* (biases .^ (z)) .* transpose(moment_g(pc[end], lc[end-1], data, moment - z, cache))
-    end
-    
-    # 3. Moment w.r.t P(x_m | x_o)
-    results = results_unnormalized ./ p_observed
-
-    # 4. Add Bias^moment terms
-    results .+= biases .^ (moment)
-    
-    results, cache
-end
-
-
-function ExpectationUpward(pc::ProbΔ, lc::LogisticΔ, data::XData{Int8})
-        # 1. Get probability of each observation
-        fc, log_likelihoods = marginal_log_likelihood_per_instance(pc, data)
-        p_observed = exp.( log_likelihoods )
-        
-        # 2. Expectation w.r.t. P(x_m, x_o)
-        exps_flow = exp_pass_up(pc, lc, data)
-        results_unnormalized = exps_flow[end].fg
-    
-        # 3. Expectation w.r.t P(x_m | x_o)
-        results = transpose(results_unnormalized) ./ p_observed
-    
-        # 4. Add Bias terms
-        biases = lc[end].thetas
-        results .+= biases
-        
-        results, exps_flow
-end
-
-
-# exp_f (pr-constraint) is originally from:
-#   Arthur Choi, Guy Van den Broeck, and Adnan Darwiche. Tractable learning for structured probability spaces: A case study in learning preference distributions. In Proceedings of IJCAI, 2015.
-
-function exp_f(n::Prob⋁, m::Logistic⋁, data::XData{Int8}, cache::Union{ExpectationCache, MomentCache})
-    @inbounds get!(cache.f, Pair(n, m)) do
-        value = zeros(1 , num_examples(data) )
-        pthetas = [exp(n.log_thetas[i]) for i in 1:length(n.children)]
-        @fastmath @simd for i in 1:length(n.children)
-            @simd for j in 1:length(m.children)
-                value .+= (pthetas[i] .* exp_f(n.children[i], m.children[j], data, cache))
-            end
-        end
-        return value
-    end
-end
-
-function exp_f(n::Prob⋀, m::Logistic⋀, data::XData{Int8}, cache::Union{ExpectationCache, MomentCache})
-    @inbounds get!(cache.f, Pair(n, m)) do
-        value = ones(1 , num_examples(data) )
-        @fastmath for (i,j) in zip(n.children, m.children)
-            value .*= exp_f(i, j, data, cache)
-        end
-        return value
-        # exp_f(n.children[1], m.children[1], data, cache) .* exp_f(n.children[2], m.children[2], data, cache)
-    end
-end
-
-
-@inline function exp_f(n::ProbLiteral, m::LogisticLiteral, data::XData{Int8}, cache::Union{ExpectationCache, MomentCache})
-    @inbounds get!(cache.f, Pair(n, m)) do
-        value = zeros(1 , num_examples(data) )
-        var = lit2var(literal(m))
-        X = feature_matrix(data)
-        if positive(n) && positive(m) 
-            # value[1, X[:, var] .== -1 ] .= 1.0  # missing observation always agrees
-            # value[1, X[:, var] .== 1 ] .= 1.0 # positive observations
-            value[1, X[:, var] .!= 0 ] .= 1.0 # positive or missing observations
-        elseif negative(n) && negative(m)
-            # value[1, X[:, var] .== -1 ] .= 1.0  # missing observation always agrees
-            # value[1, X[:, var] .== 0 ] .= 1.0 # negative observations
-            value[1, X[:, var] .!= 1 ] .= 1.0 # negative or missing observations
-        end
-        return value
-    end
-end
-
-"""
-Has to be a Logistic⋁ with only one child, which is a leaf node 
-"""
-@inline function exp_f(n::ProbLiteral, m::Logistic⋁, data::XData{Int8}, cache::Union{ExpectationCache, MomentCache})
-    @inbounds get!(cache.f, Pair(n, m)) do
-        exp_f(n, m.children[1], data, cache)
-    end
-end
-
-#######################################################################
-######## exp_g, exp_fg
-########################################################################
-
-@inline function exp_g(n::Prob⋁, m::Logistic⋁, data::XData{Int8}, cache::ExpectationCache)
-    exp_fg(n, m, data, cache) # exp_fg and exp_g are the same for OR nodes
-end
-
-# function exp_g(n::Prob⋀, m::Logistic⋀, data::XData{Int8}, cache::ExpectationCache)
-#     value = zeros(classes(m) , num_examples(data))
-#     @fastmath for (i,j) in zip(n.children, m.children)
-#         value .+= exp_fg(i, j, data, cache)
-#     end
-#     return value
-#     # exp_fg(n.children[1], m.children[1], data, cache) .+ exp_fg(n.children[2], m.children[2], data, cache)
-# end
-
-
-function exp_fg(n::Prob⋁, m::Logistic⋁, data::XData{Int8}, cache::ExpectationCache)
-    @inbounds get!(cache.fg, Pair(n, m)) do
-        value = zeros(classes(m) , num_examples(data) )
-        pthetas = [exp(n.log_thetas[i]) for i in 1:length(n.children)]
-        @fastmath @simd for i in 1:length(n.children)
-            for j in 1:length(m.children)
-                value .+= (pthetas[i] .* m.thetas[j,:]) .* exp_f(n.children[i], m.children[j], data, cache)
-                value .+= pthetas[i] .* exp_fg(n.children[i], m.children[j], data, cache)
-            end
-        end
-        return value
-    end
-end
-
-function exp_fg(n::Prob⋀, m::Logistic⋀, data::XData{Int8}, cache::ExpectationCache)
-    @inbounds get!(cache.fg, Pair(n, m)) do
-        # Assuming 2 children
-        value = exp_f(n.children[1], m.children[1], data, cache) .* exp_fg(n.children[2], m.children[2], data, cache)
-        value .+= exp_f(n.children[2], m.children[2], data, cache) .* exp_fg(n.children[1], m.children[1], data, cache)
-        return value
-    end
-end
-
-
-"""
-Has to be a Logistic⋁ with only one child, which is a leaf node 
-"""
-@inline function exp_fg(n::ProbLiteral, m::Logistic⋁, data::XData{Int8}, cache::ExpectationCache)
-    @inbounds get!(cache.fg, Pair(n, m)) do
-        m.thetas[1,:] .* exp_f(n, m, data, cache)
-    end
-end
-
-@inline function exp_fg(n::ProbLiteral, m::LogisticLiteral, data::XData{Int8}, cache::ExpectationCache)
-    #dont know how many classes, boradcasting does the job
-    zeros(1 , num_examples(data)) 
-end
-
-#######################################################################
-######## moment_g, moment_fg
-########################################################################
-
-@inline function moment_g(n::Prob⋁, m::Logistic⋁, data::XData{Int8}, moment::Int, cache::MomentCache)
-    get!(cache.fg, (n, m, moment)) do
-        moment_fg(n, m, data, moment, cache)
-    end
-end
-
-"""
-Calculating  E[g^k * f]
-"""
-function moment_fg(n::Prob⋁, m::Logistic⋁, data::XData{Int8}, moment::Int, cache::MomentCache)
-    if moment == 0
-        return exp_f(n, m, data, cache)
-    end
-
-    get!(cache.fg, (n, m, moment)) do
-        value = zeros(classes(m) , num_examples(data) )
-        pthetas = [exp(n.log_thetas[i]) for i in 1:length(n.children)]
-        @fastmath @simd for i in 1:length(n.children)
-            for j in 1:length(m.children)
-                for z in 0:moment
-                    value .+= pthetas[i] .* choose(moment, z) .* m.thetas[j,:].^(moment - z) .* moment_fg(n.children[i], m.children[j], data, z, cache)
-                end
-            end
-        end
-        return value
-    end
-end
-
-@inline function moment_fg(n::ProbLiteral, m::Logistic⋁, data::XData{Int8}, moment::Int, cache::MomentCache)
-    get!(cache.fg, (n, m, moment)) do
-        m.thetas[1,:].^(moment) .* exp_f(n, m, data, cache)
-    end
-end
-
-@inline function moment_fg(n::ProbLiteral, m::LogisticLiteral, data::XData{Int8}, moment::Int, cache::MomentCache)
-    #dont know how many classes, boradcasting does the job
-    if moment == 0
-        exp_f(n, m, data, cache)
-    else
-        zeros(1, num_examples(data))
-    end
-end
-
-function moment_fg(n::Prob⋀, m::Logistic⋀, data::XData{Int8}, moment::Int, cache::MomentCache)
-    if moment == 0
-        return exp_f(n, m, data, cache)
-    end
-    get!(cache.fg, (n, m, moment)) do
-        value = moment_fg(n.children[1], m.children[1], data, 0, cache) .* moment_fg(n.children[2], m.children[2], data, moment, cache)
-
-        for z in 1:moment
-            value .+= choose(moment, z) .* moment_fg(n.children[1], m.children[1], data, z, cache) .* moment_fg(n.children[2], m.children[2], data, moment - z, cache)
-        end
-        return value
-    end
-end
\ No newline at end of file
diff --git a/src/Reasoning/Reasoning.jl b/src/Reasoning/Reasoning.jl
deleted file mode 100644
index ce486ff4..00000000
--- a/src/Reasoning/Reasoning.jl
+++ /dev/null
@@ -1,20 +0,0 @@
-module Reasoning
-
-using LogicCircuits
-using ..Probabilistic
-using ..Logistic
-using ..Utils
-
-export 
-    UpExpFlow,
-    ExpFlowΔ,
-    exp_pass_up,
-    Expectation,
-    ExpectationUpward,
-    Moment
-
-include("Expectation.jl")
-include("ExpFlowCircuits.jl")
-
-
-end
\ No newline at end of file
diff --git a/src/StructureLearner/CircuitBuilder.jl b/src/StructureLearner/CircuitBuilder.jl
deleted file mode 100644
index 8d3c8c96..00000000
--- a/src/StructureLearner/CircuitBuilder.jl
+++ /dev/null
@@ -1,133 +0,0 @@
-using LightGraphs: topological_sort_by_dfs, outneighbors
-using MetaGraphs: get_prop
-
-
-"convert literal+/- to probability value 0/1"
-@inline lit2value(l::Lit)::Int = (l > 0 ? 1 : 0)
-
-"""
-Learning from data a circuit with several structure learning algorithms
-"""
-function learn_probabilistic_circuit(data::Union{XData, WXData}; 
-        pseudocount = 1.0, algo = "chow-liu", algo_kwargs=(α=1.0, clt_root="graph_center"))::ProbΔ
-    if algo == "chow-liu"
-        clt = learn_chow_liu_tree(data; algo_kwargs...)
-        pc = compile_prob_circuit_from_clt(clt)
-        estimate_parameters(pc, convert(XBatches,data); pseudocount = pseudocount)
-        pc 
-    else
-        error("Cannot learn a probabilistic circuit with algorithm $algo")
-    end
-end
-
-"Build decomposable probability circuits from Chow-Liu tree"
-function compile_prob_circuit_from_clt(clt::CLT)::ProbΔ
-    topo_order = Var.(reverse(topological_sort_by_dfs(clt::CLT))) #order to parse the node
-    lin = Vector{ProbΔNode}()
-    node_cache = Dict{Lit, LogicalΔNode}()
-    prob_cache = ProbCache()
-    parent = parent_vector(clt)
-
-    prob_children(n)::Vector{<:ProbΔNode{<:node_type(n)}} =  
-        copy_with_eltype(map(c -> prob_cache[c], n.children), ProbΔNode{<:node_type(n)})
-
-    "default order of circuit node, from left to right: +/1 -/0"
-
-    "compile leaf node into circuits"
-    function compile_leaf(ln::Var)
-        pos = LiteralNode( var2lit(ln))
-        neg = LiteralNode(-var2lit(ln))
-        node_cache[var2lit(ln)] = pos
-        node_cache[-var2lit(ln)] = neg
-        pos2 = ProbLiteral(pos)
-        neg2 = ProbLiteral(neg)
-        push!(lin, pos2)
-        push!(lin, neg2)
-        prob_cache[pos] = pos2
-        prob_cache[neg] = neg2
-    end
-
-    "compile inner disjunction node"
-    function compile_⋁inner(ln::Lit, children::Vector{Var})::Vector{⋁Node}
-        logical_nodes = Vector{⋁Node}()
-        v = lit2value(ln)
-
-        for c in children
-            #build logical ciruits
-            temp = ⋁Node([node_cache[lit] for lit in [var2lit(c), - var2lit(c)]])
-            push!(logical_nodes, temp)
-            n = Prob⋁(temp, prob_children(temp))
-            prob_cache[temp] = n
-            n.log_thetas = zeros(Float64, 2)
-            cpt = get_prop(clt, c, :cpt)
-            weights = [cpt[(1, v)], cpt[(0, v)]]
-            n.log_thetas = log.(weights)
-            push!(lin, n)
-        end
-
-        return logical_nodes
-    end
-
-    "compile inner conjunction node into circuits, left node is indicator, rest nodes are disjunction children nodes"
-    function compile_⋀inner(indicator::Lit, children::Vector{⋁Node})
-        leaf = node_cache[indicator]
-        temp = ⋀Node(vcat([leaf], children))
-        node_cache[indicator] = temp
-        n = Prob⋀(temp, prob_children(temp))
-        prob_cache[temp] = n
-        push!(lin, n)
-    end
-
-    "compile inner node, 1 inner variable to 2 leaf nodes, 2 * num_children disjunction nodes and 2 conjunction nodes"
-    function compile_inner(ln::Var, children::Vector{Var})
-        compile_leaf(ln)
-        pos⋁ = compile_⋁inner(var2lit(ln), children)
-        neg⋁ = compile_⋁inner(-var2lit(ln), children)
-        compile_⋀inner(var2lit(ln), pos⋁)
-        compile_⋀inner(-var2lit(ln), neg⋁)
-    end
-
-    "compile root, add another disjunction node"
-    function compile_root(root::Var)
-        temp = ⋁Node([node_cache[s] for s in [var2lit(root), -var2lit(root)]])
-        n = Prob⋁(temp, prob_children(temp))
-        prob_cache[temp] = n
-        n.log_thetas = zeros(Float64, 2)
-        cpt = get_prop(clt, root, :cpt)
-        weights = [cpt[1], cpt[0]]
-        n.log_thetas = log.(weights)
-        push!(lin, n)
-        return n
-    end
-
-    function compile_independent_roots(roots::Vector{ProbΔNode})
-        temp = ⋀Node([c.origin for c in roots])
-        n = Prob⋀(temp, prob_children(temp))
-        prob_cache[temp] = n
-        push!(lin, n)
-        temp = ⋁Node([temp])
-        n = Prob⋁{LogicalΔNode}(temp, prob_children(temp))
-        prob_cache[temp] = n
-        n.log_thetas = [0.0]
-        push!(lin, n)
-    end
-
-    roots = Vector{ProbΔNode}()
-    for id in topo_order
-        children = Var.(outneighbors(clt, id))
-        if isequal(children, [])
-            compile_leaf(id)
-        else
-            compile_inner(id, children)
-        end
-        if 0 == parent[id]
-            push!(roots, compile_root(id))
-        end
-    end
-
-    if length(roots) > 1
-        compile_independent_roots(roots)
-    end
-
-    return lin
-end
diff --git a/src/StructureLearner/PSDDInitializer.jl b/src/StructureLearner/PSDDInitializer.jl
deleted file mode 100644
index 439b691a..00000000
--- a/src/StructureLearner/PSDDInitializer.jl
+++ /dev/null
@@ -1,275 +0,0 @@
-using ..Utils
-
-"Map from literal to LogicalΔNode"
-const LitCache = Dict{Lit, LogicalΔNode}
-
-"Use literal to represent constraint (1 to X, -1 to not X), 0 to represent true"
-const ⊤ = convert(Lit, 0)
-
-"""
-Learning from data a structured-decomposable circuit with several structure learning algorithms
-"""
-function learn_struct_prob_circuit(data::Union{XData, WXData}; 
-        pseudocount = 1.0, algo = "chow-liu", algo_kwargs=(α=1.0, clt_root="graph_center"), vtree = "chow-liu", vtree_kwargs=(vtree_mode="balanced",))
-    if algo == "chow-liu"
-        clt = learn_chow_liu_tree(data; algo_kwargs...)
-        vtree = learn_vtree_from_clt(clt; vtree_kwargs...);
-        pc = compile_psdd_from_clt(clt, vtree);
-        estimate_parameters(pc, convert(XBatches,data); pseudocount = pseudocount)
-        pc, vtree
-    else
-        error("Cannot learn a structured-decomposable circuit with algorithm $algo")
-    end
-end
-
-#############
-# Learn PlainVtree from CLT
-#############
-
-"
-Learn a vtree from clt,
-with strategy (close to) `linear` or `balanced`
-"
-function learn_vtree_from_clt(clt::CLT; vtree_mode::String)::PlainVtree
-    roots = [i for (i, x) in enumerate(parent_vector(clt)) if x == 0]
-    rootnode = construct_children(Var.(roots), clt, vtree_mode)
-
-    return node2dag(rootnode)
-end
-
-function construct_node(v::Var, clt::CLT, strategy::String)::PlainVtreeNode
-    children = Var.(outneighbors(clt, v))
-    if isempty(children) # leaf node
-        return PlainVtreeLeafNode(v)
-    else
-        right = construct_children(children, clt, strategy)
-        return add_parent(v, right)
-    end
-end
-
-function construct_children(children::Vector{Var}, clt::CLT, strategy::String)::PlainVtreeNode
-    sorted_vars = sort(collect(children))
-    children_nodes = Vector{PlainVtreeNode}()
-    foreach(x -> push!(children_nodes, construct_node(x, clt, strategy)), sorted_vars)
-
-    if strategy == "linear"
-        construct_children_linear(children_nodes, clt)
-    elseif strategy == "balanced"
-        construct_children_balanced(children_nodes, clt)
-    else
-        throw("Unknown type of strategy")
-    end
-end
-
-function construct_children_linear(children_nodes::Vector{PlainVtreeNode}, clt::CLT)::PlainVtreeNode
-    children_nodes = Iterators.Stateful(reverse(children_nodes))
-
-    right = popfirst!(children_nodes)
-    for left in children_nodes
-        right = PlainVtreeInnerNode(left, right)
-    end
-    return right
-end
-
-function construct_children_balanced(children_nodes::Vector{PlainVtreeNode}, clt::CLT)::PlainVtreeNode
-    if length(children_nodes) == 1
-        return children_nodes[1]
-    elseif length(children_nodes) == 2
-        return PlainVtreeInnerNode(children_nodes[1], children_nodes[2])
-    else
-        len = trunc(Int64, length(children_nodes) / 2)
-        left = construct_children_balanced(children_nodes[1 : len], clt)
-        right = construct_children_balanced(children_nodes[len + 1 : end], clt)
-        return PlainVtreeInnerNode(left, right)
-    end
-end
-
-function add_parent(parent::Var, children::PlainVtreeNode)
-    return PlainVtreeInnerNode(PlainVtreeLeafNode(parent), children)
-end
-
-#####################
-# Compile PSDD from CLT and vtree
-#####################
-
-"Compile a psdd circuit from clt and vtree"
-function compile_psdd_from_clt(clt::MetaDiGraph, vtree::PlainVtree)
-    order = node2dag(vtree[end])
-    parent_clt = Var.(parent_vector(clt))
-
-    lin = Vector{ProbΔNode}()
-    prob_cache = ProbCache()
-    lit_cache = LitCache()
-    v2p = Dict{PlainVtreeNode, ProbΔ}()
-
-    get_params(cpt::Dict) = length(cpt) == 2 ? [cpt[1], cpt[0]] : [cpt[(1,1)], cpt[(0,1)], cpt[(1,0)], cpt[(0,0)]]
-    function add_mapping!(v::PlainVtreeNode, circuits::ProbΔ)
-        if !haskey(v2p, v); v2p[v] = Vector{ProbΔNode}(); end
-        foreach(c -> if !(c in v2p[v]) push!(v2p[v], c);end, circuits)
-    end
-
-    # compile vtree leaf node to terminal/true node
-    function compile_from_vtree_node(v::PlainVtreeLeafNode)
-        var = v.var
-        children = Var.(outneighbors(clt, var))
-        cpt = get_prop(clt, var, :cpt)
-        parent = parent_clt[var]
-        if isequal(children, [])
-            circuit = compile_true_nodes(var, v, get_params(cpt), lit_cache, prob_cache, lin)
-        else
-            circuit = compile_literal_nodes(var, v, get_params(cpt), lit_cache, prob_cache, lin)
-        end
-        add_mapping!(v, circuit)
-    end
-
-    # compile to decision node
-    function compile_from_vtree_node(v::PlainVtreeInnerNode)
-        left_var = left_most_child(v.left).var
-        right_var = left_most_child(v.right).var
-        left_circuit = v2p[v.left]
-        right_circuit = v2p[v.right]
-
-        if parent_clt[left_var] == parent_clt[right_var] # two nodes are independent, compile to seperate decision nodes
-            circuit = [compile_decision_node([l], [r], v, [1.0], prob_cache, lin) for (l, r) in zip(left_circuit, right_circuit)]
-        elseif left_var == parent_clt[right_var] # conditioned on left
-            cpt = get_prop(clt, left_var, :cpt)
-            circuit = compile_decision_nodes(left_circuit, right_circuit, v, get_params(cpt), prob_cache, lin)
-        else
-            throw("PlainVtree are not learned from the same CLT")
-        end
-        add_mapping!(v, circuit)
-    end
-
-    foreach(compile_from_vtree_node, vtree)
-    return lin
-end
-
-#####################
-# Construct probabilistic circuit node
-#####################
-
-prob_children(n, prob_cache) =  
-    copy_with_eltype(map(c -> prob_cache[c], n.children), ProbΔNode{<:StructLogicalΔNode})
-
-"Add leaf nodes to circuit `lin`"
-function add_prob_leaf_node(var::Var, vtree::PlainVtreeLeafNode, lit_cache::LitCache, prob_cache::ProbCache, lin)
-    pos = StructLiteralNode{PlainVtreeNode}( var2lit(var), vtree)
-    neg = StructLiteralNode{PlainVtreeNode}(-var2lit(var), vtree)
-    lit_cache[var2lit(var)] = pos
-    lit_cache[-var2lit(var)] = neg
-    pos2 = ProbLiteral(pos)
-    neg2 = ProbLiteral(neg)
-    prob_cache[pos] = pos2
-    prob_cache[neg] = neg2
-    push!(lin, pos2)
-    push!(lin, neg2)
-    return (pos2, neg2)
-end
-
-"Add prob⋀ node to circuit `lin`"
-function add_prob⋀_node(children::ProbΔ, vtree::PlainVtreeInnerNode, prob_cache::ProbCache, lin)::Prob⋀
-    logic = Struct⋀Node{PlainVtreeNode}([c.origin for c in children], vtree)
-    prob = Prob⋀(logic, prob_children(logic, prob_cache))
-    prob_cache[logic] = prob
-    push!(lin, prob)
-    return prob
-end
-
-"Add prob⋁ node to circuit `lin`"
-function add_prob⋁_node(children::ProbΔ, vtree::PlainVtreeNode, thetas::Vector{Float64}, prob_cache::ProbCache, lin)::Prob⋁
-    logic = Struct⋁Node{PlainVtreeNode}([c.origin for c in children], vtree)
-    prob = Prob⋁(logic, prob_children(logic, prob_cache))
-    prob.log_thetas = log.(thetas)
-    prob_cache[logic] = prob
-    push!(lin, prob)
-    return prob
-end
-
-"Construct decision nodes given `primes` and `subs`"
-function compile_decision_node(primes::ProbΔ, subs::ProbΔ, vtree::PlainVtreeInnerNode, params::Vector{Float64}, prob_cache::ProbCache, lin)
-    elements = [add_prob⋀_node([prime, sub], vtree, prob_cache, lin) for (prime, sub) in zip(primes, subs)]
-    return add_prob⋁_node(elements, vtree, params, prob_cache, lin)
-end
-
-"Construct literal nodes given variable `var`"
-function compile_literal_nodes(var::Var, vtree::PlainVtreeLeafNode, probs::Vector{Float64}, lit_cache::LitCache, prob_cache::ProbCache, lin)
-    (pos, neg) = add_prob_leaf_node(var, vtree, lit_cache, prob_cache, lin)
-    return [pos, neg]
-end
-
-"Construct true nodes given variable `var`"
-function compile_true_nodes(var::Var, vtree::PlainVtreeLeafNode, probs::Vector{Float64}, lit_cache::LitCache, prob_cache::ProbCache, lin)
-    (pos, neg) = add_prob_leaf_node(var, vtree, lit_cache, prob_cache, lin)
-    return [add_prob⋁_node([pos, neg], vtree, probs[i:i+1], prob_cache, lin) for i in 1:2:length(probs)]
-end
-
-"Construct decision nodes conditiond on different distribution"
-function compile_decision_nodes(primes::ProbΔ, subs::ProbΔ, vtree::PlainVtreeInnerNode, params::Vector{Float64}, prob_cache::ProbCache, lin)
-    return [compile_decision_node(primes, subs, vtree, params[i:i+1], prob_cache, lin) for i in 1:2:length(params)]
-end
-
-#####################
-# Map and cache constraints
-#####################
-
-function set_base(index, n::StructLiteralNode, bases)
-    if positive(n)
-        bases[n][variable(n)] = 1
-    else
-        bases[n][variable(n)] = -1
-    end
-end
-
-function set_base(index, n::Struct⋁Node, bases)
-    len = num_children(n)
-    temp = sum([bases[c] for c in n.children])
-    bases[n] = map(x-> if x == len 1; elseif -x == len; -1; else 0; end, temp)
-end
-
-function set_base(index, n::Struct⋀Node, bases)
-    bases[n] = sum([bases[c] for c in n.children])
-end
-
-function calculate_all_bases(circuit::ProbΔ)::BaseCache
-    num_var = num_variables(circuit[end].origin.vtree)
-    bases = BaseCache()
-    foreach(n -> bases[n.origin] = fill(⊤, num_var), circuit)
-    foreach(n -> set_base(n[1], n[2].origin, bases), enumerate(circuit))
-    @assert all(bases[circuit[end].origin] .== ⊤) "Base of root node should be true"
-    return bases
-end
-
-#####################
-# Compile fully factorized PSDD from vtree, all variables are independent initially
-#####################
-
-function compile_fully_factorized_psdd_from_vtree(vtree::PlainVtree)::ProbΔ
-
-    function ful_factor_node(v::PlainVtreeLeafNode, lit_cache::LitCache, prob_cache::ProbCache, v2n, lin)
-        var = variables(v)[1]
-        pos, neg = add_prob_leaf_node(var, v, lit_cache, prob_cache, lin)
-        prob_or = add_prob⋁_node([pos, neg], v, [0.5, 0.5], prob_cache, lin)
-        v2n[v] = prob_or
-        nothing
-    end
-
-    function ful_factor_node(v::PlainVtreeInnerNode, lit_cache::LitCache, prob_cache::ProbCache, v2n, lin)
-        left = v2n[v.left]
-        right = v2n[v.right]
-        prob_and = add_prob⋀_node([left, right], v, prob_cache, lin)
-        prob_or = add_prob⋁_node([prob_and], v, [1.0], prob_cache, lin)
-        v2n[v] = prob_or
-        nothing
-    end
-
-    lin = Vector{ProbΔNode}()
-    prob_cache = ProbCache()
-    lit_cache = LitCache()
-    v2n = Dict{PlainVtreeNode, ProbΔNode}()
-
-    for v in vtree
-        ful_factor_node(v, lit_cache, prob_cache, v2n, lin)
-    end
-
-    lin
-end
diff --git a/src/StructureLearner/StructureLearner.jl b/src/StructureLearner/StructureLearner.jl
deleted file mode 100644
index b21c0ad7..00000000
--- a/src/StructureLearner/StructureLearner.jl
+++ /dev/null
@@ -1,24 +0,0 @@
-module StructureLearner
-
-using LogicCircuits
-using ..Utils
-
-using ..Probabilistic
-using ..IO
-
-export
-# ChowLiuTree
-learn_chow_liu_tree, parent_vector, print_tree, CLT,
-
-# CircuitBuilder
-compile_prob_circuit_from_clt, learn_probabilistic_circuit, BaseCache, ⊤, LitCache,
-
-# PSDDInitializer
-learn_struct_prob_circuit,
-learn_vtree_from_clt, compile_psdd_from_clt,compile_fully_factorized_psdd_from_vtree
-
-include("ChowLiuTree.jl")
-include("CircuitBuilder.jl")
-include("PSDDInitializer.jl")
-
-end
diff --git a/src/Utils/Utils.jl b/src/Utils/Utils.jl
index d863c9bc..6e12f494 100644
--- a/src/Utils/Utils.jl
+++ b/src/Utils/Utils.jl
@@ -1,74 +1,10 @@
 """
-Module with general utilities and missing standard library features that could be useful in any Julia project
+Module with general utilities and missing standard library features 
+that could be useful in any Julia project
 """
 module Utils
 
-export to_long_mi,
-    generate_all, generate_data_all
-
-
-###################
-# Misc.
-####################
-
-
-function to_long_mi(m::Matrix{Float64}, min_int, max_int)::Matrix{Int64}
-    δmi = maximum(m) - minimum(m)
-    δint = max_int - min_int
-    return @. round(Int64, m * δint / δmi + min_int)
-end
-
-###################
-# One-Hot Encoding
-####################
-"""
-One-hot encode data (2-D Array) based on categories (1-D Array)
-Each row of the return value is a concatenation of one-hot encoding of elements of the same row in data
-Assumption: both input arrays have elements of same type
-"""
-function one_hot_encode(X::Array{T, 2}, categories::Array{T,1}) where {T<:Any}
-    X_dash = zeros(Bool, size(X)[1], length(categories)*size(X)[2])
-    for i = 1:size(X)[1], j = 1:size(X)[2]
-            X_dash[i, (j-1)*length(categories) + findfirst(==(X[i,j]), categories)] = 1
-    end  
-    X_dash
-end
-
-###################
-# Testing Utils
-####################
-
-"""
-Given some missing values generates all possible fillings
-"""
-function generate_all(row::Array{Int8})
-    miss_count = count(row .== -1)
-    lits = length(row)
-    result = Bool.(zeros(1 << miss_count, lits))
-
-    if miss_count == 0
-        result[1, :] = copy(row)
-    else
-        for mask = 0: (1<<miss_count) - 1
-            cur = copy(row)
-            cur[row .== -1] = transpose(parse.(Bool, split(bitstring(mask)[end-miss_count+1:end], "")))
-            result[mask+1,:] = cur
-        end
-    end
-    result
-end
-
-"""
-Generates all possible binary configurations of size N
-"""
-function generate_data_all(N::Int)
-    data_all = transpose(parse.(Bool, split(bitstring(0)[end-N+1:end], "")));
-    for mask = 1: (1<<N) - 1
-        data_all = vcat(data_all,
-            transpose(parse.(Bool, split(bitstring(mask)[end-N+1:end], "")))
-        );
-    end
-    data_all
-end
+include("misc.jl")
+include("information.jl")
 
 end #module
diff --git a/src/Probabilistic/MutualInformation.jl b/src/Utils/information.jl
similarity index 82%
rename from src/Probabilistic/MutualInformation.jl
rename to src/Utils/information.jl
index 2e13330c..1b03b658 100644
--- a/src/Probabilistic/MutualInformation.jl
+++ b/src/Utils/information.jl
@@ -1,5 +1,7 @@
+export entropy, conditional_entropy, mutual_information
 using Statistics
 using StatsFuns: xlogx, xlogy
+using LogicCircuits: issomething
 
 "Cache pairwise / marginal distribution for all variables in one dataset"
 mutable struct DisCache
@@ -13,11 +15,11 @@ DisCache(num) = DisCache(Array{Float64}(undef, num, num, 4), Array{Float64}(unde
 #####################
 # Methods for pairwise and marginal distribution
 #####################
-@inline get_parameters(bm::AbstractMatrix{<:Bool}, α, w=nothing) = size(bm)[2], issomething(w) ? sum(w) : size(bm)[1], @. Float64(bm), @. Float64(!bm)
+@inline get_parameters(bm, α, w=nothing) = size(bm)[2], issomething(w) ? sum(w) : size(bm)[1], convert(Matrix{Float64}, bm), convert(Matrix{Float64}, .!bm)
 
-function cache_distributions(bm::AbstractMatrix{<:Bool}, w::Union{Nothing, AbstractVector{<:AbstractFloat}}=nothing; α, flag=(pairwise=true, marginal=true))
+function cache_distributions(bm, w::Union{Nothing, Vector}=nothing; α, flag=(pairwise=true, marginal=true))
     # parameters
-    D, N, (m, notm) = get_parameters(bm, α, w)
+    D, N, m, notm = get_parameters(bm, α, w)
     dis_cache = DisCache(D)
     base = N + 4 * α
     w = isnothing(w) ? ones(Float64, N) : w
@@ -57,18 +59,14 @@ function mutual_information(dis_cache::DisCache)
 end
 
 "Calculate mutual information of given bit matrix `bm`, example weights `w`, and smoothing pseudocount `α`"
-function mutual_information(bm::AbstractMatrix{<:Bool}, w::Union{Nothing, AbstractVector{<:AbstractFloat}}=nothing; α)
+function mutual_information(bm, w::Union{Nothing, Vector}=nothing; α)
     dis_cache = cache_distributions(bm, w; α=α)
     mi = mutual_information(dis_cache)
     return (dis_cache, mi)
 end
 
-function mutual_information(train_x::PlainXData, w::Union{Nothing, AbstractVector{<:AbstractFloat}}=nothing; α)
-    mutual_information(feature_matrix(train_x), w; α=α)
-end
-
 "Calculate set mutual information"
-function set_mutual_information(mi::Matrix, sets::Vector{Vector{Var}})::Matrix
+function set_mutual_information(mi::Matrix, sets::Vector{Vector})::Matrix
     len = length(sets)
     if len == size(mi)[1]
         return mi
@@ -84,6 +82,7 @@ end
 #####################
 # Entropy
 #####################
+
 function entropy(dis_cache::DisCache)
     D = dimension(dis_cache)
     px_log_px = @. xlogx(dis_cache.marginal)
@@ -95,7 +94,7 @@ function entropy(bm::AbstractMatrix{<:Bool}, w::Union{Nothing, AbstractVector{<:
     return (dis_cache, entropy(dis_cache))
 end
 
-function sum_entropy_given_x(bm::AbstractMatrix{<:Bool}, x::Var, w::Union{Nothing, AbstractVector{<:AbstractFloat}}=nothing; α)::Float64
+function sum_entropy_given_x(bm::AbstractMatrix{<:Bool}, x, w::Union{Nothing, AbstractVector{<:AbstractFloat}}=nothing; α)::Float64
     @assert x <= size(bm)[2]
     vars = [1 : x-1; x+1 : size(bm)[2]]
     indexes_left = bm[:,x].== 0
diff --git a/src/Utils/misc.jl b/src/Utils/misc.jl
new file mode 100644
index 00000000..9de4ccea
--- /dev/null
+++ b/src/Utils/misc.jl
@@ -0,0 +1,124 @@
+export to_long_mi, logsumexp_cuda,
+    pop_cuda!, push_cuda!, all_empty, length_cuda,
+    generate_all, generate_data_all
+
+using DataFrames
+using CUDA: CUDA
+
+###################
+# Misc.
+####################
+
+function to_long_mi(m::Matrix{Float64}, min_int, max_int)::Matrix{Int64}
+    δmi = maximum(m) - minimum(m)
+    δint = max_int - min_int
+    return @. round(Int64, m * δint / δmi + min_int)
+end
+
+# TODO: get rid of all copies
+@inline function logsumexp_cuda(x,y) 
+    Δ = ifelse(x == y, zero(x), CUDA.abs(x - y))
+    max(x, y) + CUDA.log1p(CUDA.exp(-Δ))
+end
+
+###################
+# Rudimentary CUDA-compatible stack data structure
+####################
+
+# sadly making `i` varargs doesn't work; kernel won't compile
+
+function pop_cuda!(stack, i)
+    if @inbounds stack[i,1] == zero(eltype(stack))
+        return zero(eltype(stack))
+    else
+        @inbounds stack[i,1] -= one(eltype(stack))
+        @inbounds return stack[i,stack[i,1]+2]
+    end
+end
+
+function pop_cuda!(stack, i, j)
+    if @inbounds stack[i,j,1] == zero(eltype(stack))
+        return zero(eltype(stack))
+    else
+        @inbounds stack[i,j,1] -= one(eltype(stack))
+        @inbounds return stack[i,j,stack[i,j,1]+2]
+    end
+end
+
+function push_cuda!(stack, v, i)
+    @inbounds stack[i,1] += one(eltype(stack))
+    @inbounds CUDA.@cuassert 1+stack[i,1] <= size(stack, ndims(stack)) "CUDA stack overflow"
+    @inbounds stack[i,1+stack[i,1]] = v
+    return nothing
+end
+
+function push_cuda!(stack, v, i, j)
+    @inbounds stack[i, j,1] += one(eltype(stack))
+    @inbounds CUDA.@cuassert 1+stack[i, j,1] <= size(stack, ndims(stack)) "CUDA stack overflow"
+    @inbounds stack[i, j,1+stack[i, j,1]] = v
+    return nothing
+end
+
+all_empty(stack::AbstractArray{T,2}) where T = 
+    all(x -> iszero(x), stack[:,1])
+
+all_empty(stack::AbstractArray{T,3}) where T = 
+    all(x -> iszero(x), stack[:,:,1])
+
+
+length_cuda(stack, i...) = stack[i...,1]
+
+
+###################
+# One-Hot Encoding
+####################
+
+"""
+One-hot encode data (2-D Array) based on categories (1-D Array)
+Each row of the return value is a concatenation of one-hot encoding of elements of the same row in data
+Assumption: both input arrays have elements of same type
+"""
+function one_hot_encode(X::Array{T, 2}, categories::Array{T,1}) where {T<:Any}
+    X_dash = zeros(Bool, size(X)[1], length(categories)*size(X)[2])
+    for i = 1:size(X)[1], j = 1:size(X)[2]
+            X_dash[i, (j-1)*length(categories) + findfirst(==(X[i,j]), categories)] = 1
+    end  
+    X_dash
+end
+
+###################
+# Testing Utils
+####################
+
+"""
+Given some missing values generates all possible fillings
+"""
+function generate_all(row::Vector)
+    miss_count = count(ismissing, row)
+    lits = length(row)
+    result = Bool.(zeros(1 << miss_count, lits))
+
+    if miss_count == 0
+        result[1, :] = copy(row)
+    else
+        for mask = 0: (1<<miss_count) - 1
+            cur = copy(row)
+            cur[row .== -1] = transpose(parse.(Bool, split(bitstring(mask)[end-miss_count+1:end], "")))
+            result[mask+1,:] = cur
+        end
+    end
+    DataFrame(result)
+end
+
+"""
+Generates all possible binary configurations of size N
+"""
+function generate_data_all(N::Int)
+    data_all = transpose(parse.(Bool, split(bitstring(0)[end-N+1:end], "")));
+    for mask = 1: (1<<N) - 1
+        data_all = vcat(data_all,
+            transpose(parse.(Bool, split(bitstring(mask)[end-N+1:end], "")))
+        );
+    end
+    DataFrame(data_all)
+end
diff --git a/src/abstract_prob_nodes.jl b/src/abstract_prob_nodes.jl
new file mode 100644
index 00000000..46ace89d
--- /dev/null
+++ b/src/abstract_prob_nodes.jl
@@ -0,0 +1,97 @@
+export ProbCircuit,
+    multiply, summate, ismul, issum,
+    num_parameters,
+    mul_nodes, sum_nodes
+
+using LogicCircuits
+
+#####################
+# Abstract probabilistic circuit nodes
+#####################
+
+"Root of the probabilistic circuit node hierarchy"
+abstract type ProbCircuit <: LogicCircuit end
+
+#####################
+# node functions that need to be implemented for each type of probabilistic circuit
+#####################
+
+"Get the parameters associated with a sum node"
+params(n) = n.log_probs
+
+import LogicCircuits: children, compile # extend
+
+"Multiply nodes into a single circuit"
+function multiply end
+
+"Sum nodes into a single circuit"
+function summate end
+
+#####################
+# derived functions
+#####################
+
+"Is the node a multiplication?"
+@inline ismul(n) = GateType(n) isa ⋀Gate
+"Is the node a summation?"
+@inline issum(n) = GateType(n) isa ⋁Gate
+
+"Count the number of parameters in the circuit"
+@inline num_parameters(c::ProbCircuit) = 
+    sum(n -> num_parameters_node(n), sum_nodes(c))
+
+#####################
+# methods to easily construct circuits
+#####################
+
+@inline multiply(xs::ProbCircuit...) = multiply(collect(xs))
+@inline summate(xs::ProbCircuit...) = summate(collect(xs))
+
+import LogicCircuits: conjoin, disjoin # make available for extension
+
+# alias conjoin/disjoin using mul/sum terminology
+@inline conjoin(args::Vector{<:ProbCircuit}; reuse=nothing) = 
+    multiply(args; reuse)
+@inline disjoin(args::Vector{<:ProbCircuit}; reuse=nothing) = 
+    summate(args; reuse)
+
+@inline Base.:*(x::ProbCircuit, y::ProbCircuit) = multiply(x,y)
+@inline Base.:*(xs::ProbCircuit...) = multiply(xs...)
+@inline Base.:+(x::ProbCircuit, y::ProbCircuit) = summate(x,y)
+@inline Base.:+(xs::ProbCircuit...) = summate(xs...)
+
+compile(::Type{<:ProbCircuit}, ::Bool) =
+    error("Probabilistic circuits do not have constant leafs.")
+
+struct WeightProbCircuit
+    tmp_weight :: Float64
+    circuit :: ProbCircuit
+end
+
+@inline Base.:*(w::Real, x::ProbCircuit) = WeightProbCircuit(w, x)
+@inline Base.:*(x::ProbCircuit, w::Real) = w * x
+@inline Base.:+(x::WeightProbCircuit...) = begin
+    ch = collect(x)
+    c = map(x -> x.circuit, ch)
+    w = map(x -> x.tmp_weight, ch)
+    pc = summate(c)
+    pc.log_probs .= log.(w)
+    pc
+end
+
+#####################
+# circuit inspection
+#####################
+
+"Get the list of multiplication nodes in a given circuit"
+mul_nodes(c::ProbCircuit) = ⋀_nodes(c)
+
+"Get the list of summation nodes in a given circuit"
+sum_nodes(c::ProbCircuit) = ⋁_nodes(c)
+
+function check_parameter_integrity(circuit::ProbCircuit)
+    for node in sum_nodes(circuit)
+        @assert all(θ -> !isnan(θ), node.log_probs) "There is a NaN in one of the log_probs"
+    end
+    true
+end
\ No newline at end of file
diff --git a/src/logistic_nodes.jl b/src/logistic_nodes.jl
new file mode 100644
index 00000000..78f0d66f
--- /dev/null
+++ b/src/logistic_nodes.jl
@@ -0,0 +1,93 @@
+export 
+    LogisticCircuit,
+    LogisticLeafNode, LogisticInnerNode, 
+    LogisticLiteral, Logistic⋀Node, Logistic⋁Node,
+    num_classes, num_parameters_per_class
+    
+#####################
+# Infrastructure for logistic circuit nodes
+#####################
+
+"Root of the logistic circuit node hierarchy"
+abstract type LogisticCircuit <: LogicCircuit end
+
+"""
+A logistic leaf node
+"""
+abstract type LogisticLeafNode <: LogisticCircuit end
+
+"""
+A logistic inner node
+"""
+abstract type LogisticInnerNode <: LogisticCircuit end
+
+"""
+A logistic literal node
+"""
+mutable struct LogisticLiteral <: LogisticLeafNode
+    literal::Lit
+    data
+    counter::UInt32
+    LogisticLiteral(l) = begin 
+        new(l, nothing, 0)
+    end
+end
+
+"""
+A logistic conjunction node (And node)
+"""
+mutable struct Logistic⋀Node <: LogisticInnerNode
+    children::Vector{<:LogisticCircuit}
+    data
+    counter::UInt32
+    Logistic⋀Node(children) = begin
+        new(convert(Vector{LogisticCircuit}, children), nothing, 0)
+    end
+end
+
+"""
+A logistic disjunction node (Or node)
+"""
+mutable struct Logistic⋁Node <: LogisticInnerNode
+    children::Vector{<:LogisticCircuit}
+    thetas::Matrix{Float32}
+    data
+    counter::UInt32
+    Logistic⋁Node(children, class::Int) = begin
+        new(convert(Vector{LogisticCircuit}, children), init_array(Float32, length(children), class), nothing, 0)
+    end
+end
+
+#####################
+# traits
+#####################
+
+import LogicCircuits.GateType # make available for extension
+@inline GateType(::Type{<:LogisticLiteral}) = LiteralGate()
+@inline GateType(::Type{<:Logistic⋀Node}) = ⋀Gate()
+@inline GateType(::Type{<:Logistic⋁Node}) = ⋁Gate()
+
+#####################
+# methods
+#####################
+
+import LogicCircuits: children # make available for extension
+@inline children(n::LogisticInnerNode) = n.children
+@inline num_classes(n::Logistic⋁Node) = size(n.thetas)[2]
+
+@inline num_parameters(c::LogisticCircuit) = sum(n -> num_children(n) * classes(n), ⋁_nodes(c))
+@inline num_parameters_per_class(c::LogisticCircuit) = sum(n -> num_children(n), ⋁_nodes(c))
+
+
+
+#####################
+# constructors and conversions
+#####################
+
+function LogisticCircuit(circuit::LogicCircuit, classes::Int)
+    f_con(n) = error("Cannot construct a logistic circuit from constant leafs: first smooth and remove unsatisfiable branches.")
+    f_lit(n) = LogisticLiteral(literal(n))
+    f_a(n, cn) = Logistic⋀Node(cn)
+    f_o(n, cn) = Logistic⋁Node(cn, classes)
+    foldup_aggregate(circuit, f_con, f_lit, f_a, f_o, LogisticCircuit)
+end
\ No newline at end of file
diff --git a/src/mixtures/em.jl b/src/mixtures/em.jl
new file mode 100644
index 00000000..6554bda0
--- /dev/null
+++ b/src/mixtures/em.jl
@@ -0,0 +1,119 @@
+export one_step_em, component_weights_per_example, initial_weights, clustering,
+log_likelihood_per_instance_per_component, estimate_parameters_cached, learn_em_model
+
+using Statistics: mean
+using LinearAlgebra: normalize!
+using Clustering: kmeans, nclusters, assignments
+
+function one_step_em(spc, train_x, component_weights; pseudocount)
+    # E step
+    lls = log_likelihood_per_instance_per_component(spc, train_x)
+    lls .+= log.(component_weights)
+
+    example_weights = component_weights_per_example(lls)
+    component_weights .= sum(example_weights, dims=1)
+    normalize!(component_weights, 1.0)
+
+    # M step
+    estimate_parameters_cached(spc, example_weights; pseudocount=pseudocount)
+    logsumexp(lls, 2), component_weights
+end
+
+function component_weights_per_example(log_p_of_x_and_c)
+    log_p_of_x = logsumexp(log_p_of_x_and_c, 2) # marginalize out components
+    log_p_of_given_x_query_c = mapslices(col -> col .- log_p_of_x, log_p_of_x_and_c, dims=[1])
+    p_of_given_x_query_c = exp.(log_p_of_given_x_query_c) # no more risk of underflow, so go to linear space
+    @assert sum(p_of_given_x_query_c) ≈ size(log_p_of_x_and_c, 1) "$(sum(p_of_given_x_query_c)) != $(size(log_p_of_x_and_c))"# each row has proability 1
+    Matrix(p_of_given_x_query_c)
+end
+
+function initial_weights(train_x, mix_num::Int64; alg="cluster")::Vector{Float64}
+    if alg == "cluster"
+        clustered = clustering(train_x, mix_num)
+        counting = Float64.(num_examples.(clustered))
+        return normalize!(counting, 1)
+    elseif alg == "random"
+        return normalize!(rand(Float64, mix_num), 1)
+    else
+        error("Initialize weights algorithm is $undefined")
+    end
+end
+
+function clustering(train_x, mix_num::Int64; maxiter=200)::Vector
+    train_x = Matrix(train_x)
+    if mix_num == 1
+        return [train_x]
+    end
+    
+    n = num_examples(train_x)
+
+    R = kmeans(train_x, mix_num; maxiter=maxiter)
+    @assert nclusters(R) == mix_num
+    a = assignments(R)
+
+    clustered_train_x = Vector()
+    for k in 1 : mix_num
+        push!(clustered_train_x, train_x[:, findall(x -> x == k, a)]')
+    end
+
+    return clustered_train_x
+end
+
+function log_likelihood_per_instance_per_component(pc::SharedProbCircuit, data)
+    @assert isbinarydata(data) "Can only calculate EVI on Bool data"
+    
+    compute_flows(pc, data)
+    num_mix = num_components(pc)
+    log_likelihoods = zeros(Float64, num_examples(data), num_mix)
+    indices = init_array(Bool, num_examples(data))::BitVector
+    
+    
+    ll(n::SharedProbCircuit) = ()
+    ll(n::SharedPlainSumNode) = begin
+        if num_children(n) != 1 # other nodes have no effect on likelihood
+            for i in 1 : num_children(n)
+                c = children(n)[i]
+                log_theta = reshape(n.log_probs[i, :], 1, num_mix)
+                indices = get_downflow(n, c)
+                view(log_likelihoods, indices::BitVector, :) .+=  log_theta # see MixedProductKernelBenchmark.jl
+            end
+         end
+    end
+
+    foreach(ll, pc)
+    log_likelihoods
+end
+
+function estimate_parameters_cached(pc::SharedProbCircuit, example_weights; pseudocount::Float64)
+    foreach(pc) do pn
+        if is⋁gate(pn)
+            if num_children(pn) == 1
+                pn.log_probs .= 0.0
+            else
+                smoothed_flow = Float64.(sum(example_weights[get_downflow(pn), :], dims=1)) .+ pseudocount
+                uniform_pseudocount = pseudocount / num_children(pn)
+                children_flows = vcat(map(c -> sum(example_weights[get_downflow(pn, c), :], dims=1), children(pn))...)
+                @. pn.log_probs = log((children_flows + uniform_pseudocount) / smoothed_flow)
+                @assert all(sum(exp.(pn.log_probs), dims=1) .≈ 1.0) "Parameters do not sum to one locally"
+                # normalize away any leftover error
+                pn.log_probs .-= logsumexp(pn.log_probs, dims=1)
+            end
+        end
+    end
+end
+
+function learn_em_model(pc, train_x;
+        num_mix=5,
+        pseudocount=1.0,
+        maxiter=typemax(Int))
+    spc = SharedProbCircuit(pc, num_mix)
+    compute_flows(spc, train_x)
+    estimate_parameters_cached(spc, ones(Float64, num_examples(train_x), num_mix) ./ num_mix; pseudocount=pseudocount)
+    component_weights = reshape(initial_weights(train_x, num_mix), 1, num_mix)
+
+    for iter in 1 : maxiter
+        @assert isapprox(sum(component_weights), 1.0; atol=1e-10)
+        lls, component_weights = one_step_em(spc, train_x, component_weights; pseudocount=pseudocount)
+        println("Log likelihood per instance is $(mean(lls))")
+    end
+end
\ No newline at end of file
diff --git a/src/mixtures/shared_prob_nodes.jl b/src/mixtures/shared_prob_nodes.jl
new file mode 100644
index 00000000..c2c3a651
--- /dev/null
+++ b/src/mixtures/shared_prob_nodes.jl
@@ -0,0 +1,111 @@
+export SharedProbCircuit, SharedProbLeafNode, SharedProbInnerNode, SharedProbLiteralNode,
+SharedMulNode, SharedSumNode, num_components
+
+#####################
+# Probabilistic circuits which share the same structure
+#####################
+
+"""
+Root of the shared probabilistic circuit node hierarchy
+"""
+abstract type SharedProbCircuit <: ProbCircuit end
+
+"""
+A shared probabilistic leaf node
+"""
+abstract type SharedProbLeafNode <: SharedProbCircuit end
+
+"""
+A shared probabilistic inner node
+"""
+abstract type SharedProbInnerNode <: SharedProbCircuit end
+
+"""
+A shared probabilistic literal node
+"""
+mutable struct SharedProbLiteralNode <: SharedProbLeafNode
+    literal::Lit
+    data
+    counter::UInt32
+    SharedProbLiteralNode(l) = new(l, nothing, 0)
+end
+
+"""
+A shared probabilistic multiplcation node
+"""
+mutable struct SharedMulNode <: SharedProbInnerNode
+    children::Vector{<:SharedProbCircuit}
+    data
+    counter::UInt32
+    SharedMulNode(children) = new(children, nothing, 0)
+end
+
+"""
+A shared probabilistic summation node
+"""
+mutable struct SharedSumNode <: SharedProbInnerNode
+    children::Vector{<:SharedProbCircuit}
+    log_probs::Matrix{Float64}
+    data
+    counter::UInt32
+    SharedSumNode(children, n_mixture) = begin
+        new(children, init_array(Float64, length(children), n_mixture), nothing, 0)
+    end
+end
+
+#####################
+# traits
+#####################
+
+import LogicCircuits.GateType # make available for extension
+@inline GateType(::Type{<:SharedProbLiteralNode}) = LiteralGate()
+@inline GateType(::Type{<:SharedMulNode}) = ⋀Gate()
+@inline GateType(::Type{<:SharedSumNode}) = ⋁Gate()
+
+#####################
+# methods
+#####################
+
+import LogicCircuits: children # make available for extension
+@inline children(n::SharedProbInnerNode) = n.children
+
+@inline num_parameters_node(n::SharedSumNode) = length(n.log_probs)
+
+"How many components are mixed together in this shared circuit?"
+@inline num_components(n::SharedSumNode) = size(n.log_probs,2)
+
+#####################
+# constructors and conversions
+#####################
+
+function multiply(arguments::Vector{<:SharedProbCircuit};
+    reuse=nothing)
+    @assert length(arguments) > 0
+    reuse isa SharedMulNode && children(reuse) == arguments && return reuse
+    return SharedMulNode(arguments)
+end
+
+function summate(arguments::Vector{<:SharedProbCircuit}, num_components=0;
+       reuse=nothing)
+    @assert length(arguments) > 0
+    reuse isa SharedSumNode && children(reuse) == arguments && return reuse
+    return SharedSumNode(arguments, num_components) # unknwown number of components; resize later
+end
+
+compile(::Type{<:SharedProbCircuit}, l::Lit) =
+    SharedProbLiteralNode(l)
+
+function compile(::Type{<:SharedProbCircuit}, circuit::LogicCircuit, num_components::Int)
+    f_con(n) = error("Cannot construct a probabilistic circuit from constant leafs: first smooth and remove unsatisfiable branches.")
+    f_lit(n) = compile(SharedProbCircuit, literal(n))
+    f_a(_, cns) = multiply(cns)
+    f_o(_, cns) = summate(cns, num_components)
+    foldup_aggregate(circuit, f_con, f_lit, f_a, f_o, SharedProbCircuit)
+end
+
+import LogicCircuits: fully_factorized_circuit #extend
+
+function fully_factorized_circuit(::Type{<:SharedProbCircuit}, n::Int)
+    ff_logic_circuit = fully_factorized_circuit(PlainLogicCircuit, n)
+    compile(SharedProbCircuit, ff_logic_circuit)
+end
diff --git a/src/param_bit_circuit.jl b/src/param_bit_circuit.jl
new file mode 100644
index 00000000..20599b22
--- /dev/null
+++ b/src/param_bit_circuit.jl
@@ -0,0 +1,72 @@
+export ParamBitCircuit
+
+"A `BitCircuit` with parameters attached to the elements"
+struct ParamBitCircuit{V,M,W}
+    bitcircuit::BitCircuit{V,M}
+    params::W
+end
+
+function ParamBitCircuit(pc::ProbCircuit, data; reset=true)
+    logprobs::Vector{Float64} = Vector{Float64}()
+    on_decision(n, cs, layer_id, decision_id, first_element, last_element) = begin
+        if isnothing(n) # this decision node is not part of the PC
+            # @assert first_element == last_element
+            push!(logprobs, 0.0)
+        else
+            # @assert last_element-first_element+1 == length(n.log_probs) 
+            append!(logprobs, n.log_probs)
+        end
+    end
+    bc = BitCircuit(pc, data; reset=reset, on_decision)
+    ParamBitCircuit(bc, logprobs)
+end
+
+function ParamBitCircuit(lc::LogisticCircuit, nc, data; reset=true)
+    thetas::Vector{Vector{Float32}} = Vector{Vector{Float32}}()
+    on_decision(n, cs, layer_id, decision_id, first_element, last_element) = begin
+        if isnothing(n)
+            # @assert first_element == last_element
+            push!(thetas, zeros(Float32, nc))
+            println("here, some node is not part of the logistic circuit")
+        else
+            # @assert last_element - first_element + 1 == size(n.thetas, 1)
+            # @assert size(n.thetas, 2) == nc
+            for theta in eachrow(n.thetas)
+                push!(thetas, theta)
+            end
+        end
+    end
+    bc = BitCircuit(lc, data; reset=reset, on_decision)
+    thetas_matrix = permutedims(hcat(thetas...), (2, 1))
+    ParamBitCircuit(bc, permutedims(hcat(thetas...), (2, 1)))
+end
+
+
+
+#######################
+## Helper functions ###
+#######################
+
+params(c::ParamBitCircuit) = c.params
+
+import LogicCircuits: num_nodes, num_elements, num_features, num_leafs, nodes, elements
+
+num_nodes(c::ParamBitCircuit) = num_nodes(c.bitcircuit)
+num_elements(c::ParamBitCircuit) = num_elements(c.bitcircuit)
+num_features(c::ParamBitCircuit) = num_features(c.bitcircuit)
+num_leafs(c::ParamBitCircuit) = num_leafs(c.bitcircuit)
+
+nodes(c::ParamBitCircuit) = nodes(c.bitcircuit)
+elements(c::ParamBitCircuit) = elements(c.bitcircuit)
+
+import LogicCircuits: to_gpu, to_cpu, isgpu #extend
+
+to_gpu(c::ParamBitCircuit) = 
+    ParamBitCircuit(to_gpu(c.bitcircuit), to_gpu(c.params))
+
+to_cpu(c::ParamBitCircuit) = 
+    ParamBitCircuit(to_cpu(c.bitcircuit), to_cpu(c.params))
+
+
+isgpu(c::ParamBitCircuit) = 
+    isgpu(c.bitcircuit) && isgpu(c.params)
diff --git a/src/parameters.jl b/src/parameters.jl
new file mode 100644
index 00000000..6a96968a
--- /dev/null
+++ b/src/parameters.jl
@@ -0,0 +1,167 @@
+export estimate_parameters, uniform_parameters, estimate_parameters_em, test
+
+using StatsFuns: logsumexp
+using CUDA
+using LoopVectorization
+
+"""
+Maximum likilihood estimation of parameters given data
+"""
+function estimate_parameters(pc::ProbCircuit, data; pseudocount::Float64)
+    @assert isbinarydata(data) "Probabilistic circuit parameter estimation for binary data only"
+    bc = BitCircuit(pc, data; reset=false)
+    params = if isgpu(data)
+        estimate_parameters_gpu(to_gpu(bc), data, pseudocount)
+    else
+        estimate_parameters_cpu(bc, data, pseudocount)
+    end
+    estimate_parameters_cached!(pc, bc, params)
+    params
+end
+
+function estimate_parameters_cached!(pc, bc, params)
+    foreach_reset(pc) do pn
+        if is⋁gate(pn)
+            if num_children(pn) == 1
+                pn.log_probs .= zero(Float64)
+            else
+                id = (pn.data::⋁NodeIds).node_id
+                @inbounds els_start = bc.nodes[1,id]
+                @inbounds els_end = bc.nodes[2,id]
+                @inbounds @views pn.log_probs .= params[els_start:els_end]
+                @assert isapprox(sum(exp.(pn.log_probs)), 1.0, atol=1e-6) "Parameters do not sum to one locally: $(sum(exp.(pn.log_probs))); $(pn.log_probs)"
+                pn.log_probs .-= logsumexp(pn.log_probs) # normalize away any leftover error
+            end
+        end
+    end
+    nothing
+end
+
+function estimate_parameters_cpu(bc::BitCircuit, data, pseudocount)
+    # no need to synchronize, since each computation is unique to a decision node
+    node_counts::Vector{UInt} = Vector{UInt}(undef, num_nodes(bc))
+    log_params::Vector{Float64} = Vector{Float64}(undef, num_elements(bc))
+
+    @inline function on_node(flows, values, dec_id)
+        node_counts[dec_id] = sum(1:size(flows,1)) do i
+            count_ones(flows[i, dec_id]) 
+        end
+    end
+
+    @inline function estimate(element, decision, edge_count)
+        num_els = num_elements(bc.nodes, decision)
+        log_params[element] = 
+            log((edge_count+pseudocount/num_els)
+                   /(node_counts[decision]+pseudocount))
+    end
+
+    @inline function on_edge(flows, values, prime, sub, element, grandpa, single_child)
+        if !single_child
+            edge_count = sum(1:size(flows,1)) do i
+                count_ones(values[i, prime] & values[i, sub] & flows[i, grandpa]) 
+            end
+            estimate(element, grandpa, edge_count)
+        end # no need to estimate single child params, they are always prob 1
+    end
+
+    v, f = satisfies_flows(bc, data; on_node, on_edge)
+
+    return log_params
+end
+
+function estimate_parameters_gpu(bc::BitCircuit, data, pseudocount)
+    node_counts::CuVector{Int32} = CUDA.zeros(Int32, num_nodes(bc))
+    edge_counts::CuVector{Int32} = CUDA.zeros(Int32, num_elements(bc))
+    # need to manually cudaconvert closure variables
+    node_counts_device = CUDA.cudaconvert(node_counts)
+    edge_counts_device = CUDA.cudaconvert(edge_counts)
+    
+    @inline function on_node(flows, values, dec_id, chunk_id, flow)
+        c::Int32 = CUDA.count_ones(flow) # cast for @atomic to be happy
+        CUDA.@atomic node_counts_device[dec_id] += c
+    end
+
+    @inline function on_edge(flows, values, prime, sub, element, grandpa, chunk_id, edge_flow, single_child)
+        if !single_child
+            c::Int32 = CUDA.count_ones(edge_flow) # cast for @atomic to be happy
+            CUDA.@atomic edge_counts_device[element] += c
+        end
+    end
+
+    v, f = satisfies_flows(bc, data; on_node, on_edge)
+
+    CUDA.unsafe_free!(v) # save the GC some effort
+    CUDA.unsafe_free!(f) # save the GC some effort
+
+    # TODO: reinstate simpler implementation once https://github.com/JuliaGPU/GPUArrays.jl/issues/313 is fixed and released
+    @inbounds parents = bc.elements[1,:]
+    @inbounds parent_counts = node_counts[parents]
+    @inbounds parent_elcount = bc.nodes[2,parents] .- bc.nodes[1,parents] .+ 1 
+    params = log.((edge_counts .+ (pseudocount ./ parent_elcount)) 
+                    ./ (parent_counts .+ pseudocount))
+    return to_cpu(params)
+end
+
+"""
+Uniform distribution
+"""
+function uniform_parameters(pc::ProbCircuit)
+    foreach(pc) do pn
+        if is⋁gate(pn)
+            if num_children(pn) == 1
+                pn.log_probs .= 0.0
+            else
+                pn.log_probs .= log.(ones(Float64, num_children(pn)) ./ num_children(pn))
+            end
+        end
+    end
+end
+
+"""
+Expectation maximization parameter learning given missing data
+"""
+function estimate_parameters_em(pc::ProbCircuit, data; pseudocount::Float64)
+    pbc = ParamBitCircuit(pc, data; reset=false)
+    params = if isgpu(data)
+        estimate_parameters_gpu(to_gpu(pbc), data, pseudocount)
+    else
+        estimate_parameters_cpu(pbc, data, pseudocount)
+    end
+    estimate_parameters_cached!(pc, pbc.bitcircuit, params)
+    params
+end
+
+function estimate_parameters_cpu(pbc::ParamBitCircuit, data, pseudocount)
+    # no need to synchronize, since each computation is unique to a decision node
+    bc = pbc.bitcircuit
+    node_counts::Vector{Float64} = Vector{Float64}(undef, num_nodes(bc))
+    log_params::Vector{Float64} = Vector{Float64}(undef, num_elements(bc))
+
+    @inline function on_node(flows, values, dec_id)
+        sum_flows = map(1:size(flows,1)) do i
+            flows[i, dec_id]
+        end
+        node_counts[dec_id] = logsumexp(sum_flows)
+    end
+
+    @inline function estimate(element, decision, edge_count)
+        num_els = num_elements(bc.nodes, decision)
+        log_params[element] = 
+            log((exp(edge_count)+pseudocount/num_els) / (exp(node_counts[decision])+pseudocount))
+    end
+
+    @inline function on_edge(flows, values, prime, sub, element, grandpa, single_child)
+        θ = eltype(flows)(pbc.params[element])
+        if !single_child
+            edge_flows = map(1:size(flows,1)) do i
+                values[i, prime] + values[i, sub] - values[i, grandpa] + flows[i, grandpa] + θ
+            end
+            edge_count = logsumexp(edge_flows)
+            estimate(element, grandpa, edge_count)
+        end # no need to estimate single child params, they are always prob 1
+    end
+
+    v, f = marginal_flows(pbc, data; on_node, on_edge)
+
+    return log_params
+end
diff --git a/src/plain_prob_nodes.jl b/src/plain_prob_nodes.jl
new file mode 100644
index 00000000..c37cf49e
--- /dev/null
+++ b/src/plain_prob_nodes.jl
@@ -0,0 +1,108 @@
+export PlainProbCircuit, 
+    PlainProbLeafNode, PlainProbInnerNode, 
+    PlainProbLiteralNode, PlainMulNode, PlainSumNode
+
+#####################
+# Plain probabilistic circuit nodes
+#####################
+
+"Root of the plain probabilistic circuit node hierarchy"
+abstract type PlainProbCircuit <: ProbCircuit end
+
+"A probabilistic leaf node"
+abstract type PlainProbLeafNode <: PlainProbCircuit end
+
+"A probabilistic inner node"
+abstract type PlainProbInnerNode <: PlainProbCircuit end
+
+"A probabilistic literal node"
+mutable struct PlainProbLiteralNode <: PlainProbLeafNode
+    literal::Lit
+    data
+    counter::UInt32
+    PlainProbLiteralNode(l) = new(l, nothing, 0)
+end
+
+"A probabilistic conjunction node (multiplication node)"
+mutable struct PlainMulNode <: PlainProbInnerNode
+    children::Vector{PlainProbCircuit}
+    data
+    counter::UInt32
+    PlainMulNode(children) = begin
+        new(convert(Vector{PlainProbCircuit}, children), nothing, 0)
+    end
+end
+
+"A probabilistic disjunction node (summation node)"
+mutable struct PlainSumNode <: PlainProbInnerNode
+    children::Vector{PlainProbCircuit}
+    log_probs::Vector{Float64}
+    data
+    counter::UInt32
+    PlainSumNode(c) = begin
+        new(c, init_array(Float64, length(c)), nothing, 0)
+    end
+end
+
+#####################
+# traits
+#####################
+
+import LogicCircuits.GateType # make available for extension
+
+@inline GateType(::Type{<:PlainProbLiteralNode}) = LiteralGate()
+@inline GateType(::Type{<:PlainMulNode}) = ⋀Gate()
+@inline GateType(::Type{<:PlainSumNode}) = ⋁Gate()
+
+#####################
+# methods
+#####################
+
+import LogicCircuits: children # make available for extension
+@inline children(n::PlainProbInnerNode) = n.children
+
+"Count the number of parameters in the node"
+@inline num_parameters_node(n::PlainSumNode) = num_children(n)
+
+#####################
+# constructors and conversions
+#####################
+
+function multiply(arguments::Vector{<:PlainProbCircuit};
+                 reuse=nothing)
+    @assert length(arguments) > 0
+    reuse isa PlainMulNode && children(reuse) == arguments && return reuse
+    return PlainMulNode(arguments)
+end
+
+function summate(arguments::Vector{<:PlainProbCircuit};
+                    reuse=nothing)
+    @assert length(arguments) > 0
+    reuse isa PlainSumNode && children(reuse) == arguments && return reuse
+    return PlainSumNode(arguments)
+end
+
+# claim `PlainProbCircuit` as the default `ProbCircuit` implementation
+compile(::Type{ProbCircuit}, args...) =
+    compile(PlainProbCircuit, args...)
+
+compile(::Type{<:PlainProbCircuit}, l::Lit) =
+    PlainProbLiteralNode(l)
+
+function compile(::Type{<:PlainProbCircuit}, circuit::LogicCircuit)
+    f_con(n) = error("Cannot construct a probabilistic circuit from constant leafs: first smooth and remove unsatisfiable branches.")
+    f_lit(n) = compile(PlainProbCircuit, literal(n))
+    f_a(_, cns) = multiply(cns)
+    f_o(_, cns) = summate(cns)
+    foldup_aggregate(circuit, f_con, f_lit, f_a, f_o, PlainProbCircuit)
+end
+
+import LogicCircuits: fully_factorized_circuit #extend
+
+fully_factorized_circuit(::Type{ProbCircuit}, n::Int) =
+    fully_factorized_circuit(PlainProbCircuit, n)
+
+function fully_factorized_circuit(::Type{<:PlainProbCircuit}, n::Int)
+    ff_logic_circuit = fully_factorized_circuit(PlainLogicCircuit, n)
+    compile(PlainProbCircuit, ff_logic_circuit)
+end
\ No newline at end of file
diff --git a/src/queries/expectation_graph.jl b/src/queries/expectation_graph.jl
new file mode 100644
index 00000000..48e9a182
--- /dev/null
+++ b/src/queries/expectation_graph.jl
@@ -0,0 +1,172 @@
+export UpExpFlow, ExpFlowCircuit, exp_pass_up, ExpectationUpward
+
+#####################
+# Expectation Flow circuits
+# For use of algorithms depending on pairs of nodes of two circuits
+#####################
+
+"A expectation circuit node that has pair of origins of type PC and type LC"
+abstract type ExpFlowNode{F} end
+
+const ExpFlowCircuit{O} = Vector{<:ExpFlowNode{<:O}}
+
+struct UpExpFlow{F} <: ExpFlowNode{F}
+    p_origin::ProbCircuit
+    f_origin::LogisticCircuit
+    children::Vector{<:ExpFlowNode{<:F}}
+    f::F
+    fg::F
+end
+
+import LogicCircuits: children
+children(x::UpExpFlow) = x.children
+
+"""
+Expected Prediction of LC w.r.t PC.
+This implementation uses the computation graph approach.
+"""
+function ExpectationUpward(pc::ProbCircuit, lc::LogisticCircuit, data)
+    # 1. Get probability of each observation
+    log_likelihoods = marginal(pc, data)
+    p_observed = exp.( log_likelihoods )
+    
+    # 2. Expectation w.r.t. P(x_m, x_o)
+    exps_flow = exp_pass_up(pc, lc, data)
+    results_unnormalized = exps_flow[end].fg
+
+    # 3. Expectation w.r.t P(x_m | x_o)
+    results = transpose(results_unnormalized) ./ p_observed
+
+    # 4. Add Bias terms
+    biases = lc.thetas
+    results .+= biases
+    
+    results, exps_flow
+end
+
+
+"""
+Construct a upward expectation flow circuit from a given pair of PC and LC circuits
+Note that its assuming the two circuits share the same vtree
+"""
+function ExpFlowCircuit(pc::ProbCircuit, lc::LogisticCircuit, batch_size::Int, ::Type{El}) where El
+    F = Array{El, 2}
+    fmem = () -> zeros(1, batch_size) #Vector{El}(undef, batch_size)  #init_array(El, batch_size) # note: fmem's return type will determine type of all UpFlows in the circuit (should be El)
+    fgmem = () -> zeros(num_classes(lc), batch_size)
+
+    root_pc = pc
+    root_lc = children(lc)[1]
+    
+    cache = Dict{Pair{Node, Node}, ExpFlowNode}()
+    sizehint!(cache, (num_nodes(pc) + num_nodes(lc))*4÷3)
+    expFlowCircuit = Vector{ExpFlowNode}()
+
+    function ExpflowTraverse(n::PlainSumNode, m::Logistic⋁Node) 
+        get!(cache, Pair(n, m)) do
+            ch = [ ExpflowTraverse(i, j) for i in children(n) for j in children(m)]
+            node = UpExpFlow{F}(n, m, ch, fmem(), fgmem())
+            push!(expFlowCircuit, node)
+            return node
+        end
+    end
+    function ExpflowTraverse(n::PlainMulNode, m::Logistic⋀Node) 
+        get!(cache, Pair(n, m)) do
+            ch = [ ExpflowTraverse(z[1], z[2]) for z in zip(children(n), children(m)) ]
+            node = UpExpFlow{F}(n, m, ch, fmem(), fgmem())
+            push!(expFlowCircuit, node)
+            return node
+        end
+    end
+    function ExpflowTraverse(n::PlainProbLiteralNode, m::Logistic⋁Node) 
+        get!(cache, Pair(n, m)) do
+            ch = Vector{ExpFlowNode{F}}() # TODO
+            node = UpExpFlow{F}(n, m, ch, fmem(), fgmem())
+            push!(expFlowCircuit, node)
+            return node
+        end
+    end
+    function ExpflowTraverse(n::PlainProbLiteralNode, m::LogisticLiteral) 
+        get!(cache, Pair(n, m)) do
+            ch = Vector{ExpFlowNode{F}}() # TODO
+            node = UpExpFlow{F}(n, m, ch, fmem(), fgmem())
+            push!(expFlowCircuit, node)
+            return node
+        end
+    end
+
+    ExpflowTraverse(root_pc, root_lc)
+    expFlowCircuit
+end
+
+function exp_pass_up(pc::ProbCircuit, lc::LogisticCircuit, data)
+    expFlowCircuit = ExpFlowCircuit(pc, lc, num_examples(data), Float64);
+    for n in expFlowCircuit
+        exp_pass_up_node(n, data)
+    end 
+    expFlowCircuit
+end
+
+function exp_pass_up(fc::ExpFlowCircuit, data)
+    #TODO write resize_flows similar to flow_circuits
+    #     and give as input the expFlowCircuit instead
+    #expFlowCircuit = ExpFlowCircuit(pc, lc, num_examples(data), Float64);
+    for n in fc
+        exp_pass_up_node(n, data)
+    end
+end
+
+function exp_pass_up_node(node::ExpFlowNode{E}, data) where E
+    pType = typeof(node.p_origin)
+    fType = typeof(node.f_origin)
+
+    if node.p_origin isa PlainSumNode && node.f_origin isa Logistic⋁Node
+        #todo this ordering might be different than the ExpFlowNode children
+        pthetas = [exp(node.p_origin.log_probs[i])
+                    for i in 1:length(children(node.p_origin)) for j in 1:length(children(node.f_origin))]
+        fthetas = [node.f_origin.thetas[j,:] # only taking the first class for now
+            for i in 1:length(node.p_origin.children) for j in 1:length(node.f_origin.children)]
+
+        node.f .= 0.0
+        node.fg .= 0.0
+        for z = 1:length(children(node))
+            node.f  .+= pthetas[z] .* children(node)[z].f
+            node.fg .+= (pthetas[z] .* fthetas[z]) .* children(node)[z].f
+            node.fg .+= pthetas[z] .* children(node)[z].fg
+        end
+    elseif node.p_origin isa PlainMulNode && node.f_origin isa Logistic⋀Node
+        node.f .= children(node)[1].f .* children(node)[2].f # assume 2 children
+        node.fg .= (children(node)[1].f .* children(node)[2].fg) .+
+                   (children(node)[2].f .* children(node)[1].fg)
+
+    elseif node.p_origin isa PlainProbLiteralNode 
+        if node.f_origin isa Logistic⋁Node
+            m = children(node.f_origin)[1]
+        elseif node.f_origin isa LogisticLiteral
+            m = node.f_origin
+        else
+            error("Invalid Types of pairs {$pType} - {$fType}")
+        end
+
+        var = variable(m)
+        X = data
+        if ispositive(node.p_origin) && ispositive(m)
+            node.f[:, X[:, var] .!= 0 ] .= 1.0 # positive and missing observations
+            node.f[:, X[:, var] .== 0 ] .= 0.0
+        elseif isnegative(node.p_origin) && isnegative(m)
+            node.f[:, X[:, var] .!= 1 ] .= 1.0 # negative and missing observations
+            node.f[:, X[:, var] .== 1 ] .= 0.0
+        else
+            node.f .= 0.0
+        end
+
+        if node.f_origin isa Logistic⋁Node
+            node.fg .= node.f .* transpose(node.f_origin.thetas)
+        else
+            node.fg .= 0.0
+        end
+
+    else
+        error("Invalid Types of pairs {$pType} - {$fType}")
+    end
+
+end
\ No newline at end of file
diff --git a/src/queries/expectation_rec.jl b/src/queries/expectation_rec.jl
new file mode 100644
index 00000000..93e90e1d
--- /dev/null
+++ b/src/queries/expectation_rec.jl
@@ -0,0 +1,247 @@
+export Expectation, Moment
+
+
+ExpCacheDict = Dict{Pair{ProbCircuit, LogisticCircuit}, Array{Float64, 2}}
+MomentCacheDict = Dict{Tuple{ProbCircuit, LogisticCircuit, Int64}, Array{Float64, 2}}
+
+struct ExpectationCache 
+    f::ExpCacheDict
+    fg::ExpCacheDict
+end
+
+ExpectationCache() = ExpectationCache(ExpCacheDict(), ExpCacheDict())
+
+struct MomentCache 
+    f::ExpCacheDict
+    fg::MomentCacheDict
+end
+MomentCache() = MomentCache( ExpCacheDict(),  MomentCacheDict())
+
+
+# Find a better way to cache n_choose_k values
+max_k = 31
+choose_cache = [ 1.0 * binomial(i,j) for i=0:max_k+1, j=0:max_k+1 ]
+@inline function choose(n::Int, m::Int)
+    return choose_cache[n+1, m+1]
+end
+
+
+# On Tractable Computation of Expected Predictions (https://arxiv.org/abs/1910.02182)
+"""
+Missing values should be denoted by -1
+"""
+function Expectation(pc::ProbCircuit, lc::LogisticCircuit, data)
+    # 1. Get probability of each observation
+    log_likelihoods = marginal(pc, data)
+    p_observed = exp.( log_likelihoods )
+    
+    # 2. Expectation w.r.t. P(x_m, x_o)
+    cache = ExpectationCache()
+    results_unnormalized = exp_g(pc, children(lc)[1], data, cache) # skipping the bias node of lc
+
+    # 3. Expectation w.r.t P(x_m | x_o)
+    results = transpose(results_unnormalized) ./ p_observed
+
+    # 4. Add Bias terms
+    biases = lc.thetas
+    results .+= biases
+    
+    results, cache
+end
+
+function Moment(pc::ProbCircuit, lc::LogisticCircuit, data, moment::Int)
+    # 1. Get probability of each observation
+    log_likelihoods = marginal(pc, data)
+    p_observed = exp.( log_likelihoods )
+    
+    # 2. Moment w.r.t. P(x_m, x_o)
+    cache = MomentCache()
+    biases = lc.thetas
+    results_unnormalized = zeros(num_examples(data), num_classes(lc))
+    
+    for z = 0:moment-1  
+        results_unnormalized .+= choose(moment, z) .* (biases .^ (z)) .* transpose(moment_g(pc, children(lc)[1], data, moment - z, cache))
+    end
+    
+    # 3. Moment w.r.t P(x_m | x_o)
+    results = results_unnormalized ./ p_observed
+
+    # 4. Add Bias^moment terms
+    results .+= biases .^ (moment)
+    
+    results, cache
+end
+
+
+
+# exp_f (pr-constraint) is originally from:
+#   Arthur Choi, Guy Van den Broeck, and Adnan Darwiche. Tractable learning for structured probability spaces: A case study in learning preference distributions. In Proceedings of IJCAI, 2015.
+
+function exp_f(n::Union{PlainSumNode, StructSumNode}, m::Logistic⋁Node, data, cache::Union{ExpectationCache, MomentCache})
+    @inbounds get!(cache.f, Pair(n, m)) do
+        value = zeros(1 , num_examples(data) )
+        pthetas = [exp(n.log_probs[i]) for i in 1:num_children(n)]
+        @fastmath @simd for i in 1:num_children(n)
+            @simd for j in 1:num_children(m)
+                value .+= (pthetas[i] .* exp_f(children(n)[i], children(m)[j], data, cache))
+            end
+        end
+        return value
+    end
+end
+
+function exp_f(n::Union{PlainMulNode, StructMulNode}, m::Logistic⋀Node, data, cache::Union{ExpectationCache, MomentCache})
+    @inbounds get!(cache.f, Pair(n, m)) do
+        value = ones(1 , num_examples(data) )
+        @fastmath for (i,j) in zip(children(n), children(m))
+            value .*= exp_f(i, j, data, cache)
+        end
+        return value
+        # exp_f(children(n)[1], children(m)[1], data, cache) .* exp_f(children(n)[2], children(m)[2], data, cache)
+    end
+end
+
+
+@inline function exp_f(n::Union{PlainProbLiteralNode, StructProbLiteralNode}, m::LogisticLiteral, data, cache::Union{ExpectationCache, MomentCache})
+    @inbounds get!(cache.f, Pair(n, m)) do
+        value = zeros(1 , num_examples(data) )
+        var = lit2var(literal(m))
+        X = data
+        if ispositive(n) && ispositive(m) 
+            # value[1, X[:, var] .== -1 ] .= 1.0  # missing observation always agrees
+            # value[1, X[:, var] .== 1 ] .= 1.0 # positive observations
+            value[1, .!isequal.(X[:, var], 0)] .= 1.0 # positive or missing observations
+        elseif isnegative(n) && isnegative(m)
+            # value[1, X[:, var] .== -1 ] .= 1.0  # missing observation always agrees
+            # value[1, X[:, var] .== 0 ] .= 1.0 # negative observations
+            value[1, .!isequal.(X[:, var], 1)] .= 1.0 # negative or missing observations
+        end
+        return value
+    end
+end
+
+"""
+Has to be a Logistic⋁Node with only one child, which is a leaf node 
+"""
+@inline function exp_f(n::Union{PlainProbLiteralNode, StructProbLiteralNode}, m::Logistic⋁Node, data, cache::Union{ExpectationCache, MomentCache})
+    @inbounds get!(cache.f, Pair(n, m)) do
+        exp_f(n, children(m)[1], data, cache)
+    end
+end
+
+#######################################################################
+######## exp_g, exp_fg
+########################################################################
+
+@inline function exp_g(n::Union{PlainSumNode, StructSumNode}, m::Logistic⋁Node, data, cache::ExpectationCache)
+    exp_fg(n, m, data, cache) # exp_fg and exp_g are the same for OR nodes
+end
+
+# function exp_g(n::Prob⋀, m::Logistic⋀Node, data, cache::ExpectationCache)
+#     value = zeros(classes(m) , num_examples(data))
+#     @fastmath for (i,j) in zip(children(n), children(m))
+#         value .+= exp_fg(i, j, data, cache)
+#     end
+#     return value
+#     # exp_fg(children(n)[1], children(m)[1], data, cache) .+ exp_fg(children(n)[2], children(m)[2], data, cache)
+# end
+
+
+function exp_fg(n::Union{PlainSumNode, StructSumNode}, m::Logistic⋁Node, data, cache::ExpectationCache)
+    @inbounds get!(cache.fg, Pair(n, m)) do
+        value = zeros(num_classes(m) , num_examples(data) )
+        pthetas = [exp(n.log_probs[i]) for i in 1:num_children(n)]
+        @fastmath @simd for i in 1:num_children(n)
+            for j in 1:num_children(m)
+                value .+= (pthetas[i] .* m.thetas[j,:]) .* exp_f(children(n)[i], children(m)[j], data, cache)
+                value .+= pthetas[i] .* exp_fg(children(n)[i], children(m)[j], data, cache)
+            end
+        end
+        return value
+    end
+end
+
+function exp_fg(n::Union{PlainMulNode, StructMulNode}, m::Logistic⋀Node, data, cache::ExpectationCache)
+    @inbounds get!(cache.fg, Pair(n, m)) do
+        # Assuming 2 children
+        value = exp_f(children(n)[1], children(m)[1], data, cache) .* exp_fg(children(n)[2], children(m)[2], data, cache)
+        value .+= exp_f(children(n)[2], children(m)[2], data, cache) .* exp_fg(children(n)[1], children(m)[1], data, cache)
+        return value
+    end
+end
+
+
+"""
+Has to be a Logistic⋁Node with only one child, which is a leaf node 
+"""
+@inline function exp_fg(n::Union{PlainProbLiteralNode, StructProbLiteralNode}, m::Logistic⋁Node, data, cache::ExpectationCache)
+    @inbounds get!(cache.fg, Pair(n, m)) do
+        m.thetas[1,:] .* exp_f(n, m, data, cache)
+    end
+end
+
+@inline function exp_fg(n::Union{PlainProbLiteralNode, StructProbLiteralNode}, m::LogisticLiteral, data, cache::ExpectationCache)
+    #dont know how many classes, boradcasting does the job
+    zeros(1 , num_examples(data)) 
+end
+
+#######################################################################
+######## moment_g, moment_fg
+########################################################################
+
+@inline function moment_g(n::Union{PlainSumNode, StructSumNode}, m::Logistic⋁Node, data, moment::Int, cache::MomentCache)
+    get!(cache.fg, (n, m, moment)) do
+        moment_fg(n, m, data, moment, cache)
+    end
+end
+
+"""
+Calculating  E[g^k * f]
+"""
+function moment_fg(n::Union{PlainSumNode, StructSumNode}, m::Logistic⋁Node, data, moment::Int, cache::MomentCache)
+    if moment == 0
+        return exp_f(n, m, data, cache)
+    end
+
+    get!(cache.fg, (n, m, moment)) do
+        value = zeros(num_classes(m) , num_examples(data) )
+        pthetas = [exp(n.log_probs[i]) for i in 1:num_children(n)]
+        @fastmath @simd for i in 1:num_children(n)
+            for j in 1:num_children(m)
+                for z in 0:moment
+                    value .+= pthetas[i] .* choose(moment, z) .* m.thetas[j,:].^(moment - z) .* moment_fg(children(n)[i], children(m)[j], data, z, cache)
+                end
+            end
+        end
+        return value
+    end
+end
+
+@inline function moment_fg(n::Union{PlainProbLiteralNode, StructProbLiteralNode}, m::Logistic⋁Node, data, moment::Int, cache::MomentCache)
+    get!(cache.fg, (n, m, moment)) do
+        m.thetas[1,:].^(moment) .* exp_f(n, m, data, cache)
+    end
+end
+
+@inline function moment_fg(n::Union{PlainProbLiteralNode, StructProbLiteralNode}, m::LogisticLiteral, data, moment::Int, cache::MomentCache)
+    #dont know how many classes, boradcasting does the job
+    if moment == 0
+        exp_f(n, m, data, cache)
+    else
+        zeros(1, num_examples(data))
+    end
+end
+
+function moment_fg(n::Union{PlainMulNode, StructMulNode}, m::Logistic⋀Node, data, moment::Int, cache::MomentCache)
+    if moment == 0
+        return exp_f(n, m, data, cache)
+    end
+    get!(cache.fg, (n, m, moment)) do
+        value = moment_fg(children(n)[1], children(m)[1], data, 0, cache) .* moment_fg(children(n)[2], children(m)[2], data, moment, cache)
+
+        for z in 1:moment
+            value .+= choose(moment, z) .* moment_fg(children(n)[1], children(m)[1], data, z, cache) .* moment_fg(children(n)[2], children(m)[2], data, moment - z, cache)
+        end
+        return value
+    end
+end
diff --git a/src/queries/information.jl b/src/queries/information.jl
new file mode 100644
index 00000000..127ea0a5
--- /dev/null
+++ b/src/queries/information.jl
@@ -0,0 +1,164 @@
+export kl_divergence
+
+const KLDCache = Dict{Tuple{ProbCircuit,ProbCircuit}, Float64}
+
+""""
+Calculate entropy of the distribution of the input pc."
+"""
+
+import ..Utils: entropy
+function entropy(pc_node::StructSumNode, pc_entropy_cache::Dict{ProbCircuit, Float64}=Dict{ProbCircuit, Float64}())::Float64
+    if pc_node in keys(pc_entropy_cache)
+        return pc_entropy_cache[pc_node]
+    elseif children(pc_node)[1] isa StructProbLiteralNode
+        return get!(pc_entropy_cache, pc_node,
+            - exp(pc_node.log_probs[1]) * pc_node.log_probs[1] -
+            exp(pc_node.log_probs[2]) * pc_node.log_probs[2])
+    else
+        local_entropy = 0.0
+        for (prob⋀_node, log_prob) in zip(children(pc_node), pc_node.log_probs)
+            p = children(prob⋀_node)[1]
+            s = children(prob⋀_node)[2]
+
+            local_entropy += exp(log_prob) * (entropy(p, pc_entropy_cache) +
+                entropy(s, pc_entropy_cache) - log_prob)
+        end
+        return get!(pc_entropy_cache, pc_node, local_entropy)
+    end
+end
+
+function entropy(pc_node::StructMulNode, pc_entropy_cache::Dict{ProbCircuit, Float64})::Float64
+    return get!(pc_entropy_cache, children(pc_node)[1], entropy(children(pc_node)[1], pc_entropy_cache)) +
+        get!(pc_entropy_cache, children(pc_node)[2], entropy(children(pc_node)[2], pc_entropy_cache))
+end
+
+function entropy(pc_node::StructProbLiteralNode, pc_entropy_cache::Dict{ProbCircuit, Float64})::Float64
+    return get!(pc_entropy_cache, pc_node, 0.0)
+end
+
+"Calculate KL divergence calculation for pcs that are not necessarily identical"
+function kl_divergence(pc_node1::StructSumNode, pc_node2::StructSumNode,
+        kl_divergence_cache::KLDCache=KLDCache(), pr_constraint_cache::PRCache=PRCache())
+    @assert !(pc_node1 isa StructMulNode || pc_node2 isa StructMulNode) "Prob⋀ not a valid pc node for KL-Divergence"
+
+    # Check if both nodes are normalized for same vtree node
+    @assert variables(pc_node1) == variables(pc_node2) "Both nodes not normalized for same vtree node"
+
+    if (pc_node1, pc_node2) in keys(kl_divergence_cache) # Cache hit
+        return kl_divergence_cache[(pc_node1, pc_node2)]
+    elseif children(pc_node1)[1] isa StructProbLiteralNode
+        if pc_node2 isa StructProbLiteralNode
+            kl_divergence(children(pc_node1)[1], pc_node2, kl_divergence_cache, pr_constraint_cache)
+            kl_divergence(children(pc_node1)[2], pc_node2, kl_divergence_cache, pr_constraint_cache)
+            if literal(children(pc_node1)[1]) == literal(pc_node2)
+                return get!(kl_divergence_cache, (pc_node1, pc_node2),
+                    pc_node1.log_probs[1] * exp(pc_node1.log_probs[1])
+                )
+            else
+                return get!(kl_divergence_cache, (pc_node1, pc_node2),
+                    pc_node1.log_probs[2] * exp(pc_node1.log_probs[2])
+                )
+            end
+        else
+            # The below four lines actually assign zero, but still we need to
+            # call it.
+            kl_divergence(children(pc_node1)[1], children(pc_node2)[1], kl_divergence_cache, pr_constraint_cache)
+            kl_divergence(children(pc_node1)[1], children(pc_node2)[2], kl_divergence_cache, pr_constraint_cache)
+            kl_divergence(children(pc_node1)[2], children(pc_node2)[1], kl_divergence_cache, pr_constraint_cache)
+            kl_divergence(children(pc_node1)[2], children(pc_node2)[2], kl_divergence_cache, pr_constraint_cache)
+            # There are two possible matches
+            if literal(children(pc_node1)[1]) == literal(children(pc_node2)[1])
+                return get!(kl_divergence_cache, (pc_node1, pc_node2),
+                    exp(pc_node1.log_probs[1]) * (pc_node1.log_probs[1] - pc_node2.log_probs[1]) +
+                    exp(pc_node1.log_probs[2]) * (pc_node1.log_probs[2] - pc_node2.log_probs[2])
+                )
+            else
+                return get!(kl_divergence_cache, (pc_node1, pc_node2),
+                    exp(pc_node1.log_probs[1]) * (pc_node1.log_probs[1] - pc_node2.log_probs[2]) +
+                    exp(pc_node1.log_probs[2]) * (pc_node1.log_probs[2] - pc_node2.log_probs[1])
+                )
+            end
+        end
+    else # the normal case
+        kld = 0.0
+
+        # loop through every combination of prim and sub
+        for (prob⋀_node1, log_theta1) in zip(children(pc_node1), pc_node1.log_probs)
+            for (prob⋀_node2, log_theta2) in zip(children(pc_node2), pc_node2.log_probs)
+                p = children(prob⋀_node1)[1]
+                s = children(prob⋀_node1)[2]
+
+                r = children(prob⋀_node2)[1]
+                t = children(prob⋀_node2)[2]
+
+                theta1 = exp(log_theta1)
+
+                p11 = pr_constraint(s, t, pr_constraint_cache)
+                p12 = pr_constraint(p, r, pr_constraint_cache)
+
+                p13 = theta1 * (log_theta1 - log_theta2)
+
+                p21 = kl_divergence(p, r, kl_divergence_cache, pr_constraint_cache)
+                p31 = kl_divergence(s, t, kl_divergence_cache, pr_constraint_cache)
+
+                kld += p11 * p12 * p13 + theta1 * (p11 * p21 + p12 * p31)
+            end
+        end
+        return get!(kl_divergence_cache, (pc_node1, pc_node2), kld)
+    end
+end
+
+function kl_divergence(pc_node1::StructProbLiteralNode, pc_node2::StructProbLiteralNode,
+        kl_divergence_cache::KLDCache, pr_constraint_cache::PRCache)
+    # Check if literals are over same variables in vtree
+   @assert variables(pc_node1) == variables(pc_node2) "Both nodes not normalized for same vtree node"
+
+    if (pc_node1, pc_node2) in keys(kl_divergence_cache) # Cache hit
+        return kl_divergence_cache[pc_node1, pc_node2]
+    else
+        # In this case probability is 1, kl divergence is 0
+        return get!(kl_divergence_cache, (pc_node1, pc_node2), 0.0)
+    end
+end
+
+function kl_divergence(pc_node1::StructSumNode, pc_node2::StructProbLiteralNode,
+        kl_divergence_cache::KLDCache, pr_constraint_cache::PRCache)
+    @assert variables(pc_node1) == variables(pc_node2) "Both nodes not normalized for same vtree node"
+
+    if (pc_node1, pc_node2) in keys(kl_divergence_cache) # Cache hit
+        return kl_divergence_cache[pc_node1, pc_node2]
+    else
+        kl_divergence(children(pc_node1)[1], pc_node2, kl_divergence_cache, pr_constraint_cache)
+        kl_divergence(children(pc_node1)[2], pc_node2, kl_divergence_cache, pr_constraint_cache)
+        if literal(children(pc_node1)[1]) == literal(pc_node2)
+            return get!(kl_divergence_cache, (pc_node1, pc_node2),
+                pc_node1.log_probs[1] * exp(pc_node1.log_probs[1])
+            )
+        else
+            return get!(kl_divergence_cache, (pc_node1, pc_node2),
+                pc_node1.log_probs[2] * exp(pc_node1.log_probs[2])
+            )
+        end
+    end
+end
+
+function kl_divergence(pc_node1::StructProbLiteralNode, pc_node2::StructSumNode,
+        kl_divergence_cache::KLDCache, pr_constraint_cache::PRCache)
+    @assert variables(pc_node1) == variables(pc_node2) "Both nodes not normalized for same vtree node"
+
+    if (pc_node1, pc_node2) in keys(kl_divergence_cache) # Cache hit
+        return kl_divergence_cache[pc_node1, pc_node2]
+    else
+        kl_divergence(pc_node1, children(pc_node2)[1], kl_divergence_cache, pr_constraint_cache)
+        kl_divergence(pc_node1, children(pc_node2)[2], kl_divergence_cache, pr_constraint_cache)
+        if literal(pc_node1) == literal(children(pc_node2)[1])
+            return get!(kl_divergence_cache, (pc_node1, pc_node2),
+                -pc_node2.log_probs[1]
+            )
+        else
+            return get!(kl_divergence_cache, (pc_node1, pc_node2),
+                -pc_node2.log_probs[2]
+            )
+        end
+    end
+end
diff --git a/src/queries/likelihood.jl b/src/queries/likelihood.jl
new file mode 100644
index 00000000..aeb8a174
--- /dev/null
+++ b/src/queries/likelihood.jl
@@ -0,0 +1,82 @@
+export EVI, log_likelihood_per_instance, log_likelihood, log_likelihood_avg
+
+"""
+Compute the likelihood of the PC given each individual instance in the data
+"""
+function log_likelihood_per_instance(pc::ProbCircuit, data)
+    @assert isbinarydata(data) "Probabilistic circuit likelihoods are for binary data only"
+    bc = ParamBitCircuit(pc, data)
+    if isgpu(data)
+        log_likelihood_per_instance_gpu(to_gpu(bc), data)
+    else
+        log_likelihood_per_instance_cpu(bc, data)
+    end
+end
+
+function log_likelihood_per_instance_cpu(bc, data)
+    ll::Vector{Float64} = zeros(Float64, num_examples(data))
+    ll_lock::Threads.ReentrantLock = Threads.ReentrantLock()
+    
+    @inline function on_edge(flows, values, prime, sub, element, grandpa, single_child)
+        if !single_child
+            lock(ll_lock) do # TODO: move lock to inner loop? change to atomic float?
+                for i = 1:size(flows,1)
+                    @inbounds edge_flow = values[i, prime] & values[i, sub] & flows[i, grandpa]
+                    first_true_bit = trailing_zeros(edge_flow)+1
+                    last_true_bit = 64-leading_zeros(edge_flow)
+                    @simd for j = first_true_bit:last_true_bit
+                        ex_id = ((i-1) << 6) + j
+                        if get_bit(edge_flow, j)
+                            @inbounds ll[ex_id] += bc.params[element]
+                        end
+                    end
+                end
+            end
+        end
+        nothing
+    end
+
+    satisfies_flows(bc.bitcircuit, data; on_edge)
+    return ll
+end
+
+function log_likelihood_per_instance_gpu(bc, data)
+    params_device = CUDA.cudaconvert(bc.params)
+    ll::CuVector{Float64} = CUDA.zeros(Float64, num_examples(data))
+    ll_device = CUDA.cudaconvert(ll)
+        
+    @inline function on_edge(flows, values, prime, sub, element, grandpa, chunk_id, edge_flow, single_child)
+        if !single_child
+            first_true_bit = 1+trailing_zeros(edge_flow)
+            last_true_bit = 64-leading_zeros(edge_flow)
+            for j = first_true_bit:last_true_bit
+                ex_id = ((chunk_id-1) << 6) + j
+                if get_bit(edge_flow, j)
+                    CUDA.@atomic ll_device[ex_id] += params_device[element]
+                end
+            end
+        end
+        nothing
+    end
+    
+    v, f = satisfies_flows(bc.bitcircuit, data; on_edge)
+    CUDA.unsafe_free!(v) # save the GC some effort
+    CUDA.unsafe_free!(f) # save the GC some effort
+    
+    return ll
+end
+
+"""
+Complete evidence queries
+"""
+const EVI = log_likelihood_per_instance
+
+"""
+Compute the likelihood of the PC given the data
+"""
+log_likelihood(pc, data) = sum(log_likelihood_per_instance(pc, data))
+
+"""
+Compute the likelihood of the PC given the data, averaged over all instances in the data
+"""
+log_likelihood_avg(pc, data) = log_likelihood(pc, data)/num_examples(data)
\ No newline at end of file
diff --git a/src/queries/map.jl b/src/queries/map.jl
new file mode 100644
index 00000000..cda5d052
--- /dev/null
+++ b/src/queries/map.jl
@@ -0,0 +1,122 @@
+export max_a_posteriori, MAP
+
+import DataFrames: DataFrame, mapcols!
+
+#####################
+# Circuit MAP/MPE evaluation
+#####################
+
+"Evaluate maximum a-posteriori state of the circuit for a given input"
+max_a_posteriori(root::ProbCircuit, data::Union{Bool,Missing}...) =
+    max_a_posteriori(root, collect(Union{Bool,Missing}, data))
+
+max_a_posteriori(root::ProbCircuit, data::Union{Vector{Union{Bool,Missing}},CuVector{UInt8}}) =
+    example(max_a_posteriori(root, DataFrame(reshape(data, 1, :))), 1)
+
+max_a_posteriori(circuit::ProbCircuit, data::DataFrame) =
+    max_a_posteriori(same_device(ParamBitCircuit(circuit, data), data), data)
+
+function max_a_posteriori(pbc::ParamBitCircuit, data; Float=Float32)
+    @assert isgpu(data) == isgpu(pbc) "ParamBitCircuit and data need to be on the same device"
+    values = marginal_all(pbc, data)
+    return map_down(pbc, data, values; Float)
+end
+
+"""
+Maximum a-posteriori queries
+"""
+const MAP = max_a_posteriori
+
+"""
+Mode of the distribution
+"""
+const mode = max_a_posteriori
+
+"Find the MAP child value and node id of a given decision node"
+function map_child(params, nodes, elements, ex_id, dec_id, values)
+    @inbounds els_start = nodes[1,dec_id]
+    @inbounds els_end = nodes[2,dec_id]
+    pr_opt = typemin(eltype(values))
+    j_opt = 1
+    for j = els_start:els_end
+        @inbounds prime = elements[2,j]
+        @inbounds sub = elements[3,j]
+        @inbounds pr = values[ex_id, prime] + values[ex_id, sub] + params[j]
+        if pr > pr_opt
+            pr_opt = pr
+            j_opt = j
+        end
+    end
+    @inbounds return params[j_opt], elements[2,j_opt], elements[3,j_opt] 
+end
+
+# CPU code
+
+function map_down(pbc, data, values::Array; Float=Float32)
+    state = zeros(Bool, num_examples(data), num_features(data))
+    logprob = zeros(Float, num_examples(data))
+    Threads.@threads for ex_id = 1:size(state,1)
+        map_rec(num_leafs(pbc), params(pbc), nodes(pbc), elements(pbc), ex_id, num_nodes(pbc), values, state, logprob)
+    end
+    df = DataFrame(state)
+    mapcols!(c -> BitVector(c), df)
+    return df, logprob
+end
+
+function map_rec(nl, params, nodes, elements, ex_id, dec_id, values, state, logprob)
+    if isleafgate(nl, dec_id)
+        if isliteralgate(nl, dec_id)
+            l = literal(nl, dec_id)
+            @inbounds state[ex_id, lit2var(l)] = (l > 0) 
+        end
+    else
+        edge_log_pr, prime, sub = map_child(params, nodes, elements, ex_id, dec_id, values)
+        @inbounds logprob[ex_id] += edge_log_pr
+        map_rec(nl, params, nodes, elements, ex_id, prime, values, state, logprob)
+        map_rec(nl, params, nodes, elements, ex_id, sub,   values, state, logprob)
+    end
+end
+
+# GPU code
+
+function map_down(pbc, data, values::CuArray; Float=Float32)
+    state = CUDA.zeros(Bool, num_examples(data), num_features(data))
+    logprob = CUDA.zeros(Float, num_examples(data))
+    stack = CUDA.zeros(Int32, num_examples(data), num_features(data)+3)
+    @inbounds stack[:,1] .= 1 # start with 1 dec_id in the stack
+    @inbounds stack[:,2] .= num_nodes(pbc) # start with the root in the stack
+    num_threads = 256
+    num_blocks = ceil(Int, size(state,1)/num_threads)
+    CUDA.@sync begin
+        @cuda threads=num_threads blocks=num_blocks map_cuda_kernel(num_leafs(pbc), params(pbc), nodes(pbc), elements(pbc), values, state, logprob, stack)
+    end
+    CUDA.unsafe_free!(values) # save the GC some effort
+    # do the conversion to a CuBitVector on the CPU...
+    df = DataFrame(to_cpu(state))
+    mapcols!(c -> to_gpu(BitVector(c)), df)
+    return df, logprob
+end
+
+function map_cuda_kernel(nl, params, nodes, elements, values, state, logprob, stack)
+    index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    stride_x = blockDim().x * gridDim().x
+    for ex_id = index_x:stride_x:size(state,1)
+        dec_id = pop_cuda!(stack, ex_id)
+        while dec_id > zero(eltype(stack))
+            if isleafgate(nl, dec_id)
+                if isliteralgate(nl, dec_id)
+                    l = literal(nl, dec_id)
+                    var = lit2var(l)
+                    @inbounds state[ex_id, var] = (l > 0) 
+                end
+            else
+                edge_log_pr, prime, sub = map_child(params, nodes, elements, ex_id, dec_id, values)
+                @inbounds logprob[ex_id] += edge_log_pr
+                push_cuda!(stack, prime, ex_id)
+                push_cuda!(stack, sub, ex_id)
+            end
+            dec_id = pop_cuda!(stack, ex_id)
+        end
+    end
+    return nothing
+end
\ No newline at end of file
diff --git a/src/queries/marginal_flow.jl b/src/queries/marginal_flow.jl
new file mode 100644
index 00000000..1a6b55df
--- /dev/null
+++ b/src/queries/marginal_flow.jl
@@ -0,0 +1,344 @@
+using StatsFuns: logsumexp, log1pexp
+
+using CUDA: CUDA, @cuda
+using DataFrames: DataFrame
+using LoopVectorization: @avx
+using LogicCircuits: balance_threads
+
+export marginal, MAR, marginal_all, marginal_log_likelihood, 
+marginal_log_likelihood_avg, marginal_flows, marginal_flows_down
+
+#####################
+# Circuit marginal evaluation
+#####################
+  
+# evaluate a probabilistic circuit as a function
+function (root::ProbCircuit)(data...)
+    marginal(root, data...)
+end
+
+"Evaluate marginals of the circuit bottom-up for a given input"
+marginal(root::ProbCircuit, data::Union{Real,Missing}...) =
+    marginal(root, collect(Union{Bool,Missing}, data))
+
+marginal(root::ProbCircuit, data::Union{Vector{Union{Bool,Missing}},CuVector{UInt8}}) =
+    marginal(root, DataFrame(reshape(data, 1, :)))[1]
+
+marginal(circuit::ProbCircuit, data::DataFrame) =
+    marginal(same_device(ParamBitCircuit(circuit, data), data) , data)
+
+function marginal(circuit::ParamBitCircuit, data::DataFrame)::AbstractVector
+    marginal_all(circuit,data)[:,end]
+end
+
+"""
+Marginal queries
+"""
+const MAR = marginal
+
+"""
+Compute the marginal likelihood of the PC given the data
+"""
+marginal_log_likelihood(pc, data) = sum(marginal(pc, data))
+
+"""
+Compute the marginal likelihood of the PC given the data, averaged over all instances in the data
+"""
+marginal_log_likelihood_avg(pc, data) = marginal_log_likelihood(pc, data)/num_examples(data)
+
+#####################
+# Circuit evaluation of *all* nodes in circuit
+#####################
+
+"Evaluate the probabilistic circuit bottom-up for a given input and return the marginal probability value of all nodes"
+marginal_all(circuit::ProbCircuit, data::DataFrame) =
+    marginal_all(same_device(ParamBitCircuit(circuit, data), data) , data)
+
+function marginal_all(circuit::ParamBitCircuit, data, reuse=nothing)
+    @assert num_features(data) == num_features(circuit) 
+    @assert isbinarydata(data)
+    values = init_marginal(data, reuse, num_nodes(circuit))
+    marginal_layers(circuit, values)
+    return values
+end
+
+"Initialize values from the data (data frames)"
+function init_marginal(data, reuse, num_nodes; Float=Float32)
+    flowtype = isgpu(data) ? CuMatrix{Float} : Matrix{Float}
+    values = similar!(reuse, flowtype, num_examples(data), num_nodes)
+    @views values[:,LogicCircuits.TRUE_BITS] .= log(one(Float))
+    @views values[:,LogicCircuits.FALSE_BITS] .= log(zero(Float))
+    # here we should use a custom CUDA kernel to extract Float marginals from bit vectors
+    # for now the lazy solution is to move everything to the CPU and do the work there...
+    data_cpu = to_cpu(data)
+    for i=1:num_features(data)
+        marg_pos::Vector{Float} = log.(coalesce.(data_cpu[:,i], one(Float)))
+        marg_neg::Vector{Float} = log.(coalesce.(1.0 .- data_cpu[:,i], one(Float)))
+        values[:,2+i] .= same_device(marg_pos, values)
+        values[:,2+num_features(data)+i] .= same_device(marg_neg, values)
+    end
+    return values
+end
+
+# upward pass helpers on CPU
+
+"Compute marginals on the CPU (SIMD & multi-threaded)"
+function marginal_layers(circuit::ParamBitCircuit, values::Matrix)
+    bc = circuit.bitcircuit
+    els = bc.elements
+    pars = circuit.params
+    for layer in bc.layers[2:end]
+        Threads.@threads for dec_id in layer
+            j = @inbounds bc.nodes[1,dec_id]
+            els_end = @inbounds bc.nodes[2,dec_id]
+            if j == els_end
+                assign_marginal(values, dec_id, els[2,j], els[3,j], pars[j])
+                j += 1
+            else
+                assign_marginal(values, dec_id, els[2,j], els[3,j], els[2,j+1], els[3,j+1], pars[j], pars[j+1])
+                j += 2
+            end
+            while j <= els_end
+                accum_marginal(values, dec_id, els[2,j], els[3,j], pars[j])
+                j += 1
+            end
+        end
+    end
+end
+
+assign_marginal(v::Matrix{<:AbstractFloat}, i, e1p, e1s, p1) =
+    @views @. @avx v[:,i] = v[:,e1p] + v[:,e1s] + p1
+
+accum_marginal(v::Matrix{<:AbstractFloat}, i, e1p, e1s, p1) = begin
+    @avx for j=1:size(v,1)
+        @inbounds x = v[j,i]
+        @inbounds y = v[j,e1p] + v[j,e1s] + p1
+        Δ = ifelse(x == y, zero(eltype(v)), abs(x - y))
+        @inbounds  v[j,i] = max(x, y) + log1p(exp(-Δ))
+    end
+end
+
+assign_marginal(v::Matrix{<:AbstractFloat}, i, e1p, e1s, e2p, e2s, p1, p2) = begin
+    @avx for j=1:size(v,1)
+        @inbounds x = v[j,e1p] + v[j,e1s] + p1
+        @inbounds y = v[j,e2p] + v[j,e2s] + p2
+        Δ = ifelse(x == y, zero(eltype(v)), abs(x - y))
+        @inbounds  v[j,i] = max(x, y) + log1p(exp(-Δ))
+    end
+end
+
+# upward pass helpers on GPU
+
+"Compute marginals on the GPU"
+function marginal_layers(circuit::ParamBitCircuit, values::CuMatrix;  dec_per_thread = 8, log2_threads_per_block = 8)
+    bc = circuit.bitcircuit
+    CUDA.@sync for layer in bc.layers[2:end]
+        num_examples = size(values, 1)
+        num_decision_sets = length(layer)/dec_per_thread
+        num_threads =  balance_threads(num_examples, num_decision_sets, log2_threads_per_block)
+        num_blocks = (ceil(Int, num_examples/num_threads[1]), 
+                      ceil(Int, num_decision_sets/num_threads[2]))
+        @cuda threads=num_threads blocks=num_blocks marginal_layers_cuda(layer, bc.nodes, bc.elements, circuit.params, values)
+    end
+end
+
+"CUDA kernel for circuit evaluation"
+function marginal_layers_cuda(layer, nodes, elements, params, values)
+    index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    index_y = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    stride_x = blockDim().x * gridDim().x
+    stride_y = blockDim().y * gridDim().y
+    for j = index_x:stride_x:size(values,1)
+        for i = index_y:stride_y:length(layer)
+            decision_id = @inbounds layer[i]
+            k = @inbounds nodes[1,decision_id]
+            els_end = @inbounds nodes[2,decision_id]
+            @inbounds x = values[j, elements[2,k]] + values[j, elements[3,k]] + params[k]
+            while k < els_end
+                k += 1
+                @inbounds y = values[j, elements[2,k]] + values[j, elements[3,k]] + params[k]
+                Δ = ifelse(x == y, zero(eltype(values)), CUDA.abs(x - y))
+                x = max(x, y) + CUDA.log1p(CUDA.exp(-Δ))
+            end
+            values[j, decision_id] = x
+        end
+    end
+    return nothing
+end
+
+
+#####################
+# Bit circuit marginals and flows (up and downward pass)
+#####################
+
+"Compute the marginal and flow of each node"
+function marginal_flows(circuit::ProbCircuit, data, 
+    reuse_values=nothing, reuse_flows=nothing; on_node=noop, on_edge=noop) 
+    bc = same_device(ParamBitCircuit(circuit, data), data)
+    marginal_flows(bc, data, reuse_values, reuse_flows; on_node, on_edge)
+end
+
+function marginal_flows(circuit::ParamBitCircuit, data, 
+            reuse_values=nothing, reuse_flows=nothing; on_node=noop, on_edge=noop)
+    @assert isgpu(data) == isgpu(circuit) "ParamBitCircuit and data need to be on the same device"
+    values = marginal_all(circuit, data, reuse_values)
+    flows = marginal_flows_down(circuit, values, reuse_flows; on_node, on_edge)
+    return values, flows
+end
+
+#####################
+# Bit circuit marginal flows downward pass
+#####################
+
+"When marginals of nodes have already been computed, do a downward pass computing the marginal flows at each node"
+function marginal_flows_down(circuit::ParamBitCircuit, values, reuse=nothing; on_node=noop, on_edge=noop)
+    flows = similar!(reuse, typeof(values), size(values)...)
+    marginal_flows_down_layers(circuit, flows, values, on_node, on_edge)
+    return flows
+end
+
+# downward pass helpers on CPU
+
+"Evaluate marginals of the layers of a bit circuit on the CPU (SIMD & multi-threaded)"
+function marginal_flows_down_layers(pbc::ParamBitCircuit, flows::Matrix, values::Matrix, on_node, on_edge)
+    @assert flows !== values
+    circuit = pbc.bitcircuit 
+    els = circuit.elements   
+    for layer in Iterators.reverse(circuit.layers)
+        Threads.@threads for dec_id in layer
+            par_start = @inbounds circuit.nodes[3,dec_id]
+            if iszero(par_start)
+                if dec_id == num_nodes(circuit)
+                    # marginal flow start from 0.0
+                    @inbounds @views flows[:, dec_id] .= zero(eltype(flows))
+                end
+                # no parents, ignore (can happen for false/true node and root)
+            else
+                par_end = @inbounds circuit.nodes[4,dec_id]
+                for j = par_start:par_end
+                    par = @inbounds circuit.parents[j]
+                    grandpa = @inbounds els[1,par]
+                    sib_id = sibling(els, par, dec_id)
+                    single_child = has_single_child(circuit.nodes, grandpa)
+                    if single_child
+                        if j == par_start
+                            @inbounds @views @. flows[:, dec_id] = flows[:, grandpa]
+                        else
+                            accum_marg_flow(flows, dec_id, grandpa)
+                        end
+                    else
+                        θ = eltype(flows)(pbc.params[par])
+                        if j == par_start
+                            assign_marg_flow(flows, values, dec_id, grandpa, sib_id, θ)
+                        else
+                            accum_marg_flow(flows, values, dec_id, grandpa, sib_id, θ)
+                        end
+                    end
+                    # report edge flow only once:
+                    sib_id > dec_id && on_edge(flows, values, dec_id, sib_id, par, grandpa, single_child)
+                end
+            end
+            on_node(flows, values, dec_id)
+        end
+    end
+end 
+
+function assign_marg_flow(f::Matrix{<:AbstractFloat}, v, d, g, s, θ)
+    @inbounds @simd for j in 1:size(f,1) #@avx gives incorrect results
+        edge_flow = v[j, s] + v[j, d] - v[j, g] + f[j, g] + θ
+        edge_flow = ifelse(isnan(edge_flow), typemin(eltype(f)), edge_flow)
+        f[j, d] = edge_flow 
+    end
+    # @assert !any(isnan, f[:,d])
+end
+
+function accum_marg_flow(f::Matrix{<:AbstractFloat}, d, g)
+    @avx for j=1:size(f,1) #@avx gives incorrect results
+        x = f[j, d]
+        y = f[j, g]
+        Δ = ifelse(x == y, zero(eltype(f)), abs(x - y))
+        f[j, d] = max(x, y) + log1p(exp(-Δ))
+    end
+    # @assert !any(isnan, f[:,d])
+end
+
+function accum_marg_flow(f::Matrix{<:AbstractFloat}, v, d, g, s, θ)
+    @inbounds @simd for j=1:size(f,1) #@avx gives incorrect results
+        x = f[j, d]
+        y = v[j, s] + v[j, d] - v[j, g] + f[j, g] + θ
+        y = ifelse(isnan(y), typemin(eltype(f)), y)
+        Δ = ifelse(x == y, zero(eltype(f)), abs(x - y))
+        f[j, d] = max(x, y) + log1p(exp(-Δ))
+    end
+    # @assert !any(isnan, f[:,d])
+end
+
+# downward pass helpers on GPU
+
+"Pass marginal flows down the layers of a bit circuit on the GPU"
+function marginal_flows_down_layers(pbc::ParamBitCircuit, flows::CuMatrix, values::CuMatrix, 
+            on_node, on_edge; 
+            dec_per_thread = 8, log2_threads_per_block = 7)
+    bc = pbc.bitcircuit
+    CUDA.@sync for layer in Iterators.reverse(bc.layers)
+        num_examples = size(values, 1)
+        num_decision_sets = length(layer)/dec_per_thread
+        num_threads =  balance_threads(num_examples, num_decision_sets, log2_threads_per_block)
+        num_blocks = (ceil(Int, num_examples/num_threads[1]), 
+                      ceil(Int, num_decision_sets/num_threads[2])) 
+        @cuda threads=num_threads blocks=num_blocks marginal_flows_down_layers_cuda(layer, bc.nodes, bc.elements, bc.parents, pbc.params, flows, values, on_node, on_edge)
+    end
+end
+
+"CUDA kernel for passing marginal flows down circuit"
+function marginal_flows_down_layers_cuda(layer, nodes, elements, parents, params, flows, values, on_node, on_edge)
+    index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    index_y = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    stride_x = blockDim().x * gridDim().x
+    stride_y = blockDim().y * gridDim().y
+    for k = index_x:stride_x:size(values,1)
+        for i = index_y:stride_y:length(layer)
+            dec_id = @inbounds layer[i]
+            if dec_id == size(nodes,2)
+                # populate root flows
+                flow = zero(eltype(flows))
+            else
+                par_start = @inbounds nodes[3,dec_id]
+                flow = typemin(eltype(flows)) # log(0)
+                if !iszero(par_start)
+                    par_end = @inbounds nodes[4,dec_id]
+                    for j = par_start:par_end
+                        par = @inbounds parents[j]
+                        grandpa = @inbounds elements[1,par]
+                        v_gp = @inbounds values[k, grandpa]
+                        prime = elements[2,par]
+                        sub = elements[3,par]
+                        θ = eltype(flows)(params[par])
+                        if !iszero(v_gp) # edge flow only gets reported when non-zero
+                            f_gp = @inbounds flows[k, grandpa]
+                            single_child = has_single_child(nodes, grandpa)
+                            if single_child
+                                edge_flow = f_gp
+                            else
+                                v_prime = @inbounds values[k, prime]
+                                v_sub = @inbounds values[k, sub]
+                                edge_flow = compute_marg_edge_flow(v_prime, v_sub, v_gp, f_gp, θ)
+                            end
+                            flow = logsumexp_cuda(flow, edge_flow)
+                            # report edge flow only once:
+                            dec_id == prime && on_edge(flows, values, prime, sub, par, grandpa, k, edge_flow, single_child)
+                        end
+                    end
+                end
+            end
+            @inbounds flows[k, dec_id] = flow
+            on_node(flows, values, dec_id, k, flow)
+        end
+    end
+    return nothing
+end
+
+@inline function compute_marg_edge_flow(p_up, s_up, n_up, n_down, θ)
+    x = p_up + s_up - n_up + n_down + θ
+    ifelse(isnan(x), typemin(n_down), x)
+end
diff --git a/src/queries/pr_constraint.jl b/src/queries/pr_constraint.jl
new file mode 100644
index 00000000..4594a5c5
--- /dev/null
+++ b/src/queries/pr_constraint.jl
@@ -0,0 +1,63 @@
+export pr_constraint
+
+const PRCache = Dict{Tuple{ProbCircuit, LogicCircuit}, Float64}
+
+# Arthur Choi, Guy Van den Broeck, and Adnan Darwiche. Tractable learning for structured probability
+# spaces: A case study in learning preference distributions. In Proceedings of IJCAI, 2015.
+
+"""
+Calculate the probability of the logic formula given by LC for the PC
+"""
+function pr_constraint(pc_node::StructProbCircuit, lc_node, cache::PRCache=PRCache())::Float64
+
+    # TODO require that both circuits have an equal vtree for safety. If they don't, then first convert them to have a vtree
+    @assert respects_vtree(lc_node, vtree(pc_node)) "Both circuits do not have an equal vtree"
+
+    # Cache hit
+    if (pc_node, lc_node) in keys(cache) 
+        return cache[pc_node, lc_node]
+    
+    # Boundary cases
+    elseif isliteralgate(pc_node)
+        # Both are literals, just check whether they agrees with each other 
+        if isliteralgate(lc_node)
+            if literal(pc_node) == literal(lc_node)
+                return get!(cache, (pc_node, lc_node), 1.0)
+            else
+                return get!(cache, (pc_node, lc_node), 0.0)
+            end
+        else
+            pr_constraint(pc_node, children(lc_node)[1], cache)
+            if length(children(lc_node)) > 1
+                pr_constraint(pc_node, children(lc_node)[2], cache)
+                return get!(cache, (pc_node, lc_node), 1.0)
+            else
+                return get!(cache, (pc_node, lc_node),
+                    literal(children(lc_node)[1]) == literal(pc_node) ? 1.0 : 0.0)
+            end
+        end
+    
+    # The pc is true
+    elseif isliteralgate(children(pc_node)[1])
+        theta = exp(pc_node.log_probs[1])
+        return get!(cache, (pc_node, lc_node),
+            theta * pr_constraint(children(pc_node)[1], lc_node, cache) +
+            (1.0 - theta) * pr_constraint(children(pc_node)[2], lc_node, cache))
+    
+    # Both pcs are not trivial
+    else 
+        prob = 0.0
+        for (prob⋀_node, log_theta) in zip(children(pc_node), pc_node.log_probs)
+            p = children(prob⋀_node)[1]
+            s = children(prob⋀_node)[2]
+
+            theta = exp(log_theta)
+            for lc⋀_node in children(lc_node)
+                r = children(lc⋀_node)[1]
+                t = children(lc⋀_node)[2]
+                prob += theta * pr_constraint(p, r, cache) * pr_constraint(s, t, cache)
+            end
+        end
+        return get!(cache, (pc_node, lc_node), prob)
+    end
+end
\ No newline at end of file
diff --git a/src/queries/sample.jl b/src/queries/sample.jl
new file mode 100644
index 00000000..72aa5c88
--- /dev/null
+++ b/src/queries/sample.jl
@@ -0,0 +1,160 @@
+export sample, to_sampled_dataframes
+
+import DataFrames: DataFrame, mapcols!
+import Random: default_rng
+
+#####################
+# Circuit sampling
+#####################
+
+"Sample states from the circuit distribution."
+function sample(pc::ProbCircuit; rng = default_rng())
+    states, prs = sample(pc, 1, [missing for i=1:num_variables(pc)]...; rng)
+    return states[1,:], prs[1]
+end
+
+sample(pc::ProbCircuit, num_samples; rng = default_rng(), gpu=false) =
+    sample(pc, num_samples, [missing for i=1:num_variables(pc)]...; rng, gpu)
+
+sample(pc::ProbCircuit, num_samples, inputs::Union{Bool,Missing}...; 
+        rng = default_rng(), gpu=false) =
+    sample(pc, num_samples, collect(Union{Bool,Missing}, inputs); rng, gpu)
+
+function sample(pc::ProbCircuit, num_samples, inputs::AbstractVector{Union{Bool,Missing}}; 
+                rng = default_rng(), gpu=false)
+    data = DataFrame(reshape(inputs, 1, :))
+    data = gpu ? to_gpu(data) : data
+    states, prs = sample(pc, num_samples, data; rng)
+    return states[:,1,:], prs[:,1]
+end
+
+sample(circuit::ProbCircuit, num_samples, data::DataFrame; rng = default_rng()) =
+    sample(same_device(ParamBitCircuit(circuit, data), data), num_samples, data; rng)
+
+function sample(pbc::ParamBitCircuit, num_samples, data; Float = Float32, rng = default_rng())
+    @assert isgpu(data) == isgpu(pbc) "ParamBitCircuit and data need to be on the same device"
+    values = marginal_all(pbc, data)
+    return sample_down(pbc, num_samples, data, values, rng, Float)
+end
+
+"Convert an array of samples into a vector of dataframes"
+function to_sampled_dataframes(states) 
+    dfs = mapslices(DataFrame, states, dims = [2,3])
+    map(dfs) do df
+        mapcols!(c -> BitVector(c), df)
+    end
+    return dfs
+end
+
+# CPU code
+
+function sample_down(pbc, num_samples, data, values::Array, rng, ::Type{Float}) where Float
+    state = zeros(Bool, num_samples, num_examples(data), num_features(data))
+    logprob = zeros(Float, num_samples, num_examples(data))
+    Threads.@threads for (s_id, ex_id) = collect(Iterators.product(1:size(state,1), 1:size(state,2)))
+        sample_rec(num_leafs(pbc), params(pbc), nodes(pbc), elements(pbc), ex_id, s_id, num_nodes(pbc), values, state, logprob, rng)
+    end
+    return state, logprob
+end
+
+function sample_rec(nl, params, nodes, elements, ex_id, s_id, dec_id, values, state, logprob, rng)
+    if isleafgate(nl, dec_id)
+        if isliteralgate(nl, dec_id)
+            l = literal(nl, dec_id)
+            @inbounds state[s_id, ex_id, lit2var(l)] = (l > 0) 
+        end
+    else
+        edge_log_pr, prime, sub = sample_child_cpu(params, nodes, elements, ex_id, dec_id, values, rng)
+        @inbounds logprob[s_id, ex_id] += edge_log_pr
+        sample_rec(nl, params, nodes, elements, ex_id, s_id, prime, values, state, logprob, rng)
+        sample_rec(nl, params, nodes, elements, ex_id, s_id, sub,   values, state, logprob, rng)
+    end
+end
+
+function sample_child_cpu(params, nodes, elements, ex_id, dec_id, values, rng)
+    @inbounds els_start = nodes[1,dec_id]
+    @inbounds els_end = nodes[2,dec_id]
+    threshold = log(rand(rng)) + values[ex_id, dec_id]
+    cumul_prob = -Inf
+    j_sampled = els_end - els_start + 1 # give all numerical error probability to the last node
+    for j = els_start:els_end
+        @inbounds prime = elements[2,j]
+        @inbounds sub = elements[3,j]
+        @inbounds pr = values[ex_id, prime] + values[ex_id, sub] + params[j]
+        Δ = ifelse(cumul_prob == pr, zero(cumul_prob), abs(cumul_prob - pr))
+        cumul_prob = max(cumul_prob, pr) + log1p(exp(-Δ))
+        if cumul_prob > threshold
+            j_sampled = j
+            break
+        end
+    end
+    @inbounds return params[j_sampled], elements[2,j_sampled], elements[3,j_sampled] 
+end
+
+
+# GPU code
+
+function sample_down(pbc, num_samples, data, values::CuArray, rng, ::Type{Float}) where Float
+    CUDA.seed!(rand(rng, UInt))
+    state = CUDA.zeros(Bool, num_samples, num_examples(data), num_features(data))
+    logprob = CUDA.zeros(Float, num_samples, num_examples(data))
+    stack = CUDA.zeros(Int32, num_samples, num_examples(data), num_features(data)+3)
+    @inbounds stack[:,:,1] .= 1 # start with 1 dec_id in the stack
+    @inbounds stack[:,:,2] .= num_nodes(pbc) # start with the pc in the stack
+    num_threads =  balance_threads(num_samples, num_examples(data), 8)
+    num_blocks = (ceil(Int, num_samples/num_threads[1]), 
+                  ceil(Int, num_examples(data)/num_threads[2]))
+    CUDA.@sync  while true
+        r = CUDA.rand(num_samples, num_examples(data))
+        @cuda threads=num_threads blocks=num_blocks sample_cuda_kernel(num_leafs(pbc), params(pbc), nodes(pbc), elements(pbc), values, state, logprob, stack, r, Float)
+        all_empty(stack) && break
+    end
+    CUDA.unsafe_free!(values) # save the GC some effort
+    return state, logprob
+end
+
+function sample_cuda_kernel(nl, params, nodes, elements, values, state, logprob, stack, r, ::Type{Float}) where Float
+    index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    index_y = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    stride_x = blockDim().x * gridDim().x
+    stride_y = blockDim().y * gridDim().y
+    for s_id = index_x:stride_x:size(state,1)
+        for ex_id = index_y:stride_y:size(state,2)
+            dec_id = pop_cuda!(stack, s_id, ex_id)
+            if dec_id > zero(eltype(stack))
+                if isleafgate(nl, dec_id)
+                    if isliteralgate(nl, dec_id)
+                        l = literal(nl, dec_id)
+                        var = lit2var(l)
+                        @inbounds state[s_id, ex_id, var] = (l > 0) 
+                    end
+                else
+                    edge_log_pr, prime, sub = sample_child_cuda(params, nodes, elements, s_id, ex_id, dec_id, values, r, Float)
+                    @inbounds logprob[s_id, ex_id] += edge_log_pr
+                    push_cuda!(stack, prime, s_id, ex_id)
+                    push_cuda!(stack, sub, s_id, ex_id)
+                end
+            end
+        end
+    end
+    return nothing
+end
+
+function sample_child_cuda(params, nodes, elements, s_id, ex_id, dec_id, values, r, ::Type{Float}) where Float
+    @inbounds els_start = nodes[1,dec_id]
+    @inbounds els_end = nodes[2,dec_id]
+    @inbounds threshold = CUDA.log(r[s_id, ex_id]) + values[ex_id, dec_id]
+    cumul_prob::Float = -Inf
+    j_sampled = els_end - els_start + 1 # give all numerical error probability to the last node
+    for j = els_start:els_end
+        @inbounds prime = elements[2,j]
+        @inbounds sub = elements[3,j]
+        @inbounds pr::Float = values[ex_id, prime] + values[ex_id, sub] + params[j]
+        cumul_prob = logsumexp_cuda(cumul_prob, pr)
+        if cumul_prob > threshold
+            j_sampled = j
+            break
+        end
+    end
+    @inbounds return params[j_sampled], elements[2,j_sampled], elements[3,j_sampled] 
+end
\ No newline at end of file
diff --git a/src/structured_prob_nodes.jl b/src/structured_prob_nodes.jl
new file mode 100644
index 00000000..6fbfca38
--- /dev/null
+++ b/src/structured_prob_nodes.jl
@@ -0,0 +1,141 @@
+export ProbCircuit, StructProbCircuit, StructProbLeafNode, StructProbInnerNode,
+    StructProbLiteralNode, StructMulNode, StructSumNode, check_parameter_integrity
+
+#####################
+# Prob circuits that are structured,
+# meaning that each conjunction is associated with a vtree node.
+#####################
+
+"Root of the plain structure probabilistic circuit node hierarchy"
+abstract type StructProbCircuit <: ProbCircuit end
+
+"A plain structured probabilistic leaf node"
+abstract type StructProbLeafNode <: StructProbCircuit end
+
+"A plain structured probabilistic inner node"
+abstract type StructProbInnerNode <: StructProbCircuit end
+
+"A plain structured probabilistic literal leaf node, representing the positive or negative literal of its variable"
+mutable struct StructProbLiteralNode <: StructProbLeafNode
+    literal::Lit
+    vtree::Vtree
+    data
+    counter::UInt32
+    StructProbLiteralNode(l,v) = begin
+        @assert lit2var(l) ∈ v 
+        new(l, v, nothing, 0)
+    end
+end
+
+"A plain structured probabilistic conjunction node"
+mutable struct StructMulNode <: StructProbInnerNode
+    prime::StructProbCircuit
+    sub::StructProbCircuit
+    vtree::Vtree
+    data
+    counter::UInt32
+    StructMulNode(p,s,v) = begin
+        @assert isinner(v) "Structured conjunctions must respect inner vtree node"
+        @assert varsubset_left(vtree(p),v) "$p does not go left in $v"
+        @assert varsubset_right(vtree(s),v) "$s does not go right in $v"
+        new(p,s, v, nothing, 0)
+    end
+end
+
+"A plain structured probabilistic disjunction node"
+mutable struct StructSumNode <: StructProbInnerNode
+    children::Vector{StructProbCircuit}
+    log_probs::Vector{Float64}
+    vtree::Vtree # could be leaf or inner
+    data
+    counter::UInt32
+    StructSumNode(c, v) = 
+        new(c, init_array(Float64, length(c)), v, nothing, 0)
+end
+
+#####################
+# traits
+#####################
+
+import LogicCircuits.GateType # make available for extension
+@inline GateType(::Type{<:StructProbLiteralNode}) = LiteralGate()
+@inline GateType(::Type{<:StructMulNode}) = ⋀Gate()
+@inline GateType(::Type{<:StructSumNode}) = ⋁Gate()
+
+#####################
+# methods
+#####################
+
+import LogicCircuits: children, vtree, vtree_safe, respects_vtree # make available for extension
+@inline children(n::StructSumNode) = n.children
+@inline children(n::StructMulNode) = [n.prime,n.sub]
+
+"Get the vtree corresponding to the argument, or nothing if the node has no vtree"
+@inline vtree(n::StructProbCircuit) = n.vtree
+@inline vtree_safe(n::StructProbInnerNode) = vtree(n)
+@inline vtree_safe(n::StructProbLiteralNode) = vtree(n)
+
+# ProbCircuit has a default argument for respects: its root's vtree
+respects_vtree(circuit::StructProbCircuit) = 
+    respects_vtree(circuit, vtree(circuit))
+
+@inline num_parameters_node(n::StructSumNode) = num_children(n)
+
+#####################
+# constructors and compilation
+#####################
+
+multiply(arguments::Vector{<:StructProbCircuit};
+        reuse=nothing, use_vtree=nothing) =
+        multiply(arguments...; reuse, use_vtree)
+
+function multiply(a1::StructProbCircuit,  
+                 a2::StructProbCircuit;
+                 reuse=nothing, use_vtree=nothing) 
+    reuse isa StructMulNode && reuse.prime == a1 && reuse.sub == a2 && return reuse
+    !(use_vtree isa Vtree) && (reuse isa StructProbCircuit) &&  (use_vtree = reuse.vtree)
+    !(use_vtree isa Vtree) && (use_vtree = find_inode(vtree_safe(a1), vtree_safe(a2)))
+    return StructMulNode(a1, a2, use_vtree)
+end
+
+function summate(arguments::Vector{<:StructProbCircuit};
+                 reuse=nothing, use_vtree=nothing)
+    @assert length(arguments) > 0
+    reuse isa StructSumNode && reuse.children == arguments && return reuse
+    !(use_vtree isa Vtree) && (reuse isa StructProbCircuit) &&  (use_vtree = reuse.vtree)
+    !(use_vtree isa Vtree) && (use_vtree = mapreduce(vtree_safe, lca, arguments))
+    return StructSumNode(arguments, use_vtree)
+end
+
+# claim `StructProbCircuit` as the default `ProbCircuit` implementation that has a vtree
+
+compile(::Type{ProbCircuit}, a1::Union{Vtree, StructLogicCircuit}, args...) =
+    compile(StructProbCircuit, a1, args...)
+
+compile(n::StructProbCircuit, args...) = 
+    compile(typeof(n), root(vtree(n)), args...)
+
+compile(::Type{<:StructProbCircuit}, c::StructLogicCircuit) =
+    compile(StructProbCircuit, root(vtree(c)), c)
+
+compile(::Type{<:StructLogicCircuit}, c::StructProbCircuit) =
+    compile(StructLogicCircuit, root(vtree(c)), c)
+
+compile(::Type{<:StructProbCircuit}, ::Vtree, ::Bool) =
+    error("Probabilistic circuits do not have constant leafs.")
+
+compile(::Type{<:StructProbCircuit}, vtree::Vtree, l::Lit) =
+    StructProbLiteralNode(l,find_leaf(lit2var(l),vtree))
+
+function compile(::Type{<:StructProbCircuit}, vtree::Vtree, circuit::LogicCircuit)
+    f_con(n) = error("Cannot construct a probabilistic circuit from constant leafs: first smooth and remove unsatisfiable branches.")
+    f_lit(n) = compile(StructProbCircuit, vtree, literal(n))
+    f_a(n, cns) = multiply(cns...) # note: this will use the LCA as vtree node
+    f_o(n, cns) = summate(cns) # note: this will use the LCA as vtree node
+    foldup_aggregate(circuit, f_con, f_lit, f_a, f_o, StructProbCircuit)
+end
+
+function fully_factorized_circuit(::Type{<:StructProbCircuit}, vtree::Vtree)
+    ff_logic_circuit = fully_factorized_circuit(PlainStructLogicCircuit, vtree)
+    compile(StructProbCircuit, vtree, ff_logic_circuit)
+end
diff --git a/src/Probabilistic/VtreeLearner.jl b/src/structurelearner/VtreeLearner.jl
similarity index 98%
rename from src/Probabilistic/VtreeLearner.jl
rename to src/structurelearner/VtreeLearner.jl
index 3fd4f719..676f911c 100644
--- a/src/Probabilistic/VtreeLearner.jl
+++ b/src/structurelearner/VtreeLearner.jl
@@ -224,7 +224,7 @@ function learn_vtree_bottom_up(train_x::PlainXData; α)
     (_, mi) = mutual_information(feature_matrix(train_x), Data.weights(train_x); α = α)
     vars = Var.(collect(1:num_features(train_x)))
     context = BlossomContext(vars, mi)
-    vtree = bottom_up_vtree(PlainVtreeNode, vars, blossom_bottom_up_curry(context))
+    vtree = bottom_up_vtree(PlainVtree, vars, blossom_bottom_up_curry(context))
 end
 
 #############
diff --git a/src/StructureLearner/ChowLiuTree.jl b/src/structurelearner/chow_liu_tree.jl
similarity index 84%
rename from src/StructureLearner/ChowLiuTree.jl
rename to src/structurelearner/chow_liu_tree.jl
index b2581869..ff9ca1ab 100644
--- a/src/StructureLearner/ChowLiuTree.jl
+++ b/src/structurelearner/chow_liu_tree.jl
@@ -1,5 +1,6 @@
-using LightGraphs: SimpleGraph, SimpleDiGraph, complete_graph, add_edge!, kruskal_mst, bfs_tree, center, 
-    connected_components, induced_subgraph, nv, ne, edges, vertices, src, dst
+export CLT, learn_chow_liu_tree, parent_vector
+using LightGraphs: SimpleGraph, SimpleDiGraph, complete_graph, add_edge!, kruskal_mst, 
+    bfs_tree, center, connected_components, induced_subgraph, nv, ne, edges, vertices, src, dst
 using SimpleWeightedGraphs: SimpleWeightedGraph
 using MetaGraphs: MetaDiGraph, set_prop!, props
 
@@ -16,15 +17,12 @@ const CLT = MetaDiGraph
 learn a Chow-Liu tree from training set `train_x`, with Laplace smoothing factor `α`, specifying the tree root by `clt_root`
 return a `CLT`
 """
-function learn_chow_liu_tree(train_x::XData; α = 1.0, clt_root="graph_center")::CLT
-    learn_chow_liu_tree(WXData(train_x);α=α, clt_root=clt_root)
-end
-
-function learn_chow_liu_tree(train_x::WXData; α = 1.0, clt_root="graph_center")::CLT
+function learn_chow_liu_tree(train_x; α = 1.0, clt_root="graph_center",
+        weight=ones(Float64, num_examples(train_x)))::CLT
     features_num = num_features(train_x)
 
     # calculate mutual information
-    (dis_cache, MI) = mutual_information(feature_matrix(train_x), Data.weights(train_x); α = α)
+    (dis_cache, MI) = mutual_information(train_x, weight; α = α)
 
     # maximum spanning tree/ forest
     g = SimpleWeightedGraph(complete_graph(features_num))
@@ -91,9 +89,7 @@ function parent_vector(tree::CLT)::Vector{Int64}
     return v
 end
 
-#####################
-# Methods for test
-#####################
+import LogicCircuits: print_tree
 "Print edges and vertices of a ChowLiu tree"
 function print_tree(clt::CLT)
     for e in edges(clt) print(e); print(" ");end
diff --git a/src/structurelearner/heuristics.jl b/src/structurelearner/heuristics.jl
new file mode 100644
index 00000000..7312aa27
--- /dev/null
+++ b/src/structurelearner/heuristics.jl
@@ -0,0 +1,95 @@
+
+using LinearAlgebra: diagind
+"""
+Pick the edge with maximum flow
+"""
+function count_downflow(values::Matrix{UInt64}, flows::Matrix{UInt64}, n::LogicCircuit)
+    dec_id = n.data.node_id
+    sum(1:size(flows,1)) do i
+        count_ones(flows[i, dec_id]) 
+    end
+end
+
+function count_downflow(values::Matrix{UInt64}, flows::Matrix{UInt64}, n::LogicCircuit, c::LogicCircuit)
+    grandpa = n.data.node_id
+    prime = c.prime.data.node_id
+    sub = c.sub.data.node_id
+    edge_count = sum(1:size(flows,1)) do i
+        count_ones(values[i, prime] & values[i, sub] & flows[i, grandpa]) 
+    end
+end
+
+function downflow_all(values::Matrix{UInt64}, flows::Matrix{UInt64}, n::LogicCircuit, c::LogicCircuit)
+    grandpa = n.data.node_id
+    prime = c.prime.data.node_id
+    sub = c.sub.data.node_id
+    edge = map(1:size(flows,1)) do i
+        digits(Bool, values[i, prime] & values[i, sub] & flows[i, grandpa], base=2, pad=64)
+    end
+    vcat(edge...)
+end
+
+function eFlow(values, flows, candidates::Vector{Tuple{Node, Node}})
+    edge2flows = map(candidates) do (or, and)
+        count_downflow(values, flows, or, and)
+    end
+    (max_flow, max_edge_id) = findmax(edge2flows)
+    candidates[max_edge_id], max_flow
+end
+
+"""
+Pick the variable with maximum sum of mutual information
+"""
+function vMI(values, flows, edge, vars::Vector{Var}, train_x)
+    examples_id = downflow_all(values, flows, edge...)[1:num_examples(train_x)]
+    sub_matrix = train_x[examples_id, vars]
+    (_, mi) = mutual_information(sub_matrix; α=1.0)
+    mi[diagind(mi)] .= 0
+    scores = dropdims(sum(mi, dims = 1), dims = 1)
+    var = vars[argmax(scores)]
+    score = maximum(scores)
+    var, score
+end
+
+"""
+Pick the edge randomly
+"""
+function eRand(candidates::Vector{Tuple{Node, Node}})
+    return rand(candidates)
+end
+
+"""
+Pick the variable randomly
+"""
+function vRand(vars::Vector{Var})
+    lits = collect(Set{Lit}(scope[and]))
+    vars =  Var.(intersect(filter(l -> l > 0, lits), - filter(l -> l < 0, lits)))
+    return Var(rand(vars))
+end
+
+function heuristic_loss(circuit::LogicCircuit, train_x; pick_edge="eFlow", pick_var="vMI")
+    candidates, scope = split_candidates(circuit)
+    values, flows = satisfies_flows(circuit, train_x)
+    if pick_edge == "eFlow"
+        edge, flow = eFlow(values, flows, candidates)
+    elseif pick_edge == "eRand"
+        edge = eRand(candidates)
+    else
+        error("Heuristics $pick_edge to pick edge is undefined.")
+    end
+
+    or, and = edge
+    lits = collect(Set{Lit}(scope[and]))
+    vars =  Var.(intersect(filter(l -> l > 0, lits), - filter(l -> l < 0, lits)))
+
+    if pick_var == "vMI"
+        var, score = vMI(values, flows, edge, vars, train_x)
+    elseif pick_var == "vRand"
+        var = vRand(vars)
+    else
+        error("Heuristics $pick_var to pick variable is undefined.")
+    end
+
+    return (or, and), var
+end
+
diff --git a/src/structurelearner/init.jl b/src/structurelearner/init.jl
new file mode 100644
index 00000000..5c9155d7
--- /dev/null
+++ b/src/structurelearner/init.jl
@@ -0,0 +1,167 @@
+export learn_chow_liu_tree_circuit, learn_vtree_from_clt, compile_sdd_from_clt
+using LightGraphs: outneighbors
+using MetaGraphs: get_prop
+
+"""
+Learning from data a structured-decomposable circuit with several structure learning algorithms
+"""
+function learn_chow_liu_tree_circuit(data;
+        pseudocount = 1.0, 
+        algo = "chow-liu", algo_kwargs=(α=1.0, clt_root="graph_center"), 
+        vtree = "chow-liu", vtree_kwargs=(vtree_mode="balanced",))    
+    if algo == "chow-liu"
+        clt = learn_chow_liu_tree(data; algo_kwargs...)
+        vtree = learn_vtree_from_clt(clt; vtree_kwargs...)
+        lc = compile_sdd_from_clt(clt, vtree)
+        pc = ProbCircuit(lc)
+        estimate_parameters(pc, data; pseudocount=pseudocount)
+        pc, vtree
+    else
+        error("Cannot learn a structured-decomposable circuit with algorithm $algo")
+    end
+end
+
+#############
+# Learn PlainVtree from CLT
+#############
+
+"
+Learn a vtree from clt,
+with strategy (close to) `linear` or `balanced`
+"
+function learn_vtree_from_clt(clt::CLT; vtree_mode::String)::PlainVtree
+    roots = [i for (i, x) in enumerate(parent_vector(clt)) if x == 0]
+    rootnode = construct_children(Var.(roots), clt, vtree_mode)
+
+    return rootnode
+end
+
+function construct_node(v::Var, clt::CLT, strategy::String)::PlainVtree
+    children = Var.(outneighbors(clt, v))
+    if isempty(children) # leaf node
+        return PlainVtreeLeafNode(v)
+    else
+        right = construct_children(children, clt, strategy)
+        return add_parent(v, right)
+    end
+end
+
+function construct_children(children::Vector{Var}, clt::CLT, strategy::String)::PlainVtree
+    sorted_vars = sort(collect(children))
+    children_nodes = Vector{PlainVtree}()
+    foreach(x -> push!(children_nodes, construct_node(x, clt, strategy)), sorted_vars)
+
+    if strategy == "linear"
+        construct_children_linear(children_nodes, clt)
+    elseif strategy == "balanced"
+        construct_children_balanced(children_nodes, clt)
+    else
+        throw("Unknown type of strategy")
+    end
+end
+
+function construct_children_linear(children_nodes::Vector{PlainVtree}, clt::CLT)::PlainVtree
+    children_nodes = Iterators.Stateful(reverse(children_nodes))
+
+    right = popfirst!(children_nodes)
+    for left in children_nodes
+        right = PlainVtreeInnerNode(left, right)
+    end
+    return right
+end
+
+function construct_children_balanced(children_nodes::Vector{PlainVtree}, clt::CLT)::PlainVtree
+    if length(children_nodes) == 1
+        return children_nodes[1]
+    elseif length(children_nodes) == 2
+        return PlainVtreeInnerNode(children_nodes[1], children_nodes[2])
+    else
+        len = trunc(Int64, length(children_nodes) / 2)
+        left = construct_children_balanced(children_nodes[1 : len], clt)
+        right = construct_children_balanced(children_nodes[len + 1 : end], clt)
+        return PlainVtreeInnerNode(left, right)
+    end
+end
+
+function add_parent(parent::Var, children::PlainVtree)
+    return PlainVtreeInnerNode(PlainVtreeLeafNode(parent), children)
+end
+
+#####################
+# Compile PSDD from CLT and vtree
+#####################
+
+"Compile a psdd circuit from clt and vtree"
+function compile_sdd_from_clt(clt::CLT, vtree::PlainVtree)::PlainStructLogicCircuit
+
+    parent_clt = Var.(parent_vector(clt))
+    v2p = Dict{PlainVtree, Vector{PlainStructLogicCircuit}}()
+    
+    function add_mapping!(v::PlainVtree, circuits)
+        if !haskey(v2p, v); v2p[v] = Vector{PlainStructLogicCircuit}(); end
+        foreach(c -> if !(c in v2p[v]) push!(v2p[v], c);end, circuits)
+    end
+
+    # compile vtree leaf node to terminal/true node
+    function compile_from_vtree_node(v::PlainVtreeLeafNode)
+        var = v.var
+        children = Var.(outneighbors(clt, var))
+        cpt = get_prop(clt, var, :cpt)
+        parent = parent_clt[var]
+        if isequal(children, [])
+            circuit = compile_true_nodes(var, v; num=length(cpt) ÷ 2)
+        else
+            circuit = compile_canonical_literals(var, v)
+        end
+        add_mapping!(v, circuit)
+        nothing
+    end
+
+    # compile to decision node
+    function compile_from_vtree_node(v::PlainVtreeInnerNode)
+        left_var = left_most_descendent(v.left).var
+        right_var = left_most_descendent(v.right).var
+        left_circuit = v2p[v.left]
+        right_circuit = v2p[v.right]
+
+        if parent_clt[left_var] == parent_clt[right_var] # two nodes are independent, compile to seperate decision nodes
+            circuit = [compile_decision_node([l], [r], v) for (l, r) in zip(left_circuit, right_circuit)]
+        elseif left_var == parent_clt[right_var] # conditioned on left
+            cpt = get_prop(clt, left_var, :cpt)
+            circuit = compile_decision_nodes(left_circuit, right_circuit, v; num=length(cpt) ÷ 2)
+        else
+            throw("PlainVtree are not learned from the same CLT")
+        end
+        add_mapping!(v, circuit)
+        nothing
+    end
+
+    foreach(compile_from_vtree_node, vtree)
+
+    v2p[vtree][end]
+end
+
+#####################
+# Construct circuit node
+#####################
+"Construct decision nodes given `primes` and `subs`"
+function compile_decision_node(primes::Vector{<:PlainStructLogicCircuit}, subs::Vector{<:PlainStructLogicCircuit}, vtree::PlainVtreeInnerNode)
+    elements = [conjoin(prime, sub; use_vtree=vtree) for (prime, sub) in zip(primes, subs)]
+    return disjoin(elements; use_vtree=vtree)
+end
+
+"Construct literal nodes given variable `var`"
+function compile_canonical_literals(var::Var, vtree::PlainVtreeLeafNode)
+    return [PlainStructLiteralNode( var2lit(var), vtree), PlainStructLiteralNode(-var2lit(var), vtree)]
+end
+
+"Construct true nodes given variable `var`"
+function compile_true_nodes(var::Var, vtree::PlainVtreeLeafNode; num)
+    pos, neg = compile_canonical_literals(var, vtree)
+    return [disjoin([pos, neg]; use_vtree = vtree) for _ in 1 : num]
+end
+
+"Construct decision nodes conditiond on different distribution"
+function compile_decision_nodes(primes::Vector{<:PlainStructLogicCircuit}, subs::Vector{<:PlainStructLogicCircuit}, vtree::PlainVtreeInnerNode; num)
+    return [compile_decision_node(primes, subs, vtree) for _ in 1 : num]
+end
diff --git a/src/structurelearner/learner.jl b/src/structurelearner/learner.jl
new file mode 100644
index 00000000..4ac904a9
--- /dev/null
+++ b/src/structurelearner/learner.jl
@@ -0,0 +1,39 @@
+export learn_single_model
+using LogicCircuits: split_step, struct_learn
+using Statistics: mean
+using Random
+"""
+Learn structure decomposable circuits
+"""
+function learn_single_model(train_x;
+        pick_edge="eFlow", pick_var="vMI", depth=1, 
+        pseudocount=1.0,
+        sanity_check=true,
+        maxiter=typemax(Int),
+        seed=1337)
+
+    # init
+    Random.seed!(seed)
+    pc, vtree = learn_struct_prob_circuit(train_x)
+
+    # structure_update
+    loss(circuit) = heuristic_loss(circuit, train_x; pick_edge=pick_edge, pick_var=pick_var)
+    pc_split_step(circuit) = begin
+        c::ProbCircuit, = split_step(circuit; loss=loss, depth=depth, sanity_check=sanity_check)
+        estimate_parameters(c, train_x; pseudocount=pseudocount)
+        return c, missing
+    end
+    iter = 0
+    log_per_iter(circuit) = begin
+        ll = EVI(circuit, train_x)
+        println("Log likelihood of iteration $iter is $(mean(ll))")
+        println()
+        iter += 1
+        false
+    end
+    log_per_iter(pc)
+    pc = struct_learn(pc; 
+        primitives=[pc_split_step], kwargs=Dict(pc_split_step=>()), 
+        maxiter=maxiter, stop=log_per_iter)
+end
+
diff --git a/test/IO/PSDDParserTest.jl b/test/LoadSave/circuit_loaders_tests.jl
similarity index 61%
rename from test/IO/PSDDParserTest.jl
rename to test/LoadSave/circuit_loaders_tests.jl
index d880024a..1829c16b 100644
--- a/test/IO/PSDDParserTest.jl
+++ b/test/LoadSave/circuit_loaders_tests.jl
@@ -5,25 +5,27 @@ using ProbabilisticCircuits
 @testset "Load a small PSDD and test methods" begin
    file = zoo_psdd_file("little_4var.psdd")
    prob_circuit = load_prob_circuit(file);
-   @test prob_circuit isa ProbΔ
+   @test prob_circuit isa ProbCircuit
 
    # Testing number of nodes and parameters
    @test  9 == num_parameters(prob_circuit)
-   @test 20 == size(prob_circuit)[1]
+   @test 20 == num_nodes(prob_circuit)
    
    # Testing Read Parameters
    EPS = 1e-7
-   @test abs(prob_circuit[13].log_thetas[1] - (-1.6094379124341003)) < EPS
-   @test abs(prob_circuit[13].log_thetas[2] - (-1.2039728043259361)) < EPS
-   @test abs(prob_circuit[13].log_thetas[3] - (-0.916290731874155)) < EPS
-   @test abs(prob_circuit[13].log_thetas[4] - (-2.3025850929940455)) < EPS
+   or1 = children(children(prob_circuit)[1])[2]
+   @test abs(or1.log_probs[1] - (-1.6094379124341003)) < EPS
+   @test abs(or1.log_probs[2] - (-1.2039728043259361)) < EPS
+   @test abs(or1.log_probs[3] - (-0.916290731874155)) < EPS
+   @test abs(or1.log_probs[4] - (-2.3025850929940455)) < EPS
 
-   @test abs(prob_circuit[18].log_thetas[1] - (-2.3025850929940455)) < EPS
-   @test abs(prob_circuit[18].log_thetas[2] - (-2.3025850929940455)) < EPS
-   @test abs(prob_circuit[18].log_thetas[3] - (-2.3025850929940455)) < EPS
-   @test abs(prob_circuit[18].log_thetas[4] - (-0.35667494393873245)) < EPS
+   or2 = children(children(prob_circuit)[1])[1]
+   @test abs(or2.log_probs[1] - (-2.3025850929940455)) < EPS
+   @test abs(or2.log_probs[2] - (-2.3025850929940455)) < EPS
+   @test abs(or2.log_probs[3] - (-2.3025850929940455)) < EPS
+   @test abs(or2.log_probs[4] - (-0.35667494393873245)) < EPS
 
-   @test abs(prob_circuit[20].log_thetas[1] - (0.0)) < EPS
+   @test abs(prob_circuit.log_probs[1] - (0.0)) < EPS
 end
 
 psdd_files = ["little_4var.psdd", "msnbc-yitao-a.psdd", "msnbc-yitao-b.psdd", "msnbc-yitao-c.psdd", "msnbc-yitao-d.psdd", "msnbc-yitao-e.psdd", "mnist-antonio.psdd"]
diff --git a/test/IO/CircuitSaverTest.jl b/test/LoadSave/circuit_savers_tests.jl
similarity index 71%
rename from test/IO/CircuitSaverTest.jl
rename to test/LoadSave/circuit_savers_tests.jl
index 7ce513e5..d8696f07 100644
--- a/test/IO/CircuitSaverTest.jl
+++ b/test/LoadSave/circuit_savers_tests.jl
@@ -9,13 +9,15 @@ using ProbabilisticCircuits
                             zoo_psdd_file("little_4var.psdd"), zoo_vtree_file("little_4var.vtree"))
 
         # load, save, and load as .psdd
+        # TODO reinstate after fix Struct Prob Circuit
         save_circuit("$tmp/temp.psdd", circuit, vtree)
-        save(vtree, "$tmp/temp.vtree");
+        save_vtree(vtree, "$tmp/temp.vtree");
+
         load_struct_prob_circuit("$tmp/temp.psdd", "$tmp/temp.vtree")
         
         # save and load as .sdd
-        save_circuit("$tmp/temp.sdd", circuit, vtree)
-        save(vtree, "$tmp/temp.vtree")
+        save_circuit("$tmp/temp.sdd", PlainStructLogicCircuit(circuit), vtree)
+        save_vtree(vtree, "$tmp/temp.vtree")
 
     end
     
diff --git a/test/Logistic/LogisticCircuitTest.jl b/test/Logistic/LogisticCircuitTest.jl
deleted file mode 100644
index 5ca7ab24..00000000
--- a/test/Logistic/LogisticCircuitTest.jl
+++ /dev/null
@@ -1,46 +0,0 @@
-using Test
-using LogicCircuits
-using ProbabilisticCircuits
-
-# This tests are supposed to test queries on the circuits
-@testset "Logistic Circuit Class Conditional" begin
-    # Uses a Logistic Circuit with 4 variables, and tests 3 of the configurations to 
-    # match with python version.
-
-    EPS = 1e-7;
-    my_opts = (max_factors= 2,
-            compact⋀=false,
-            compact⋁=false)
-        
-    logistic_circuit = zoo_lc("little_4var.circuit", 2);
-    @test logistic_circuit isa Vector{<:LogisticΔNode};
-
-    flow_circuit = FlowΔ(logistic_circuit, 16, Float64, my_opts)
-    @test flow_circuit isa Vector{<:FlowΔNode};
-
-    # Step 1. Check Probabilities for 3 samples
-    data = XData(Bool.([0 0 0 0; 0 1 1 0; 0 0 1 1]));
-    
-    true_prob = [3.43147972 4.66740416; 
-                4.27595352 2.83503504;
-                3.67415087 4.93793472]
-            
-    CLASSES = 2
-    calc_prob = class_conditional_likelihood_per_instance(flow_circuit, CLASSES, data)
-    
-    for i = 1:3
-        for j = 1:2
-            @test true_prob[i,j] ≈ calc_prob[i,j] atol= EPS;
-        end
-    end
-
-    # 2. Testing different API
-    fc2, calc_prob2 = class_conditional_likelihood_per_instance(logistic_circuit, CLASSES, data)
-    for i = 1:3
-        for j = 1:2
-            @test true_prob[i,j] ≈ calc_prob2[i,j] atol= EPS;
-        end
-    end
-
-
-end
\ No newline at end of file
diff --git a/test/Logistic/logistic_tests.jl b/test/Logistic/logistic_tests.jl
new file mode 100644
index 00000000..579176b3
--- /dev/null
+++ b/test/Logistic/logistic_tests.jl
@@ -0,0 +1,82 @@
+using Test
+using LogicCircuits
+using ProbabilisticCircuits
+
+# This tests are supposed to test queries on the circuits
+@testset "Logistic Circuit Query and Parameter Tests" begin
+    # Uses a Logistic Circuit with 4 variables, and tests 3 of the configurations to 
+    # match with python version.
+    
+    # CLASSES = 2
+
+    # logistic_circuit = zoo_lc("little_4var.circuit", CLASSES)
+    # @test logistic_circuit isa LogisticCircuit
+
+    # # check probabilities for binary samples
+    # data = @. Bool([0 0 0 0; 0 1 1 0; 0 0 1 1])
+    # # true_weight_func = [3.43147972 4.66740416; 
+    # #                     4.27595352 2.83503504;
+    # #                     3.67415087 4.93793472]
+    # true_prob = [0.9686740008311808 0.9906908445371728;
+    #              0.9862917392724188 0.9445399509069984; 
+    #              0.9752568185086389 0.9928816444223209]
+            
+    # class_prob = class_likelihood_per_instance(logistic_circuit, CLASSES, data)
+    # for i = 1:size(true_prob)[1]
+    #     for j = 1:CLASSES
+    #         @test true_prob[i,j] ≈ class_prob[i,j]
+    #     end
+    # end
+
+    # # check probabilities for float samples
+    # data = Float32.(data)
+    # class_prob = class_likelihood_per_instance(logistic_circuit, CLASSES, data)
+    # for i = 1:size(true_prob)[1]
+    #     for j = 1:CLASSES
+    #         @test true_prob[i,j] ≈ class_prob[i,j]
+    #     end
+    # end
+
+    # # check predicted_classes
+    # true_labels = [2, 1, 2]
+    # predicted_classes = predict_class(logistic_circuit, CLASSES, data)
+    # @test all(predicted_classes .== true_labels)
+    
+    # # check accuracy
+    # @test accuracy(logistic_circuit, CLASSES, data, true_labels) == 1.0
+
+    # # check parameter updates
+    # original_literal_parameters = Dict{Int, Vector{Float64}}()
+    # foreach(logistic_circuit) do ln
+    #     if ln isa Logistic⋁Node
+    #         foreach(ln.children, eachrow(ln.thetas)) do c, theta
+    #             if c isa LogisticLiteral
+    #                 original_literal_parameters[c.literal] = copy(theta)
+    #             end
+    #         end
+    #     end
+    # end
+    
+    # one_hot_labels = [0.0 1.0;
+    #                   1.0 0.0;
+    #                   0.0 1.0]
+    # one_hot_labels = Float32.(one_hot_labels)
+    # true_error = true_prob .- one_hot_labels
+    # step_size = 0.1
+    # learn_parameters(logistic_circuit, CLASSES, data, true_labels; num_epochs=1, step_size=step_size, flows_computed=true)
+    
+    # foreach(logistic_circuit) do ln
+    #     if ln isa Logistic⋁Node
+    #         foreach(ln.children, eachrow(ln.thetas)) do c, theta
+    #             if c isa LogisticLiteral
+    #                 for class = 1:CLASSES
+    #                     true_update_amount = -step_size * sum(c.data.upflow .* true_error[:, class]) / size(true_error)[1]
+    #                     updated_amount = theta[class] - original_literal_parameters[c.literal][class]
+    #                     @test updated_amount ≈ true_update_amount atol=1e-7
+    #                 end
+    #             end
+    #         end
+    #     end
+    # end
+
+end
\ No newline at end of file
diff --git a/test/Probabilistic/CircuitQueriesTest.jl b/test/Probabilistic/CircuitQueriesTest.jl
deleted file mode 100644
index 26a31784..00000000
--- a/test/Probabilistic/CircuitQueriesTest.jl
+++ /dev/null
@@ -1,162 +0,0 @@
-using Test
-using LogicCircuits
-using ProbabilisticCircuits
-
-# This tests are supposed to test queries on the circuits
-@testset "Probability of Full Evidence" begin
-    # Uses a PSDD with 4 variables, and tests 3 of the configurations to
-    # match with python. Also tests all probabilities sum up to 1.
-
-    EPS = 1e-7;
-    prob_circuit = zoo_psdd("little_4var.psdd");
-    @test prob_circuit isa Vector{<:ProbΔNode};
-
-    flow_circuit = FlowΔ(prob_circuit, 16, Bool)
-    @test flow_circuit isa Vector{<:FlowΔNode};
-
-
-    # Step 1. Check Probabilities for 3 samples
-    data = XData(Bool.([0 0 0 0; 0 1 1 0; 0 0 1 1]));
-    true_prob = [0.07; 0.03; 0.13999999999999999]
-
-    calc_prob = log_likelihood_per_instance(flow_circuit, data)
-    calc_prob = exp.(calc_prob)
-
-    for i = 1:3
-        @test true_prob[i] ≈ calc_prob[i] atol= EPS;
-    end
-
-    # Step 2. Add up all probabilities and see if they add up to one
-    N = 4;
-    data_all = XData(generate_data_all(N))
-
-    calc_prob_all = log_likelihood_per_instance(flow_circuit, data_all)
-    calc_prob_all = exp.(calc_prob_all)
-    sum_prob_all = sum(calc_prob_all)
-
-    @test 1 ≈ sum_prob_all atol = EPS;
-end
-
-@testset "Probability of partial Evidence (marginals)" begin
-    EPS = 1e-7;
-    prob_circuit = zoo_psdd("little_4var.psdd");
-
-    data = XData(
-        Int8.([0 0 0 0; 0 1 1 0; 0 0 1 1;
-                0 0 0 -1; -1 1 0 -1; -1 -1 -1 -1; 0 -1 -1 -1])
-    );
-    true_prob = [0.07; 0.03; 0.13999999999999999;
-                    0.3499999999999; 0.1; 1.0; 0.8]
-
-    opts = (compact⋀=false, compact⋁=false)
-    flow_circuit = UpFlowΔ(prob_circuit, 16, Float64, opts)
-    calc_prob = marginal_log_likelihood_per_instance(flow_circuit, data)
-    calc_prob = exp.(calc_prob)
-
-    for i = 1:length(true_prob)
-        @test true_prob[i] ≈ calc_prob[i] atol= EPS;
-    end
-
-    # Now trying the other api without instantiating a flow circuit
-    fc2, calc_prob2 = marginal_log_likelihood_per_instance(prob_circuit, data)
-    calc_prob2 = exp.(calc_prob2)
-    for i = 1:length(true_prob)
-        @test true_prob[i] ≈ calc_prob2[i] atol= EPS;
-    end
-
-end
-
-@testset "Marginal Pass Down" begin
-    EPS = 1e-7;
-    prob_circuit = zoo_psdd("little_4var.psdd");
-
-    N = 4
-    data_full = XData(Int8.(generate_data_all(N)))
-    opts= (compact⋀=false, compact⋁=false)
-
-    flow_circuit   = FlowΔ(prob_circuit, 16, Float64, opts)
-    flow_circuit_marg = FlowΔ(prob_circuit, 16, Float64, opts)
-
-
-    # Comparing with down pass with fully obeserved data
-    pass_up_down(flow_circuit, data_full)
-    marginal_pass_up_down(flow_circuit_marg, data_full)
-
-    for (ind, node) in enumerate(flow_circuit)
-        if node isa HasDownFlow
-            @test all(  isapprox.(downflow(flow_circuit[ind]), downflow(flow_circuit_marg[ind]), atol = EPS) )
-        end
-    end
-
-
-    # Validating one example with missing features done by hand
-    data_partial = XData(Int8.([-1 1 -1 1]))
-    flow_circuit_part  = FlowΔ(prob_circuit, 16, Float64, opts)
-    ProbabilisticCircuits.marginal_pass_up_down(flow_circuit_part, data_partial)
-
-    # (node index, correct down_flow_value)
-    true_vals = [(1, 0.5),
-                (2, 1.0),
-                (3, 1/3),
-                (4, 1.0),
-                (5, 0.5),
-                (6, 0.0),
-                (7, 2/3),
-                (8, 0.0),
-                (9, 0.3333333333333),
-                (10, 0.0),
-                (11, 0.6666666666666),
-                (12, 0.0),
-                (13, 1.0),
-                (14, 0.5),
-                (15, 0.0),
-                (16, 0.5),
-                (17, 0.0),
-                (18, 1.0),
-                (19, 1.0),
-                (20, 1.0)]
-
-    for ind_val in true_vals
-        @test downflow(flow_circuit_part[ind_val[1]])[1] ≈ ind_val[2] atol= EPS
-    end
-
-end
-
-function test_mpe_brute_force(prob_circuit, evidence)
-    EPS = 1e-9;
-    result = MPE(prob_circuit, evidence);
-    for idx = 1 : num_examples(evidence)
-        marg = XData(generate_all(evidence.x[idx,:]));
-        fc, lls = log_likelihood_per_instance(prob_circuit, marg);
-        brute_mpe = marg.x[argmax(lls), :]
-
-        # Compare and validate p(result[idx]) == p(brute_mpe)
-        comp_data = XData(vcat(result[idx,:]',  brute_mpe'))
-        fc2, lls2 = log_likelihood_per_instance(prob_circuit, comp_data);
-
-        @test lls2[1] ≈ lls2[2] atol= EPS
-    end
-end
-
-@testset "MPE Brute Force Test Small (4 var)" begin
-    prob_circuit = zoo_psdd("little_4var.psdd");
-    evidence = XData( Int8.( [-1 0 0 0;
-                                0 -1 -1 0;
-                                1 1 1 -1;
-                                1 0 1 0;
-                                -1 -1 -1 1;
-                                -1 -1 -1 -1] ))
-
-    test_mpe_brute_force(prob_circuit, evidence)
-
-end
-
-@testset "MPE Brute Force Test Big (15 var)" begin
-    N = 15
-    COUNT = 10
-
-    prob_circuit = zoo_psdd("exp-D15-N1000-C4.psdd");
-    evidence = XData(Int8.(rand( (-1,0,1), (COUNT, N) )))
-
-    test_mpe_brute_force(prob_circuit, evidence)
-end
diff --git a/test/Probabilistic/EntropyKLDTest.jl b/test/Probabilistic/EntropyKLDTest.jl
deleted file mode 100644
index ad459fcf..00000000
--- a/test/Probabilistic/EntropyKLDTest.jl
+++ /dev/null
@@ -1,48 +0,0 @@
-using Test
-using LogicCircuits
-using ProbabilisticCircuits
-
-@testset "Entropy and KLD" begin
-    pc1, vtree = load_struct_prob_circuit(
-                    zoo_psdd_file("simple2.1.psdd"), zoo_vtree_file("simple2.vtree"))
-    pc2, vtree = load_struct_prob_circuit(
-                    zoo_psdd_file("simple2.2.psdd"), zoo_vtree_file("simple2.vtree"))
-    pc3, vtree = load_struct_prob_circuit(
-                    zoo_psdd_file("simple2.3.psdd"), zoo_vtree_file("simple2.vtree"))
-   
-    # Entropy calculation test
-    @test abs(psdd_entropy(pc1[end]) - 1.2899219826090118) < 1e-8
-    @test abs(psdd_entropy(pc2[end]) - 0.9359472745536583) < 1e-8
-
-    # KLD Tests #
-    # KLD base tests
-    pr_constraint_cache = Dict{Tuple{ProbΔNode, Union{ProbΔNode, StructLogicalΔNode}}, Float64}()
-    kl_divergence_cache = Dict{Tuple{ProbΔNode, ProbΔNode}, Float64}()
-
-    @test_throws AssertionError("Both nodes not normalized for same vtree node") psdd_kl_divergence(pc1[1], pc1[3], kl_divergence_cache, pr_constraint_cache)
-    @test_throws AssertionError("Both nodes not normalized for same vtree node") psdd_kl_divergence(pc1[2], pc1[3], kl_divergence_cache, pr_constraint_cache)
-    @test_throws AssertionError("Both nodes not normalized for same vtree node") psdd_kl_divergence(pc1[1], pc1[4], kl_divergence_cache, pr_constraint_cache)
-    @test_throws AssertionError("Both nodes not normalized for same vtree node") psdd_kl_divergence(pc1[1], pc1[5], kl_divergence_cache, pr_constraint_cache)
-    @test_throws AssertionError("Both nodes not normalized for same vtree node") psdd_kl_divergence(pc1[2], pc1[5], kl_divergence_cache, pr_constraint_cache)
-
-    @test_throws AssertionError("Prob⋀ not a valid PSDD node for KL-Divergence") psdd_kl_divergence(pc1[1], pc1[6], kl_divergence_cache, pr_constraint_cache)
-    @test_throws AssertionError("Prob⋀ not a valid PSDD node for KL-Divergence") psdd_kl_divergence(pc1[7], pc1[2], kl_divergence_cache, pr_constraint_cache)
-    @test_throws AssertionError("Prob⋀ not a valid PSDD node for KL-Divergence") psdd_kl_divergence(pc1[6], pc2[7], kl_divergence_cache, pr_constraint_cache)
-
-    # KLD calculation test
-    @test abs(psdd_kl_divergence(pc1[1], pc2[1], kl_divergence_cache, pr_constraint_cache) - 0.0) < 1e-8
-    @test abs(psdd_kl_divergence(pc1[1], pc1[2], kl_divergence_cache, pr_constraint_cache) - 0.0) < 1e-8
-    @test abs(psdd_kl_divergence(pc1[1], pc2[3], kl_divergence_cache, pr_constraint_cache) + log(0.9)) < 1e-8
-    @test abs(psdd_kl_divergence(pc1[2], pc2[3], kl_divergence_cache, pr_constraint_cache) + log(0.1)) < 1e-8
-    @test abs(psdd_kl_divergence(pc1[5], pc2[4], kl_divergence_cache, pr_constraint_cache) - 0.2 * log(0.2)) < 1e-8
-    @test abs(psdd_kl_divergence(pc1[5], pc2[5], kl_divergence_cache, pr_constraint_cache) - 0.8 * log(0.8)) < 1e-8
-    @test abs(psdd_kl_divergence(pc1[5], pc2[5], kl_divergence_cache, pr_constraint_cache) - 0.8 * log(0.8)) < 1e-8
-    @test abs(psdd_kl_divergence(pc1[end], pc2[end]) - 0.5672800167911778) < 1e-8
-
-    kl_divergence_cache = Dict{Tuple{ProbΔNode, ProbΔNode}, Float64}()
-    @test abs(psdd_kl_divergence(pc2[4], pc3[5], kl_divergence_cache, pr_constraint_cache) - 0.0) < 1e-8
-    @test abs(psdd_kl_divergence(pc2[4], pc3[4], kl_divergence_cache, pr_constraint_cache) - 0.0) < 1e-8
-    @test abs(psdd_kl_divergence(pc2[3], pc3[3], kl_divergence_cache, pr_constraint_cache) - 0.9 * log(0.9 / 0.5) - 0.1 * log(0.1 / 0.5)) < 1e-8
-    @test abs(psdd_kl_divergence(pc2[end], pc3[end]) - 0.38966506) < 1e-8
-
-end
diff --git a/test/Probabilistic/PrConstraintTest.jl b/test/Probabilistic/PrConstraintTest.jl
deleted file mode 100644
index b7ea73fb..00000000
--- a/test/Probabilistic/PrConstraintTest.jl
+++ /dev/null
@@ -1,41 +0,0 @@
-using Test
-using LogicCircuits
-using ProbabilisticCircuits
-
-
-@testset "pr_constraint Query" begin
-    # two nodes
-    simplevtree = zoo_vtree_file("simple2.vtree")
-    pc, vtree = load_struct_prob_circuit(
-                    zoo_psdd_file("simple2.4.psdd"), simplevtree)
-
-    cache = Dict{Tuple{ProbΔNode, Union{ProbΔNode, StructLogicalΔNode}}, Float64}()
-
-    @test abs(pr_constraint(pc[end], pc[end], cache) - 1.0) < 1e-8
-    @test abs(pr_constraint(pc[5], pc[3], cache) - 0.2) < 1e-8
-    @test abs(pr_constraint(pc[5], pc[4], cache) - 0.8) < 1e-8
-
-    file_circuit = "little_4var.circuit"
-    file_vtree = "little_4var.vtree"
-    logical_circuit, vtree = load_struct_smooth_logical_circuit(
-                                zoo_lc_file(file_circuit), zoo_vtree_file(file_vtree))
-
-    pc = zoo_psdd("little_4var.psdd")
-
-    @test abs(pr_constraint(pc[end], logical_circuit[end - 1], cache) - 1.0) < 1e-8
-
-    # Test with two psdds
-    pc1, vtree = load_struct_prob_circuit(zoo_psdd_file("simple2.5.psdd"), simplevtree)
-    pc2, vtree = load_struct_prob_circuit(zoo_psdd_file("simple2.6.psdd"), simplevtree)
-
-    pr_constraint_cache = Dict{Tuple{ProbΔNode, Union{ProbΔNode, StructLogicalΔNode}}, Float64}()
-    pr_constraint(pc1[end], pc2[end], pr_constraint_cache)
-    @test abs(pr_constraint_cache[pc1[1], pc2[1]] - 1.0) < 1e-8
-    @test abs(pr_constraint_cache[pc1[1], pc2[2]] - 0.0) < 1e-8
-    @test abs(pr_constraint_cache[pc1[3], pc2[4]] - 1.0) < 1e-8
-    @test abs(pr_constraint_cache[pc1[3], pc2[5]] - 0.0) < 1e-8
-    @test abs(pr_constraint_cache[pc1[9], pc2[8]] - 1.0) < 1e-8
-    @test abs(pr_constraint_cache[pc1[5], pc2[4]] - 0.2) < 1e-8
-    @test abs(pr_constraint_cache[pc1[5], pc2[5]] - 0.8) < 1e-8
-    @test abs(pr_constraint_cache[pc1[2], pc2[3]] - 1.0) < 1e-8
-end
\ No newline at end of file
diff --git a/test/Probabilistic/SamplingTest.jl b/test/Probabilistic/SamplingTest.jl
deleted file mode 100644
index 0e1afd5c..00000000
--- a/test/Probabilistic/SamplingTest.jl
+++ /dev/null
@@ -1,79 +0,0 @@
-using Test
-using LogicCircuits
-using ProbabilisticCircuits
-using DataStructures
-
-@testset "Sampling Test" begin
-    EPS = 1e-2;
-    prob_circuit = zoo_psdd("little_4var.psdd");
-    flow_circuit = FlowΔ(prob_circuit, 16, Bool);
-
-    N = 4;
-    data_all = XData(generate_data_all(N));
-
-    calc_prob_all = log_likelihood_per_instance(flow_circuit, data_all);
-    calc_prob_all = exp.(calc_prob_all);
-
-    using DataStructures
-    hist = DefaultDict{AbstractString,Float64}(0.0)
-
-    Nsamples = 1000 * 1000
-    for i = 1:Nsamples
-        cur = join(Int.(sample(prob_circuit)))
-        hist[cur] += 1
-    end
-
-    for k in keys(hist)
-        hist[k] /= Nsamples
-    end
-
-    for k in keys(hist)
-        cur = parse(Int32, k, base=2) + 1 # cause Julia arrays start at 1 :(
-        @test calc_prob_all[cur] ≈ hist[k] atol= EPS;
-    end
-
-
-end
-
-@testset "Sampling With Evidence" begin
-    # TODO (pashak) this test should be improved by adding few more cases
-    EPS = 1e-3;
-    prob_circuit = zoo_psdd("little_4var.psdd");
-
-    opts= (compact⋀=false, compact⋁=false)
-    flow_circuit = UpFlowΔ(prob_circuit, 1, Float64, opts);
-
-    N = 4;
-    data = XData(Int8.([0 -1 0 -1]));
-    calc_prob = marginal_log_likelihood_per_instance(flow_circuit, data);
-    calc_prob = exp.(calc_prob);
-
-    flow_circuit_all = UpFlowΔ(prob_circuit, 4, Float64, opts);
-    data_all = XData(Int8.([
-                            0 0 0 0;
-                            0 0 0 1;
-                            0 1 0 0;
-                            0 1 0 1;
-                        ]));
-    calc_prob_all = marginal_log_likelihood_per_instance(flow_circuit_all, data_all);
-    calc_prob_all = exp.(calc_prob_all);
-
-    calc_prob_all ./= calc_prob[1]
-
-    hist = DefaultDict{AbstractString,Float64}(0.0)
-
-    Nsamples = 1000 * 1000
-    for i = 1:Nsamples
-        cur = join(Int.(sample(flow_circuit)))
-        hist[cur] += 1
-    end
-
-    for k in keys(hist)
-        hist[k] /= Nsamples
-    end
-
-    for ind = 1:4
-        cur = join(data_all.x[ind, :])
-        @test calc_prob_all[ind] ≈ hist[cur] atol= EPS;
-    end
-end
\ No newline at end of file
diff --git a/test/Project.toml b/test/Project.toml
index 77513980..8f317fa7 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,10 +1,14 @@
 [deps]
-DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 Jive = "ba5e3d4b-8524-549f-bc71-e76ad9e9deed"
-LightGraphs = "093fc24a-ae57-5d10-9952-331d41423f4d"
 LogicCircuits = "a7847b3b-b7f1-4dd5-83c3-60e0aa0f8599"
-MetaGraphs = "626554b9-1ddb-594c-aa3c-2596fe9399a5"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[compat]
+DataFrames = "0.21"
+Jive = "0.2"
+julia = "1.5"
diff --git a/test/StructureLearner/CircuitBuilderTest.jl b/test/StructureLearner/CircuitBuilderTest.jl
deleted file mode 100644
index 48415a53..00000000
--- a/test/StructureLearner/CircuitBuilderTest.jl
+++ /dev/null
@@ -1,24 +0,0 @@
-using Test: @test, @testset
-using LogicCircuits
-using ProbabilisticCircuits
-
-@testset "Probabilistic circuits learner tests" begin
-    data = dataset(twenty_datasets("nltcs"); do_shuffle=false, batch_size=-1)
-    train_x = train(data)
-    pc = learn_probabilistic_circuit(train_x; pseudocount = 1.0, algo = "chow-liu", algo_kwargs=(α=1.0, clt_root="graph_center"))
-    
-    # simple test
-    @test pc isa ProbΔ
-    @test check_parameter_integrity(pc)
-    @test num_parameters(pc) == 62 
-    @test pc[26].log_thetas[1] ≈ -0.023528423773273476 atol=1.0e-7
-
-    # all evidence sums to 1
-    N = num_features(train_x);
-    data_all = XData(generate_data_all(N))
-    fc = FlowΔ(pc, max_batch_size(train_x), Bool, opts_accumulate_flows)
-    calc_prob_all = log_likelihood_per_instance(fc, data_all)
-    calc_prob_all = exp.(calc_prob_all)
-    sum_prob_all = sum(calc_prob_all)
-    @test sum_prob_all ≈ 1 atol = 1.0e-7;
-end
\ No newline at end of file
diff --git a/test/StructureLearner/PSDDInitializerTest.jl b/test/StructureLearner/PSDDInitializerTest.jl
deleted file mode 100644
index 25a8690c..00000000
--- a/test/StructureLearner/PSDDInitializerTest.jl
+++ /dev/null
@@ -1,34 +0,0 @@
-using Test: @test, @testset
-using LogicCircuits
-using ProbabilisticCircuits
-
-@testset "Probabilistic circuits learner tests" begin
-    data = dataset(twenty_datasets("nltcs"); do_shuffle=false, batch_size=-1)
-    train_x = train(data)
-    (pc, vtree) = learn_struct_prob_circuit(train_x; pseudocount = 1.0, algo = "chow-liu", algo_kwargs=(α=1.0, clt_root="graph_center"),
-            vtree = "chow-liu", vtree_kwargs=(vtree_mode="balanced",))
-    
-    # simple test
-    @test pc isa ProbΔ
-    @test vtree isa PlainVtree
-    @test num_variables(vtree) == num_features(data)
-    @test check_parameter_integrity(pc)
-    @test num_parameters(pc) == 74 
-
-    # test below has started to fail -- unclear whether that is a bug or randomness...?
-    # @test pc[28].log_thetas[1] ≈ -1.1870882896239272 atol=1.0e-7
-
-    # is structured decomposable 
-    for (n, vars) in variable_scopes(pc)
-        @test vars == BitSet(variables(origin(n).vtree))
-    end
-
-    # all evidence sums to 1
-    N = num_features(train_x);
-    data_all = XData(generate_data_all(N))
-    fc = FlowΔ(pc, max_batch_size(train_x), Bool, opts_accumulate_flows)
-    calc_prob_all = log_likelihood_per_instance(fc, data_all)
-    calc_prob_all = exp.(calc_prob_all)
-    sum_prob_all = sum(calc_prob_all)
-    @test sum_prob_all ≈ 1 atol = 1.0e-7;
-end
\ No newline at end of file
diff --git a/test/broken/Logistic/logistic_tests.jl b/test/broken/Logistic/logistic_tests.jl
new file mode 100644
index 00000000..db741936
--- /dev/null
+++ b/test/broken/Logistic/logistic_tests.jl
@@ -0,0 +1,31 @@
+#TODO: reinstate
+
+# using Test
+# using LogicCircuits
+# using ProbabilisticCircuits
+
+# # This tests are supposed to test queries on the circuits
+# @testset "Logistic Circuit Class Conditional" begin
+#     # Uses a Logistic Circuit with 4 variables, and tests 3 of the configurations to 
+#     # match with python version.
+
+#     EPS = 1e-7;
+#     logistic_circuit = zoo_lc("little_4var.circuit", 2);
+#     @test logistic_circuit isa LogisticCircuit;
+
+#     # Step 1. Check Probabilities for 3 samples
+#     data = Bool.([0 0 0 0; 0 1 1 0; 0 0 1 1]);
+    
+#     true_prob = [3.43147972 4.66740416; 
+#                 4.27595352 2.83503504;
+#                 3.67415087 4.93793472]
+            
+#     CLASSES = 2
+#     calc_prob = class_conditional_likelihood_per_instance(logistic_circuit, CLASSES, data)
+    
+#     for i = 1:3
+#         for j = 1:2
+#             @test true_prob[i,j] ≈ calc_prob[i,j] atol= EPS;
+#         end
+#     end
+# end
\ No newline at end of file
diff --git a/test/Probabilistic/EMLearnerTest.jl b/test/broken/Mixtures/EMLearnerTest.jl
similarity index 100%
rename from test/Probabilistic/EMLearnerTest.jl
rename to test/broken/Mixtures/EMLearnerTest.jl
diff --git a/test/Probabilistic/VtreeLearnerTest.jl b/test/broken/StructureLearner/VtreeLearnerTest.jl
similarity index 94%
rename from test/Probabilistic/VtreeLearnerTest.jl
rename to test/broken/StructureLearner/VtreeLearnerTest.jl
index 146929fb..a9dac98a 100644
--- a/test/Probabilistic/VtreeLearnerTest.jl
+++ b/test/broken/StructureLearner/VtreeLearnerTest.jl
@@ -13,7 +13,7 @@ using ProbabilisticCircuits
     mktempdir() do tmp
         save(vtree, "$tmp/test.vtree.dot")        
         psdd = compile_psdd_from_clt(clt, vtree);
-        @test psdd isa ProbΔ
+        @test psdd isa ProbCircuit
         save_as_dot(psdd, "$tmp/test.psdd.dot")
     end
     
diff --git a/test/StructureLearner/ChowLiuTreeTest.jl b/test/broken/StructureLearner/chow_liu_tree_tests.jl
similarity index 87%
rename from test/StructureLearner/ChowLiuTreeTest.jl
rename to test/broken/StructureLearner/chow_liu_tree_tests.jl
index ff624ef0..24b2fb8c 100644
--- a/test/StructureLearner/ChowLiuTreeTest.jl
+++ b/test/broken/StructureLearner/chow_liu_tree_tests.jl
@@ -5,8 +5,7 @@ using LogicCircuits
 using ProbabilisticCircuits
 
 @testset "Chow-Liu Tree learner tests" begin
-    data = dataset(twenty_datasets("nltcs"); do_shuffle=false, batch_size=-1)
-    train_x = train(data)
+    train_x, _, _ = twenty_datasets("nltcs")
     clt = learn_chow_liu_tree(train_x; α=1.0, clt_root="graph_center")
     pv = parent_vector(clt)
 
diff --git a/test/broken/StructureLearner/init_tests.jl b/test/broken/StructureLearner/init_tests.jl
new file mode 100644
index 00000000..eb38727a
--- /dev/null
+++ b/test/broken/StructureLearner/init_tests.jl
@@ -0,0 +1,40 @@
+# TODO: reinstate
+
+# using Test: @test, @testset
+# using LogicCircuits
+# using ProbabilisticCircuits
+
+# @testset "Probabilistic circuits learner tests" begin
+#     train_x, _, _ = twenty_datasets("nltcs")
+
+#     @assert train_x isa DataFrame
+#     @assert isbinarydata(train_x)
+
+#     (pc, vtree) = learn_struct_prob_circuit(train_x)
+    
+#     # simple test
+#     @test pc isa ProbCircuit
+#     @test vtree isa PlainVtree
+#     @test num_variables(vtree) == num_features(train_x)
+#     @test check_parameter_integrity(pc)
+#     @test num_parameters(pc) == 74 
+
+#     # test below has started to fail -- unclear whether that is a bug or randomness...?
+#     # @test pc[28].log_probs[1] ≈ -1.1870882896239272 atol=1.0e-7
+
+#     # is structured decomposable 
+#     for (n, vars) in variables_by_node(pc)
+#         @test vars == BitSet(variables(n.vtree))
+#     end
+
+#     # all evidence sums to 1
+#     N = num_features(train_x)
+#     data_all = generate_data_all(N)
+#     @assert data_all isa DataFrame
+#     @assert isbinarydata(data_all)
+
+#     calc_prob_all = log_likelihood_per_instance(pc, data_all)
+#     calc_prob_all = exp.(calc_prob_all)
+#     sum_prob_all = sum(calc_prob_all)
+#     @test sum_prob_all ≈ 1 atol = 1.0e-7;
+# end
\ No newline at end of file
diff --git a/test/Probabilistic/MutualInformationTest.jl b/test/broken/Utils/informations_tests.jl
similarity index 100%
rename from test/Probabilistic/MutualInformationTest.jl
rename to test/broken/Utils/informations_tests.jl
diff --git a/test/Reasoning/ExpectationTest.jl b/test/broken/expectation_tests.jl
similarity index 78%
rename from test/Reasoning/ExpectationTest.jl
rename to test/broken/expectation_tests.jl
index b0b59a71..c8aafc50 100644
--- a/test/Reasoning/ExpectationTest.jl
+++ b/test/broken/expectation_tests.jl
@@ -2,19 +2,19 @@ using Test
 using LogicCircuits
 using ProbabilisticCircuits
 
-function test_expectation_brute_force(pc::ProbΔ, lc::LogisticΔ, data::XData, CLASSES::Int)
+function test_expectation_brute_force(pc::ProbCircuit, lc::LogisticCircuit, data, CLASSES::Int)
     EPS = 1e-7;
-    COUNT = size(data.x)[1]
+    COUNT = size(data)[1]
     # Compute True expectation brute force
     true_exp = zeros(COUNT, CLASSES)
     for i in 1:COUNT
-        row = data.x[i, :]
-        cur_data_all = XData(generate_all(row))
+        row = data[i, :]
+        cur_data_all = generate_all(row)
 
-        fc1, calc_p = log_likelihood_per_instance(pc, cur_data_all)
+        calc_p = log_likelihood_per_instance(pc, cur_data_all)
         calc_p = exp.(calc_p)
 
-        fc2, calc_f = class_conditional_likelihood_per_instance(lc, CLASSES, cur_data_all)
+        calc_f = class_conditional_likelihood_per_instance(lc, CLASSES, cur_data_all)
         true_exp[i, :] = sum(calc_p .* calc_f, dims=1)
         true_exp[i, :] ./= sum(calc_p) #p_observed
     end
@@ -35,19 +35,19 @@ function test_expectation_brute_force(pc::ProbΔ, lc::LogisticΔ, data::XData, C
     end
 end
 
-function test_moment_brute_force(pc::ProbΔ, lc::LogisticΔ, data::XData, CLASSES::Int, moment::Int)
+function test_moment_brute_force(pc::ProbCircuit, lc::LogisticCircuit, data, CLASSES::Int, moment::Int)
     EPS = 1e-7;
-    COUNT = size(data.x)[1]
+    COUNT = size(data)[1]
     # Compute True moment brute force
     true_mom = zeros(COUNT, CLASSES)
     for i in 1:COUNT
-        row = data.x[i, :]
-        cur_data_all = XData(generate_all(row))
+        row = data[i, :]
+        cur_data_all = generate_all(row)
 
-        fc1, calc_p = log_likelihood_per_instance(pc, cur_data_all)
+        calc_p = log_likelihood_per_instance(pc, cur_data_all)
         calc_p = exp.(calc_p)
 
-        fc2, calc_f = class_conditional_likelihood_per_instance(lc, CLASSES, cur_data_all)
+        calc_f = class_conditional_likelihood_per_instance(lc, CLASSES, cur_data_all)
         true_mom[i, :] = sum(calc_p .* (calc_f .^ moment), dims=1)
         true_mom[i, :] ./= sum(calc_p) #p_observed
     end
@@ -70,7 +70,7 @@ end
 
     pc = zoo_psdd(psdd_file);
     lc = zoo_lc(logistic_file, CLASSES);
-    data = XData(Int8.([
+    data = Int8.([
                         0 0 0 0; 
                         0 1 1 0; 
                         0 0 1 1;
@@ -82,7 +82,7 @@ end
                         -1 -1 0 1;
                         -1 -1 -1 1;
                         -1 -1 -1 0;
-                        ]));
+                        ]);
 
     test_expectation_brute_force(pc, lc, data, CLASSES)
 end
@@ -97,7 +97,7 @@ end
 
     pc = zoo_psdd(psdd_file);
     lc = zoo_lc(logistic_file, CLASSES);
-    data = XData(Int8.(rand( (-1,0,1), (COUNT, N) )))
+    data = Int8.(rand( (-1,0,1), (COUNT, N) ))
     
     test_expectation_brute_force(pc, lc, data, CLASSES)
 end
@@ -112,7 +112,7 @@ end
 
     pc = zoo_psdd(psdd_file);
     lc = zoo_lc(logistic_file, CLASSES);
-    data = XData(Int8.(rand( (-1,0,1), (COUNT, N) )))
+    data = Int8.(rand( (-1,0,1), (COUNT, N) ))
 
     test_moment_brute_force(pc, lc, data, CLASSES, 1)
     test_moment_brute_force(pc, lc, data, CLASSES, 2)
@@ -131,7 +131,7 @@ end
 
     pc = zoo_psdd(psdd_file);
     lc = zoo_lc(logistic_file, CLASSES);
-    data = XData(Int8.(rand( (-1,0,1), (COUNT, N) )))
+    data = Int8.(rand( (-1,0,1), (COUNT, N) ))
 
     test_moment_brute_force(pc, lc, data, CLASSES, 1)
     test_moment_brute_force(pc, lc, data, CLASSES, 2)
diff --git a/test/helper/gpu.jl b/test/helper/gpu.jl
new file mode 100644
index 00000000..95e03927
--- /dev/null
+++ b/test/helper/gpu.jl
@@ -0,0 +1,9 @@
+using CUDA: CUDA
+
+function cpu_gpu_agree(f, data; atol=1e-7)
+    CUDA.functional() && @test f(data) == to_cpu(f(to_gpu(data)))
+end
+
+function cpu_gpu_agree_approx(f, data; atol=1e-7)
+    CUDA.functional() && @test f(data) ≈ to_cpu(f(to_gpu(data))) atol=atol
+end
\ No newline at end of file
diff --git a/test/helper/plain_logic_circuits.jl b/test/helper/plain_logic_circuits.jl
new file mode 100644
index 00000000..f2a66f95
--- /dev/null
+++ b/test/helper/plain_logic_circuits.jl
@@ -0,0 +1,68 @@
+function little_2var()
+    v = Var(2)
+    pos = compile(PlainLogicCircuit, var2lit(v))
+    neg = compile(PlainLogicCircuit, -var2lit(v))
+    or1 = pos | neg
+    or2 = pos | neg
+
+    v = Var(1)
+    pos = compile(PlainLogicCircuit, var2lit(v))
+    neg = compile(PlainLogicCircuit, -var2lit(v))
+    
+    and1 = pos & or1
+    and2 = neg & or2
+    and1 | and2
+end
+
+function little_3var()
+    or1 = little_2var()
+    v = Var(3)
+
+    pos = compile(PlainLogicCircuit,  var2lit(v))
+    neg = compile(PlainLogicCircuit, -var2lit(v))
+    
+    or2 = disjoin(children(or1))
+    
+    and1 = pos & or1
+    and2 = neg & or2
+    and1 | and2
+end
+
+function little_3var_constants()
+    or1 = little_2var()
+    v = Var(3)
+
+    t = compile(PlainLogicCircuit, true)
+    f = compile(PlainLogicCircuit, false)
+
+    pos = compile(PlainLogicCircuit,  var2lit(v)) & t
+    neg = compile(PlainLogicCircuit, -var2lit(v)) & f
+    
+    or2 = disjoin(children(or1))
+    
+    and1 = pos & or1
+    and2 = neg & or2
+    and1 | and2
+end
+
+function little_4var()
+    ors = map(1:4) do v
+        v = Var(v)
+        pos = compile(PlainLogicCircuit, var2lit(v))
+        neg = compile(PlainLogicCircuit, - var2lit(v))
+        or = pos | neg
+    end
+    and1 = ors[1] & ors[2]
+    and2 = ors[3] & ors[4]
+    or = and1 | and2
+end
+
+function little_5var()
+    c_4var = little_4var()
+    v = Var(5)
+    pos = compile(PlainLogicCircuit, var2lit(v))
+    neg = compile(PlainLogicCircuit, - var2lit(v))
+    or = pos | neg
+    and = c_4var & or
+    Plain⋁Node([and])
+end
diff --git a/test/parameters_tests.jl b/test/parameters_tests.jl
new file mode 100644
index 00000000..23a93078
--- /dev/null
+++ b/test/parameters_tests.jl
@@ -0,0 +1,43 @@
+using Test
+using LogicCircuits
+using ProbabilisticCircuits
+using DataFrames: DataFrame
+using CUDA: CUDA
+
+@testset "MLE tests" begin
+    
+    dfb = DataFrame(BitMatrix([true false; true true; false true]))
+    r = fully_factorized_circuit(ProbCircuit,num_features(dfb))
+    
+    estimate_parameters(r,dfb; pseudocount=1.0)
+    @test log_likelihood_avg(r,dfb) ≈ LogicCircuits.Utils.fully_factorized_log_likelihood(dfb; pseudocount=1.0)
+
+    estimate_parameters(r,dfb; pseudocount=0.0)
+    @test log_likelihood_avg(r,dfb) ≈ LogicCircuits.Utils.fully_factorized_log_likelihood(dfb; pseudocount=0.0)
+
+    if CUDA.functional()
+
+        dfb_gpu = to_gpu(dfb)
+        
+        estimate_parameters(r,dfb_gpu; pseudocount=1.0)
+        @test log_likelihood_avg(r,dfb_gpu) ≈ LogicCircuits.Utils.fully_factorized_log_likelihood(dfb; pseudocount=1.0)
+
+        estimate_parameters(r,dfb_gpu; pseudocount=0.0)
+        @test log_likelihood_avg(r,dfb_gpu) ≈ LogicCircuits.Utils.fully_factorized_log_likelihood(dfb; pseudocount=0.0)
+
+    end
+
+end
+
+@testset "EM tests" begin
+    data = DataFrame([true missing])
+    vtree2 = PlainVtree(2, :balanced)
+    pc = fully_factorized_circuit(StructProbCircuit, vtree2).children[1]
+    uniform_parameters(pc)
+    pc.children[1].prime.log_probs .= log.([0.3, 0.7])
+    pc.children[1].sub.log_probs .= log.([0.4, 0.6])
+    pbc = ParamBitCircuit(pc, data)
+    estimate_parameters_em(pc, data; pseudocount=0.0)
+    @test all(pc.children[1].prime.log_probs .== log.([1.0, 0.0]))
+    @test pc.children[1].sub.log_probs[1] .≈ log.([0.4, 0.6])[1] atol=1e-6
+end
\ No newline at end of file
diff --git a/test/plain_prob_nodes_tests.jl b/test/plain_prob_nodes_tests.jl
new file mode 100644
index 00000000..592c52a7
--- /dev/null
+++ b/test/plain_prob_nodes_tests.jl
@@ -0,0 +1,49 @@
+using Test
+using LogicCircuits
+using ProbabilisticCircuits
+
+include("helper/plain_logic_circuits.jl")
+
+@testset "probabilistic circuit nodes" begin
+
+    c1 = little_3var()
+
+    @test isdisjoint(linearize(ProbCircuit(c1)), linearize(ProbCircuit(c1)))
+    
+    p1 = ProbCircuit(c1)
+    lit3 = children(children(p1)[1])[1]
+
+    # traits
+    @test p1 isa ProbCircuit
+    @test p1 isa PlainSumNode
+    @test children(p1)[1] isa PlainMulNode
+    @test lit3 isa PlainProbLiteralNode
+    @test GateType(p1) isa ⋁Gate
+    @test GateType(children(p1)[1]) isa ⋀Gate
+    @test GateType(lit3) isa LiteralGate
+    @test length(mul_nodes(p1)) == 4
+    
+    # methods
+    @test num_parameters(p1) == 10
+
+    # extension methods
+    @test literal(lit3) === literal(children(children(c1)[1])[1])
+    @test variable(left_most_descendent(p1)) == Var(3)
+    @test ispositive(left_most_descendent(p1))
+    @test !isnegative(left_most_descendent(p1))
+    @test num_nodes(p1) == 15
+    @test num_edges(p1) == 18
+
+    r1 = fully_factorized_circuit(ProbCircuit,10)
+    @test num_parameters(r1) == 2*10+1
+
+    @test length(mul_nodes(r1)) == 1
+
+    # compilation tests
+    lit1 = compile(PlainProbCircuit, Lit(1))
+    litn1 = compile(PlainProbCircuit, Lit(-1))
+    r = lit1 * 0.3 + 0.7 * litn1
+    @test r isa PlainSumNode
+    @test all(children(r) .== [lit1, litn1])
+    @test all(r.log_probs .≈ log.([0.3, 0.7]))
+end
\ No newline at end of file
diff --git a/test/queries/informations_tests.jl b/test/queries/informations_tests.jl
new file mode 100644
index 00000000..efc60de4
--- /dev/null
+++ b/test/queries/informations_tests.jl
@@ -0,0 +1,20 @@
+using Test
+using LogicCircuits
+using ProbabilisticCircuits
+
+@testset "Entropy and KLD" begin
+
+    pc1, vtree = load_struct_prob_circuit(
+                    zoo_psdd_file("simple2.1.psdd"), zoo_vtree_file("simple2.vtree"))
+    pc2, vtree = load_struct_prob_circuit(
+                    zoo_psdd_file("simple2.2.psdd"), zoo_vtree_file("simple2.vtree"))
+    pc3, vtree = load_struct_prob_circuit(
+                    zoo_psdd_file("simple2.3.psdd"), zoo_vtree_file("simple2.vtree"))
+   
+    @test entropy(pc1) ≈ 1.2899219826090118
+    @test entropy(pc2) ≈ 0.9359472745536583
+
+    @test kl_divergence(pc1, pc2) ≈ 0.5672800167911778
+    @test kl_divergence(pc2, pc3) ≈ 0.38966506
+
+end
diff --git a/test/queries/likelihood_tests.jl b/test/queries/likelihood_tests.jl
new file mode 100644
index 00000000..9f32e388
--- /dev/null
+++ b/test/queries/likelihood_tests.jl
@@ -0,0 +1,38 @@
+using Test
+using LogicCircuits
+using ProbabilisticCircuits
+using DataFrames: DataFrame
+
+include("../helper/gpu.jl")
+
+@testset "Likelihood" begin
+    # Uses a PC with 4 variables, and tests 3 of the configurations to
+    # match with python. Also tests all probabilities sum up to 1.
+
+    prob_circuit = zoo_psdd("little_4var.psdd");
+    @test prob_circuit isa ProbCircuit;
+
+    # Step 1. Check Probabilities for 3 samples
+    data = DataFrame(BitArray([0 0 0 0; 0 1 1 0; 0 0 1 1]));
+    true_prob = [0.07; 0.03; 0.13999999999999999]
+
+    calc_prob = EVI(prob_circuit, data)
+    calc_prob = exp.(calc_prob)
+
+    @test true_prob ≈ calc_prob atol=1e-7;
+
+    # Step 2. Add up all probabilities and see if they add up to one
+    N = 4;
+    data_all = generate_data_all(N)
+
+    calc_prob_all = EVI(prob_circuit, data_all)
+    calc_prob_all = exp.(calc_prob_all)
+    sum_prob_all = sum(calc_prob_all)
+
+    @test 1 ≈ sum_prob_all atol = 1e-7;
+
+    cpu_gpu_agree_approx(data_all) do d 
+        EVI(prob_circuit, d)
+    end
+    
+end
\ No newline at end of file
diff --git a/test/queries/map_tests.jl b/test/queries/map_tests.jl
new file mode 100644
index 00000000..1b0b1f8c
--- /dev/null
+++ b/test/queries/map_tests.jl
@@ -0,0 +1,61 @@
+using Test
+using LogicCircuits
+using ProbabilisticCircuits
+using DataFrames: DataFrame
+using CUDA
+
+include("../helper/gpu.jl")
+
+@testset "MAP" begin
+    prob_circuit = zoo_psdd("little_4var.psdd");
+
+    data_full = generate_data_all(num_variables(prob_circuit))
+
+    map, mappr = MAP(prob_circuit, data_full)
+
+    @test map == data_full
+    
+    evipr = EVI(prob_circuit, data_full)
+    @test mappr ≈ evipr atol=1e-6
+
+    data_marg = DataFrame([false false false false; 
+                      false true true false; 
+                      false false true true;
+                      false false false missing; 
+                      missing true false missing; 
+                      missing missing missing missing; 
+                      false missing missing missing])
+
+    map, mappr = MAP(prob_circuit, data_marg)
+
+    @test all(zip(eachcol(map), eachcol(data_marg))) do (cf,cm) 
+        all(zip(cf, cm)) do (f,m) 
+            ismissing(m) || f == m
+        end
+    end
+
+    mar = MAR(prob_circuit, data_marg)
+
+    @test all(mar .> mappr .- 1e-6)
+
+    # same MAP states on CPU and GPU
+    cpu_gpu_agree(data_full) do d 
+        MAP(prob_circuit, d)[1]
+    end
+
+    # same MAP probabilities on CPU and GPU
+    cpu_gpu_agree_approx(data_full) do d 
+        MAP(prob_circuit, d)[2]
+    end
+
+    # same MAP states on CPU and GPU
+    cpu_gpu_agree(data_marg) do d 
+        MAP(prob_circuit, d)[1]
+    end
+
+    # same MAP probabilities on CPU and GPU
+    cpu_gpu_agree_approx(data_marg) do d 
+        MAP(prob_circuit, d)[2]
+    end
+
+end
\ No newline at end of file
diff --git a/test/queries/marginal_flow_tests.jl b/test/queries/marginal_flow_tests.jl
new file mode 100644
index 00000000..235809c5
--- /dev/null
+++ b/test/queries/marginal_flow_tests.jl
@@ -0,0 +1,97 @@
+using Test
+using LogicCircuits
+using ProbabilisticCircuits
+using DataFrames: DataFrame
+using CUDA
+
+include("../helper/gpu.jl")
+
+@testset "Marginals" begin
+    prob_circuit = zoo_psdd("little_4var.psdd");
+
+    data_marg = DataFrame([false false false false; 
+                      false true true false; 
+                      false false true true;
+                      false false false missing; 
+                      missing true false missing; 
+                      missing missing missing missing; 
+                      false missing missing missing])
+    true_prob = [0.07; 0.03; 0.13999999999999999;
+                    0.3499999999999; 0.1; 1.0; 0.8]
+
+    calc_prob = exp.(MAR(prob_circuit, data_marg))
+    @test true_prob ≈ calc_prob atol=1e-7
+
+    cpu_gpu_agree_approx(data_marg) do d 
+        marginal_all(prob_circuit, d)
+    end
+
+    function test_complete_mar(data)
+        r1 = EVI(prob_circuit, data)
+        r2 = MAR(prob_circuit, data)
+        @test r1 ≈ r2 atol=1e-6
+    end
+
+    data_full = generate_data_all(num_variables(prob_circuit))
+    
+    test_complete_mar(data_full)
+    CUDA.functional() && test_complete_mar(to_gpu(data_full))
+
+    cpu_gpu_agree_approx(data_full) do d 
+        marginal_all(prob_circuit, d)
+    end
+
+end
+
+@testset "Marginal flows" begin
+    
+    prob_circuit = zoo_psdd("little_4var.psdd");
+
+    function test_flows(data)
+        # Comparing with down pass with fully observed data
+
+        data_f = CUDA.@allowscalar Float64.(data)
+
+        _, f1 = satisfies_flows(prob_circuit, data_f)
+        _, f2 = marginal_flows(prob_circuit, data)
+
+        # note: while downward pass flows should be the same, 
+        # the upward pass is *not* supposed to be the same (parameters used vs not)
+        
+        f1 = to_cpu(f1[:,3:end]) # ignore true and false leaf
+        f2 = to_cpu(f2[:,3:end]) # ignore true and false leaf
+
+        @test f1 ≈ exp.(f2) atol=1e-6
+    end
+
+    data_full = generate_data_all(num_variables(prob_circuit))
+    
+    test_flows(data_full)
+    CUDA.functional() && test_flows(to_gpu(data_full))
+    
+    cpu_gpu_agree_approx(data_full) do d 
+        _, f = marginal_flows(prob_circuit, d)
+        f[:,3:end] # ignore true and false leaf
+    end
+
+    # Validating one example with missing features done by hand
+    data_partial = DataFrame([missing true missing true;])
+    prob_circuit = zoo_psdd("little_4var.psdd");
+    _, f = marginal_flows(prob_circuit, data_partial)
+    f = exp.(f)
+
+    @test f[end] ≈ 1.0
+    @test f[end-1] ≈ 1.0
+    @test f[end-2] ≈ 1.0
+    @test f[end-4] ≈ 2/3
+    @test f[end-5] ≈ 0.0 atol=1e-7
+    @test f[end-6] ≈ 1/2
+    @test f[end-7] ≈ 1.0
+    @test f[end-8] ≈ 1/3
+    @test f[end-9] ≈ 1
+    @test f[end-10] ≈ 1/2
+
+    # correctness on gpu by transitivy with above test
+
+end
+
diff --git a/test/queries/pr_constraint_tests.jl b/test/queries/pr_constraint_tests.jl
new file mode 100644
index 00000000..d3eb6cb8
--- /dev/null
+++ b/test/queries/pr_constraint_tests.jl
@@ -0,0 +1,30 @@
+using Test
+using LogicCircuits
+using ProbabilisticCircuits
+
+@testset "Probability of constraint" begin
+
+    # two nodes
+    simplevtree = zoo_vtree_file("simple2.vtree")
+    pc, vtree = load_struct_prob_circuit(
+                    zoo_psdd_file("simple2.4.psdd"), simplevtree)
+
+  
+    @test pr_constraint(pc, pc) ≈ 1.0
+
+    file_circuit = "little_4var.circuit"
+    file_vtree = "little_4var.vtree"
+    logic_circuit, vtree = load_struct_smooth_logic_circuit(
+                                zoo_lc_file(file_circuit), zoo_vtree_file(file_vtree))
+
+    pc, _ = load_struct_prob_circuit(zoo_psdd_file("little_4var.psdd"), zoo_vtree_file("little_4var.vtree"))
+
+    @test pr_constraint(pc, children(logic_circuit)[1]) ≈ 1.0
+
+    # Test with two psdds
+    pc1, vtree = load_struct_prob_circuit(zoo_psdd_file("simple2.5.psdd"), simplevtree)
+    pc2, vtree = load_struct_prob_circuit(zoo_psdd_file("simple2.6.psdd"), simplevtree)
+
+    @test pr_constraint(pc1, pc2) ≈ 1
+
+end
diff --git a/test/queries/sample_test.jl b/test/queries/sample_test.jl
new file mode 100644
index 00000000..46143add
--- /dev/null
+++ b/test/queries/sample_test.jl
@@ -0,0 +1,101 @@
+using Test
+using LogicCircuits
+using ProbabilisticCircuits
+using Random: MersenneTwister
+using CUDA: functional
+
+function histogram_matches_likelihood(samples::Matrix{Bool}, worlds, loglikelihoods)
+    hist = Dict{BitVector,Int}()
+    for i = 1:size(samples,1)
+        sample = BitVector(samples[i,:]) 
+        hist[sample] = get(hist, sample, 0) + 1
+    end
+    for i = 1:size(worlds,1)
+        exact_prob = exp(loglikelihoods[i])
+        ex = BitVector(example(worlds,i))
+        estim_prob = get(hist, ex, 0) / size(samples,1)
+        @test exact_prob ≈ estim_prob atol=1e-2;
+    end
+
+end
+
+@testset "Unconditional Sampling Test" begin
+
+    rng = MersenneTwister(42)
+
+    pc = zoo_psdd("little_4var.psdd");
+    worlds = generate_data_all(num_variables(pc));
+
+    loglikelihoods = EVI(pc, worlds)
+
+    Nsamples = 2_0000
+
+    samples, _ = sample(pc, Nsamples; rng)
+    histogram_matches_likelihood(samples, worlds, loglikelihoods)
+
+    if CUDA.functional()
+        samples, _ = sample(pc, Nsamples; rng, gpu = true)
+        samples_cpu = to_cpu(samples)
+        histogram_matches_likelihood(samples_cpu, worlds, loglikelihoods)    
+    end
+
+end
+
+@testset "Conditional Sampling Test" begin
+
+    rng = MersenneTwister(42)
+    num_samples = 10
+
+    pc = zoo_psdd("little_4var.psdd");
+    data_all = generate_data_all(num_variables(pc));
+
+    # sampling given complete data should return same data with its log likelihood 
+
+
+    loglikelihoods = MAR(pc, data_all)
+    sample_states, sample_prs = sample(pc, num_samples, data_all; rng)
+
+    for i in 1:num_samples
+        @test sample_states[i,:,:] == convert(Matrix,data_all)
+        @test sample_prs[i,:] ≈ loglikelihoods atol=1e-6
+    end
+    
+    # same states on CPU and GPU
+    cpu_gpu_agree(data_all) do d 
+        sample(pc, num_samples, d)[1]
+    end
+
+    # same probabilities on CPU and GPU
+    cpu_gpu_agree_approx(data_all) do d 
+        sample(pc, num_samples, d)[2]
+    end
+
+
+    # sampling given partial data invariants
+
+    data_marg = DataFrame([false false false false; 
+                      false true true false; 
+                      false false true true;
+                      false false false missing; 
+                      missing true false missing; 
+                      missing missing missing missing; 
+                      false missing missing missing])
+
+    _, map_pr = MAP(pc, data_marg)
+
+    sample_states, sample_prs = sample(pc, num_samples, data_marg; rng)
+    
+    for i in 1:num_samples
+
+        # samples keep the partial evidence values
+        pairs = collect(zip(sample_states[i,:,:], convert(Matrix,data_marg)))
+        @test all(pairs) do (f,m)
+            ismissing(m) || f == m
+        end
+
+        # probability does not exceed MAP probability
+        @test all(sample_prs[i,:] .<= map_pr .+ 1e-6)
+    end
+
+
+end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index b9e58d89..6c0ce54c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -11,4 +11,5 @@ end
 
 using Jive
 
-runtests(@__DIR__, skip=["runtests.jl", "helper"])
+# TODO reinstate after refactoring all modules
+runtests(@__DIR__, skip=["runtests.jl", "helper", "broken"])
diff --git a/test/structured_prob_nodes_tests.jl b/test/structured_prob_nodes_tests.jl
new file mode 100644
index 00000000..bf31a710
--- /dev/null
+++ b/test/structured_prob_nodes_tests.jl
@@ -0,0 +1,112 @@
+using Test
+using LogicCircuits
+using ProbabilisticCircuits
+using DataFrames: DataFrame
+
+
+@testset "structured probabilistic circuit nodes" begin
+
+    vtree = PlainVtree(10, :balanced)
+    f = fully_factorized_circuit(StructProbCircuit, vtree)
+    @test f isa StructProbCircuit
+    @test num_nodes(f) == 20+10+9*2+1
+    @test num_edges(f) == 20+18+9+1
+    @test length(mul_nodes(f)) == 9
+    @test length(sum_nodes(f)) == 10+9+1
+
+    @test respects_vtree(f)
+    @test respects_vtree(f, PlainVtree(10, :balanced))
+    @test !respects_vtree(f, PlainVtree(5, :balanced))
+    @test !respects_vtree(f, PlainVtree(10, :rightlinear))
+    @test !respects_vtree(f, PlainVtree(10, :leftlinear))
+
+    @test variable(left_most_descendent(f)) == Var(1)
+    @test variable(right_most_descendent(f)) == Var(10)
+    @test ispositive(left_most_descendent(f))
+    @test isnegative(right_most_descendent(f))
+
+    @test literal((StructProbCircuit,vtree)(Lit(-5))) == Lit(-5)
+
+    @test_throws Exception multiply(StructProbCircuit[])
+    @test_throws Exception summate(StructProbCircuit[])
+
+    @test isdecomposable(f)
+
+    @test variables(f) == BitSet(1:10)
+    @test num_variables(f) == 10
+    @test issmooth(f)
+
+    input = DataFrame(BitArray([1 0 1 0 1 0 1 0 1 0;
+                1 1 1 1 1 1 1 1 1 1;
+                0 0 0 0 0 0 0 0 0 0;
+                0 1 1 0 1 0 0 1 0 1]))
+    @test satisfies(f,input) == BitVector([1,1,1,1])
+
+    plainf = PlainLogicCircuit(f) 
+    foreach(plainf) do n
+        @test n isa PlainLogicCircuit
+    end
+    @test plainf !== f
+    @test num_edges(plainf) == num_edges(f)
+    @test num_nodes(plainf) == num_nodes(f) 
+    @test length(and_nodes(plainf)) == 9
+    @test length(or_nodes(plainf)) == 10+9+1
+    @test model_count(plainf) == BigInt(2)^10
+    @test isempty(intersect(linearize(f),linearize(plainf)))
+
+    ref = StructProbCircuit(vtree,plainf)
+    foreach(ref) do n
+        @test n isa StructProbCircuit
+    end
+    @test plainf !== ref
+    @test f !== ref
+    @test f.vtree === ref.vtree
+    @test num_edges(ref) == num_edges(f)
+    @test num_nodes(ref) == num_nodes(f) 
+    @test length(and_nodes(ref)) == 9
+    @test length(or_nodes(ref)) == 10+9+1
+    @test model_count(ref) == BigInt(2)^10
+    @test isempty(intersect(linearize(f),linearize(ref)))
+
+    ref = StructProbCircuit(vtree,f)
+    foreach(ref) do n
+        @test n isa StructProbCircuit
+    end
+    @test plainf !== ref
+    @test f !== ref
+    @test f.vtree === ref.vtree
+    @test num_edges(ref) == num_edges(f)
+    @test num_nodes(ref) == num_nodes(f) 
+    @test length(and_nodes(ref)) == 9
+    @test length(or_nodes(ref)) == 10+9+1
+    @test model_count(ref) == BigInt(2)^10
+    @test isempty(intersect(linearize(f),linearize(ref)))
+
+    mgr = SddMgr(7, :balanced)
+    v = Dict([(i => compile(mgr, Lit(i))) for i=1:7])
+    c = (v[1] | !v[2] | v[3]) &
+        (v[2] | !v[7] | v[6]) &
+        (v[3] | !v[4] | v[5]) &
+        (v[1] | !v[4] | v[6])
+
+    c2 = StructLogicCircuit(mgr, c)
+    c2 = propagate_constants(c2; remove_unary=true)
+    
+    c3 = StructProbCircuit(mgr, c2)
+    foreach(c3) do n
+      @test n isa StructProbCircuit
+    end
+    @test num_edges(c3) == 69
+    @test num_variables(c3) == 7
+
+    # compilation tests
+    v = Vtree(Var(1))
+    lit1 = compile(StructProbCircuit, v, Lit(1))
+    litn1 = compile(StructProbCircuit, v, Lit(-1))
+    r = lit1 * 0.3 + 0.7 * litn1
+    @test r isa StructSumNode
+    @test all(children(r) .== [lit1, litn1])
+    @test r.vtree === lit1.vtree
+    @test all(r.log_probs .≈ log.([0.3, 0.7]))
+
+end
\ No newline at end of file