From 4bf2fce48f6aec9834446232b6b2e561181bb534 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Fri, 21 Nov 2025 10:31:35 -0500 Subject: [PATCH 01/26] remove the bijectors extension and relevant tests, benchmarks --- HISTORY.md | 11 ++ Project.toml | 6 +- bench/Project.toml | 2 - bench/benchmarks.jl | 1 - bench/normallognormal.jl | 44 ------ ext/AdvancedVIBijectorsExt.jl | 129 ------------------ test/Project.toml | 2 - .../klminrepgraddescent_bijectors.jl | 94 ------------- .../klminrepgradproxdescent_bijectors.jl | 89 ------------ .../klminscoregraddescent_bijectors.jl | 86 ------------ test/models/normallognormal.jl | 62 --------- test/runtests.jl | 3 - 12 files changed, 12 insertions(+), 517 deletions(-) delete mode 100644 bench/normallognormal.jl delete mode 100644 ext/AdvancedVIBijectorsExt.jl delete mode 100644 test/algorithms/klminrepgraddescent_bijectors.jl delete mode 100644 test/algorithms/klminrepgradproxdescent_bijectors.jl delete mode 100644 test/algorithms/klminscoregraddescent_bijectors.jl delete mode 100644 test/models/normallognormal.jl diff --git a/HISTORY.md b/HISTORY.md index 32a58da21..9172492f5 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,3 +1,14 @@ +# Release 0.7 + +## Removal of special treatment to `Bijectors.TransformedDistribution` + +Previously, `KLMinRepGradDescent`, `KLMinRepGradProxDescent`, `KLMinScoreGradDescent` only required the support of the target log-density problem to match that of `q`. +This was implemented by giving a special treatment to `q <: Bijectors.TransformedDistribution` through the `Bijectors` extension. +This, however, resulted in a multiplicative complexity to maintaining the relevant bits. +Since this is not the only way to deal with constrained supports, `Bijectors` extension is now removed. +In addition, `KLMinRepGradDescent`, `KLMinRepGradProxDescent`, `KLMinScoreGradDescent` now expect an unconstrained target log-density problem. +Instead, a tutorial has been added to the documentation on how to deal with a target log-density problem with constrained support. + # Release 0.6 ## New Algorithms diff --git a/Project.toml b/Project.toml index 437f5216d..92589cb86 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "AdvancedVI" uuid = "b5ca4192-6429-45e5-a2d9-87aec30a685c" -version = "0.6" +version = "0.7" [deps] ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" @@ -20,13 +20,11 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [weakdeps] -Bijectors = "76274a88-744f-5084-9051-94815aaf08c4" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6" ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" [extensions] -AdvancedVIBijectorsExt = ["Bijectors", "Optimisers"] AdvancedVIEnzymeExt = ["Enzyme", "ChainRulesCore"] AdvancedVIMooncakeExt = ["Mooncake", "ChainRulesCore"] AdvancedVIReverseDiffExt = ["ReverseDiff", "ChainRulesCore"] @@ -34,7 +32,6 @@ AdvancedVIReverseDiffExt = ["ReverseDiff", "ChainRulesCore"] [compat] ADTypes = "1" Accessors = "0.1" -Bijectors = "0.13, 0.14, 0.15" ChainRulesCore = "1" DiffResults = "1" DifferentiationInterface = "0.6, 0.7" @@ -54,7 +51,6 @@ StatsBase = "0.32, 0.33, 0.34" julia = "1.10, 1.11.2" [extras] -Bijectors = "76274a88-744f-5084-9051-94815aaf08c4" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" diff --git a/bench/Project.toml b/bench/Project.toml index 0ca4fc686..a63fcfde5 100644 --- a/bench/Project.toml +++ b/bench/Project.toml @@ -2,7 +2,6 @@ ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" AdvancedVI = "b5ca4192-6429-45e5-a2d9-87aec30a685c" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" -Bijectors = "76274a88-744f-5084-9051-94815aaf08c4" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" DistributionsAD = "ced4e74d-a319-5a8a-b0ac-84af2272839c" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" @@ -22,7 +21,6 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" ADTypes = "1" AdvancedVI = "0.6" BenchmarkTools = "1" -Bijectors = "0.13, 0.14, 0.15" Distributions = "0.25.111" DistributionsAD = "0.6" Enzyme = "0.13.7" diff --git a/bench/benchmarks.jl b/bench/benchmarks.jl index ad40b3534..08f688be4 100644 --- a/bench/benchmarks.jl +++ b/bench/benchmarks.jl @@ -34,7 +34,6 @@ begin T = Float64 for (probname, prob) in [ - ("normal + bijector", normallognormal(; n_dims=10, realtype=T)) ("normal", normal(; n_dims=10, realtype=T)) ] max_iter = 10^4 diff --git a/bench/normallognormal.jl b/bench/normallognormal.jl deleted file mode 100644 index 4cfc9af1a..000000000 --- a/bench/normallognormal.jl +++ /dev/null @@ -1,44 +0,0 @@ -struct NormalLogNormal{MX,SX,MY,SY} - μ_x::MX - σ_x::SX - μ_y::MY - Σ_y::SY -end - -function LogDensityProblems.logdensity(model::NormalLogNormal, θ) - (; μ_x, σ_x, μ_y, Σ_y) = model - log_density_x = logpdf(LogNormal(μ_x, σ_x), θ[1]) - log_density_y = logpdf(MvNormal(μ_y, Σ_y), θ[2:end]) - return log_density_x + log_density_y -end - -function LogDensityProblems.logdensity_and_gradient(model::NormalLogNormal, θ) - return ( - LogDensityProblems.logdensity(model, θ), - ForwardDiff.gradient(Base.Fix1(LogDensityProblems.logdensity, model), θ), - ) -end - -function LogDensityProblems.dimension(model::NormalLogNormal) - return length(model.μ_y) + 1 -end - -function LogDensityProblems.capabilities(::Type{<:NormalLogNormal}) - return LogDensityProblems.LogDensityOrder{1}() -end - -function Bijectors.bijector(model::NormalLogNormal) - (; μ_x, σ_x, μ_y, Σ_y) = model - return Bijectors.Stacked( - Bijectors.bijector.([LogNormal(μ_x, σ_x), MvNormal(μ_y, Σ_y)]), - [1:1, 2:(1 + length(μ_y))], - ) -end - -function normallognormal(; n_dims=10, realtype=Float64) - μ_x = realtype(5.0) - σ_x = realtype(0.3) - μ_y = Fill(realtype(5.0), n_dims) - σ_y = Fill(realtype(0.3), n_dims) - return NormalLogNormal(μ_x, σ_x, μ_y, Diagonal(σ_y .^ 2)) -end diff --git a/ext/AdvancedVIBijectorsExt.jl b/ext/AdvancedVIBijectorsExt.jl deleted file mode 100644 index 0e85d4d43..000000000 --- a/ext/AdvancedVIBijectorsExt.jl +++ /dev/null @@ -1,129 +0,0 @@ -module AdvancedVIBijectorsExt - -using AdvancedVI -using DiffResults: DiffResults -using Bijectors -using LinearAlgebra -using Optimisers -using Random - -function AdvancedVI.init( - rng::Random.AbstractRNG, - alg::AdvancedVI.ParamSpaceSGD, - q_init::Bijectors.TransformedDistribution, - prob, -) - (; adtype, optimizer, averager, objective, operator) = alg - if q_init.dist isa AdvancedVI.MvLocationScale && - operator isa AdvancedVI.IdentityOperator - @warn( - "IdentityOperator is used with a variational family <:MvLocationScale. Optimization can easily fail under this combination due to singular scale matrices. Consider using the operator `ClipScale` in the algorithm instead.", - ) - end - params, re = Optimisers.destructure(q_init) - opt_st = Optimisers.setup(optimizer, params) - obj_st = AdvancedVI.init(rng, objective, adtype, q_init, prob, params, re) - avg_st = AdvancedVI.init(averager, params) - grad_buf = DiffResults.DiffResult(zero(eltype(params)), similar(params)) - return ( - prob=prob, - q=q_init, - iteration=0, - grad_buf=grad_buf, - opt_st=opt_st, - obj_st=obj_st, - avg_st=avg_st, - ) -end - -function AdvancedVI.apply( - op::ClipScale, - ::Type{<:Bijectors.TransformedDistribution{<:AdvancedVI.MvLocationScale}}, - state, - params, - restructure, -) - q = restructure(params) - ϵ = convert(eltype(params), op.epsilon) - - # Project the scale matrix to the set of positive definite triangular matrices - diag_idx = diagind(q.dist.scale) - @. q.dist.scale[diag_idx] = max(q.dist.scale[diag_idx], ϵ) - - params, _ = Optimisers.destructure(q) - - return params -end - -function AdvancedVI.apply( - op::ClipScale, - ::Type{<:Bijectors.TransformedDistribution{<:AdvancedVI.MvLocationScaleLowRank}}, - state, - params, - restructure, -) - q = restructure(params) - ϵ = convert(eltype(params), op.epsilon) - - @. q.dist.scale_diag = max(q.dist.scale_diag, ϵ) - - params, _ = Optimisers.destructure(q) - - return params -end - -function AdvancedVI.apply( - ::AdvancedVI.ProximalLocationScaleEntropy, - ::Type{<:Bijectors.TransformedDistribution{<:AdvancedVI.MvLocationScale}}, - leaf::Optimisers.Leaf{<:Union{<:DoG,<:DoWG,<:Descent},S}, - params, - restructure, -) where {S} - q = restructure(params) - - stepsize = AdvancedVI.stepsize_from_optimizer_state(leaf.rule, leaf.state) - diag_idx = diagind(q.dist.scale) - scale_diag = q.dist.scale[diag_idx] - @. q.dist.scale[diag_idx] = - scale_diag + 1 / 2 * (sqrt(scale_diag^2 + 4 * stepsize) - scale_diag) - - params, _ = Optimisers.destructure(q) - - return params -end - -function AdvancedVI.reparam_with_entropy( - rng::Random.AbstractRNG, - q::Bijectors.TransformedDistribution, - q_stop::Bijectors.TransformedDistribution, - n_samples::Int, - ent_est::AdvancedVI.AbstractEntropyEstimator, -) - transform = q.transform - q_unconst = q.dist - q_unconst_stop = q_stop.dist - - # Draw samples and compute entropy of the unconstrained distribution - unconstr_samples, unconst_entropy = AdvancedVI.reparam_with_entropy( - rng, q_unconst, q_unconst_stop, n_samples, ent_est - ) - - # Apply bijector to samples while estimating its jacobian - unconstr_iter = AdvancedVI.eachsample(unconstr_samples) - unconstr_init = first(unconstr_iter) - samples_init, logjac_init = with_logabsdet_jacobian(transform, unconstr_init) - samples_and_logjac = mapreduce( - AdvancedVI.catsamples_and_acc, - Iterators.drop(unconstr_iter, 1); - init=(reshape(samples_init, (:, 1)), logjac_init), - ) do sample - with_logabsdet_jacobian(transform, sample) - end - samples = first(samples_and_logjac) - logjac = last(samples_and_logjac) / n_samples - - entropy = unconst_entropy + logjac - return samples, entropy -end - -end diff --git a/test/Project.toml b/test/Project.toml index db7d9068a..f657e28c1 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,6 +1,5 @@ [deps] ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" -Bijectors = "76274a88-744f-5084-9051-94815aaf08c4" DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5" DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" @@ -25,7 +24,6 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [compat] ADTypes = "0.2.1, 1" -Bijectors = "0.13, 0.14, 0.15" DiffResults = "1" DifferentiationInterface = "0.6, 0.7" Distributions = "0.25.111" diff --git a/test/algorithms/klminrepgraddescent_bijectors.jl b/test/algorithms/klminrepgraddescent_bijectors.jl deleted file mode 100644 index 0d26a309a..000000000 --- a/test/algorithms/klminrepgraddescent_bijectors.jl +++ /dev/null @@ -1,94 +0,0 @@ - -@testset "KLMinRepGradDescent with Bijectors" begin - begin - modelstats = normallognormal_meanfield(Random.default_rng(), Float64) - (; model, n_dims, μ_true, L_true) = modelstats - - b = Bijectors.bijector(model) - binv = inverse(b) - - q0_unconstr = MeanFieldGaussian(zeros(n_dims), Diagonal(ones(n_dims))) - q0 = Bijectors.transformed(q0_unconstr, binv) - - @testset "estimate_objective" begin - alg = KLMinRepGradDescent(AD; operator=ClipScale()) - q_true_unconstr = MeanFieldGaussian(Vector(μ_true), Diagonal(L_true)) - q_true = Bijectors.transformed(q_true_unconstr, binv) - - obj_est = estimate_objective(alg, q_true, model) - @test isfinite(obj_est) - - obj_est = estimate_objective(alg, q_true, model; n_samples=1) - @test isfinite(obj_est) - - obj_est = estimate_objective(alg, q_true, model; n_samples=3) - @test isfinite(obj_est) - - obj_est = estimate_objective(alg, q_true, model; n_samples=10^5) - @test obj_est ≈ 0 atol=1e-2 - end - - @testset "determinism" begin - alg = KLMinRepGradDescent(AD; operator=ClipScale()) - - seed = (0x38bef07cf9cc549d) - rng = StableRNG(seed) - T = 10 - - q_out, _, _ = optimize(rng, alg, T, model, q0; show_progress=PROGRESS) - μ = q_out.dist.location - L = q_out.dist.scale - - rng_repl = StableRNG(seed) - q_out, _, _ = optimize(rng_repl, alg, T, model, q0; show_progress=PROGRESS) - μ_repl = q_out.dist.location - L_repl = q_out.dist.scale - @test μ == μ_repl - @test L == L_repl - end - - @testset "warn MvLocationScale with IdentityOperator" begin - @test_warn "IdentityOperator" begin - alg′ = KLMinRepGradDescent(AD; operator=IdentityOperator()) - optimize(alg′, 1, model, q0; show_progress=false) - end - end - end - - @testset "type stability realtype=$(realtype)" for realtype in [Float32, Float64] - modelstats = normallognormal_meanfield(Random.default_rng(), realtype) - (; model, n_dims, μ_true, L_true) = modelstats - - T = 1 - alg = KLMinRepGradDescent(AD; n_samples=10, operator=ClipScale()) - q0_unconstr = MeanFieldGaussian( - zeros(realtype, n_dims), Diagonal(ones(realtype, n_dims)) - ) - q0 = Bijectors.transformed(q0_unconstr, binv) - - q_out, info, _ = optimize(alg, T, model, q0; show_progress=PROGRESS) - - @test eltype(q_out.dist.location) == realtype - @test eltype(q_out.dist.scale) == realtype - @test typeof(first(info).elbo) == realtype - end - - @testset "convergence $(entropy)" for entropy in - [ClosedFormEntropy(), StickingTheLandingEntropy()] - modelstats = normallognormal_meanfield(Random.default_rng(), Float64) - (; model, μ_true, L_true, is_meanfield) = modelstats - - T = 1000 - optimizer = Descent(1e-3) - alg = KLMinRepGradDescent(AD; entropy, optimizer, operator=ClipScale()) - q0_unconstr = MeanFieldGaussian(zeros(n_dims), Diagonal(ones(n_dims))) - q0 = Bijectors.transformed(q0_unconstr, binv) - - q_out, _, _ = optimize(alg, T, model, q0; show_progress=PROGRESS) - - Δλ0 = sum(abs2, q0.dist.location - μ_true) + sum(abs2, q0.dist.scale - L_true) - Δλ = sum(abs2, q_out.dist.location - μ_true) + sum(abs2, q_out.dist.scale - L_true) - - @test Δλ ≤ Δλ0/2 - end -end diff --git a/test/algorithms/klminrepgradproxdescent_bijectors.jl b/test/algorithms/klminrepgradproxdescent_bijectors.jl deleted file mode 100644 index d2714b5aa..000000000 --- a/test/algorithms/klminrepgradproxdescent_bijectors.jl +++ /dev/null @@ -1,89 +0,0 @@ - -@testset "KLMinRepGradProxDescent with Bijectors" begin - begin - modelstats = normallognormal_meanfield(Random.default_rng(), Float64) - (; model, n_dims, μ_true, L_true) = modelstats - - b = Bijectors.bijector(model) - binv = inverse(b) - - q0_unconstr = MeanFieldGaussian(zeros(n_dims), Diagonal(ones(n_dims))) - q0 = Bijectors.transformed(q0_unconstr, binv) - - @testset "estimate_objective" begin - alg = KLMinRepGradProxDescent(AD) - q_true_unconstr = MeanFieldGaussian(Vector(μ_true), Diagonal(L_true)) - q_true = Bijectors.transformed(q_true_unconstr, binv) - - obj_est = estimate_objective(alg, q_true, model) - @test isfinite(obj_est) - - obj_est = estimate_objective(alg, q_true, model; n_samples=1) - @test isfinite(obj_est) - - obj_est = estimate_objective(alg, q_true, model; n_samples=3) - @test isfinite(obj_est) - - obj_est = estimate_objective(alg, q_true, model; n_samples=10^5) - @test obj_est ≈ 0 atol=1e-3 - end - - @testset "determinism" begin - alg = KLMinRepGradProxDescent(AD) - - seed = (0x38bef07cf9cc549d) - rng = StableRNG(seed) - T = 10 - - q_out, _, _ = optimize(rng, alg, T, model, q0; show_progress=PROGRESS) - μ = q_out.dist.location - L = q_out.dist.scale - - rng_repl = StableRNG(seed) - q_out, _, _ = optimize(rng_repl, alg, T, model, q0; show_progress=PROGRESS) - μ_repl = q_out.dist.location - L_repl = q_out.dist.scale - @test μ == μ_repl - @test L == L_repl - end - end - - @testset "type stability realtype=$(realtype)" for realtype in [Float32, Float64] - modelstats = normallognormal_meanfield(Random.default_rng(), realtype) - (; model, n_dims, μ_true, L_true) = modelstats - - T = 1 - alg = KLMinRepGradProxDescent(AD; n_samples=10) - q0_unconstr = MeanFieldGaussian( - zeros(realtype, n_dims), Diagonal(ones(realtype, n_dims)) - ) - q0 = Bijectors.transformed(q0_unconstr, binv) - - q_out, info, _ = optimize(alg, T, model, q0; show_progress=PROGRESS) - - @test eltype(q_out.dist.location) == realtype - @test eltype(q_out.dist.scale) == realtype - @test typeof(first(info).elbo) == realtype - end - - @testset "convergence $(entropy)" for entropy_zerograd in [ - ClosedFormEntropyZeroGradient(), StickingTheLandingEntropyZeroGradient() - ] - modelstats = normallognormal_meanfield(Random.default_rng(), Float64) - (; model, μ_true, L_true, is_meanfield) = modelstats - - T = 1000 - optimizer = Descent(1e-3) - alg = KLMinRepGradProxDescent(AD; n_samples=10, optimizer, entropy_zerograd) - - q0_unconstr = MeanFieldGaussian(zeros(n_dims), Diagonal(ones(n_dims))) - q0 = Bijectors.transformed(q0_unconstr, binv) - - q_out, _, _ = optimize(alg, T, model, q0; show_progress=PROGRESS) - - Δλ0 = sum(abs2, q0.dist.location - μ_true) + sum(abs2, q0.dist.scale - L_true) - Δλ = sum(abs2, q_out.dist.location - μ_true) + sum(abs2, q_out.dist.scale - L_true) - - @test Δλ ≤ Δλ0/2 - end -end diff --git a/test/algorithms/klminscoregraddescent_bijectors.jl b/test/algorithms/klminscoregraddescent_bijectors.jl deleted file mode 100644 index 782d60616..000000000 --- a/test/algorithms/klminscoregraddescent_bijectors.jl +++ /dev/null @@ -1,86 +0,0 @@ - -@testset "KLMinScoreGradDescent with Bijectors" begin - begin - modelstats = normallognormal_meanfield(Random.default_rng(), Float64) - (; model, n_dims, μ_true, L_true) = modelstats - - b = Bijectors.bijector(model) - binv = inverse(b) - - q0_unconstr = MeanFieldGaussian(zeros(n_dims), Diagonal(ones(n_dims))) - q0 = Bijectors.transformed(q0_unconstr, binv) - - @testset "estimate_objective" begin - alg = KLMinScoreGradDescent(AD; n_samples=10, operator=ClipScale()) - q_true_unconstr = MeanFieldGaussian(Vector(μ_true), Diagonal(L_true)) - q_true = Bijectors.transformed(q_true_unconstr, binv) - - obj_est = estimate_objective(alg, q_true, model) - @test isfinite(obj_est) - - obj_est = estimate_objective(alg, q_true, model; n_samples=1) - @test isfinite(obj_est) - - obj_est = estimate_objective(alg, q_true, model; n_samples=3) - @test isfinite(obj_est) - - obj_est = estimate_objective(alg, q_true, model; n_samples=10^5) - @test obj_est ≈ 0 atol=1e-2 - end - - @testset "determinism" begin - alg = KLMinScoreGradDescent(AD; n_samples=10, operator=ClipScale()) - - seed = (0x38bef07cf9cc549d) - rng = StableRNG(seed) - T = 10 - - q_out, _, _ = optimize(rng, alg, T, model, q0; show_progress=PROGRESS) - μ = q_out.dist.location - L = q_out.dist.scale - - rng_repl = StableRNG(seed) - q_out, _, _ = optimize(rng_repl, alg, T, model, q0; show_progress=PROGRESS) - μ_repl = q_out.dist.location - L_repl = q_out.dist.scale - @test μ == μ_repl - @test L == L_repl - end - end - - @testset "type stability realtype=$(realtype)" for realtype in [Float32, Float64] - modelstats = normallognormal_meanfield(Random.default_rng(), realtype) - (; model, n_dims, μ_true, L_true) = modelstats - - T = 1 - alg = KLMinScoreGradDescent(AD; n_samples=10, operator=ClipScale()) - q0_unconstr = MeanFieldGaussian( - zeros(realtype, n_dims), Diagonal(ones(realtype, n_dims)) - ) - q0 = Bijectors.transformed(q0_unconstr, binv) - - q_out, info, _ = optimize(alg, T, model, q0; show_progress=PROGRESS) - - @test eltype(q_out.dist.location) == realtype - @test eltype(q_out.dist.scale) == realtype - @test typeof(first(info).elbo) == realtype - end - - @testset "convergence" begin - modelstats = normallognormal_meanfield(Random.default_rng(), Float64) - (; model, n_dims, μ_true, L_true) = modelstats - - T = 1000 - optimizer = Descent(1e-3) - alg = KLMinScoreGradDescent(AD; n_samples=100, optimizer, operator=ClipScale()) - q0_unconstr = MeanFieldGaussian(zeros(n_dims), Diagonal(ones(n_dims))) - q0 = Bijectors.transformed(q0_unconstr, binv) - - q_out, _, _ = optimize(alg, T, model, q0; show_progress=PROGRESS) - - Δλ0 = sum(abs2, q0.dist.location - μ_true) + sum(abs2, q0.dist.scale - L_true) - Δλ = sum(abs2, q_out.dist.location - μ_true) + sum(abs2, q_out.dist.scale - L_true) - - @test Δλ ≤ Δλ0/2 - end -end diff --git a/test/models/normallognormal.jl b/test/models/normallognormal.jl deleted file mode 100644 index 7c9f29c10..000000000 --- a/test/models/normallognormal.jl +++ /dev/null @@ -1,62 +0,0 @@ - -struct NormalLogNormal{MX,SX,MY,SY} - μ_x::MX - σ_x::SX - μ_y::MY - Σ_y::SY -end - -function LogDensityProblems.logdensity(model::NormalLogNormal, θ) - (; μ_x, σ_x, μ_y, Σ_y) = model - return logpdf(LogNormal(μ_x, σ_x), θ[1]) + logpdf(MvNormal(μ_y, Σ_y), θ[2:end]) -end - -function LogDensityProblems.logdensity_and_gradient(model::NormalLogNormal, θ) - return ( - LogDensityProblems.logdensity(model, θ), - ForwardDiff.gradient(Base.Fix1(LogDensityProblems.logdensity, model), θ), - ) -end - -function LogDensityProblems.dimension(model::NormalLogNormal) - return length(model.μ_y) + 1 -end - -function LogDensityProblems.capabilities(::Type{<:NormalLogNormal}) - return LogDensityProblems.LogDensityOrder{1}() -end - -function Bijectors.bijector(model::NormalLogNormal) - (; μ_x, σ_x, μ_y, Σ_y) = model - return Bijectors.Stacked( - Bijectors.bijector.([LogNormal(μ_x, σ_x), MvNormal(μ_y, Σ_y)]), - [1:1, 2:(1 + length(μ_y))], - ) -end - -function normallognormal_fullrank(::Random.AbstractRNG, realtype::Type) - n_y_dims = 5 - - σ0 = realtype(0.3) - μ = Fill(realtype(5.0), n_y_dims + 1) - L = Matrix(σ0 * I, n_y_dims + 1, n_y_dims + 1) - Σ = Hermitian(L * L') - - model = NormalLogNormal( - μ[1], L[1, 1], μ[2:end], PDMat(Σ[2:end, 2:end], Cholesky(L[2:end, 2:end], 'L', 0)) - ) - return TestModel(model, μ, LowerTriangular(L), n_y_dims + 1, 1 / σ0^2, false) -end - -function normallognormal_meanfield(::Random.AbstractRNG, realtype::Type) - n_y_dims = 5 - - σ0 = realtype(0.3) - μ = Fill(realtype(5), n_y_dims + 1) - σ = Fill(σ0, n_y_dims + 1) - L = Diagonal(σ) - - model = NormalLogNormal(μ[1], σ[1], μ[2:end], Diagonal(σ[2:end] .^ 2)) - - return TestModel(model, μ, L, n_y_dims + 1, 1 / σ0^2, true) -end diff --git a/test/runtests.jl b/test/runtests.jl index 64371ca47..f1c6c8167 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -81,7 +81,4 @@ if GROUP == "All" || GROUP == "AD" include("algorithms/klminrepgraddescent.jl") include("algorithms/klminscoregraddescent.jl") include("algorithms/klminrepgradproxdescent.jl") - include("algorithms/klminrepgraddescent_bijectors.jl") - include("algorithms/klminrepgradproxdescent_bijectors.jl") - include("algorithms/klminscoregraddescent_bijectors.jl") end From 9dc43bb8892af54d4bc924ea5db63004ac578c72 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Fri, 21 Nov 2025 11:15:06 -0500 Subject: [PATCH 02/26] fix remove use of Bijectors in tests --- bench/benchmarks.jl | 5 +-- test/general/clip_scale.jl | 31 +++---------------- .../proximal_location_scale_entropy.jl | 8 +---- test/runtests.jl | 1 - 4 files changed, 7 insertions(+), 38 deletions(-) diff --git a/bench/benchmarks.jl b/bench/benchmarks.jl index 08f688be4..31e073c51 100644 --- a/bench/benchmarks.jl +++ b/bench/benchmarks.jl @@ -1,7 +1,6 @@ using ADTypes using AdvancedVI using BenchmarkTools -using Bijectors using Distributions using DistributionsAD using Enzyme, ForwardDiff, ReverseDiff, Zygote, Mooncake @@ -58,9 +57,7 @@ begin ), ] - b = Bijectors.bijector(prob) - binv = inverse(b) - q = Bijectors.TransformedDistribution(family, binv) + q = family alg = KLMinRepGradDescent(adtype; optimizer=opt, entropy, operator=ClipScale()) SUITES[probname][objname][familyname][adname] = begin diff --git a/test/general/clip_scale.jl b/test/general/clip_scale.jl index 42582e428..a7331d5b5 100644 --- a/test/general/clip_scale.jl +++ b/test/general/clip_scale.jl @@ -1,10 +1,8 @@ @testset "interface ClipScale" begin @testset "MvLocationScale" begin - @testset "$(string(covtype)) $(realtype) $(bijector)" for covtype in - [:meanfield, :fullrank], - realtype in [Float32, Float64], - bijector in [nothing, :identity] + @testset "$(string(covtype)) $(realtype)" for covtype in [:meanfield, :fullrank], + realtype in [Float32, Float64] d = 5 μ = zeros(realtype, d) @@ -16,28 +14,18 @@ L = Diagonal(ones(realtype, d)) MeanFieldGaussian(μ, L) end - q = if isnothing(bijector) - q - else - Bijectors.TransformedDistribution(q, identity) - end params, re = Optimisers.destructure(q) opt_st = Optimisers.setup(Descent(1e-2), params) params′ = AdvancedVI.apply(ClipScale(ϵ), typeof(q), opt_st, params, re) q′ = re(params′) - if isnothing(bijector) - @test all(var(q′) .≥ ϵ^2) - else - @test all(var(q′.dist) .≥ ϵ^2) - end + @test all(var(q′) .≥ ϵ^2) end end @testset "MvLocationScaleLowRank" begin - @testset "$(realtype) $(bijector)" for realtype in [Float32, Float64], - bijector in [nothing, :identity] + @testset "$(realtype) $(bijector)" for realtype in [Float32, Float64] n_rank = 2 d = 5 @@ -48,22 +36,13 @@ q = MvLocationScaleLowRank( μ, D, U, Normal{realtype}(zero(realtype), one(realtype)) ) - q = if isnothing(bijector) - q - else - Bijectors.TransformedDistribution(q, bijector) - end params, re = Optimisers.destructure(q) opt_st = Optimisers.setup(Descent(1e-2), params) params′ = AdvancedVI.apply(ClipScale(ϵ), typeof(q), opt_st, params, re) q′ = re(params′) - if isnothing(bijector) - @test all(var(q′) .≥ ϵ^2) - else - @test all(var(q′.dist) .≥ ϵ^2) - end + @test all(var(q′.dist) .≥ ϵ^2) end end end diff --git a/test/general/proximal_location_scale_entropy.jl b/test/general/proximal_location_scale_entropy.jl index fb58535c6..817c08941 100644 --- a/test/general/proximal_location_scale_entropy.jl +++ b/test/general/proximal_location_scale_entropy.jl @@ -3,8 +3,7 @@ @testset "MvLocationScale" begin @testset "$(string(covtype)) $(realtype) $(bijector)" for covtype in [:meanfield, :fullrank], - realtype in [Float32, Float64], - bijector in [nothing, :identity] + realtype in [Float32, Float64] stepsize = 1e-2 optimizer = Descent(stepsize) @@ -22,11 +21,6 @@ elseif covtype == :meanfield MeanFieldGaussian(μ, L) end - q = if isnothing(bijector) - q - else - Bijectors.TransformedDistribution(q, identity) - end # The proximal operator for the entropy of a location scale distribution # solves the subproblem: diff --git a/test/runtests.jl b/test/runtests.jl index f1c6c8167..3369ece67 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,7 +4,6 @@ using Test: @testset, @test using ADTypes using Base.Iterators -using Bijectors using DiffResults using Distributions using FillArrays From 9479ed4bfbd11b79fba01161d93f449cfdf60d33 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Fri, 21 Nov 2025 11:15:16 -0500 Subject: [PATCH 03/26] fix typo in HISTORY --- HISTORY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index 9172492f5..ab809aea0 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,7 +4,7 @@ Previously, `KLMinRepGradDescent`, `KLMinRepGradProxDescent`, `KLMinScoreGradDescent` only required the support of the target log-density problem to match that of `q`. This was implemented by giving a special treatment to `q <: Bijectors.TransformedDistribution` through the `Bijectors` extension. -This, however, resulted in a multiplicative complexity to maintaining the relevant bits. +This, however, resulted in a multiplicative complexity in maintaining the relevant bits. Since this is not the only way to deal with constrained supports, `Bijectors` extension is now removed. In addition, `KLMinRepGradDescent`, `KLMinRepGradProxDescent`, `KLMinScoreGradDescent` now expect an unconstrained target log-density problem. Instead, a tutorial has been added to the documentation on how to deal with a target log-density problem with constrained support. From 0142a82f470c93dcbbfb5813d292f033db92c5bd Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Fri, 21 Nov 2025 11:17:11 -0500 Subject: [PATCH 04/26] bump AdvancedVI version --- bench/Project.toml | 2 +- docs/Project.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bench/Project.toml b/bench/Project.toml index a63fcfde5..7a29a1618 100644 --- a/bench/Project.toml +++ b/bench/Project.toml @@ -19,7 +19,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [compat] ADTypes = "1" -AdvancedVI = "0.6" +AdvancedVI = "0.7" BenchmarkTools = "1" Distributions = "0.25.111" DistributionsAD = "0.6" diff --git a/docs/Project.toml b/docs/Project.toml index 084e8744f..c76e68889 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -25,7 +25,7 @@ StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c" [compat] ADTypes = "1" Accessors = "0.1" -AdvancedVI = "0.6" +AdvancedVI = "0.7" Bijectors = "0.13.6, 0.14, 0.15" DataFrames = "1" DifferentiationInterface = "0.7" From bb47ced708cef718cf6bed779cec406c1d6102e6 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Fri, 21 Nov 2025 11:17:51 -0500 Subject: [PATCH 05/26] run formatter Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- test/general/clip_scale.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/test/general/clip_scale.jl b/test/general/clip_scale.jl index a7331d5b5..1f87ecb9c 100644 --- a/test/general/clip_scale.jl +++ b/test/general/clip_scale.jl @@ -26,7 +26,6 @@ @testset "MvLocationScaleLowRank" begin @testset "$(realtype) $(bijector)" for realtype in [Float32, Float64] - n_rank = 2 d = 5 μ = zeros(realtype, d) From 6fa2bc30dfecccf3d2c2a89ec109a42984fee97e Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Fri, 21 Nov 2025 11:18:16 -0500 Subject: [PATCH 06/26] run formatter --- bench/benchmarks.jl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bench/benchmarks.jl b/bench/benchmarks.jl index 31e073c51..ee2788c16 100644 --- a/bench/benchmarks.jl +++ b/bench/benchmarks.jl @@ -32,9 +32,7 @@ end begin T = Float64 - for (probname, prob) in [ - ("normal", normal(; n_dims=10, realtype=T)) - ] + for (probname, prob) in [("normal", normal(; n_dims=10, realtype=T))] max_iter = 10^4 d = LogDensityProblems.dimension(prob) opt = Optimisers.Adam(T(1e-3)) From e201345fa6f02e0a21b3800ea0a38d6eb1b6c61c Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Fri, 21 Nov 2025 11:22:11 -0500 Subject: [PATCH 07/26] fix removed include to removed file --- test/runtests.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 3369ece67..e3844deba 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -51,7 +51,6 @@ struct TestModel{M,L,S,SC} is_meanfield::Bool end include("models/normal.jl") -include("models/normallognormal.jl") include("models/subsamplednormals.jl") if GROUP == "All" || GROUP == "GENERAL" From 66a22a55ec4522063939e90bc1fad3e69917a3d4 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Fri, 21 Nov 2025 11:27:35 -0500 Subject: [PATCH 08/26] fix remove erroenous references to bijector --- test/general/clip_scale.jl | 2 +- test/general/proximal_location_scale_entropy.jl | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/test/general/clip_scale.jl b/test/general/clip_scale.jl index 1f87ecb9c..4de055707 100644 --- a/test/general/clip_scale.jl +++ b/test/general/clip_scale.jl @@ -25,7 +25,7 @@ end @testset "MvLocationScaleLowRank" begin - @testset "$(realtype) $(bijector)" for realtype in [Float32, Float64] + @testset "$(realtype)" for realtype in [Float32, Float64] n_rank = 2 d = 5 μ = zeros(realtype, d) diff --git a/test/general/proximal_location_scale_entropy.jl b/test/general/proximal_location_scale_entropy.jl index 817c08941..86a5edaa3 100644 --- a/test/general/proximal_location_scale_entropy.jl +++ b/test/general/proximal_location_scale_entropy.jl @@ -1,8 +1,7 @@ @testset "interface ProximalLocationScaleEntropy" begin @testset "MvLocationScale" begin - @testset "$(string(covtype)) $(realtype) $(bijector)" for covtype in - [:meanfield, :fullrank], + @testset "$(string(covtype)) $(realtype)" for covtype in [:meanfield, :fullrank] realtype in [Float32, Float64] stepsize = 1e-2 @@ -43,7 +42,7 @@ ) q′ = re(params′) - scale′ = isnothing(bijector) ? q′.scale : q′.dist.scale + scale′ = q′.scale grad_left = ReverseDiff.gradient( L_ -> first(logabsdet(LowerTriangular(reshape(L_, d, d)))), vec(scale′) From f5f7fcb1014713cb12b62e3aae8ba7134eb52bf7 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Fri, 21 Nov 2025 11:34:14 -0500 Subject: [PATCH 09/26] fix missing comma --- test/general/proximal_location_scale_entropy.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/general/proximal_location_scale_entropy.jl b/test/general/proximal_location_scale_entropy.jl index 86a5edaa3..16f90ab0f 100644 --- a/test/general/proximal_location_scale_entropy.jl +++ b/test/general/proximal_location_scale_entropy.jl @@ -1,7 +1,7 @@ @testset "interface ProximalLocationScaleEntropy" begin @testset "MvLocationScale" begin - @testset "$(string(covtype)) $(realtype)" for covtype in [:meanfield, :fullrank] + @testset "$(string(covtype)) $(realtype)" for covtype in [:meanfield, :fullrank], realtype in [Float32, Float64] stepsize = 1e-2 From 7a6e902a169455a3b2c70105d3962eb7c0a6ae4a Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Fri, 21 Nov 2025 11:44:07 -0500 Subject: [PATCH 10/26] fix remove include to removed file --- bench/benchmarks.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/bench/benchmarks.jl b/bench/benchmarks.jl index ee2788c16..8c01abe20 100644 --- a/bench/benchmarks.jl +++ b/bench/benchmarks.jl @@ -16,7 +16,6 @@ BLAS.set_num_threads(min(4, Threads.nthreads())) @info sprint(versioninfo) @info "BLAS threads: $(BLAS.get_num_threads())" -include("normallognormal.jl") include("unconstrdist.jl") const SUITES = BenchmarkGroup() From 2a1755a7d2253bd86057307cd1fe7047b3ed914d Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Fri, 21 Nov 2025 11:51:51 -0500 Subject: [PATCH 11/26] update READMe --- README.md | 66 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 4732d5c6e..a18d51445 100644 --- a/README.md +++ b/README.md @@ -3,13 +3,13 @@ [![Tests](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Tests.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Tests.yml/badge.svg?branch=main) [![Coverage](https://codecov.io/gh/TuringLang/AdvancedVI.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/TuringLang/AdvancedVI.jl) -| AD Backend | Integration Status | -| ------------- | ------------- | -| [ForwardDiff](https://github.com/JuliaDiff/ForwardDiff.jl) | [![ForwardDiff](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ForwardDiff.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ForwardDiff.yml?query=branch%3Amain) | -| [ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl) | [![ReverseDiff](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ReverseDiff.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ReverseDiff.yml?query=branch%3Amain) | -| [Zygote](https://github.com/FluxML/Zygote.jl) | [![Zygote](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Zygote.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Zygote.yml?query=branch%3Amain) | -| [Mooncake](https://github.com/chalk-lab/Mooncake.jl) | [![Mooncake](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Mooncake.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Mooncake.yml?query=branch%3Amain) | -| [Enzyme](https://github.com/EnzymeAD/Enzyme.jl) | [![Enzyme](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Enzyme.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Enzyme.yml?query=branch%3Amain) | +| AD Backend | Integration Status | +|:---------------------------------------------------------- |:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [ForwardDiff](https://github.com/JuliaDiff/ForwardDiff.jl) | [![ForwardDiff](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ForwardDiff.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ForwardDiff.yml?query=branch%3Amain) | +| [ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl) | [![ReverseDiff](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ReverseDiff.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ReverseDiff.yml?query=branch%3Amain) | +| [Zygote](https://github.com/FluxML/Zygote.jl) | [![Zygote](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Zygote.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Zygote.yml?query=branch%3Amain) | +| [Mooncake](https://github.com/chalk-lab/Mooncake.jl) | [![Mooncake](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Mooncake.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Mooncake.yml?query=branch%3Amain) | +| [Enzyme](https://github.com/EnzymeAD/Enzyme.jl) | [![Enzyme](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Enzyme.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Enzyme.yml?query=branch%3Amain) | # AdvancedVI.jl @@ -69,7 +69,7 @@ end; Since the support of `σ` is constrained to be positive and most VI algorithms assume an unconstrained Euclidean support, we need to use a *bijector* to transform `θ`. We will use [`Bijectors`](https://github.com/TuringLang/Bijectors.jl) for this purpose. -This corresponds to the automatic differentiation variational inference (ADVI) formulation[^KTRGB2017]. +The bijector corresponding to the joint support of our model can be constructed as follows: ```julia using Bijectors: Bijectors @@ -85,6 +85,36 @@ end; A simpler approach would be to use [`Turing`](https://github.com/TuringLang/Turing.jl), where a `Turing.Model` can be automatically be converted into a `LogDensityProblem` and a corresponding `bijector` is automatically generated. +Since most VI algorithms assume that the posterior is unconstrained, we will apply a change-of-variable to our model to make it unconstrained. +This amounts to wrapping it into a `LogDensityProblem` that applies the transformation and apply a Jacobian adjustment. + +```julia +struct TransformedLogDensityProblem{Prob,Trans} + prob::Prob + transform::Trans +end + +function TransformedLogDensityProblem(prob, transform) + return TransformedLogDensityProblem{typeof(prob),typeof(transform)}(prob, transform) +end + +function LogDensityProblems.logdensity(prob_trans::TransformedLogDensityProblem, θ_trans) + (; prob, transform) = prob_trans + θ, logabsdetjac = Bijectors.with_logabsdet_jacobian(transform, θ_trans) + return LogDensityProblems.logdensity(prob, θ) + logabsdetjac +end + +function LogDensityProblems.dimension(prob_trans::TransformedLogDensityProblem) + return LogDensityProblems.dimension(prob_trans.prob) +end + +function LogDensityProblems.capabilities( + ::Type{TransformedLogDensityProblem{Prob,Trans}} +) where {Prob,Trans} + return LogDensityProblems.capabilities(Prob) +end; +``` + For the dataset, we will use the popular [sonar classification dataset](https://archive.ics.uci.edu/dataset/151/connectionist+bench+sonar+mines+vs+rocks) from the UCI repository. This can be automatically downloaded using [`OpenML`](https://github.com/JuliaAI/OpenML.jl). The sonar dataset corresponds to the dataset id 40. @@ -109,7 +139,10 @@ X = hcat(X, ones(size(X, 1))); The model can now be instantiated as follows: ```julia -model = LogReg(X, y); +prob = LogReg(X, y); +b = Bijectors.bijector(prob) +binv = Bijectors.inverse(b) +prob_trans = TransformedLogDensityProblem(prob, binv) ``` For the VI algorithm, we will use `KLMinRepGradDescent`: @@ -136,7 +169,7 @@ For this, it is straightforward to use `LogDensityProblemsAD`: using DifferentiationInterface: DifferentiationInterface using LogDensityProblemsAD: LogDensityProblemsAD -model_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoReverseDiff(), model); +prob_trans_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoReverseDiff(), prob_trans); ``` For the variational family, we will consider a `FullRankGaussian` approximation: @@ -144,7 +177,7 @@ For the variational family, we will consider a `FullRankGaussian` approximation: ```julia using LinearAlgebra -d = LogDensityProblems.dimension(model_ad) +d = LogDensityProblems.dimension(prob_trans_ad) q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.37*I, d, d))) q = MeanFieldGaussian(zeros(d), Diagonal(ones(d))); ``` @@ -161,7 +194,15 @@ We can now run VI: ```julia max_iter = 10^3 -q, info, _ = AdvancedVI.optimize(alg, max_iter, model_ad, q_transformed;); +q_opt, info, _ = AdvancedVI.optimize(alg, max_iter, prob_trans_ad, q); +``` + +Recall that we applied a change-of-variable to the posterior to make it unconstrained. +This, however, is not the original constrained posterior that we wanted to approximate. +Therefore, we finally need to apply a change-of-variable to `q_opt` to make it approximate our original problem. + +```julia +q_trans = Bijectors.TransformedDistribution(q, binv) ``` For more examples and details, please refer to the documentation. @@ -169,4 +210,3 @@ For more examples and details, please refer to the documentation. [^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014, June). Doubly stochastic variational Bayes for non-conjugate inference. In *International Conference on Machine Learning*. PMLR. [^RMW2014]: Rezende, D. J., Mohamed, S., & Wierstra, D. (2014, June). Stochastic backpropagation and approximate inference in deep generative models. In *International Conference on Machine Learning*. PMLR. [^KW2014]: Kingma, D. P., & Welling, M. (2014). Auto-encoding variational bayes. In *International Conference on Learning Representations*. -[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of machine learning research*. From f23eb7add69f38412f47dc16cffce84ae865638a Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Fri, 21 Nov 2025 13:12:57 -0500 Subject: [PATCH 12/26] fix move benchmark model to main file --- bench/benchmarks.jl | 29 ++++++++++++++++++++++++++++- bench/unconstrdist.jl | 33 --------------------------------- 2 files changed, 28 insertions(+), 34 deletions(-) delete mode 100644 bench/unconstrdist.jl diff --git a/bench/benchmarks.jl b/bench/benchmarks.jl index 8c01abe20..a2819bd7d 100644 --- a/bench/benchmarks.jl +++ b/bench/benchmarks.jl @@ -16,7 +16,34 @@ BLAS.set_num_threads(min(4, Threads.nthreads())) @info sprint(versioninfo) @info "BLAS threads: $(BLAS.get_num_threads())" -include("unconstrdist.jl") +struct Dist{D<:ContinuousMultivariateDistribution} + dist::D +end + +function LogDensityProblems.logdensity(model::Dist, x) + return logpdf(model.dist, x) +end + +function LogDensityProblems.logdensity_and_gradient(model::Dist, θ) + return ( + LogDensityProblems.logdensity(model, θ), + ForwardDiff.gradient(Base.Fix1(LogDensityProblems.logdensity, model), θ), + ) +end + +function LogDensityProblems.dimension(model::Dist) + return length(model.dist) +end + +function LogDensityProblems.capabilities(::Type{<:Dist}) + return LogDensityProblems.LogDensityOrder{0}() +end + +function normal(; n_dims=10, realtype=Float64) + μ = fill(realtype(5), n_dims) + Σ = Diagonal(ones(realtype, n_dims)) + return Dist(MvNormal(μ, Σ)) +end const SUITES = BenchmarkGroup() diff --git a/bench/unconstrdist.jl b/bench/unconstrdist.jl deleted file mode 100644 index 164199f0b..000000000 --- a/bench/unconstrdist.jl +++ /dev/null @@ -1,33 +0,0 @@ - -struct UnconstrDist{D<:ContinuousMultivariateDistribution} - dist::D -end - -function LogDensityProblems.logdensity(model::UnconstrDist, x) - return logpdf(model.dist, x) -end - -function LogDensityProblems.logdensity_and_gradient(model::UnconstrDist, θ) - return ( - LogDensityProblems.logdensity(model, θ), - ForwardDiff.gradient(Base.Fix1(LogDensityProblems.logdensity, model), θ), - ) -end - -function LogDensityProblems.dimension(model::UnconstrDist) - return length(model.dist) -end - -function LogDensityProblems.capabilities(::Type{<:UnconstrDist}) - return LogDensityProblems.LogDensityOrder{0}() -end - -function Bijectors.bijector(model::UnconstrDist) - return identity -end - -function normal(; n_dims=10, realtype=Float64) - μ = fill(realtype(5), n_dims) - Σ = Diagonal(ones(realtype, n_dims)) - return UnconstrDist(MvNormal(μ, Σ)) -end From 4d8d95e217cb5ea46902e4ecc8b92d13addeda91 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Sat, 22 Nov 2025 11:53:26 -0500 Subject: [PATCH 13/26] update docs to the new recommended use of Bijectors --- README.md | 6 +- docs/src/klminrepgraddescent.md | 130 ++++++------------------------ docs/src/tutorials/basic.md | 89 +++++++++++++------- docs/src/tutorials/flows.md | 94 +++++++++++++-------- docs/src/tutorials/subsampling.md | 73 +++++++---------- 5 files changed, 179 insertions(+), 213 deletions(-) diff --git a/README.md b/README.md index a18d51445..4e54b6e06 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,7 @@ end; A simpler approach would be to use [`Turing`](https://github.com/TuringLang/Turing.jl), where a `Turing.Model` can be automatically be converted into a `LogDensityProblem` and a corresponding `bijector` is automatically generated. Since most VI algorithms assume that the posterior is unconstrained, we will apply a change-of-variable to our model to make it unconstrained. -This amounts to wrapping it into a `LogDensityProblem` that applies the transformation and apply a Jacobian adjustment. +This amounts to wrapping it into a `LogDensityProblem` that applies the transformation and the corresponding Jacobian adjustment. ```julia struct TransformedLogDensityProblem{Prob,Trans} @@ -178,7 +178,7 @@ For the variational family, we will consider a `FullRankGaussian` approximation: using LinearAlgebra d = LogDensityProblems.dimension(prob_trans_ad) -q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.37*I, d, d))) +q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.6*I, d, d))) q = MeanFieldGaussian(zeros(d), Diagonal(ones(d))); ``` @@ -202,7 +202,7 @@ This, however, is not the original constrained posterior that we wanted to appro Therefore, we finally need to apply a change-of-variable to `q_opt` to make it approximate our original problem. ```julia -q_trans = Bijectors.TransformedDistribution(q, binv) +q_trans = Bijectors.TransformedDistribution(q_opt, binv) ``` For more examples and details, please refer to the documentation. diff --git a/docs/src/klminrepgraddescent.md b/docs/src/klminrepgraddescent.md index c2b3f9264..64cb5f5bb 100644 --- a/docs/src/klminrepgraddescent.md +++ b/docs/src/klminrepgraddescent.md @@ -1,9 +1,7 @@ # [`KLMinRepGradDescent`](@id klminrepgraddescent) -## Description - -This algorithm aims to minimize the exclusive (or reverse) Kullback-Leibler (KL) divergence via stochastic gradient descent in the space of parameters. -Specifically, it uses the the *reparameterization gradient estimator*. +This algorithms aim to minimize the exclusive (or reverse) Kullback-Leibler (KL) divergence via stochastic gradient descent in the space of parameters. +Specifically, it uses the the *reparameterization gradient* estimator. As a result, this algorithm is best applicable when the target log-density is differentiable and the sampling process of the variational family is differentiable. (See the [methodology section](@ref klminrepgraddescent_method) for more details.) This algorithm is also commonly referred to as automatic differentiation variational inference, black-box variational inference with the reparameterization gradient, and stochastic gradient variational inference. @@ -13,30 +11,15 @@ This algorithm is also commonly referred to as automatic differentiation variati KLMinRepGradDescent ``` -The associated objective value can be estimated through the following: - -```@docs; canonical=false -estimate_objective( - ::Random.AbstractRNG, - ::Union{<:KLMinRepGradDescent,<:KLMinRepGradProxDescent,<:KLMinScoreGradDescent}, - ::Any, - ::Any; - ::Int, - ::AbstractEntropyEstimator, -) -``` - ## [Methodology](@id klminrepgraddescent_method) -This algorithm aims to solve the problem +This algorithms aim to solve the problem ```math \mathrm{minimize}_{q \in \mathcal{Q}}\quad \mathrm{KL}\left(q, \pi\right) ``` where $\mathcal{Q}$ is some family of distributions, often called the variational family, by running stochastic gradient descent in the (Euclidean) space of parameters. -That is, for all $$q_{\lambda} \in \mathcal{Q}$$, we assume $$q_{\lambda}$$ there is a corresponding vector of parameters $$\lambda \in \Lambda$$, where the space of parameters is Euclidean such that $$\Lambda \subset \mathbb{R}^p$$. - Since we usually only have access to the unnormalized densities of the target distribution $\pi$, we don't have direct access to the KL divergence. Instead, the ELBO maximization strategy maximizes a surrogate objective, the *evidence lower bound* (ELBO; [^JGJS1999]) @@ -57,7 +40,7 @@ Algorithmically, `KLMinRepGradDescent` iterates the step where $\widehat{\nabla \mathrm{ELBO}}(q_{\lambda})$ is the reparameterization gradient estimate[^HC1983][^G1991][^R1992][^P1996] of the ELBO gradient and $$\mathrm{operator}$$ is an optional operator (*e.g.* projections, identity mapping). The reparameterization gradient, also known as the push-in gradient or the pathwise gradient, was introduced to VI in [^TL2014][^RMW2014][^KW2014]. -For the variational family $$\mathcal{Q}$$, suppose the process of sampling from $$q_{\lambda} \in \mathcal{Q}$$ can be described by some differentiable reparameterization function $$T_{\lambda}$$ and a *base distribution* $$\varphi$$ independent of $$\lambda$$ such that +For the variational family $\mathcal{Q} = \{q_{\lambda} \mid \lambda \in \Lambda\}$, suppose the process of sampling from $q_{\lambda}$ can be described by some differentiable reparameterization function $$T_{\lambda}$$ and a *base distribution* $$\varphi$$ independent of $$\lambda$$ such that ```math z \sim q_{\lambda} \qquad\Leftrightarrow\qquad @@ -80,50 +63,6 @@ where $$\epsilon_m \sim \varphi$$ are Monte Carlo samples. [^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014). Doubly stochastic variational Bayes for non-conjugate inference. In *International Conference on Machine Learning*. [^RMW2014]: Rezende, D. J., Mohamed, S., & Wierstra, D. (2014). Stochastic backpropagation and approximate inference in deep generative models. In *International Conference on Machine Learning*. [^KW2014]: Kingma, D. P., & Welling, M. (2014). Auto-encoding variational bayes. In *International Conference on Learning Representations*. -## [Handling Constraints with `Bijectors`](@id bijectors) - -As mentioned in the docstring, `KLMinRepGradDescent` assumes that the variational approximation $$q_{\lambda}$$ and the target distribution $$\pi$$ have the same support for all $$\lambda \in \Lambda$$. -However, in general, it is most convenient to use variational families that have the whole Euclidean space $$\mathbb{R}^d$$ as their support. -This is the case for the [location-scale distributions](@ref locscale) provided by `AdvancedVI`. -For target distributions which the support is not the full $$\mathbb{R}^d$$, we can apply some transformation $$b$$ to $$q_{\lambda}$$ to match its support such that - -```math -z \sim q_{b,\lambda} \qquad\Leftrightarrow\qquad -z \stackrel{d}{=} b^{-1}\left(\eta\right);\quad \eta \sim q_{\lambda}, -``` - -where $$b$$ is often called a *bijector*, since it is often chosen among bijective transformations. -This idea is known as automatic differentiation VI[^KTRGB2017] and has subsequently been improved by Tensorflow Probability[^DLTBV2017]. -In Julia, [Bijectors.jl](https://github.com/TuringLang/Bijectors.jl)[^FXTYG2020] provides a comprehensive collection of bijections. - -[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of Machine Learning Research*, 18(14), 1-45. -[^DLTBV2017]: Dillon, J. V., Langmore, I., Tran, D., Brevdo, E., Vasudevan, S., Moore, D., ... & Saurous, R. A. (2017). Tensorflow distributions. arXiv. -[^FXTYG2020]: Fjelde, T. E., Xu, K., Tarek, M., Yalburgi, S., & Ge, H. (2020,. Bijectors. jl: Flexible transformations for probability distributions. In *Symposium on Advances in Approximate Bayesian Inference*. - One caveat of ADVI is that, after applying the bijection, a Jacobian adjustment needs to be applied. - That is, the objective is now -```math -\mathrm{ADVI}\left(\lambda\right) -\triangleq -\mathbb{E}_{\eta \sim q_{\lambda}}\left[ - \log \pi\left( b^{-1}\left( \eta \right) \right) - + \log \lvert J_{b^{-1}}\left(\eta\right) \rvert -\right] -+ \mathbb{H}\left(q_{\lambda}\right) -``` - -This is automatically handled by `AdvancedVI` through `TransformedDistribution` provided by `Bijectors.jl`. -See the following example: - -```julia -using Bijectors -q = MeanFieldGaussian(μ, L) -b = Bijectors.bijector(dist) -binv = inverse(b) -q_transformed = Bijectors.TransformedDistribution(q, binv) -``` - -By passing `q_transformed` to `optimize`, the Jacobian adjustment for the bijector `b` is automatically applied. -(See the [Basic Example](@ref basic) for a fully working example.) ## [Entropy Gradient Estimators](@id entropygrad) @@ -157,7 +96,6 @@ Depending on the variational family, this might be computationally inefficient o For example, if ``q_{\lambda}`` is a Gaussian with a full-rank covariance, a back-substitution must be performed at every step, making the per-iteration complexity ``\mathcal{O}(d^3)`` and reducing numerical stability. ```@setup repgradelbo -using Bijectors using FillArrays using LinearAlgebra using LogDensityProblems @@ -168,51 +106,38 @@ using Optimisers using ADTypes, ForwardDiff, ReverseDiff using AdvancedVI -struct NormalLogNormal{MX,SX,MY,SY} - μ_x::MX - σ_x::SX - μ_y::MY - Σ_y::SY +struct Dist{D} + dist::D end -function LogDensityProblems.logdensity(model::NormalLogNormal, θ) - (; μ_x, σ_x, μ_y, Σ_y) = model - logpdf(LogNormal(μ_x, σ_x), θ[1]) + logpdf(MvNormal(μ_y, Σ_y), θ[2:end]) +function LogDensityProblems.logdensity(model::Dist, θ) + return logpdf(model.dist, θ) end -function LogDensityProblems.logdensity_and_gradient(model::NormalLogNormal, θ) +function LogDensityProblems.logdensity_and_gradient(model::Dist, θ) return ( LogDensityProblems.logdensity(model, θ), ForwardDiff.gradient(Base.Fix1(LogDensityProblems.logdensity, model), θ), ) end -function LogDensityProblems.dimension(model::NormalLogNormal) - length(model.μ_y) + 1 +function LogDensityProblems.dimension(model::Dist) + return length(model.dist) end -function LogDensityProblems.capabilities(::Type{<:NormalLogNormal}) +function LogDensityProblems.capabilities(::Type{<:Dist}) LogDensityProblems.LogDensityOrder{1}() end n_dims = 10 -μ_x = 2.0 -σ_x = 0.3 -μ_y = Fill(2.0, n_dims) -σ_y = Fill(1.0, n_dims) -model = NormalLogNormal(μ_x, σ_x, μ_y, Diagonal(σ_y.^2)); +μ = Fill(2.0, n_dims) +σ = Fill(1.0, n_dims) +model = Dist(MvNormal(μ, Diagonal(σ.^2))); d = LogDensityProblems.dimension(model); -μ = zeros(d); -L = Diagonal(ones(d)); -q0 = AdvancedVI.MeanFieldGaussian(μ, L) - -function Bijectors.bijector(model::NormalLogNormal) - (; μ_x, σ_x, μ_y, Σ_y) = model - Bijectors.Stacked( - Bijectors.bijector.([LogNormal(μ_x, σ_x), MvNormal(μ_y, Σ_y)]), - [1:1, 2:1+length(μ_y)]) -end +μ0 = zeros(d); +L0 = Diagonal(ones(d)); +q0 = AdvancedVI.MeanFieldGaussian(μ0, L0) ``` In this example, the true posterior is contained within the variational family. @@ -223,10 +148,6 @@ Recall that the original ADVI objective with a closed-form entropy (CFE) is give ```@example repgradelbo n_montecarlo = 16; -b = Bijectors.bijector(model); -binv = inverse(b) - -q0_trans = Bijectors.TransformedDistribution(q0, binv) cfe = KLMinRepGradDescent( AutoReverseDiff(); @@ -250,12 +171,11 @@ nothing ``` ```@setup repgradelbo -max_iter = 3*10^3 +max_iter = 10^3 function callback(; params, restructure, kwargs...) - q = restructure(params).dist - dist2 = sum(abs2, q.location - vcat([μ_x], μ_y)) - + sum(abs2, diag(q.scale) - vcat(σ_x, σ_y)) + q = restructure(params) + dist2 = sum(abs2, q.location - μ) + sum(abs2, diag(q.scale) - σ) (dist = sqrt(dist2),) end @@ -263,7 +183,7 @@ _, info_cfe, _ = AdvancedVI.optimize( cfe, max_iter, model, - q0_trans; + q0; show_progress = false, callback = callback, ); @@ -272,7 +192,7 @@ _, info_stl, _ = AdvancedVI.optimize( stl, max_iter, model, - q0_trans; + q0; show_progress = false, callback = callback, ); @@ -281,7 +201,7 @@ _, info_stl, _ = AdvancedVI.optimize( stl, max_iter, model, - q0_trans; + q0; show_progress = false, callback = callback, ); @@ -364,7 +284,7 @@ _, info_qmc, _ = AdvancedVI.optimize( KLMinRepGradDescent(AutoReverseDiff(); n_samples=n_montecarlo, optimizer=Adam(1e-2), operator=ClipScale()), max_iter, model, - q0_trans; + q0; show_progress = false, callback = callback, ); diff --git a/docs/src/tutorials/basic.md b/docs/src/tutorials/basic.md index 8998b0789..53c9db794 100644 --- a/docs/src/tutorials/basic.md +++ b/docs/src/tutorials/basic.md @@ -50,14 +50,13 @@ end nothing ``` -Since the support of `σ` is constrained to be positive and most VI algorithms assume an unconstrained Euclidean support, we need to use a *bijector* to transform `θ`. -We will use [`Bijectors`](https://github.com/TuringLang/Bijectors.jl) for this purpose. -This corresponds to the automatic differentiation variational inference (ADVI) formulation[^KTRGB2017]. +Notice that the support of `σ` is constrained to be positive. +Since most VI algorithms assume an unconstrained Euclidean support, we will apply a change-of-variable by a bijective map (a *bijector*) to the posterior to make it unconstrained. +For this, we will use the [`Bijectors`](https://github.com/TuringLang/Bijectors.jl) package. In our case, we need a bijector that applies an identity map for the first `size(X,2)` coordinates, and map the last coordinate to the support of `LogNormal(0, 3)`. -This can be done as follows: +This can be constructed as follows: -[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of machine learning research*. ```@example basic using Bijectors: Bijectors @@ -71,7 +70,37 @@ end nothing ``` -For more details, please refer to the documentation of [`Bijectors`](https://github.com/TuringLang/Bijectors.jl). +Next, we will wrap our original `LogDensityProblem` into a new `LogDensityProblem` that applies the transformation and the corresponding Jacobian adjustment. + +```@example basic +struct TransformedLogDensityProblem{Prob,Trans} + prob::Prob + transform::Trans +end + +function TransformedLogDensityProblem(prob, transform) + return TransformedLogDensityProblem{typeof(prob),typeof(transform)}(prob, transform) +end + +function LogDensityProblems.logdensity(prob_trans::TransformedLogDensityProblem, θ_trans) + (; prob, transform) = prob_trans + θ, logabsdetjac = Bijectors.with_logabsdet_jacobian(transform, θ_trans) + return LogDensityProblems.logdensity(prob, θ) + logabsdetjac +end + +function LogDensityProblems.dimension(prob_trans::TransformedLogDensityProblem) + return LogDensityProblems.dimension(prob_trans.prob) +end + +function LogDensityProblems.capabilities( + ::Type{TransformedLogDensityProblem{Prob,Trans}} +) where {Prob,Trans} + return LogDensityProblems.capabilities(Prob) +end +nothing +``` + +For more details on the usage of bijectors, please refer to the documentation of [`Bijectors`](https://github.com/TuringLang/Bijectors.jl). For the dataset, we will use the popular [sonar classification dataset](https://archive.ics.uci.edu/dataset/151/connectionist+bench+sonar+mines+vs+rocks) from the UCI repository. This can be automatically downloaded using [`OpenML`](https://github.com/JuliaAI/OpenML.jl). @@ -99,7 +128,10 @@ nothing The model can now be instantiated as follows: ```@example basic -model = LogReg(X, y) +prob = LogReg(X, y); +b = Bijectors.bijector(prob) +binv = Bijectors.inverse(b) +prob_trans = TransformedLogDensityProblem(prob, binv) nothing ``` @@ -134,7 +166,7 @@ For this example, we will use `LogDensityProblemsAD` to equip our problem with a using DifferentiationInterface: DifferentiationInterface using LogDensityProblemsAD: LogDensityProblemsAD -model_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoReverseDiff(), model) +prob_trans_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoForwardDiff(), prob_trans) nothing ``` @@ -143,29 +175,20 @@ For the variational family, we will consider a `FullRankGaussian` approximation: ```@example basic using LinearAlgebra -d = LogDensityProblems.dimension(model_ad) -q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.37*I, d, d))) +d = LogDensityProblems.dimension(prob_trans) +q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.6*I, d, d))) nothing ``` -Now, `KLMinRepGradDescent` requires the variational approximation and the target log-density to have the same support. -Since `y` follows a log-normal prior, its support is bounded to be the positive half-space ``\mathbb{R}_+``. -Thus, we will use [Bijectors](https://github.com/TuringLang/Bijectors.jl) to match the support of our target posterior and the variational approximation. -The bijector can now be applied to `q` to match the support of the target problem. - -```@example basic -b = Bijectors.bijector(model) -binv = Bijectors.inverse(b) -q_transformed = Bijectors.TransformedDistribution(q, binv) -nothing -``` +`KLMinRepGradDescent` requires the variational approximation and the target log-density to have unconstrained support. +Since we applied a bijector to the posterior to make it unconstrained, and the support of a `FullRankGaussian` is already the full $\mathbb{R}^d$, we are ready to go. We can now run VI: ```@example basic max_iter = 10^4 q_out, info, _ = AdvancedVI.optimize( - alg, max_iter, model_ad, q_transformed; show_progress=false + alg, max_iter, prob_trans_ad, q; show_progress=false ) nothing ``` @@ -190,6 +213,15 @@ nothing ![](basic_example_elbo.svg) +Recall that we applied a change-of-variable to the posterior to make it unconstrained. +This, however, is not the original constrained posterior that we wanted to approximate. +Therefore, we finally need to apply a change-of-variable to `q_out` to make it approximate our original problem: + +```julia +q_trans = Bijectors.TransformedDistribution(q_out, binv) +``` + + ## Custom Callback The ELBO estimates above however, use only a handful of Monte Carlo samples. @@ -210,7 +242,7 @@ using StatsFuns: StatsFuns Approximate the posterior predictive probability for a logistic link function using Mackay's approximation (Bishop p. 220). """ function logistic_prediction(X, μ_β, Σ_β) - xtΣx = sum((model.X*Σ_β) .* model.X; dims=2)[:, 1] + xtΣx = sum((prob.X*Σ_β) .* prob.X; dims=2)[:, 1] κ = @. 1/sqrt(1 + π/8*xtΣx) return StatsFuns.logistic.(κ .* X*μ_β) end @@ -221,18 +253,19 @@ function callback(; iteration, averaged_params, restructure, kwargs...) # Use the averaged parameters (the eventual output of the algorithm) q_avg = restructure(averaged_params) + q_avg_trans = Bijectors.TransformedDistribution(q_avg, binv) # Compute predictions - μ_β = mean(q_avg.dist)[1:(end - 1)] # posterior mean of β - Σ_β = cov(q_avg.dist)[1:(end - 1), end - 1] # marginal posterior covariance of β + μ_β = mean(q_avg_trans.dist)[1:(end - 1)] # posterior mean of β + Σ_β = cov(q_avg_trans.dist)[1:(end - 1), end - 1] # marginal posterior covariance of β y_pred = logistic_prediction(X, μ_β, Σ_β) .> 0.5 # Prediction accuracy - acc = mean(y_pred .== model.y) + acc = mean(y_pred .== prob.y) # Higher fidelity estimate of the ELBO on the averaged parameters n_samples = 256 - elbo_callback = -estimate_objective(alg, q_avg, model; n_samples) + elbo_callback = -estimate_objective(alg, q_avg, prob_trans; n_samples) (elbo_callback=elbo_callback, accuracy=acc) else @@ -250,7 +283,7 @@ The `callback` can be supplied to `optimize`: ```@example basic max_iter = 10^4 q_out, info, _ = AdvancedVI.optimize( - alg, max_iter, model_ad, q_transformed; show_progress=false, callback=callback + alg, max_iter, prob_trans_ad, q; show_progress=false, callback=callback ) nothing ``` diff --git a/docs/src/tutorials/flows.md b/docs/src/tutorials/flows.md index 6334c946c..ae44758ce 100644 --- a/docs/src/tutorials/flows.md +++ b/docs/src/tutorials/flows.md @@ -22,7 +22,6 @@ Multiplicative degeneracy is not entirely made up and do come up in some models For example, in the 3-parameter (3-PL) item-response theory model and the N-mixture model used for estimating animal population. ```@example flow -using Bijectors: Bijectors using Distributions using LogDensityProblems: LogDensityProblems @@ -56,7 +55,7 @@ Degenerate posteriors often indicate that there is not enough data to pin-point Therefore, for the purpose of illustration, we will use a single data point: ```@example flow -model = MultDegen([3.0]) +prob = MultDegen([3.0]) nothing ``` @@ -68,7 +67,7 @@ using Plots contour( range(0, 4; length=64), range(-3, 25; length=64), - (x, y) -> LogDensityProblems.logdensity(model, [x, y]); + (x, y) -> LogDensityProblems.logdensity(prob, [x, y]); xlabel="α", ylabel="β", clims=(-8, Inf), @@ -86,19 +85,6 @@ This sort of nonlinear correlation structure is difficult to model using only lo ## Gaussian Variational Family As usual, let's try to fit a multivariate Gaussian to this posterior. - -```@example flow -using ADTypes: ADTypes -using ReverseDiff: ReverseDiff -using DifferentiationInterface: DifferentiationInterface -using LogDensityProblemsAD: LogDensityProblemsAD - -model_ad = LogDensityProblemsAD.ADgradient( - ADTypes.AutoReverseDiff(; compile=true), model; x=[1.0, 1.0] -) -nothing -``` - Since $\alpha$ is constrained to the positive real half-space, we have to employ bijectors. For this, we use [Bijectors](https://github.com/TuringLang/Bijectors.jl): @@ -113,28 +99,72 @@ end nothing ``` +to transform the posterior to be unconstrained: + +```@example flow +struct TransformedLogDensityProblem{Prob,Trans} + prob::Prob + transform::Trans +end + +function TransformedLogDensityProblem(prob, transform) + return TransformedLogDensityProblem{typeof(prob),typeof(transform)}(prob, transform) +end + +function LogDensityProblems.logdensity(prob_trans::TransformedLogDensityProblem, θ_trans) + (; prob, transform) = prob_trans + θ, logabsdetjac = Bijectors.with_logabsdet_jacobian(transform, θ_trans) + return LogDensityProblems.logdensity(prob, θ) + logabsdetjac +end + +function LogDensityProblems.dimension(prob_trans::TransformedLogDensityProblem) + return LogDensityProblems.dimension(prob_trans.prob) +end + +function LogDensityProblems.capabilities( + ::Type{TransformedLogDensityProblem{Prob,Trans}} +) where {Prob,Trans} + return LogDensityProblems.capabilities(Prob) +end +nothing +``` + +Let's instantiate the model: + +```@example flow +using ADTypes: ADTypes +using ReverseDiff: ReverseDiff +using DifferentiationInterface: DifferentiationInterface +using LogDensityProblemsAD: LogDensityProblemsAD + +binv = Bijectors.inverse(Bijectors.bijector(prob)) +prob_trans = TransformedLogDensityProblem(prob, binv) +prob_trans_ad = LogDensityProblemsAD.ADgradient( + ADTypes.AutoForwardDiff(), prob_trans; x=[1.0, 1.0] +) +nothing +``` + For the algorithm, we will use the `KLMinRepGradProxDescent` objective. ```@example flow using AdvancedVI using LinearAlgebra -d = LogDensityProblems.dimension(model_ad) +d = LogDensityProblems.dimension(prob_trans_ad) q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(I, d, d))) -binv = Bijectors.inverse(Bijectors.bijector(model)) -q_trans = Bijectors.TransformedDistribution(q, binv) - max_iter = 3*10^3 alg = KLMinRepGradProxDescent(ADTypes.AutoReverseDiff(; compile=true)) -q_out, info, _ = AdvancedVI.optimize(alg, max_iter, model_ad, q_trans; show_progress=false) +q_out, info, _ = AdvancedVI.optimize(alg, max_iter, prob_trans_ad, q; show_progress=false) +q_out_trans = Bijectors.TransformedDistribution(q_out, binv) nothing ``` The resulting variational posterior can be visualized as follows: ```@example flow -samples = rand(q_out, 10000) +samples = rand(q_out_trans, 10000) histogram2d( samples[1, :], samples[2, :]; @@ -177,16 +207,7 @@ q_flow = realnvp(MvNormal(zeros(d), I), hidden_dims, n_layers; paramtype=Float64 nothing ``` -Recall that out posterior is constrained. -In most cases, flows assume an unconstrained support. -Therefore, just as with the Gaussian variational family, we can incorporate `Bijectors` to match the supports: - -```@example flow -q_flow_trans = Bijectors.TransformedDistribution(q_flow, binv) -nothing -``` - -For the variational inference algorithms, we will similarly minimize the KL divergence with stochastic gradient descent as originally proposed by Rezende and Mohamed[^RM2015]. +For the variational inference algorithm, we will similarly minimize the KL divergence with stochastic gradient descent as originally proposed by Rezende and Mohamed[^RM2015]. For this, however, we need to be mindful of the requirements of the variational algorithm. The default `entropy` gradient estimator of `KLMinRepGradDescent` is `ClosedFormEntropy()`, which assumes that the entropy of the variational family `entropy(q)` is available. For flows, the entropy is (usually) not available. Instead, we can use any gradient estimator that only relies on the log-density of the variational family `logpdf(q)`, `StickingTheLandingEntropy()` or `MonteCarloEntropy()`. @@ -199,9 +220,12 @@ Furthermore, Agrawal *et al.*[^AD2025] claim that using a larger number of Monte [^ASD2020]: Agrawal, A., Sheldon, D. R., & Domke, J. (2020). Advances in black-box VI: Normalizing flows, importance weighting, and optimization. In *Advances in Neural Information Processing Systems*, 33, 17358-17369. [^AD2025]: Agrawal, A., & Domke, J. (2024). Disentangling impact of capacity, objective, batchsize, estimators, and step-size on flow VI. In *Proceedings of the International Conference on Artificial Intelligence and Statistics*. ```@example flow +using Optimisers: Optimisers + alg_flow = KLMinRepGradDescent( ADTypes.AutoReverseDiff(; compile=true); n_samples=8, + optimizer=Optimisers.Adam(1e-2), operator=IdentityOperator(), entropy=StickingTheLandingEntropy(), ) @@ -211,9 +235,11 @@ nothing Without further due, let's now run VI: ```@example flow +max_iter = 300 q_flow_out, info_flow, _ = AdvancedVI.optimize( - alg_flow, max_iter, model_ad, q_flow_trans; show_progress=false + alg_flow, max_iter, prob_trans_ad, q_flow; show_progress=false ) +q_flow_out_trans = Bijectors.TransformedDistribution(q_flow_out, binv) nothing ``` @@ -230,7 +256,7 @@ nothing Finally, let's visualize the variational posterior: ```@example flow -samples = rand(q_flow_out, 10000) +samples = rand(q_flow_out_trans, 10000) histogram2d( samples[1, :], samples[2, :]; diff --git a/docs/src/tutorials/subsampling.md b/docs/src/tutorials/subsampling.md index 1816efb44..4f03bd757 100644 --- a/docs/src/tutorials/subsampling.md +++ b/docs/src/tutorials/subsampling.md @@ -10,7 +10,7 @@ In this tutorial, we will see how to perform subsampling with `KLMinRepGradProxD [^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of Machine Learning Research*, 18(14), 1-45. ## Setting Up Subsampling -We will consider the same hierarchical logistic regression example used in the [Basic Example](@ref basic). +We will consider a typical hierarchical logistic regression example. ```@example subsampling using LogDensityProblems: LogDensityProblems @@ -26,10 +26,11 @@ end function LogDensityProblems.logdensity(model::LogReg, θ) (; X, y, n_data) = model n, d = size(X) - β, σ = θ[1:size(X, 2)], θ[end] + β, logσ = θ[1:size(X, 2)], θ[end] + σ = exp(logσ) logprior_β = logpdf(MvNormal(Zeros(d), σ), β) - logprior_σ = logpdf(LogNormal(0, 3), σ) + logprior_σ = logpdf(Normal(0, 3), σ) logit = X*β loglike_y = mapreduce((li, yi) -> logpdf(BernoulliLogit(li), yi), +, logit, y) @@ -37,7 +38,7 @@ function LogDensityProblems.logdensity(model::LogReg, θ) end function LogDensityProblems.dimension(model::LogReg) - return size(model.X, 2) + 1 + return size(prob.X, 2) + 1 end function LogDensityProblems.capabilities(::Type{<:LogReg}) @@ -50,21 +51,6 @@ Notice that, to use subsampling, we need be able to rescale the likelihood stren That is, for the gradient of the log-density with a batch of data points of size `n` to be an unbiased estimate of the gradient using the full dataset of size `n_data`, we need to scale the likelihood by `n_data/n`. This part is critical to ensure that the algorithm correctly approximates the posterior with the full dataset. -As usual, we will set up a bijector: - -```@example subsampling -using Bijectors: Bijectors - -function Bijectors.bijector(model::LogReg) - d = size(model.X, 2) - return Bijectors.Stacked( - Bijectors.bijector.([MvNormal(Zeros(d), 1.0), LogNormal(0, 3)]), - [1:d, (d + 1):(d + 1)], - ) -end -nothing -``` - For the dataset, we will use one that is larger than that used in the [Basic Example](@ref basic). This is to properly assess the advantage of subsampling. In particular, we will utilize the "Phishing" dataset[^Tan2018], which consists of 10000 data points, each with 48 features. @@ -100,26 +86,30 @@ Let's now istantiate the model and set up automatic differentiation using [`LogD using ADTypes, ReverseDiff using LogDensityProblemsAD -model = LogReg(X, y, size(X, 1)) -model_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoReverseDiff(), model) +prob = LogReg(X, y, size(X, 1)) +prob_ad = LogDensityProblemsAD.ADgradient( + ADTypes.AutoReverseDiff(), + prob, + x=randn(LogDensityProblems.dimension(prob)) +) nothing ``` To enable subsampling, `LogReg` has to implement the method `AdvancedVI.subsample`. For our model, this is fairly simple: We only need to select the rows of `X` and the elements of `y` corresponding to the batch of data points. -As subtle point here is that we wrapped `model` with `LogDensityProblemsAD.ADgradient` into `model_ad`. -Therefore, `AdvancedVI` sees `model_ad` and not `model`. -This means we have to specialize `AdvancedVI.subsample` to `typeof(model_ad)` and not `LogReg`. +As subtle point here is that we wrapped `prob` with `LogDensityProblemsAD.ADgradient` into `prob_ad`. +Therefore, `AdvancedVI` sees `prob_ad` and not `prob`. +This means we have to specialize `AdvancedVI.subsample` to `typeof(prob_ad)` and not `LogReg`. ```@example subsampling using Accessors using AdvancedVI -function AdvancedVI.subsample(model::typeof(model_ad), idx) - (; X, y, n_data) = parent(model) - model′ = @set model.ℓ.X = X[idx, :] - model′′ = @set model′.ℓ.y = y[idx] - return model′′ +function AdvancedVI.subsample(prob::typeof(prob_ad), idx) + (; X, y, n_data) = parent(prob_ad) + prob′ = @set prob.ℓ.X = X[idx, :] + prob′′ = @set prob′.ℓ.y = y[idx] + return prob′′ end nothing ``` @@ -138,7 +128,7 @@ Here, we will use `ReshufflingBatchSubsampling`, which implements random reshuff We will us a batch size of 32, which results in `313 = length(subsampling) = ceil(Int, size(X,2)/32)` steps per epoch. ```@example subsampling -dataset = 1:size(model.X, 1) +dataset = 1:size(prob.X, 1) batchsize = 32 subsampling = ReshufflingBatchSubsampling(dataset, batchsize) alg_sub = KLMinRepGradProxDescent(ADTypes.AutoReverseDiff(; compile=true); subsampling) @@ -160,7 +150,7 @@ nothing If we don't supply a subsampling strategy to `KLMinRepGradProxDescent`, subsampling will not be used. ```@example subsampling -alg_full = KLMinRepGradProxDescent(ADTypes.AutoReverseDiff(; compile=true)) +alg_full = KLMinRepGradProxDescent(ADTypesAutoReverseDiff(; compile=true)) nothing ``` @@ -169,11 +159,8 @@ The variational family will be set up as follows: ```@example subsampling using LinearAlgebra -d = LogDensityProblems.dimension(model_ad) -q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.37*I, d, d))) -b = Bijectors.bijector(model) -binv = Bijectors.inverse(b) -q_transformed = Bijectors.TransformedDistribution(q, binv) +d = LogDensityProblems.dimension(prob_ad) +q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.6*I, d, d))) nothing ``` @@ -192,7 +179,7 @@ time_begin = nothing Approximate the posterior predictive probability for a logistic link function using Mackay's approximation (Bishop p. 220). """ function logistic_prediction(X, μ_β, Σ_β) - xtΣx = sum((model.X*Σ_β) .* model.X; dims=2)[:, 1] + xtΣx = sum((prob.X*Σ_β) .* prob.X; dims=2)[:, 1] κ = @. 1/sqrt(1 + π/8*xtΣx) return StatsFuns.logistic.(κ .* X*μ_β) end @@ -204,16 +191,16 @@ function callback(; iteration, averaged_params, restructure, kwargs...) q_avg = restructure(averaged_params) # Compute predictions using - μ_β = mean(q_avg.dist)[1:(end - 1)] # posterior mean of β - Σ_β = cov(q_avg.dist)[1:(end - 1), end - 1] # marginal posterior covariance of β + μ_β = mean(q_avg)[1:(end - 1)] # posterior mean of β + Σ_β = cov(q_avg)[1:(end - 1), end - 1] # marginal posterior covariance of β y_pred = logistic_prediction(X, μ_β, Σ_β) .> 0.5 # Prediction accuracy - acc = mean(y_pred .== model.y) + acc = mean(y_pred .== prob.y) # Higher fidelity estimate of the ELBO on the averaged parameters n_samples = 256 - elbo_callback = -estimate_objective(alg_full, q_avg, model; n_samples) + elbo_callback = -estimate_objective(alg_full, q_avg, prob; n_samples) (elbo_callback=elbo_callback, accuracy=acc, time_elapsed=time() - time_begin) else @@ -223,12 +210,12 @@ end time_begin = time() _, info_full, _ = AdvancedVI.optimize( - alg_full, max_iter, model_ad, q_transformed; show_progress=false, callback + alg_full, max_iter, prob_ad, q; show_progress=true, callback ); time_begin = time() _, info_sub, _ = AdvancedVI.optimize( - alg_sub, max_iter, model_ad, q_transformed; show_progress=false, callback + alg_sub, max_iter, prob_ad, q; show_progress=true, callback ); nothing ``` From fba3d9fa362f96103c8ac7e7b572aed70f982e13 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Sat, 22 Nov 2025 12:25:19 -0500 Subject: [PATCH 14/26] run formatter Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/src/klminrepgraddescent.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/src/klminrepgraddescent.md b/docs/src/klminrepgraddescent.md index 64cb5f5bb..c239f3e4e 100644 --- a/docs/src/klminrepgraddescent.md +++ b/docs/src/klminrepgraddescent.md @@ -63,7 +63,6 @@ where $$\epsilon_m \sim \varphi$$ are Monte Carlo samples. [^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014). Doubly stochastic variational Bayes for non-conjugate inference. In *International Conference on Machine Learning*. [^RMW2014]: Rezende, D. J., Mohamed, S., & Wierstra, D. (2014). Stochastic backpropagation and approximate inference in deep generative models. In *International Conference on Machine Learning*. [^KW2014]: Kingma, D. P., & Welling, M. (2014). Auto-encoding variational bayes. In *International Conference on Learning Representations*. - ## [Entropy Gradient Estimators](@id entropygrad) For the gradient of the entropy term, we provide three choices with varying requirements. From 4daaa791fe658df89e35911422719cb0d53c343f Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Sat, 22 Nov 2025 12:25:34 -0500 Subject: [PATCH 15/26] run formatter Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/src/tutorials/basic.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/src/tutorials/basic.md b/docs/src/tutorials/basic.md index 53c9db794..713d89904 100644 --- a/docs/src/tutorials/basic.md +++ b/docs/src/tutorials/basic.md @@ -221,7 +221,6 @@ Therefore, we finally need to apply a change-of-variable to `q_out` to make it a q_trans = Bijectors.TransformedDistribution(q_out, binv) ``` - ## Custom Callback The ELBO estimates above however, use only a handful of Monte Carlo samples. From a0f13d54f9232cf58ffb5673e29b192a6889bedc Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Sat, 22 Nov 2025 12:26:10 -0500 Subject: [PATCH 16/26] run formatter Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/src/tutorials/basic.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/tutorials/basic.md b/docs/src/tutorials/basic.md index 713d89904..cf2d44a3a 100644 --- a/docs/src/tutorials/basic.md +++ b/docs/src/tutorials/basic.md @@ -50,7 +50,7 @@ end nothing ``` -Notice that the support of `σ` is constrained to be positive. +Notice that the support of `σ` is constrained to be positive. Since most VI algorithms assume an unconstrained Euclidean support, we will apply a change-of-variable by a bijective map (a *bijector*) to the posterior to make it unconstrained. For this, we will use the [`Bijectors`](https://github.com/TuringLang/Bijectors.jl) package. From 9a64db7bc3a30df5004999088d0b1751dd307365 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Sat, 22 Nov 2025 12:26:22 -0500 Subject: [PATCH 17/26] run formatter Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/src/tutorials/subsampling.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/src/tutorials/subsampling.md b/docs/src/tutorials/subsampling.md index 4f03bd757..eab0fbf7f 100644 --- a/docs/src/tutorials/subsampling.md +++ b/docs/src/tutorials/subsampling.md @@ -88,9 +88,7 @@ using LogDensityProblemsAD prob = LogReg(X, y, size(X, 1)) prob_ad = LogDensityProblemsAD.ADgradient( - ADTypes.AutoReverseDiff(), - prob, - x=randn(LogDensityProblems.dimension(prob)) + ADTypes.AutoReverseDiff(), prob; x=randn(LogDensityProblems.dimension(prob)) ) nothing ``` From 048a31028bb49670d95caabbf16dad9d23ab3090 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Sat, 22 Nov 2025 13:37:23 -0500 Subject: [PATCH 18/26] add constraint tutorial --- docs/make.jl | 1 + docs/src/tutorials/constrained.md | 275 ++++++++++++++++++++++++++++++ 2 files changed, 276 insertions(+) create mode 100644 docs/src/tutorials/constrained.md diff --git a/docs/make.jl b/docs/make.jl index 34c0080d3..8b8c9f892 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -21,6 +21,7 @@ makedocs(; "Scaling to Large Datasets" => "tutorials/subsampling.md", "Stan Models" => "tutorials/stan.md", "Normalizing Flows" => "tutorials/flows.md", + "Dealing with Constrained Posteriors" => "tutorials/constrained.md" ], "Algorithms" => [ "`KLMinRepGradDescent`" => "klminrepgraddescent.md", diff --git a/docs/src/tutorials/constrained.md b/docs/src/tutorials/constrained.md new file mode 100644 index 000000000..e1539fe39 --- /dev/null +++ b/docs/src/tutorials/constrained.md @@ -0,0 +1,275 @@ +# [Dealing with Constrained Posteriors](@id constrained) + +In this tutorial, we will demonstrate how to deal with constrained posteriors in more detail. +Formally, by constrained posteriors, we mean that the target posterior has a density defined over a space that does not span the "full" Euclidean space $\mathbb{R}^d$: +```math +\pi : \mathcal{X} \to \mathbb{R}_{> 0} , +``` +where $\mathcal{X} \subset \mathbb{R}^d$ but not $\mathcal{X} = \mathbb{R}^d$. + +For instance, consider the basic hierarchical model for estimating the mean of the data $y_1, \ldots, y_n$: +```math +\begin{aligned} + \sigma &\sim \operatorname{LogNormal}(\alpha, \beta) \\ + \mu &\sim \operatorname{Normal}(0, \sigma) \\ + y_i &\sim \operatorname{Normal}(\mu, \sigma) . +\end{aligned} +``` +The corresponding posterior +```math +\pi(\mu, \sigma \mid y_1, \ldots, y_n) += +\operatorname{LogNormal}(\sigma; \alpha, \beta) +\operatorname{Normal}(\mu; 0, \sigma) +\prod_{i=1}^n \operatorname{Normal}(y_i; \mu, \sigma) +``` +has a density with respect to the space +```math + \mathcal{X} = \mathbb{R}_{> 0} \times \mathbb{R} . +``` +There are also more complicated examples of constrained spaces. +For example, a $k$-dimensional variable with a Dirichlet prior will be constrained to live on a $k$-dimensional simplex. + +Now, most algorithms provided by `AdvancedVI`, such as: + +- `KLMinRepGradDescent` +- `KLMinRepGradProxDescent` +- `KLMinNaturalGradDescent` +- `FisherMinBatchMatch` + +tend to assume the target posterior is defined over the whole Euclidean space $\mathbb{R}^d$. +Therefore, to apply these algorithms, we need to do something about the constraints. +We will describe some recommended ways of doing this. + +## Transforming the Posterior +The most widely applicable way is to transform the posterior $\pi : \mathcal{X} \to \mathbb{R}_{>0}$ to be unconstrained. +That is, consider some bijective map $b : \mathcal{X} \to \mathbb{R}^{d}$ between the $\mathcal{X}$ and the associated Euclidean space $\mathbb{R}^{d}$. +Using the inverse of the map $b^{-1}$ and its Jacobian $\mathrm{J}_{b^{-1}}$, we can apply a change of variable to the posterior and obtain its unconstrained counterpart +```math +\pi_{b^{-1}}(\eta) : \mathbb{R}^d \to \mathbb{R}_{>0} = \pi(b^{-1}(\eta)) {\lvert \mathrm{J}_{b^{-1}}(\eta) \rvert} . +``` +This idea popularized by Stan[^CGHetal2017] and Tensorflow probability[^DLTetal2017] is, in fact, how most probabilistic programming frameworks enable the use of off-the-shelf Markov chain Monte Carlo algorithms. +In the context of variational inference, we will first approximate the unconstrained posterior as + +```math +q^* = \arg\min_{q \in \mathcal{Q}} \;\; \mathrm{D}(q, \pi_{b^{-1}}) . +``` + +and then transform the optimal unconstrained approximation $q^*$ to be constrained by again applying a change of variable as + +```math +q_{b}^* : \mathcal{X} \to \mathbb{R}_{>0} = q(b(z)) {\lvert \mathrm{J}_{b}(z) \rvert} . +``` + +Sampling from $q_{b}^*$ amounts to pushing each sample from $q$ into $b^{-1}$: + +```math +z \sim q_{b}^* \quad\Leftrightarrow\quad z \stackrel{\mathrm{d}}{=} b^{-1}(\eta) ; \quad \eta \sim q^* . +``` + +The idea of applying a change-of-variable to the variational approximation to match a constrained posterior was popularized by the automatic differentiation VI[^KTRGB2017]. + +[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. Journal of machine learning research, 18(14), 1-45. + +Now, there are two ways how to do this in Julia. +First, let's define the constrained posterior example above using the `LogDensityProblems` interface for illustration: + +```@example constraints +using LogDensityProblems + +struct Mean + y::Vector{Float64} +end + +function LogDensityProblems.logdensity(prob::Mean, θ) + μ, σ = θ[1], θ[2] + ℓp_μ = logpdf(Normal(0, σ), μ) + ℓp_σ = logpdf(LogNormal(0, 3), σ) + ℓl_y = mapreduce(yi -> logpdf(Normal(μ, σ), yi), +, prob.y) + return ℓp_μ + ℓp_σ + ℓl_y +end + +LogDensityProblems.dimension(::Mean) = 2 + +LogDensityProblems.capabilities(::Type{Mean}) = LogDensityProblems.LogDensityOrder{0}() + +n_data = 30 +prob = Mean(randn(n_data)) +nothing +``` + +We need to find the right transformation associated with a `LogNormal` prior. +Most of the common bijective transformations can be found in [`Bijectors.jl`](https://github.com/TuringLang/Bijectors.jl) package[^FXTYG2020]. +See the following: + +```@example constraints +using Bijectors + +b_σ = Bijectors.bijector(LogNormal(0, 1)) +``` + +and the inverse transformation can be obtained as + +```@example constraints +binv_σ = Bijectors.inverse(b_σ) +``` + +Multiple bijectors can also be stacked to form a joint bijector using `Bijectors.Stacked`. +For example: + +```@example constraints +function Bijectors.bijector(::Mean) + return Bijectors.Stacked( + Bijectors.bijector.([Normal(0, 1), LogNormal(1, 1)]), [1:1, 2:2], + ) +end + +b = Bijectors.bijector(prob) +binv = Bijectors.inverse(b) +``` + +Refer to the documentation of `Bijectors.jl` for more details. + + +## Wrap the `LogDensityProblem` + +The most general and easy way to obtain an unconstrained posterior using a `Bijector` is to wrap our original `LogDensityProblem` to form a new `LogDensityProblem`. +This approach only requires the user to implement the model-specific `Bijectors.bijector` function as above. +The rest can be done by simply copy-pasting the code below: + +```@example constraints +struct TransformedLogDensityProblem{Prob,BInv} + prob::Prob + binv::BInv +end + +function TransformedLogDensityProblem(prob) + b = Bijectors.bijector(prob) + binv = Bijectors.inverse(b) + return TransformedLogDensityProblem{typeof(prob),typeof(binv)}(prob, binv) +end + +function LogDensityProblems.logdensity(prob_trans::TransformedLogDensityProblem, θ_trans) + (; prob, binv) = prob_trans + θ, logabsdetjac = Bijectors.with_logabsdet_jacobian(binv, θ_trans) + return LogDensityProblems.logdensity(prob, θ) + logabsdetjac +end + +function LogDensityProblems.dimension(prob_trans::TransformedLogDensityProblem) + (; prob, binv) = prob_trans + b = Bijectors.inverse(binv) + d = LogDensityProblems.dimension(prob) + return prod(Bijectors.output_size(b, (d,))) +end + +function LogDensityProblems.capabilities( + ::Type{TransformedLogDensityProblem{Prob,BInv}} +) where {Prob,BInv} + return LogDensityProblems.capabilities(Prob) +end +nothing +``` + +Wrapping `prob` with `TransformedLogDensityProblem` yields our unconstrained posterior. + +```@example constraints +prob_trans = TransformedLogDensityProblem(prob) + +x = randn(LogDensityProblems.dimension(prob_trans)) # sample on an unconstrained support +LogDensityProblems.logdensity(prob_trans, x) +``` + +We can also wrap `prob_trans` with `LogDensityProblemsAD.ADGradient` to make it differentiable. +```@example constraints +using LogDensityProblemsAD +using ADTypes, ReverseDiff + +prob_trans_ad = LogDensityProblemsAD.ADgradient( + ADTypes.AutoReverseDiff(; compile=true), prob_trans; x = randn(2) +) +``` + +Let's now run VI to verify that it works. +Here, we will use `FisherMinBatchMatch`, which expects an unconstrained posterior. + +```@example constraints +using AdvancedVI +using LinearAlgebra + +d = LogDensityProblems.dimension(prob_trans_ad) +q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.6*I, d, d))) + +q_opt, info, _ = AdvancedVI.optimize( + FisherMinBatchMatch(), 100, prob_trans_ad, q; show_progress=false +) +nothing +``` + +We have now obtained a variational approximation `q_opt` of the unconstrained posterior associated with `prob_trans`. +It remains to transform `q_opt` back to the constrained space we were originally interested in. +This can be done by wrapping it into a `Bijectors.TransformedDistribution`. + +```@example constraints +q_opt_trans = Bijectors.TransformedDistribution(q_opt, binv) +``` + +```@example constraints +using Plots + +x = rand(q_opt_trans, 1000) + +Plots.stephist(x[2,:], normed=true, xlabel="Posterior of σ", label=nothing, xlims=(0, 2)) +Plots.vline!([1.0], label="True Value") +savefig("constrained_histogram.svg") +``` + +![](constrained_histogram.svg) + +We can see that the transformed posterior is indeed a meaningful approximation of the original posterior $\pi(\sigma \mid y_1, \ldots, y_n)$ we were interested in. + + +## Bake a Bijector into the `LogDensityProblem` + +A problem with the general approach above is that automatically differentiating through `TransformedLogDensityProblem` can be a bit inefficient (due to `Stacked`), especially with reverse-mode AD. +Therefore, another effective but less automatic approach is to bake the transformation and Jacobian adjustment into the `LogDensityProblem` itself. +Here is an example for our mean estimation model: + +```@example constraints +struct MeanTransformed{BInvS} + y::Vector{Float64} + binv_σ::BInvS +end + +function MeanTransformed(y::Vector{Float64}) + binv_σ = Bijectors.bijector(LogNormal(0, 3)) |> Bijectors.inverse + return MeanTransformed(y, binv_σ) +end + +function LogDensityProblems.logdensity(prob::MeanTransformed, θ) + (; y, binv_σ) = prob + μ = θ[1] + + # Apply bijector and compute Jacobian + σ, ℓabsdetjac_σ = with_logabsdet_jacobian(binv_σ, θ[2]) + + ℓp_μ = logpdf(Normal(0, σ), μ) + ℓp_σ = logpdf(LogNormal(0, 3), σ) + ℓl_y = mapreduce(yi -> logpdf(Normal(μ, σ), yi), +, prob.y) + return ℓp_μ + ℓp_σ + ℓl_y + ℓabsdetjac_σ +end + +LogDensityProblems.dimension(::MeanTransformed) = 2 + +LogDensityProblems.capabilities(::Type{MeanTransformed}) = LogDensityProblems.LogDensityOrder{0}() + +n_data = 30 +prob_bakedtrans = MeanTransformed(randn(n_data)) +nothing +``` + +Now, `prob_bakedtrans` can be used identically as `prob_trans` above. +For problems with larger dimensions, however, baking the bijector into the problem as above could be significantly more efficient. + +[^CGHetal2017]: Carpenter, B., Gelman, A., Hoffman, M. D., Lee, D., Goodrich, B., Betancourt, M., ... & Riddell, A. (2017). Stan: A probabilistic programming language. Journal of statistical software, 76, 1-32. +[^DLTetal2017]: Dillon, J. V., Langmore, I., Tran, D., Brevdo, E., Vasudevan, S., Moore, D., ... & Saurous, R. A. (2017). Tensorflow distributions. arXiv preprint arXiv:1711.10604. +[^FXTYG2020]: Fjelde, T. E., Xu, K., Tarek, M., Yalburgi, S., & Ge, H. (2020, February). Bijectors. jl: Flexible transformations for probability distributions. In Symposium on Advances in Approximate Bayesian Inference (pp. 1-17). PMLR. From 69ae57a389930741b0ef9d2c780e9394a165a538 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Sat, 22 Nov 2025 13:38:56 -0500 Subject: [PATCH 19/26] fix missing import in docs --- docs/src/klminrepgraddescent.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/klminrepgraddescent.md b/docs/src/klminrepgraddescent.md index 64cb5f5bb..771cf99cd 100644 --- a/docs/src/klminrepgraddescent.md +++ b/docs/src/klminrepgraddescent.md @@ -63,7 +63,6 @@ where $$\epsilon_m \sim \varphi$$ are Monte Carlo samples. [^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014). Doubly stochastic variational Bayes for non-conjugate inference. In *International Conference on Machine Learning*. [^RMW2014]: Rezende, D. J., Mohamed, S., & Wierstra, D. (2014). Stochastic backpropagation and approximate inference in deep generative models. In *International Conference on Machine Learning*. [^KW2014]: Kingma, D. P., & Welling, M. (2014). Auto-encoding variational bayes. In *International Conference on Learning Representations*. - ## [Entropy Gradient Estimators](@id entropygrad) For the gradient of the entropy term, we provide three choices with varying requirements. @@ -96,6 +95,7 @@ Depending on the variational family, this might be computationally inefficient o For example, if ``q_{\lambda}`` is a Gaussian with a full-rank covariance, a back-substitution must be performed at every step, making the per-iteration complexity ``\mathcal{O}(d^3)`` and reducing numerical stability. ```@setup repgradelbo +using Distributions using FillArrays using LinearAlgebra using LogDensityProblems From af4ad180ff75d56c6bc485ee61c64c9c3e6a38d2 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Sat, 22 Nov 2025 13:39:06 -0500 Subject: [PATCH 20/26] run furmatter to constrained --- docs/src/tutorials/constrained.md | 53 +++++++++++++++++++------------ 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/docs/src/tutorials/constrained.md b/docs/src/tutorials/constrained.md index e1539fe39..0cfcde5d8 100644 --- a/docs/src/tutorials/constrained.md +++ b/docs/src/tutorials/constrained.md @@ -2,12 +2,15 @@ In this tutorial, we will demonstrate how to deal with constrained posteriors in more detail. Formally, by constrained posteriors, we mean that the target posterior has a density defined over a space that does not span the "full" Euclidean space $\mathbb{R}^d$: + ```math \pi : \mathcal{X} \to \mathbb{R}_{> 0} , ``` + where $\mathcal{X} \subset \mathbb{R}^d$ but not $\mathcal{X} = \mathbb{R}^d$. For instance, consider the basic hierarchical model for estimating the mean of the data $y_1, \ldots, y_n$: + ```math \begin{aligned} \sigma &\sim \operatorname{LogNormal}(\alpha, \beta) \\ @@ -15,7 +18,9 @@ For instance, consider the basic hierarchical model for estimating the mean of t y_i &\sim \operatorname{Normal}(\mu, \sigma) . \end{aligned} ``` -The corresponding posterior + +The corresponding posterior + ```math \pi(\mu, \sigma \mid y_1, \ldots, y_n) = @@ -23,31 +28,37 @@ The corresponding posterior \operatorname{Normal}(\mu; 0, \sigma) \prod_{i=1}^n \operatorname{Normal}(y_i; \mu, \sigma) ``` -has a density with respect to the space + +has a density with respect to the space + ```math \mathcal{X} = \mathbb{R}_{> 0} \times \mathbb{R} . ``` + There are also more complicated examples of constrained spaces. For example, a $k$-dimensional variable with a Dirichlet prior will be constrained to live on a $k$-dimensional simplex. Now, most algorithms provided by `AdvancedVI`, such as: -- `KLMinRepGradDescent` -- `KLMinRepGradProxDescent` -- `KLMinNaturalGradDescent` -- `FisherMinBatchMatch` + - `KLMinRepGradDescent` + - `KLMinRepGradProxDescent` + - `KLMinNaturalGradDescent` + - `FisherMinBatchMatch` tend to assume the target posterior is defined over the whole Euclidean space $\mathbb{R}^d$. Therefore, to apply these algorithms, we need to do something about the constraints. We will describe some recommended ways of doing this. ## Transforming the Posterior + The most widely applicable way is to transform the posterior $\pi : \mathcal{X} \to \mathbb{R}_{>0}$ to be unconstrained. That is, consider some bijective map $b : \mathcal{X} \to \mathbb{R}^{d}$ between the $\mathcal{X}$ and the associated Euclidean space $\mathbb{R}^{d}$. Using the inverse of the map $b^{-1}$ and its Jacobian $\mathrm{J}_{b^{-1}}$, we can apply a change of variable to the posterior and obtain its unconstrained counterpart + ```math \pi_{b^{-1}}(\eta) : \mathbb{R}^d \to \mathbb{R}_{>0} = \pi(b^{-1}(\eta)) {\lvert \mathrm{J}_{b^{-1}}(\eta) \rvert} . ``` + This idea popularized by Stan[^CGHetal2017] and Tensorflow probability[^DLTetal2017] is, in fact, how most probabilistic programming frameworks enable the use of off-the-shelf Markov chain Monte Carlo algorithms. In the context of variational inference, we will first approximate the unconstrained posterior as @@ -70,7 +81,6 @@ z \sim q_{b}^* \quad\Leftrightarrow\quad z \stackrel{\mathrm{d}}{=} b^{-1}(\eta) The idea of applying a change-of-variable to the variational approximation to match a constrained posterior was popularized by the automatic differentiation VI[^KTRGB2017]. [^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. Journal of machine learning research, 18(14), 1-45. - Now, there are two ways how to do this in Julia. First, let's define the constrained posterior example above using the `LogDensityProblems` interface for illustration: @@ -83,7 +93,7 @@ end function LogDensityProblems.logdensity(prob::Mean, θ) μ, σ = θ[1], θ[2] - ℓp_μ = logpdf(Normal(0, σ), μ) + ℓp_μ = logpdf(Normal(0, σ), μ) ℓp_σ = logpdf(LogNormal(0, 3), σ) ℓl_y = mapreduce(yi -> logpdf(Normal(μ, σ), yi), +, prob.y) return ℓp_μ + ℓp_σ + ℓl_y @@ -120,7 +130,7 @@ For example: ```@example constraints function Bijectors.bijector(::Mean) return Bijectors.Stacked( - Bijectors.bijector.([Normal(0, 1), LogNormal(1, 1)]), [1:1, 2:2], + Bijectors.bijector.([Normal(0, 1), LogNormal(1, 1)]), [1:1, 2:2] ) end @@ -130,11 +140,10 @@ binv = Bijectors.inverse(b) Refer to the documentation of `Bijectors.jl` for more details. - ## Wrap the `LogDensityProblem` The most general and easy way to obtain an unconstrained posterior using a `Bijector` is to wrap our original `LogDensityProblem` to form a new `LogDensityProblem`. -This approach only requires the user to implement the model-specific `Bijectors.bijector` function as above. +This approach only requires the user to implement the model-specific `Bijectors.bijector` function as above. The rest can be done by simply copy-pasting the code below: ```@example constraints @@ -179,13 +188,14 @@ x = randn(LogDensityProblems.dimension(prob_trans)) # sample on an unconstrained LogDensityProblems.logdensity(prob_trans, x) ``` -We can also wrap `prob_trans` with `LogDensityProblemsAD.ADGradient` to make it differentiable. +We can also wrap `prob_trans` with `LogDensityProblemsAD.ADGradient` to make it differentiable. + ```@example constraints using LogDensityProblemsAD using ADTypes, ReverseDiff prob_trans_ad = LogDensityProblemsAD.ADgradient( - ADTypes.AutoReverseDiff(; compile=true), prob_trans; x = randn(2) + ADTypes.AutoReverseDiff(; compile=true), prob_trans; x=randn(2) ) ``` @@ -218,8 +228,8 @@ using Plots x = rand(q_opt_trans, 1000) -Plots.stephist(x[2,:], normed=true, xlabel="Posterior of σ", label=nothing, xlims=(0, 2)) -Plots.vline!([1.0], label="True Value") +Plots.stephist(x[2, :]; normed=true, xlabel="Posterior of σ", label=nothing, xlims=(0, 2)) +Plots.vline!([1.0]; label="True Value") savefig("constrained_histogram.svg") ``` @@ -227,7 +237,6 @@ savefig("constrained_histogram.svg") We can see that the transformed posterior is indeed a meaningful approximation of the original posterior $\pi(\sigma \mid y_1, \ldots, y_n)$ we were interested in. - ## Bake a Bijector into the `LogDensityProblem` A problem with the general approach above is that automatically differentiating through `TransformedLogDensityProblem` can be a bit inefficient (due to `Stacked`), especially with reverse-mode AD. @@ -241,18 +250,18 @@ struct MeanTransformed{BInvS} end function MeanTransformed(y::Vector{Float64}) - binv_σ = Bijectors.bijector(LogNormal(0, 3)) |> Bijectors.inverse + binv_σ = Bijectors.inverse(Bijectors.bijector(LogNormal(0, 3))) return MeanTransformed(y, binv_σ) end function LogDensityProblems.logdensity(prob::MeanTransformed, θ) (; y, binv_σ) = prob μ = θ[1] - + # Apply bijector and compute Jacobian - σ, ℓabsdetjac_σ = with_logabsdet_jacobian(binv_σ, θ[2]) + σ, ℓabsdetjac_σ = with_logabsdet_jacobian(binv_σ, θ[2]) - ℓp_μ = logpdf(Normal(0, σ), μ) + ℓp_μ = logpdf(Normal(0, σ), μ) ℓp_σ = logpdf(LogNormal(0, 3), σ) ℓl_y = mapreduce(yi -> logpdf(Normal(μ, σ), yi), +, prob.y) return ℓp_μ + ℓp_σ + ℓl_y + ℓabsdetjac_σ @@ -260,7 +269,9 @@ end LogDensityProblems.dimension(::MeanTransformed) = 2 -LogDensityProblems.capabilities(::Type{MeanTransformed}) = LogDensityProblems.LogDensityOrder{0}() +function LogDensityProblems.capabilities(::Type{MeanTransformed}) + LogDensityProblems.LogDensityOrder{0}() +end n_data = 30 prob_bakedtrans = MeanTransformed(randn(n_data)) From f9d7f0bf2de49de88ef0e0202f4abd90095f85cb Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Sat, 22 Nov 2025 14:33:01 -0500 Subject: [PATCH 21/26] fix typo --- docs/src/tutorials/subsampling.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/tutorials/subsampling.md b/docs/src/tutorials/subsampling.md index eab0fbf7f..6b5ef665b 100644 --- a/docs/src/tutorials/subsampling.md +++ b/docs/src/tutorials/subsampling.md @@ -148,7 +148,7 @@ nothing If we don't supply a subsampling strategy to `KLMinRepGradProxDescent`, subsampling will not be used. ```@example subsampling -alg_full = KLMinRepGradProxDescent(ADTypesAutoReverseDiff(; compile=true)) +alg_full = KLMinRepGradProxDescent(ADTypes.AutoReverseDiff(; compile=true)) nothing ``` From d6dede239e404b660366d02664e5a2164ba0c7e4 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Sat, 22 Nov 2025 15:08:34 -0500 Subject: [PATCH 22/26] fix bins in normalizing flow tutorial --- docs/src/tutorials/flows.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/tutorials/flows.md b/docs/src/tutorials/flows.md index ae44758ce..772438a1b 100644 --- a/docs/src/tutorials/flows.md +++ b/docs/src/tutorials/flows.md @@ -169,7 +169,7 @@ histogram2d( samples[1, :], samples[2, :]; normalize=:pdf, - nbins=32, + nbins=(16, 16), xlabel="α", ylabel="β", xlims=(0, 4), @@ -261,7 +261,7 @@ histogram2d( samples[1, :], samples[2, :]; normalize=:pdf, - nbins=64, + nbins=(16, 16), xlabel="α", ylabel="β", xlims=(0, 4), From c7e8c4421fdb2652a94d1e388d700276722a0cc9 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Sat, 22 Nov 2025 15:09:22 -0500 Subject: [PATCH 23/26] fix formatting --- docs/src/tutorials/constrained.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/tutorials/constrained.md b/docs/src/tutorials/constrained.md index 0cfcde5d8..857e6a3af 100644 --- a/docs/src/tutorials/constrained.md +++ b/docs/src/tutorials/constrained.md @@ -80,10 +80,10 @@ z \sim q_{b}^* \quad\Leftrightarrow\quad z \stackrel{\mathrm{d}}{=} b^{-1}(\eta) The idea of applying a change-of-variable to the variational approximation to match a constrained posterior was popularized by the automatic differentiation VI[^KTRGB2017]. -[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. Journal of machine learning research, 18(14), 1-45. Now, there are two ways how to do this in Julia. First, let's define the constrained posterior example above using the `LogDensityProblems` interface for illustration: +[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. Journal of machine learning research, 18(14), 1-45. ```@example constraints using LogDensityProblems From 373abdd21fef6eb1d09a9158fde0bb9d79001e26 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Sat, 22 Nov 2025 15:09:44 -0500 Subject: [PATCH 24/26] run formatter Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/make.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/make.jl b/docs/make.jl index 8b8c9f892..f85540832 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -21,7 +21,7 @@ makedocs(; "Scaling to Large Datasets" => "tutorials/subsampling.md", "Stan Models" => "tutorials/stan.md", "Normalizing Flows" => "tutorials/flows.md", - "Dealing with Constrained Posteriors" => "tutorials/constrained.md" + "Dealing with Constrained Posteriors" => "tutorials/constrained.md", ], "Algorithms" => [ "`KLMinRepGradDescent`" => "klminrepgraddescent.md", From 6a5c7ba15dbe2d45969977e26e7586cb118b5cf0 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Sat, 22 Nov 2025 15:15:23 -0500 Subject: [PATCH 25/26] revert changes to documentation --- docs/make.jl | 1 - docs/src/klminrepgraddescent.md | 132 ++++++++++++++++++++++++------ docs/src/tutorials/basic.md | 88 +++++++------------- docs/src/tutorials/flows.md | 98 ++++++++-------------- docs/src/tutorials/subsampling.md | 69 ++++++++++------ 5 files changed, 212 insertions(+), 176 deletions(-) diff --git a/docs/make.jl b/docs/make.jl index 8b8c9f892..34c0080d3 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -21,7 +21,6 @@ makedocs(; "Scaling to Large Datasets" => "tutorials/subsampling.md", "Stan Models" => "tutorials/stan.md", "Normalizing Flows" => "tutorials/flows.md", - "Dealing with Constrained Posteriors" => "tutorials/constrained.md" ], "Algorithms" => [ "`KLMinRepGradDescent`" => "klminrepgraddescent.md", diff --git a/docs/src/klminrepgraddescent.md b/docs/src/klminrepgraddescent.md index 771cf99cd..c2b3f9264 100644 --- a/docs/src/klminrepgraddescent.md +++ b/docs/src/klminrepgraddescent.md @@ -1,7 +1,9 @@ # [`KLMinRepGradDescent`](@id klminrepgraddescent) -This algorithms aim to minimize the exclusive (or reverse) Kullback-Leibler (KL) divergence via stochastic gradient descent in the space of parameters. -Specifically, it uses the the *reparameterization gradient* estimator. +## Description + +This algorithm aims to minimize the exclusive (or reverse) Kullback-Leibler (KL) divergence via stochastic gradient descent in the space of parameters. +Specifically, it uses the the *reparameterization gradient estimator*. As a result, this algorithm is best applicable when the target log-density is differentiable and the sampling process of the variational family is differentiable. (See the [methodology section](@ref klminrepgraddescent_method) for more details.) This algorithm is also commonly referred to as automatic differentiation variational inference, black-box variational inference with the reparameterization gradient, and stochastic gradient variational inference. @@ -11,15 +13,30 @@ This algorithm is also commonly referred to as automatic differentiation variati KLMinRepGradDescent ``` +The associated objective value can be estimated through the following: + +```@docs; canonical=false +estimate_objective( + ::Random.AbstractRNG, + ::Union{<:KLMinRepGradDescent,<:KLMinRepGradProxDescent,<:KLMinScoreGradDescent}, + ::Any, + ::Any; + ::Int, + ::AbstractEntropyEstimator, +) +``` + ## [Methodology](@id klminrepgraddescent_method) -This algorithms aim to solve the problem +This algorithm aims to solve the problem ```math \mathrm{minimize}_{q \in \mathcal{Q}}\quad \mathrm{KL}\left(q, \pi\right) ``` where $\mathcal{Q}$ is some family of distributions, often called the variational family, by running stochastic gradient descent in the (Euclidean) space of parameters. +That is, for all $$q_{\lambda} \in \mathcal{Q}$$, we assume $$q_{\lambda}$$ there is a corresponding vector of parameters $$\lambda \in \Lambda$$, where the space of parameters is Euclidean such that $$\Lambda \subset \mathbb{R}^p$$. + Since we usually only have access to the unnormalized densities of the target distribution $\pi$, we don't have direct access to the KL divergence. Instead, the ELBO maximization strategy maximizes a surrogate objective, the *evidence lower bound* (ELBO; [^JGJS1999]) @@ -40,7 +57,7 @@ Algorithmically, `KLMinRepGradDescent` iterates the step where $\widehat{\nabla \mathrm{ELBO}}(q_{\lambda})$ is the reparameterization gradient estimate[^HC1983][^G1991][^R1992][^P1996] of the ELBO gradient and $$\mathrm{operator}$$ is an optional operator (*e.g.* projections, identity mapping). The reparameterization gradient, also known as the push-in gradient or the pathwise gradient, was introduced to VI in [^TL2014][^RMW2014][^KW2014]. -For the variational family $\mathcal{Q} = \{q_{\lambda} \mid \lambda \in \Lambda\}$, suppose the process of sampling from $q_{\lambda}$ can be described by some differentiable reparameterization function $$T_{\lambda}$$ and a *base distribution* $$\varphi$$ independent of $$\lambda$$ such that +For the variational family $$\mathcal{Q}$$, suppose the process of sampling from $$q_{\lambda} \in \mathcal{Q}$$ can be described by some differentiable reparameterization function $$T_{\lambda}$$ and a *base distribution* $$\varphi$$ independent of $$\lambda$$ such that ```math z \sim q_{\lambda} \qquad\Leftrightarrow\qquad @@ -63,6 +80,51 @@ where $$\epsilon_m \sim \varphi$$ are Monte Carlo samples. [^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014). Doubly stochastic variational Bayes for non-conjugate inference. In *International Conference on Machine Learning*. [^RMW2014]: Rezende, D. J., Mohamed, S., & Wierstra, D. (2014). Stochastic backpropagation and approximate inference in deep generative models. In *International Conference on Machine Learning*. [^KW2014]: Kingma, D. P., & Welling, M. (2014). Auto-encoding variational bayes. In *International Conference on Learning Representations*. +## [Handling Constraints with `Bijectors`](@id bijectors) + +As mentioned in the docstring, `KLMinRepGradDescent` assumes that the variational approximation $$q_{\lambda}$$ and the target distribution $$\pi$$ have the same support for all $$\lambda \in \Lambda$$. +However, in general, it is most convenient to use variational families that have the whole Euclidean space $$\mathbb{R}^d$$ as their support. +This is the case for the [location-scale distributions](@ref locscale) provided by `AdvancedVI`. +For target distributions which the support is not the full $$\mathbb{R}^d$$, we can apply some transformation $$b$$ to $$q_{\lambda}$$ to match its support such that + +```math +z \sim q_{b,\lambda} \qquad\Leftrightarrow\qquad +z \stackrel{d}{=} b^{-1}\left(\eta\right);\quad \eta \sim q_{\lambda}, +``` + +where $$b$$ is often called a *bijector*, since it is often chosen among bijective transformations. +This idea is known as automatic differentiation VI[^KTRGB2017] and has subsequently been improved by Tensorflow Probability[^DLTBV2017]. +In Julia, [Bijectors.jl](https://github.com/TuringLang/Bijectors.jl)[^FXTYG2020] provides a comprehensive collection of bijections. + +[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of Machine Learning Research*, 18(14), 1-45. +[^DLTBV2017]: Dillon, J. V., Langmore, I., Tran, D., Brevdo, E., Vasudevan, S., Moore, D., ... & Saurous, R. A. (2017). Tensorflow distributions. arXiv. +[^FXTYG2020]: Fjelde, T. E., Xu, K., Tarek, M., Yalburgi, S., & Ge, H. (2020,. Bijectors. jl: Flexible transformations for probability distributions. In *Symposium on Advances in Approximate Bayesian Inference*. + One caveat of ADVI is that, after applying the bijection, a Jacobian adjustment needs to be applied. + That is, the objective is now +```math +\mathrm{ADVI}\left(\lambda\right) +\triangleq +\mathbb{E}_{\eta \sim q_{\lambda}}\left[ + \log \pi\left( b^{-1}\left( \eta \right) \right) + + \log \lvert J_{b^{-1}}\left(\eta\right) \rvert +\right] ++ \mathbb{H}\left(q_{\lambda}\right) +``` + +This is automatically handled by `AdvancedVI` through `TransformedDistribution` provided by `Bijectors.jl`. +See the following example: + +```julia +using Bijectors +q = MeanFieldGaussian(μ, L) +b = Bijectors.bijector(dist) +binv = inverse(b) +q_transformed = Bijectors.TransformedDistribution(q, binv) +``` + +By passing `q_transformed` to `optimize`, the Jacobian adjustment for the bijector `b` is automatically applied. +(See the [Basic Example](@ref basic) for a fully working example.) + ## [Entropy Gradient Estimators](@id entropygrad) For the gradient of the entropy term, we provide three choices with varying requirements. @@ -95,7 +157,7 @@ Depending on the variational family, this might be computationally inefficient o For example, if ``q_{\lambda}`` is a Gaussian with a full-rank covariance, a back-substitution must be performed at every step, making the per-iteration complexity ``\mathcal{O}(d^3)`` and reducing numerical stability. ```@setup repgradelbo -using Distributions +using Bijectors using FillArrays using LinearAlgebra using LogDensityProblems @@ -106,38 +168,51 @@ using Optimisers using ADTypes, ForwardDiff, ReverseDiff using AdvancedVI -struct Dist{D} - dist::D +struct NormalLogNormal{MX,SX,MY,SY} + μ_x::MX + σ_x::SX + μ_y::MY + Σ_y::SY end -function LogDensityProblems.logdensity(model::Dist, θ) - return logpdf(model.dist, θ) +function LogDensityProblems.logdensity(model::NormalLogNormal, θ) + (; μ_x, σ_x, μ_y, Σ_y) = model + logpdf(LogNormal(μ_x, σ_x), θ[1]) + logpdf(MvNormal(μ_y, Σ_y), θ[2:end]) end -function LogDensityProblems.logdensity_and_gradient(model::Dist, θ) +function LogDensityProblems.logdensity_and_gradient(model::NormalLogNormal, θ) return ( LogDensityProblems.logdensity(model, θ), ForwardDiff.gradient(Base.Fix1(LogDensityProblems.logdensity, model), θ), ) end -function LogDensityProblems.dimension(model::Dist) - return length(model.dist) +function LogDensityProblems.dimension(model::NormalLogNormal) + length(model.μ_y) + 1 end -function LogDensityProblems.capabilities(::Type{<:Dist}) +function LogDensityProblems.capabilities(::Type{<:NormalLogNormal}) LogDensityProblems.LogDensityOrder{1}() end n_dims = 10 -μ = Fill(2.0, n_dims) -σ = Fill(1.0, n_dims) -model = Dist(MvNormal(μ, Diagonal(σ.^2))); +μ_x = 2.0 +σ_x = 0.3 +μ_y = Fill(2.0, n_dims) +σ_y = Fill(1.0, n_dims) +model = NormalLogNormal(μ_x, σ_x, μ_y, Diagonal(σ_y.^2)); d = LogDensityProblems.dimension(model); -μ0 = zeros(d); -L0 = Diagonal(ones(d)); -q0 = AdvancedVI.MeanFieldGaussian(μ0, L0) +μ = zeros(d); +L = Diagonal(ones(d)); +q0 = AdvancedVI.MeanFieldGaussian(μ, L) + +function Bijectors.bijector(model::NormalLogNormal) + (; μ_x, σ_x, μ_y, Σ_y) = model + Bijectors.Stacked( + Bijectors.bijector.([LogNormal(μ_x, σ_x), MvNormal(μ_y, Σ_y)]), + [1:1, 2:1+length(μ_y)]) +end ``` In this example, the true posterior is contained within the variational family. @@ -148,6 +223,10 @@ Recall that the original ADVI objective with a closed-form entropy (CFE) is give ```@example repgradelbo n_montecarlo = 16; +b = Bijectors.bijector(model); +binv = inverse(b) + +q0_trans = Bijectors.TransformedDistribution(q0, binv) cfe = KLMinRepGradDescent( AutoReverseDiff(); @@ -171,11 +250,12 @@ nothing ``` ```@setup repgradelbo -max_iter = 10^3 +max_iter = 3*10^3 function callback(; params, restructure, kwargs...) - q = restructure(params) - dist2 = sum(abs2, q.location - μ) + sum(abs2, diag(q.scale) - σ) + q = restructure(params).dist + dist2 = sum(abs2, q.location - vcat([μ_x], μ_y)) + + sum(abs2, diag(q.scale) - vcat(σ_x, σ_y)) (dist = sqrt(dist2),) end @@ -183,7 +263,7 @@ _, info_cfe, _ = AdvancedVI.optimize( cfe, max_iter, model, - q0; + q0_trans; show_progress = false, callback = callback, ); @@ -192,7 +272,7 @@ _, info_stl, _ = AdvancedVI.optimize( stl, max_iter, model, - q0; + q0_trans; show_progress = false, callback = callback, ); @@ -201,7 +281,7 @@ _, info_stl, _ = AdvancedVI.optimize( stl, max_iter, model, - q0; + q0_trans; show_progress = false, callback = callback, ); @@ -284,7 +364,7 @@ _, info_qmc, _ = AdvancedVI.optimize( KLMinRepGradDescent(AutoReverseDiff(); n_samples=n_montecarlo, optimizer=Adam(1e-2), operator=ClipScale()), max_iter, model, - q0; + q0_trans; show_progress = false, callback = callback, ); diff --git a/docs/src/tutorials/basic.md b/docs/src/tutorials/basic.md index cf2d44a3a..8998b0789 100644 --- a/docs/src/tutorials/basic.md +++ b/docs/src/tutorials/basic.md @@ -50,13 +50,14 @@ end nothing ``` -Notice that the support of `σ` is constrained to be positive. -Since most VI algorithms assume an unconstrained Euclidean support, we will apply a change-of-variable by a bijective map (a *bijector*) to the posterior to make it unconstrained. -For this, we will use the [`Bijectors`](https://github.com/TuringLang/Bijectors.jl) package. +Since the support of `σ` is constrained to be positive and most VI algorithms assume an unconstrained Euclidean support, we need to use a *bijector* to transform `θ`. +We will use [`Bijectors`](https://github.com/TuringLang/Bijectors.jl) for this purpose. +This corresponds to the automatic differentiation variational inference (ADVI) formulation[^KTRGB2017]. In our case, we need a bijector that applies an identity map for the first `size(X,2)` coordinates, and map the last coordinate to the support of `LogNormal(0, 3)`. -This can be constructed as follows: +This can be done as follows: +[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of machine learning research*. ```@example basic using Bijectors: Bijectors @@ -70,37 +71,7 @@ end nothing ``` -Next, we will wrap our original `LogDensityProblem` into a new `LogDensityProblem` that applies the transformation and the corresponding Jacobian adjustment. - -```@example basic -struct TransformedLogDensityProblem{Prob,Trans} - prob::Prob - transform::Trans -end - -function TransformedLogDensityProblem(prob, transform) - return TransformedLogDensityProblem{typeof(prob),typeof(transform)}(prob, transform) -end - -function LogDensityProblems.logdensity(prob_trans::TransformedLogDensityProblem, θ_trans) - (; prob, transform) = prob_trans - θ, logabsdetjac = Bijectors.with_logabsdet_jacobian(transform, θ_trans) - return LogDensityProblems.logdensity(prob, θ) + logabsdetjac -end - -function LogDensityProblems.dimension(prob_trans::TransformedLogDensityProblem) - return LogDensityProblems.dimension(prob_trans.prob) -end - -function LogDensityProblems.capabilities( - ::Type{TransformedLogDensityProblem{Prob,Trans}} -) where {Prob,Trans} - return LogDensityProblems.capabilities(Prob) -end -nothing -``` - -For more details on the usage of bijectors, please refer to the documentation of [`Bijectors`](https://github.com/TuringLang/Bijectors.jl). +For more details, please refer to the documentation of [`Bijectors`](https://github.com/TuringLang/Bijectors.jl). For the dataset, we will use the popular [sonar classification dataset](https://archive.ics.uci.edu/dataset/151/connectionist+bench+sonar+mines+vs+rocks) from the UCI repository. This can be automatically downloaded using [`OpenML`](https://github.com/JuliaAI/OpenML.jl). @@ -128,10 +99,7 @@ nothing The model can now be instantiated as follows: ```@example basic -prob = LogReg(X, y); -b = Bijectors.bijector(prob) -binv = Bijectors.inverse(b) -prob_trans = TransformedLogDensityProblem(prob, binv) +model = LogReg(X, y) nothing ``` @@ -166,7 +134,7 @@ For this example, we will use `LogDensityProblemsAD` to equip our problem with a using DifferentiationInterface: DifferentiationInterface using LogDensityProblemsAD: LogDensityProblemsAD -prob_trans_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoForwardDiff(), prob_trans) +model_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoReverseDiff(), model) nothing ``` @@ -175,20 +143,29 @@ For the variational family, we will consider a `FullRankGaussian` approximation: ```@example basic using LinearAlgebra -d = LogDensityProblems.dimension(prob_trans) -q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.6*I, d, d))) +d = LogDensityProblems.dimension(model_ad) +q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.37*I, d, d))) nothing ``` -`KLMinRepGradDescent` requires the variational approximation and the target log-density to have unconstrained support. -Since we applied a bijector to the posterior to make it unconstrained, and the support of a `FullRankGaussian` is already the full $\mathbb{R}^d$, we are ready to go. +Now, `KLMinRepGradDescent` requires the variational approximation and the target log-density to have the same support. +Since `y` follows a log-normal prior, its support is bounded to be the positive half-space ``\mathbb{R}_+``. +Thus, we will use [Bijectors](https://github.com/TuringLang/Bijectors.jl) to match the support of our target posterior and the variational approximation. +The bijector can now be applied to `q` to match the support of the target problem. + +```@example basic +b = Bijectors.bijector(model) +binv = Bijectors.inverse(b) +q_transformed = Bijectors.TransformedDistribution(q, binv) +nothing +``` We can now run VI: ```@example basic max_iter = 10^4 q_out, info, _ = AdvancedVI.optimize( - alg, max_iter, prob_trans_ad, q; show_progress=false + alg, max_iter, model_ad, q_transformed; show_progress=false ) nothing ``` @@ -213,14 +190,6 @@ nothing ![](basic_example_elbo.svg) -Recall that we applied a change-of-variable to the posterior to make it unconstrained. -This, however, is not the original constrained posterior that we wanted to approximate. -Therefore, we finally need to apply a change-of-variable to `q_out` to make it approximate our original problem: - -```julia -q_trans = Bijectors.TransformedDistribution(q_out, binv) -``` - ## Custom Callback The ELBO estimates above however, use only a handful of Monte Carlo samples. @@ -241,7 +210,7 @@ using StatsFuns: StatsFuns Approximate the posterior predictive probability for a logistic link function using Mackay's approximation (Bishop p. 220). """ function logistic_prediction(X, μ_β, Σ_β) - xtΣx = sum((prob.X*Σ_β) .* prob.X; dims=2)[:, 1] + xtΣx = sum((model.X*Σ_β) .* model.X; dims=2)[:, 1] κ = @. 1/sqrt(1 + π/8*xtΣx) return StatsFuns.logistic.(κ .* X*μ_β) end @@ -252,19 +221,18 @@ function callback(; iteration, averaged_params, restructure, kwargs...) # Use the averaged parameters (the eventual output of the algorithm) q_avg = restructure(averaged_params) - q_avg_trans = Bijectors.TransformedDistribution(q_avg, binv) # Compute predictions - μ_β = mean(q_avg_trans.dist)[1:(end - 1)] # posterior mean of β - Σ_β = cov(q_avg_trans.dist)[1:(end - 1), end - 1] # marginal posterior covariance of β + μ_β = mean(q_avg.dist)[1:(end - 1)] # posterior mean of β + Σ_β = cov(q_avg.dist)[1:(end - 1), end - 1] # marginal posterior covariance of β y_pred = logistic_prediction(X, μ_β, Σ_β) .> 0.5 # Prediction accuracy - acc = mean(y_pred .== prob.y) + acc = mean(y_pred .== model.y) # Higher fidelity estimate of the ELBO on the averaged parameters n_samples = 256 - elbo_callback = -estimate_objective(alg, q_avg, prob_trans; n_samples) + elbo_callback = -estimate_objective(alg, q_avg, model; n_samples) (elbo_callback=elbo_callback, accuracy=acc) else @@ -282,7 +250,7 @@ The `callback` can be supplied to `optimize`: ```@example basic max_iter = 10^4 q_out, info, _ = AdvancedVI.optimize( - alg, max_iter, prob_trans_ad, q; show_progress=false, callback=callback + alg, max_iter, model_ad, q_transformed; show_progress=false, callback=callback ) nothing ``` diff --git a/docs/src/tutorials/flows.md b/docs/src/tutorials/flows.md index 772438a1b..6334c946c 100644 --- a/docs/src/tutorials/flows.md +++ b/docs/src/tutorials/flows.md @@ -22,6 +22,7 @@ Multiplicative degeneracy is not entirely made up and do come up in some models For example, in the 3-parameter (3-PL) item-response theory model and the N-mixture model used for estimating animal population. ```@example flow +using Bijectors: Bijectors using Distributions using LogDensityProblems: LogDensityProblems @@ -55,7 +56,7 @@ Degenerate posteriors often indicate that there is not enough data to pin-point Therefore, for the purpose of illustration, we will use a single data point: ```@example flow -prob = MultDegen([3.0]) +model = MultDegen([3.0]) nothing ``` @@ -67,7 +68,7 @@ using Plots contour( range(0, 4; length=64), range(-3, 25; length=64), - (x, y) -> LogDensityProblems.logdensity(prob, [x, y]); + (x, y) -> LogDensityProblems.logdensity(model, [x, y]); xlabel="α", ylabel="β", clims=(-8, Inf), @@ -85,6 +86,19 @@ This sort of nonlinear correlation structure is difficult to model using only lo ## Gaussian Variational Family As usual, let's try to fit a multivariate Gaussian to this posterior. + +```@example flow +using ADTypes: ADTypes +using ReverseDiff: ReverseDiff +using DifferentiationInterface: DifferentiationInterface +using LogDensityProblemsAD: LogDensityProblemsAD + +model_ad = LogDensityProblemsAD.ADgradient( + ADTypes.AutoReverseDiff(; compile=true), model; x=[1.0, 1.0] +) +nothing +``` + Since $\alpha$ is constrained to the positive real half-space, we have to employ bijectors. For this, we use [Bijectors](https://github.com/TuringLang/Bijectors.jl): @@ -99,77 +113,33 @@ end nothing ``` -to transform the posterior to be unconstrained: - -```@example flow -struct TransformedLogDensityProblem{Prob,Trans} - prob::Prob - transform::Trans -end - -function TransformedLogDensityProblem(prob, transform) - return TransformedLogDensityProblem{typeof(prob),typeof(transform)}(prob, transform) -end - -function LogDensityProblems.logdensity(prob_trans::TransformedLogDensityProblem, θ_trans) - (; prob, transform) = prob_trans - θ, logabsdetjac = Bijectors.with_logabsdet_jacobian(transform, θ_trans) - return LogDensityProblems.logdensity(prob, θ) + logabsdetjac -end - -function LogDensityProblems.dimension(prob_trans::TransformedLogDensityProblem) - return LogDensityProblems.dimension(prob_trans.prob) -end - -function LogDensityProblems.capabilities( - ::Type{TransformedLogDensityProblem{Prob,Trans}} -) where {Prob,Trans} - return LogDensityProblems.capabilities(Prob) -end -nothing -``` - -Let's instantiate the model: - -```@example flow -using ADTypes: ADTypes -using ReverseDiff: ReverseDiff -using DifferentiationInterface: DifferentiationInterface -using LogDensityProblemsAD: LogDensityProblemsAD - -binv = Bijectors.inverse(Bijectors.bijector(prob)) -prob_trans = TransformedLogDensityProblem(prob, binv) -prob_trans_ad = LogDensityProblemsAD.ADgradient( - ADTypes.AutoForwardDiff(), prob_trans; x=[1.0, 1.0] -) -nothing -``` - For the algorithm, we will use the `KLMinRepGradProxDescent` objective. ```@example flow using AdvancedVI using LinearAlgebra -d = LogDensityProblems.dimension(prob_trans_ad) +d = LogDensityProblems.dimension(model_ad) q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(I, d, d))) +binv = Bijectors.inverse(Bijectors.bijector(model)) +q_trans = Bijectors.TransformedDistribution(q, binv) + max_iter = 3*10^3 alg = KLMinRepGradProxDescent(ADTypes.AutoReverseDiff(; compile=true)) -q_out, info, _ = AdvancedVI.optimize(alg, max_iter, prob_trans_ad, q; show_progress=false) -q_out_trans = Bijectors.TransformedDistribution(q_out, binv) +q_out, info, _ = AdvancedVI.optimize(alg, max_iter, model_ad, q_trans; show_progress=false) nothing ``` The resulting variational posterior can be visualized as follows: ```@example flow -samples = rand(q_out_trans, 10000) +samples = rand(q_out, 10000) histogram2d( samples[1, :], samples[2, :]; normalize=:pdf, - nbins=(16, 16), + nbins=32, xlabel="α", ylabel="β", xlims=(0, 4), @@ -207,7 +177,16 @@ q_flow = realnvp(MvNormal(zeros(d), I), hidden_dims, n_layers; paramtype=Float64 nothing ``` -For the variational inference algorithm, we will similarly minimize the KL divergence with stochastic gradient descent as originally proposed by Rezende and Mohamed[^RM2015]. +Recall that out posterior is constrained. +In most cases, flows assume an unconstrained support. +Therefore, just as with the Gaussian variational family, we can incorporate `Bijectors` to match the supports: + +```@example flow +q_flow_trans = Bijectors.TransformedDistribution(q_flow, binv) +nothing +``` + +For the variational inference algorithms, we will similarly minimize the KL divergence with stochastic gradient descent as originally proposed by Rezende and Mohamed[^RM2015]. For this, however, we need to be mindful of the requirements of the variational algorithm. The default `entropy` gradient estimator of `KLMinRepGradDescent` is `ClosedFormEntropy()`, which assumes that the entropy of the variational family `entropy(q)` is available. For flows, the entropy is (usually) not available. Instead, we can use any gradient estimator that only relies on the log-density of the variational family `logpdf(q)`, `StickingTheLandingEntropy()` or `MonteCarloEntropy()`. @@ -220,12 +199,9 @@ Furthermore, Agrawal *et al.*[^AD2025] claim that using a larger number of Monte [^ASD2020]: Agrawal, A., Sheldon, D. R., & Domke, J. (2020). Advances in black-box VI: Normalizing flows, importance weighting, and optimization. In *Advances in Neural Information Processing Systems*, 33, 17358-17369. [^AD2025]: Agrawal, A., & Domke, J. (2024). Disentangling impact of capacity, objective, batchsize, estimators, and step-size on flow VI. In *Proceedings of the International Conference on Artificial Intelligence and Statistics*. ```@example flow -using Optimisers: Optimisers - alg_flow = KLMinRepGradDescent( ADTypes.AutoReverseDiff(; compile=true); n_samples=8, - optimizer=Optimisers.Adam(1e-2), operator=IdentityOperator(), entropy=StickingTheLandingEntropy(), ) @@ -235,11 +211,9 @@ nothing Without further due, let's now run VI: ```@example flow -max_iter = 300 q_flow_out, info_flow, _ = AdvancedVI.optimize( - alg_flow, max_iter, prob_trans_ad, q_flow; show_progress=false + alg_flow, max_iter, model_ad, q_flow_trans; show_progress=false ) -q_flow_out_trans = Bijectors.TransformedDistribution(q_flow_out, binv) nothing ``` @@ -256,12 +230,12 @@ nothing Finally, let's visualize the variational posterior: ```@example flow -samples = rand(q_flow_out_trans, 10000) +samples = rand(q_flow_out, 10000) histogram2d( samples[1, :], samples[2, :]; normalize=:pdf, - nbins=(16, 16), + nbins=64, xlabel="α", ylabel="β", xlims=(0, 4), diff --git a/docs/src/tutorials/subsampling.md b/docs/src/tutorials/subsampling.md index 6b5ef665b..1816efb44 100644 --- a/docs/src/tutorials/subsampling.md +++ b/docs/src/tutorials/subsampling.md @@ -10,7 +10,7 @@ In this tutorial, we will see how to perform subsampling with `KLMinRepGradProxD [^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of Machine Learning Research*, 18(14), 1-45. ## Setting Up Subsampling -We will consider a typical hierarchical logistic regression example. +We will consider the same hierarchical logistic regression example used in the [Basic Example](@ref basic). ```@example subsampling using LogDensityProblems: LogDensityProblems @@ -26,11 +26,10 @@ end function LogDensityProblems.logdensity(model::LogReg, θ) (; X, y, n_data) = model n, d = size(X) - β, logσ = θ[1:size(X, 2)], θ[end] - σ = exp(logσ) + β, σ = θ[1:size(X, 2)], θ[end] logprior_β = logpdf(MvNormal(Zeros(d), σ), β) - logprior_σ = logpdf(Normal(0, 3), σ) + logprior_σ = logpdf(LogNormal(0, 3), σ) logit = X*β loglike_y = mapreduce((li, yi) -> logpdf(BernoulliLogit(li), yi), +, logit, y) @@ -38,7 +37,7 @@ function LogDensityProblems.logdensity(model::LogReg, θ) end function LogDensityProblems.dimension(model::LogReg) - return size(prob.X, 2) + 1 + return size(model.X, 2) + 1 end function LogDensityProblems.capabilities(::Type{<:LogReg}) @@ -51,6 +50,21 @@ Notice that, to use subsampling, we need be able to rescale the likelihood stren That is, for the gradient of the log-density with a batch of data points of size `n` to be an unbiased estimate of the gradient using the full dataset of size `n_data`, we need to scale the likelihood by `n_data/n`. This part is critical to ensure that the algorithm correctly approximates the posterior with the full dataset. +As usual, we will set up a bijector: + +```@example subsampling +using Bijectors: Bijectors + +function Bijectors.bijector(model::LogReg) + d = size(model.X, 2) + return Bijectors.Stacked( + Bijectors.bijector.([MvNormal(Zeros(d), 1.0), LogNormal(0, 3)]), + [1:d, (d + 1):(d + 1)], + ) +end +nothing +``` + For the dataset, we will use one that is larger than that used in the [Basic Example](@ref basic). This is to properly assess the advantage of subsampling. In particular, we will utilize the "Phishing" dataset[^Tan2018], which consists of 10000 data points, each with 48 features. @@ -86,28 +100,26 @@ Let's now istantiate the model and set up automatic differentiation using [`LogD using ADTypes, ReverseDiff using LogDensityProblemsAD -prob = LogReg(X, y, size(X, 1)) -prob_ad = LogDensityProblemsAD.ADgradient( - ADTypes.AutoReverseDiff(), prob; x=randn(LogDensityProblems.dimension(prob)) -) +model = LogReg(X, y, size(X, 1)) +model_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoReverseDiff(), model) nothing ``` To enable subsampling, `LogReg` has to implement the method `AdvancedVI.subsample`. For our model, this is fairly simple: We only need to select the rows of `X` and the elements of `y` corresponding to the batch of data points. -As subtle point here is that we wrapped `prob` with `LogDensityProblemsAD.ADgradient` into `prob_ad`. -Therefore, `AdvancedVI` sees `prob_ad` and not `prob`. -This means we have to specialize `AdvancedVI.subsample` to `typeof(prob_ad)` and not `LogReg`. +As subtle point here is that we wrapped `model` with `LogDensityProblemsAD.ADgradient` into `model_ad`. +Therefore, `AdvancedVI` sees `model_ad` and not `model`. +This means we have to specialize `AdvancedVI.subsample` to `typeof(model_ad)` and not `LogReg`. ```@example subsampling using Accessors using AdvancedVI -function AdvancedVI.subsample(prob::typeof(prob_ad), idx) - (; X, y, n_data) = parent(prob_ad) - prob′ = @set prob.ℓ.X = X[idx, :] - prob′′ = @set prob′.ℓ.y = y[idx] - return prob′′ +function AdvancedVI.subsample(model::typeof(model_ad), idx) + (; X, y, n_data) = parent(model) + model′ = @set model.ℓ.X = X[idx, :] + model′′ = @set model′.ℓ.y = y[idx] + return model′′ end nothing ``` @@ -126,7 +138,7 @@ Here, we will use `ReshufflingBatchSubsampling`, which implements random reshuff We will us a batch size of 32, which results in `313 = length(subsampling) = ceil(Int, size(X,2)/32)` steps per epoch. ```@example subsampling -dataset = 1:size(prob.X, 1) +dataset = 1:size(model.X, 1) batchsize = 32 subsampling = ReshufflingBatchSubsampling(dataset, batchsize) alg_sub = KLMinRepGradProxDescent(ADTypes.AutoReverseDiff(; compile=true); subsampling) @@ -157,8 +169,11 @@ The variational family will be set up as follows: ```@example subsampling using LinearAlgebra -d = LogDensityProblems.dimension(prob_ad) -q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.6*I, d, d))) +d = LogDensityProblems.dimension(model_ad) +q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.37*I, d, d))) +b = Bijectors.bijector(model) +binv = Bijectors.inverse(b) +q_transformed = Bijectors.TransformedDistribution(q, binv) nothing ``` @@ -177,7 +192,7 @@ time_begin = nothing Approximate the posterior predictive probability for a logistic link function using Mackay's approximation (Bishop p. 220). """ function logistic_prediction(X, μ_β, Σ_β) - xtΣx = sum((prob.X*Σ_β) .* prob.X; dims=2)[:, 1] + xtΣx = sum((model.X*Σ_β) .* model.X; dims=2)[:, 1] κ = @. 1/sqrt(1 + π/8*xtΣx) return StatsFuns.logistic.(κ .* X*μ_β) end @@ -189,16 +204,16 @@ function callback(; iteration, averaged_params, restructure, kwargs...) q_avg = restructure(averaged_params) # Compute predictions using - μ_β = mean(q_avg)[1:(end - 1)] # posterior mean of β - Σ_β = cov(q_avg)[1:(end - 1), end - 1] # marginal posterior covariance of β + μ_β = mean(q_avg.dist)[1:(end - 1)] # posterior mean of β + Σ_β = cov(q_avg.dist)[1:(end - 1), end - 1] # marginal posterior covariance of β y_pred = logistic_prediction(X, μ_β, Σ_β) .> 0.5 # Prediction accuracy - acc = mean(y_pred .== prob.y) + acc = mean(y_pred .== model.y) # Higher fidelity estimate of the ELBO on the averaged parameters n_samples = 256 - elbo_callback = -estimate_objective(alg_full, q_avg, prob; n_samples) + elbo_callback = -estimate_objective(alg_full, q_avg, model; n_samples) (elbo_callback=elbo_callback, accuracy=acc, time_elapsed=time() - time_begin) else @@ -208,12 +223,12 @@ end time_begin = time() _, info_full, _ = AdvancedVI.optimize( - alg_full, max_iter, prob_ad, q; show_progress=true, callback + alg_full, max_iter, model_ad, q_transformed; show_progress=false, callback ); time_begin = time() _, info_sub, _ = AdvancedVI.optimize( - alg_sub, max_iter, prob_ad, q; show_progress=true, callback + alg_sub, max_iter, model_ad, q_transformed; show_progress=false, callback ); nothing ``` From 8c657e0ff8409217de911500e64d257edecf38b1 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Sat, 22 Nov 2025 15:27:16 -0500 Subject: [PATCH 26/26] revert changes to the README --- README.md | 68 ++++++++++++------------------------------------------- 1 file changed, 14 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index 4e54b6e06..4732d5c6e 100644 --- a/README.md +++ b/README.md @@ -3,13 +3,13 @@ [![Tests](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Tests.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Tests.yml/badge.svg?branch=main) [![Coverage](https://codecov.io/gh/TuringLang/AdvancedVI.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/TuringLang/AdvancedVI.jl) -| AD Backend | Integration Status | -|:---------------------------------------------------------- |:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| [ForwardDiff](https://github.com/JuliaDiff/ForwardDiff.jl) | [![ForwardDiff](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ForwardDiff.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ForwardDiff.yml?query=branch%3Amain) | -| [ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl) | [![ReverseDiff](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ReverseDiff.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ReverseDiff.yml?query=branch%3Amain) | -| [Zygote](https://github.com/FluxML/Zygote.jl) | [![Zygote](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Zygote.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Zygote.yml?query=branch%3Amain) | -| [Mooncake](https://github.com/chalk-lab/Mooncake.jl) | [![Mooncake](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Mooncake.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Mooncake.yml?query=branch%3Amain) | -| [Enzyme](https://github.com/EnzymeAD/Enzyme.jl) | [![Enzyme](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Enzyme.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Enzyme.yml?query=branch%3Amain) | +| AD Backend | Integration Status | +| ------------- | ------------- | +| [ForwardDiff](https://github.com/JuliaDiff/ForwardDiff.jl) | [![ForwardDiff](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ForwardDiff.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ForwardDiff.yml?query=branch%3Amain) | +| [ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl) | [![ReverseDiff](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ReverseDiff.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ReverseDiff.yml?query=branch%3Amain) | +| [Zygote](https://github.com/FluxML/Zygote.jl) | [![Zygote](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Zygote.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Zygote.yml?query=branch%3Amain) | +| [Mooncake](https://github.com/chalk-lab/Mooncake.jl) | [![Mooncake](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Mooncake.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Mooncake.yml?query=branch%3Amain) | +| [Enzyme](https://github.com/EnzymeAD/Enzyme.jl) | [![Enzyme](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Enzyme.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Enzyme.yml?query=branch%3Amain) | # AdvancedVI.jl @@ -69,7 +69,7 @@ end; Since the support of `σ` is constrained to be positive and most VI algorithms assume an unconstrained Euclidean support, we need to use a *bijector* to transform `θ`. We will use [`Bijectors`](https://github.com/TuringLang/Bijectors.jl) for this purpose. -The bijector corresponding to the joint support of our model can be constructed as follows: +This corresponds to the automatic differentiation variational inference (ADVI) formulation[^KTRGB2017]. ```julia using Bijectors: Bijectors @@ -85,36 +85,6 @@ end; A simpler approach would be to use [`Turing`](https://github.com/TuringLang/Turing.jl), where a `Turing.Model` can be automatically be converted into a `LogDensityProblem` and a corresponding `bijector` is automatically generated. -Since most VI algorithms assume that the posterior is unconstrained, we will apply a change-of-variable to our model to make it unconstrained. -This amounts to wrapping it into a `LogDensityProblem` that applies the transformation and the corresponding Jacobian adjustment. - -```julia -struct TransformedLogDensityProblem{Prob,Trans} - prob::Prob - transform::Trans -end - -function TransformedLogDensityProblem(prob, transform) - return TransformedLogDensityProblem{typeof(prob),typeof(transform)}(prob, transform) -end - -function LogDensityProblems.logdensity(prob_trans::TransformedLogDensityProblem, θ_trans) - (; prob, transform) = prob_trans - θ, logabsdetjac = Bijectors.with_logabsdet_jacobian(transform, θ_trans) - return LogDensityProblems.logdensity(prob, θ) + logabsdetjac -end - -function LogDensityProblems.dimension(prob_trans::TransformedLogDensityProblem) - return LogDensityProblems.dimension(prob_trans.prob) -end - -function LogDensityProblems.capabilities( - ::Type{TransformedLogDensityProblem{Prob,Trans}} -) where {Prob,Trans} - return LogDensityProblems.capabilities(Prob) -end; -``` - For the dataset, we will use the popular [sonar classification dataset](https://archive.ics.uci.edu/dataset/151/connectionist+bench+sonar+mines+vs+rocks) from the UCI repository. This can be automatically downloaded using [`OpenML`](https://github.com/JuliaAI/OpenML.jl). The sonar dataset corresponds to the dataset id 40. @@ -139,10 +109,7 @@ X = hcat(X, ones(size(X, 1))); The model can now be instantiated as follows: ```julia -prob = LogReg(X, y); -b = Bijectors.bijector(prob) -binv = Bijectors.inverse(b) -prob_trans = TransformedLogDensityProblem(prob, binv) +model = LogReg(X, y); ``` For the VI algorithm, we will use `KLMinRepGradDescent`: @@ -169,7 +136,7 @@ For this, it is straightforward to use `LogDensityProblemsAD`: using DifferentiationInterface: DifferentiationInterface using LogDensityProblemsAD: LogDensityProblemsAD -prob_trans_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoReverseDiff(), prob_trans); +model_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoReverseDiff(), model); ``` For the variational family, we will consider a `FullRankGaussian` approximation: @@ -177,8 +144,8 @@ For the variational family, we will consider a `FullRankGaussian` approximation: ```julia using LinearAlgebra -d = LogDensityProblems.dimension(prob_trans_ad) -q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.6*I, d, d))) +d = LogDensityProblems.dimension(model_ad) +q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.37*I, d, d))) q = MeanFieldGaussian(zeros(d), Diagonal(ones(d))); ``` @@ -194,15 +161,7 @@ We can now run VI: ```julia max_iter = 10^3 -q_opt, info, _ = AdvancedVI.optimize(alg, max_iter, prob_trans_ad, q); -``` - -Recall that we applied a change-of-variable to the posterior to make it unconstrained. -This, however, is not the original constrained posterior that we wanted to approximate. -Therefore, we finally need to apply a change-of-variable to `q_opt` to make it approximate our original problem. - -```julia -q_trans = Bijectors.TransformedDistribution(q_opt, binv) +q, info, _ = AdvancedVI.optimize(alg, max_iter, model_ad, q_transformed;); ``` For more examples and details, please refer to the documentation. @@ -210,3 +169,4 @@ For more examples and details, please refer to the documentation. [^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014, June). Doubly stochastic variational Bayes for non-conjugate inference. In *International Conference on Machine Learning*. PMLR. [^RMW2014]: Rezende, D. J., Mohamed, S., & Wierstra, D. (2014, June). Stochastic backpropagation and approximate inference in deep generative models. In *International Conference on Machine Learning*. PMLR. [^KW2014]: Kingma, D. P., & Welling, M. (2014). Auto-encoding variational bayes. In *International Conference on Learning Representations*. +[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of machine learning research*.