From 4bf2fce48f6aec9834446232b6b2e561181bb534 Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Fri, 21 Nov 2025 10:31:35 -0500
Subject: [PATCH 01/26] remove the bijectors extension and relevant tests,
 benchmarks

---
 HISTORY.md                                    |  11 ++
 Project.toml                                  |   6 +-
 bench/Project.toml                            |   2 -
 bench/benchmarks.jl                           |   1 -
 bench/normallognormal.jl                      |  44 ------
 ext/AdvancedVIBijectorsExt.jl                 | 129 ------------------
 test/Project.toml                             |   2 -
 .../klminrepgraddescent_bijectors.jl          |  94 -------------
 .../klminrepgradproxdescent_bijectors.jl      |  89 ------------
 .../klminscoregraddescent_bijectors.jl        |  86 ------------
 test/models/normallognormal.jl                |  62 ---------
 test/runtests.jl                              |   3 -
 12 files changed, 12 insertions(+), 517 deletions(-)
 delete mode 100644 bench/normallognormal.jl
 delete mode 100644 ext/AdvancedVIBijectorsExt.jl
 delete mode 100644 test/algorithms/klminrepgraddescent_bijectors.jl
 delete mode 100644 test/algorithms/klminrepgradproxdescent_bijectors.jl
 delete mode 100644 test/algorithms/klminscoregraddescent_bijectors.jl
 delete mode 100644 test/models/normallognormal.jl

diff --git a/HISTORY.md b/HISTORY.md
index 32a58da21..9172492f5 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,3 +1,14 @@
+# Release 0.7
+
+## Removal of special treatment to `Bijectors.TransformedDistribution`
+
+Previously, `KLMinRepGradDescent`, `KLMinRepGradProxDescent`, `KLMinScoreGradDescent` only required the support of the target log-density problem to match that of `q`.
+This was implemented by giving a special treatment to `q <: Bijectors.TransformedDistribution` through the `Bijectors` extension.
+This, however, resulted in a multiplicative complexity to maintaining the relevant bits.
+Since this is not the only way to deal with constrained supports, `Bijectors` extension is now removed.
+In addition, `KLMinRepGradDescent`, `KLMinRepGradProxDescent`, `KLMinScoreGradDescent` now expect an unconstrained target log-density problem.
+Instead, a tutorial has been added to the documentation on how to deal with a target log-density problem with constrained support.
+
 # Release 0.6
 
 ## New Algorithms
diff --git a/Project.toml b/Project.toml
index 437f5216d..92589cb86 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "AdvancedVI"
 uuid = "b5ca4192-6429-45e5-a2d9-87aec30a685c"
-version = "0.6"
+version = "0.7"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -20,13 +20,11 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [weakdeps]
-Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 
 [extensions]
-AdvancedVIBijectorsExt = ["Bijectors", "Optimisers"]
 AdvancedVIEnzymeExt = ["Enzyme", "ChainRulesCore"]
 AdvancedVIMooncakeExt = ["Mooncake", "ChainRulesCore"]
 AdvancedVIReverseDiffExt = ["ReverseDiff", "ChainRulesCore"]
@@ -34,7 +32,6 @@ AdvancedVIReverseDiffExt = ["ReverseDiff", "ChainRulesCore"]
 [compat]
 ADTypes = "1"
 Accessors = "0.1"
-Bijectors = "0.13, 0.14, 0.15"
 ChainRulesCore = "1"
 DiffResults = "1"
 DifferentiationInterface = "0.6, 0.7"
@@ -54,7 +51,6 @@ StatsBase = "0.32, 0.33, 0.34"
 julia = "1.10, 1.11.2"
 
 [extras]
-Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
diff --git a/bench/Project.toml b/bench/Project.toml
index 0ca4fc686..a63fcfde5 100644
--- a/bench/Project.toml
+++ b/bench/Project.toml
@@ -2,7 +2,6 @@
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 AdvancedVI = "b5ca4192-6429-45e5-a2d9-87aec30a685c"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
-Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 DistributionsAD = "ced4e74d-a319-5a8a-b0ac-84af2272839c"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
@@ -22,7 +21,6 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 ADTypes = "1"
 AdvancedVI = "0.6"
 BenchmarkTools = "1"
-Bijectors = "0.13, 0.14, 0.15"
 Distributions = "0.25.111"
 DistributionsAD = "0.6"
 Enzyme = "0.13.7"
diff --git a/bench/benchmarks.jl b/bench/benchmarks.jl
index ad40b3534..08f688be4 100644
--- a/bench/benchmarks.jl
+++ b/bench/benchmarks.jl
@@ -34,7 +34,6 @@ begin
     T = Float64
 
     for (probname, prob) in [
-        ("normal + bijector", normallognormal(; n_dims=10, realtype=T))
         ("normal", normal(; n_dims=10, realtype=T))
     ]
         max_iter = 10^4
diff --git a/bench/normallognormal.jl b/bench/normallognormal.jl
deleted file mode 100644
index 4cfc9af1a..000000000
--- a/bench/normallognormal.jl
+++ /dev/null
@@ -1,44 +0,0 @@
-struct NormalLogNormal{MX,SX,MY,SY}
-    μ_x::MX
-    σ_x::SX
-    μ_y::MY
-    Σ_y::SY
-end
-
-function LogDensityProblems.logdensity(model::NormalLogNormal, θ)
-    (; μ_x, σ_x, μ_y, Σ_y) = model
-    log_density_x = logpdf(LogNormal(μ_x, σ_x), θ[1])
-    log_density_y = logpdf(MvNormal(μ_y, Σ_y), θ[2:end])
-    return log_density_x + log_density_y
-end
-
-function LogDensityProblems.logdensity_and_gradient(model::NormalLogNormal, θ)
-    return (
-        LogDensityProblems.logdensity(model, θ),
-        ForwardDiff.gradient(Base.Fix1(LogDensityProblems.logdensity, model), θ),
-    )
-end
-
-function LogDensityProblems.dimension(model::NormalLogNormal)
-    return length(model.μ_y) + 1
-end
-
-function LogDensityProblems.capabilities(::Type{<:NormalLogNormal})
-    return LogDensityProblems.LogDensityOrder{1}()
-end
-
-function Bijectors.bijector(model::NormalLogNormal)
-    (; μ_x, σ_x, μ_y, Σ_y) = model
-    return Bijectors.Stacked(
-        Bijectors.bijector.([LogNormal(μ_x, σ_x), MvNormal(μ_y, Σ_y)]),
-        [1:1, 2:(1 + length(μ_y))],
-    )
-end
-
-function normallognormal(; n_dims=10, realtype=Float64)
-    μ_x = realtype(5.0)
-    σ_x = realtype(0.3)
-    μ_y = Fill(realtype(5.0), n_dims)
-    σ_y = Fill(realtype(0.3), n_dims)
-    return NormalLogNormal(μ_x, σ_x, μ_y, Diagonal(σ_y .^ 2))
-end
diff --git a/ext/AdvancedVIBijectorsExt.jl b/ext/AdvancedVIBijectorsExt.jl
deleted file mode 100644
index 0e85d4d43..000000000
--- a/ext/AdvancedVIBijectorsExt.jl
+++ /dev/null
@@ -1,129 +0,0 @@
-module AdvancedVIBijectorsExt
-
-using AdvancedVI
-using DiffResults: DiffResults
-using Bijectors
-using LinearAlgebra
-using Optimisers
-using Random
-
-function AdvancedVI.init(
-    rng::Random.AbstractRNG,
-    alg::AdvancedVI.ParamSpaceSGD,
-    q_init::Bijectors.TransformedDistribution,
-    prob,
-)
-    (; adtype, optimizer, averager, objective, operator) = alg
-    if q_init.dist isa AdvancedVI.MvLocationScale &&
-        operator isa AdvancedVI.IdentityOperator
-        @warn(
-            "IdentityOperator is used with a variational family <:MvLocationScale. Optimization can easily fail under this combination due to singular scale matrices. Consider using the operator `ClipScale` in the algorithm instead.",
-        )
-    end
-    params, re = Optimisers.destructure(q_init)
-    opt_st = Optimisers.setup(optimizer, params)
-    obj_st = AdvancedVI.init(rng, objective, adtype, q_init, prob, params, re)
-    avg_st = AdvancedVI.init(averager, params)
-    grad_buf = DiffResults.DiffResult(zero(eltype(params)), similar(params))
-    return (
-        prob=prob,
-        q=q_init,
-        iteration=0,
-        grad_buf=grad_buf,
-        opt_st=opt_st,
-        obj_st=obj_st,
-        avg_st=avg_st,
-    )
-end
-
-function AdvancedVI.apply(
-    op::ClipScale,
-    ::Type{<:Bijectors.TransformedDistribution{<:AdvancedVI.MvLocationScale}},
-    state,
-    params,
-    restructure,
-)
-    q = restructure(params)
-    ϵ = convert(eltype(params), op.epsilon)
-
-    # Project the scale matrix to the set of positive definite triangular matrices
-    diag_idx = diagind(q.dist.scale)
-    @. q.dist.scale[diag_idx] = max(q.dist.scale[diag_idx], ϵ)
-
-    params, _ = Optimisers.destructure(q)
-
-    return params
-end
-
-function AdvancedVI.apply(
-    op::ClipScale,
-    ::Type{<:Bijectors.TransformedDistribution{<:AdvancedVI.MvLocationScaleLowRank}},
-    state,
-    params,
-    restructure,
-)
-    q = restructure(params)
-    ϵ = convert(eltype(params), op.epsilon)
-
-    @. q.dist.scale_diag = max(q.dist.scale_diag, ϵ)
-
-    params, _ = Optimisers.destructure(q)
-
-    return params
-end
-
-function AdvancedVI.apply(
-    ::AdvancedVI.ProximalLocationScaleEntropy,
-    ::Type{<:Bijectors.TransformedDistribution{<:AdvancedVI.MvLocationScale}},
-    leaf::Optimisers.Leaf{<:Union{<:DoG,<:DoWG,<:Descent},S},
-    params,
-    restructure,
-) where {S}
-    q = restructure(params)
-
-    stepsize = AdvancedVI.stepsize_from_optimizer_state(leaf.rule, leaf.state)
-    diag_idx = diagind(q.dist.scale)
-    scale_diag = q.dist.scale[diag_idx]
-    @. q.dist.scale[diag_idx] =
-        scale_diag + 1 / 2 * (sqrt(scale_diag^2 + 4 * stepsize) - scale_diag)
-
-    params, _ = Optimisers.destructure(q)
-
-    return params
-end
-
-function AdvancedVI.reparam_with_entropy(
-    rng::Random.AbstractRNG,
-    q::Bijectors.TransformedDistribution,
-    q_stop::Bijectors.TransformedDistribution,
-    n_samples::Int,
-    ent_est::AdvancedVI.AbstractEntropyEstimator,
-)
-    transform = q.transform
-    q_unconst = q.dist
-    q_unconst_stop = q_stop.dist
-
-    # Draw samples and compute entropy of the unconstrained distribution
-    unconstr_samples, unconst_entropy = AdvancedVI.reparam_with_entropy(
-        rng, q_unconst, q_unconst_stop, n_samples, ent_est
-    )
-
-    # Apply bijector to samples while estimating its jacobian
-    unconstr_iter = AdvancedVI.eachsample(unconstr_samples)
-    unconstr_init = first(unconstr_iter)
-    samples_init, logjac_init = with_logabsdet_jacobian(transform, unconstr_init)
-    samples_and_logjac = mapreduce(
-        AdvancedVI.catsamples_and_acc,
-        Iterators.drop(unconstr_iter, 1);
-        init=(reshape(samples_init, (:, 1)), logjac_init),
-    ) do sample
-        with_logabsdet_jacobian(transform, sample)
-    end
-    samples = first(samples_and_logjac)
-    logjac = last(samples_and_logjac) / n_samples
-
-    entropy = unconst_entropy + logjac
-    return samples, entropy
-end
-
-end
diff --git a/test/Project.toml b/test/Project.toml
index db7d9068a..f657e28c1 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,6 +1,5 @@
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
-Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
 DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
 DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
@@ -25,7 +24,6 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 ADTypes = "0.2.1, 1"
-Bijectors = "0.13, 0.14, 0.15"
 DiffResults = "1"
 DifferentiationInterface = "0.6, 0.7"
 Distributions = "0.25.111"
diff --git a/test/algorithms/klminrepgraddescent_bijectors.jl b/test/algorithms/klminrepgraddescent_bijectors.jl
deleted file mode 100644
index 0d26a309a..000000000
--- a/test/algorithms/klminrepgraddescent_bijectors.jl
+++ /dev/null
@@ -1,94 +0,0 @@
-
-@testset "KLMinRepGradDescent with Bijectors" begin
-    begin
-        modelstats = normallognormal_meanfield(Random.default_rng(), Float64)
-        (; model, n_dims, μ_true, L_true) = modelstats
-
-        b = Bijectors.bijector(model)
-        binv = inverse(b)
-
-        q0_unconstr = MeanFieldGaussian(zeros(n_dims), Diagonal(ones(n_dims)))
-        q0 = Bijectors.transformed(q0_unconstr, binv)
-
-        @testset "estimate_objective" begin
-            alg = KLMinRepGradDescent(AD; operator=ClipScale())
-            q_true_unconstr = MeanFieldGaussian(Vector(μ_true), Diagonal(L_true))
-            q_true = Bijectors.transformed(q_true_unconstr, binv)
-
-            obj_est = estimate_objective(alg, q_true, model)
-            @test isfinite(obj_est)
-
-            obj_est = estimate_objective(alg, q_true, model; n_samples=1)
-            @test isfinite(obj_est)
-
-            obj_est = estimate_objective(alg, q_true, model; n_samples=3)
-            @test isfinite(obj_est)
-
-            obj_est = estimate_objective(alg, q_true, model; n_samples=10^5)
-            @test obj_est ≈ 0 atol=1e-2
-        end
-
-        @testset "determinism" begin
-            alg = KLMinRepGradDescent(AD; operator=ClipScale())
-
-            seed = (0x38bef07cf9cc549d)
-            rng = StableRNG(seed)
-            T = 10
-
-            q_out, _, _ = optimize(rng, alg, T, model, q0; show_progress=PROGRESS)
-            μ = q_out.dist.location
-            L = q_out.dist.scale
-
-            rng_repl = StableRNG(seed)
-            q_out, _, _ = optimize(rng_repl, alg, T, model, q0; show_progress=PROGRESS)
-            μ_repl = q_out.dist.location
-            L_repl = q_out.dist.scale
-            @test μ == μ_repl
-            @test L == L_repl
-        end
-
-        @testset "warn MvLocationScale with IdentityOperator" begin
-            @test_warn "IdentityOperator" begin
-                alg′ = KLMinRepGradDescent(AD; operator=IdentityOperator())
-                optimize(alg′, 1, model, q0; show_progress=false)
-            end
-        end
-    end
-
-    @testset "type stability realtype=$(realtype)" for realtype in [Float32, Float64]
-        modelstats = normallognormal_meanfield(Random.default_rng(), realtype)
-        (; model, n_dims, μ_true, L_true) = modelstats
-
-        T = 1
-        alg = KLMinRepGradDescent(AD; n_samples=10, operator=ClipScale())
-        q0_unconstr = MeanFieldGaussian(
-            zeros(realtype, n_dims), Diagonal(ones(realtype, n_dims))
-        )
-        q0 = Bijectors.transformed(q0_unconstr, binv)
-
-        q_out, info, _ = optimize(alg, T, model, q0; show_progress=PROGRESS)
-
-        @test eltype(q_out.dist.location) == realtype
-        @test eltype(q_out.dist.scale) == realtype
-        @test typeof(first(info).elbo) == realtype
-    end
-
-    @testset "convergence $(entropy)" for entropy in
-                                          [ClosedFormEntropy(), StickingTheLandingEntropy()]
-        modelstats = normallognormal_meanfield(Random.default_rng(), Float64)
-        (; model, μ_true, L_true, is_meanfield) = modelstats
-
-        T = 1000
-        optimizer = Descent(1e-3)
-        alg = KLMinRepGradDescent(AD; entropy, optimizer, operator=ClipScale())
-        q0_unconstr = MeanFieldGaussian(zeros(n_dims), Diagonal(ones(n_dims)))
-        q0 = Bijectors.transformed(q0_unconstr, binv)
-
-        q_out, _, _ = optimize(alg, T, model, q0; show_progress=PROGRESS)
-
-        Δλ0 = sum(abs2, q0.dist.location - μ_true) + sum(abs2, q0.dist.scale - L_true)
-        Δλ = sum(abs2, q_out.dist.location - μ_true) + sum(abs2, q_out.dist.scale - L_true)
-
-        @test Δλ ≤ Δλ0/2
-    end
-end
diff --git a/test/algorithms/klminrepgradproxdescent_bijectors.jl b/test/algorithms/klminrepgradproxdescent_bijectors.jl
deleted file mode 100644
index d2714b5aa..000000000
--- a/test/algorithms/klminrepgradproxdescent_bijectors.jl
+++ /dev/null
@@ -1,89 +0,0 @@
-
-@testset "KLMinRepGradProxDescent with Bijectors" begin
-    begin
-        modelstats = normallognormal_meanfield(Random.default_rng(), Float64)
-        (; model, n_dims, μ_true, L_true) = modelstats
-
-        b = Bijectors.bijector(model)
-        binv = inverse(b)
-
-        q0_unconstr = MeanFieldGaussian(zeros(n_dims), Diagonal(ones(n_dims)))
-        q0 = Bijectors.transformed(q0_unconstr, binv)
-
-        @testset "estimate_objective" begin
-            alg = KLMinRepGradProxDescent(AD)
-            q_true_unconstr = MeanFieldGaussian(Vector(μ_true), Diagonal(L_true))
-            q_true = Bijectors.transformed(q_true_unconstr, binv)
-
-            obj_est = estimate_objective(alg, q_true, model)
-            @test isfinite(obj_est)
-
-            obj_est = estimate_objective(alg, q_true, model; n_samples=1)
-            @test isfinite(obj_est)
-
-            obj_est = estimate_objective(alg, q_true, model; n_samples=3)
-            @test isfinite(obj_est)
-
-            obj_est = estimate_objective(alg, q_true, model; n_samples=10^5)
-            @test obj_est ≈ 0 atol=1e-3
-        end
-
-        @testset "determinism" begin
-            alg = KLMinRepGradProxDescent(AD)
-
-            seed = (0x38bef07cf9cc549d)
-            rng = StableRNG(seed)
-            T = 10
-
-            q_out, _, _ = optimize(rng, alg, T, model, q0; show_progress=PROGRESS)
-            μ = q_out.dist.location
-            L = q_out.dist.scale
-
-            rng_repl = StableRNG(seed)
-            q_out, _, _ = optimize(rng_repl, alg, T, model, q0; show_progress=PROGRESS)
-            μ_repl = q_out.dist.location
-            L_repl = q_out.dist.scale
-            @test μ == μ_repl
-            @test L == L_repl
-        end
-    end
-
-    @testset "type stability realtype=$(realtype)" for realtype in [Float32, Float64]
-        modelstats = normallognormal_meanfield(Random.default_rng(), realtype)
-        (; model, n_dims, μ_true, L_true) = modelstats
-
-        T = 1
-        alg = KLMinRepGradProxDescent(AD; n_samples=10)
-        q0_unconstr = MeanFieldGaussian(
-            zeros(realtype, n_dims), Diagonal(ones(realtype, n_dims))
-        )
-        q0 = Bijectors.transformed(q0_unconstr, binv)
-
-        q_out, info, _ = optimize(alg, T, model, q0; show_progress=PROGRESS)
-
-        @test eltype(q_out.dist.location) == realtype
-        @test eltype(q_out.dist.scale) == realtype
-        @test typeof(first(info).elbo) == realtype
-    end
-
-    @testset "convergence $(entropy)" for entropy_zerograd in [
-        ClosedFormEntropyZeroGradient(), StickingTheLandingEntropyZeroGradient()
-    ]
-        modelstats = normallognormal_meanfield(Random.default_rng(), Float64)
-        (; model, μ_true, L_true, is_meanfield) = modelstats
-
-        T = 1000
-        optimizer = Descent(1e-3)
-        alg = KLMinRepGradProxDescent(AD; n_samples=10, optimizer, entropy_zerograd)
-
-        q0_unconstr = MeanFieldGaussian(zeros(n_dims), Diagonal(ones(n_dims)))
-        q0 = Bijectors.transformed(q0_unconstr, binv)
-
-        q_out, _, _ = optimize(alg, T, model, q0; show_progress=PROGRESS)
-
-        Δλ0 = sum(abs2, q0.dist.location - μ_true) + sum(abs2, q0.dist.scale - L_true)
-        Δλ = sum(abs2, q_out.dist.location - μ_true) + sum(abs2, q_out.dist.scale - L_true)
-
-        @test Δλ ≤ Δλ0/2
-    end
-end
diff --git a/test/algorithms/klminscoregraddescent_bijectors.jl b/test/algorithms/klminscoregraddescent_bijectors.jl
deleted file mode 100644
index 782d60616..000000000
--- a/test/algorithms/klminscoregraddescent_bijectors.jl
+++ /dev/null
@@ -1,86 +0,0 @@
-
-@testset "KLMinScoreGradDescent with Bijectors" begin
-    begin
-        modelstats = normallognormal_meanfield(Random.default_rng(), Float64)
-        (; model, n_dims, μ_true, L_true) = modelstats
-
-        b = Bijectors.bijector(model)
-        binv = inverse(b)
-
-        q0_unconstr = MeanFieldGaussian(zeros(n_dims), Diagonal(ones(n_dims)))
-        q0 = Bijectors.transformed(q0_unconstr, binv)
-
-        @testset "estimate_objective" begin
-            alg = KLMinScoreGradDescent(AD; n_samples=10, operator=ClipScale())
-            q_true_unconstr = MeanFieldGaussian(Vector(μ_true), Diagonal(L_true))
-            q_true = Bijectors.transformed(q_true_unconstr, binv)
-
-            obj_est = estimate_objective(alg, q_true, model)
-            @test isfinite(obj_est)
-
-            obj_est = estimate_objective(alg, q_true, model; n_samples=1)
-            @test isfinite(obj_est)
-
-            obj_est = estimate_objective(alg, q_true, model; n_samples=3)
-            @test isfinite(obj_est)
-
-            obj_est = estimate_objective(alg, q_true, model; n_samples=10^5)
-            @test obj_est ≈ 0 atol=1e-2
-        end
-
-        @testset "determinism" begin
-            alg = KLMinScoreGradDescent(AD; n_samples=10, operator=ClipScale())
-
-            seed = (0x38bef07cf9cc549d)
-            rng = StableRNG(seed)
-            T = 10
-
-            q_out, _, _ = optimize(rng, alg, T, model, q0; show_progress=PROGRESS)
-            μ = q_out.dist.location
-            L = q_out.dist.scale
-
-            rng_repl = StableRNG(seed)
-            q_out, _, _ = optimize(rng_repl, alg, T, model, q0; show_progress=PROGRESS)
-            μ_repl = q_out.dist.location
-            L_repl = q_out.dist.scale
-            @test μ == μ_repl
-            @test L == L_repl
-        end
-    end
-
-    @testset "type stability realtype=$(realtype)" for realtype in [Float32, Float64]
-        modelstats = normallognormal_meanfield(Random.default_rng(), realtype)
-        (; model, n_dims, μ_true, L_true) = modelstats
-
-        T = 1
-        alg = KLMinScoreGradDescent(AD; n_samples=10, operator=ClipScale())
-        q0_unconstr = MeanFieldGaussian(
-            zeros(realtype, n_dims), Diagonal(ones(realtype, n_dims))
-        )
-        q0 = Bijectors.transformed(q0_unconstr, binv)
-
-        q_out, info, _ = optimize(alg, T, model, q0; show_progress=PROGRESS)
-
-        @test eltype(q_out.dist.location) == realtype
-        @test eltype(q_out.dist.scale) == realtype
-        @test typeof(first(info).elbo) == realtype
-    end
-
-    @testset "convergence" begin
-        modelstats = normallognormal_meanfield(Random.default_rng(), Float64)
-        (; model, n_dims, μ_true, L_true) = modelstats
-
-        T = 1000
-        optimizer = Descent(1e-3)
-        alg = KLMinScoreGradDescent(AD; n_samples=100, optimizer, operator=ClipScale())
-        q0_unconstr = MeanFieldGaussian(zeros(n_dims), Diagonal(ones(n_dims)))
-        q0 = Bijectors.transformed(q0_unconstr, binv)
-
-        q_out, _, _ = optimize(alg, T, model, q0; show_progress=PROGRESS)
-
-        Δλ0 = sum(abs2, q0.dist.location - μ_true) + sum(abs2, q0.dist.scale - L_true)
-        Δλ = sum(abs2, q_out.dist.location - μ_true) + sum(abs2, q_out.dist.scale - L_true)
-
-        @test Δλ ≤ Δλ0/2
-    end
-end
diff --git a/test/models/normallognormal.jl b/test/models/normallognormal.jl
deleted file mode 100644
index 7c9f29c10..000000000
--- a/test/models/normallognormal.jl
+++ /dev/null
@@ -1,62 +0,0 @@
-
-struct NormalLogNormal{MX,SX,MY,SY}
-    μ_x::MX
-    σ_x::SX
-    μ_y::MY
-    Σ_y::SY
-end
-
-function LogDensityProblems.logdensity(model::NormalLogNormal, θ)
-    (; μ_x, σ_x, μ_y, Σ_y) = model
-    return logpdf(LogNormal(μ_x, σ_x), θ[1]) + logpdf(MvNormal(μ_y, Σ_y), θ[2:end])
-end
-
-function LogDensityProblems.logdensity_and_gradient(model::NormalLogNormal, θ)
-    return (
-        LogDensityProblems.logdensity(model, θ),
-        ForwardDiff.gradient(Base.Fix1(LogDensityProblems.logdensity, model), θ),
-    )
-end
-
-function LogDensityProblems.dimension(model::NormalLogNormal)
-    return length(model.μ_y) + 1
-end
-
-function LogDensityProblems.capabilities(::Type{<:NormalLogNormal})
-    return LogDensityProblems.LogDensityOrder{1}()
-end
-
-function Bijectors.bijector(model::NormalLogNormal)
-    (; μ_x, σ_x, μ_y, Σ_y) = model
-    return Bijectors.Stacked(
-        Bijectors.bijector.([LogNormal(μ_x, σ_x), MvNormal(μ_y, Σ_y)]),
-        [1:1, 2:(1 + length(μ_y))],
-    )
-end
-
-function normallognormal_fullrank(::Random.AbstractRNG, realtype::Type)
-    n_y_dims = 5
-
-    σ0 = realtype(0.3)
-    μ = Fill(realtype(5.0), n_y_dims + 1)
-    L = Matrix(σ0 * I, n_y_dims + 1, n_y_dims + 1)
-    Σ = Hermitian(L * L')
-
-    model = NormalLogNormal(
-        μ[1], L[1, 1], μ[2:end], PDMat(Σ[2:end, 2:end], Cholesky(L[2:end, 2:end], 'L', 0))
-    )
-    return TestModel(model, μ, LowerTriangular(L), n_y_dims + 1, 1 / σ0^2, false)
-end
-
-function normallognormal_meanfield(::Random.AbstractRNG, realtype::Type)
-    n_y_dims = 5
-
-    σ0 = realtype(0.3)
-    μ = Fill(realtype(5), n_y_dims + 1)
-    σ = Fill(σ0, n_y_dims + 1)
-    L = Diagonal(σ)
-
-    model = NormalLogNormal(μ[1], σ[1], μ[2:end], Diagonal(σ[2:end] .^ 2))
-
-    return TestModel(model, μ, L, n_y_dims + 1, 1 / σ0^2, true)
-end
diff --git a/test/runtests.jl b/test/runtests.jl
index 64371ca47..f1c6c8167 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -81,7 +81,4 @@ if GROUP == "All" || GROUP == "AD"
     include("algorithms/klminrepgraddescent.jl")
     include("algorithms/klminscoregraddescent.jl")
     include("algorithms/klminrepgradproxdescent.jl")
-    include("algorithms/klminrepgraddescent_bijectors.jl")
-    include("algorithms/klminrepgradproxdescent_bijectors.jl")
-    include("algorithms/klminscoregraddescent_bijectors.jl")
 end

From 9dc43bb8892af54d4bc924ea5db63004ac578c72 Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Fri, 21 Nov 2025 11:15:06 -0500
Subject: [PATCH 02/26] fix remove use of Bijectors in tests

---
 bench/benchmarks.jl                           |  5 +--
 test/general/clip_scale.jl                    | 31 +++----------------
 .../proximal_location_scale_entropy.jl        |  8 +----
 test/runtests.jl                              |  1 -
 4 files changed, 7 insertions(+), 38 deletions(-)

diff --git a/bench/benchmarks.jl b/bench/benchmarks.jl
index 08f688be4..31e073c51 100644
--- a/bench/benchmarks.jl
+++ b/bench/benchmarks.jl
@@ -1,7 +1,6 @@
 using ADTypes
 using AdvancedVI
 using BenchmarkTools
-using Bijectors
 using Distributions
 using DistributionsAD
 using Enzyme, ForwardDiff, ReverseDiff, Zygote, Mooncake
@@ -58,9 +57,7 @@ begin
                 ),
             ]
 
-            b = Bijectors.bijector(prob)
-            binv = inverse(b)
-            q = Bijectors.TransformedDistribution(family, binv)
+            q = family
             alg = KLMinRepGradDescent(adtype; optimizer=opt, entropy, operator=ClipScale())
 
             SUITES[probname][objname][familyname][adname] = begin
diff --git a/test/general/clip_scale.jl b/test/general/clip_scale.jl
index 42582e428..a7331d5b5 100644
--- a/test/general/clip_scale.jl
+++ b/test/general/clip_scale.jl
@@ -1,10 +1,8 @@
 
 @testset "interface ClipScale" begin
     @testset "MvLocationScale" begin
-        @testset "$(string(covtype)) $(realtype) $(bijector)" for covtype in
-                                                                  [:meanfield, :fullrank],
-            realtype in [Float32, Float64],
-            bijector in [nothing, :identity]
+        @testset "$(string(covtype)) $(realtype)" for covtype in [:meanfield, :fullrank],
+            realtype in [Float32, Float64]
 
             d = 5
             μ = zeros(realtype, d)
@@ -16,28 +14,18 @@
                 L = Diagonal(ones(realtype, d))
                 MeanFieldGaussian(μ, L)
             end
-            q = if isnothing(bijector)
-                q
-            else
-                Bijectors.TransformedDistribution(q, identity)
-            end
 
             params, re = Optimisers.destructure(q)
             opt_st = Optimisers.setup(Descent(1e-2), params)
             params′ = AdvancedVI.apply(ClipScale(ϵ), typeof(q), opt_st, params, re)
             q′ = re(params′)
 
-            if isnothing(bijector)
-                @test all(var(q′) .≥ ϵ^2)
-            else
-                @test all(var(q′.dist) .≥ ϵ^2)
-            end
+            @test all(var(q′) .≥ ϵ^2)
         end
     end
 
     @testset "MvLocationScaleLowRank" begin
-        @testset "$(realtype) $(bijector)" for realtype in [Float32, Float64],
-            bijector in [nothing, :identity]
+        @testset "$(realtype) $(bijector)" for realtype in [Float32, Float64]
 
             n_rank = 2
             d = 5
@@ -48,22 +36,13 @@
             q = MvLocationScaleLowRank(
                 μ, D, U, Normal{realtype}(zero(realtype), one(realtype))
             )
-            q = if isnothing(bijector)
-                q
-            else
-                Bijectors.TransformedDistribution(q, bijector)
-            end
 
             params, re = Optimisers.destructure(q)
             opt_st = Optimisers.setup(Descent(1e-2), params)
             params′ = AdvancedVI.apply(ClipScale(ϵ), typeof(q), opt_st, params, re)
             q′ = re(params′)
 
-            if isnothing(bijector)
-                @test all(var(q′) .≥ ϵ^2)
-            else
-                @test all(var(q′.dist) .≥ ϵ^2)
-            end
+            @test all(var(q′.dist) .≥ ϵ^2)
         end
     end
 end
diff --git a/test/general/proximal_location_scale_entropy.jl b/test/general/proximal_location_scale_entropy.jl
index fb58535c6..817c08941 100644
--- a/test/general/proximal_location_scale_entropy.jl
+++ b/test/general/proximal_location_scale_entropy.jl
@@ -3,8 +3,7 @@
     @testset "MvLocationScale" begin
         @testset "$(string(covtype)) $(realtype) $(bijector)" for covtype in
                                                                   [:meanfield, :fullrank],
-            realtype in [Float32, Float64],
-            bijector in [nothing, :identity]
+            realtype in [Float32, Float64]
 
             stepsize = 1e-2
             optimizer = Descent(stepsize)
@@ -22,11 +21,6 @@
             elseif covtype == :meanfield
                 MeanFieldGaussian(μ, L)
             end
-            q = if isnothing(bijector)
-                q
-            else
-                Bijectors.TransformedDistribution(q, identity)
-            end
 
             # The proximal operator for the entropy of a location scale distribution 
             # solves the subproblem:
diff --git a/test/runtests.jl b/test/runtests.jl
index f1c6c8167..3369ece67 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -4,7 +4,6 @@ using Test: @testset, @test
 
 using ADTypes
 using Base.Iterators
-using Bijectors
 using DiffResults
 using Distributions
 using FillArrays

From 9479ed4bfbd11b79fba01161d93f449cfdf60d33 Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Fri, 21 Nov 2025 11:15:16 -0500
Subject: [PATCH 03/26] fix typo in HISTORY

---
 HISTORY.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HISTORY.md b/HISTORY.md
index 9172492f5..ab809aea0 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -4,7 +4,7 @@
 
 Previously, `KLMinRepGradDescent`, `KLMinRepGradProxDescent`, `KLMinScoreGradDescent` only required the support of the target log-density problem to match that of `q`.
 This was implemented by giving a special treatment to `q <: Bijectors.TransformedDistribution` through the `Bijectors` extension.
-This, however, resulted in a multiplicative complexity to maintaining the relevant bits.
+This, however, resulted in a multiplicative complexity in maintaining the relevant bits.
 Since this is not the only way to deal with constrained supports, `Bijectors` extension is now removed.
 In addition, `KLMinRepGradDescent`, `KLMinRepGradProxDescent`, `KLMinScoreGradDescent` now expect an unconstrained target log-density problem.
 Instead, a tutorial has been added to the documentation on how to deal with a target log-density problem with constrained support.

From 0142a82f470c93dcbbfb5813d292f033db92c5bd Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Fri, 21 Nov 2025 11:17:11 -0500
Subject: [PATCH 04/26] bump AdvancedVI version

---
 bench/Project.toml | 2 +-
 docs/Project.toml  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bench/Project.toml b/bench/Project.toml
index a63fcfde5..7a29a1618 100644
--- a/bench/Project.toml
+++ b/bench/Project.toml
@@ -19,7 +19,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 ADTypes = "1"
-AdvancedVI = "0.6"
+AdvancedVI = "0.7"
 BenchmarkTools = "1"
 Distributions = "0.25.111"
 DistributionsAD = "0.6"
diff --git a/docs/Project.toml b/docs/Project.toml
index 084e8744f..c76e68889 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -25,7 +25,7 @@ StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
 [compat]
 ADTypes = "1"
 Accessors = "0.1"
-AdvancedVI = "0.6"
+AdvancedVI = "0.7"
 Bijectors = "0.13.6, 0.14, 0.15"
 DataFrames = "1"
 DifferentiationInterface = "0.7"

From bb47ced708cef718cf6bed779cec406c1d6102e6 Mon Sep 17 00:00:00 2001
From: Kyurae Kim <msca8h@naver.com>
Date: Fri, 21 Nov 2025 11:17:51 -0500
Subject: [PATCH 05/26] run formatter

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 test/general/clip_scale.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/general/clip_scale.jl b/test/general/clip_scale.jl
index a7331d5b5..1f87ecb9c 100644
--- a/test/general/clip_scale.jl
+++ b/test/general/clip_scale.jl
@@ -26,7 +26,6 @@
 
     @testset "MvLocationScaleLowRank" begin
         @testset "$(realtype) $(bijector)" for realtype in [Float32, Float64]
-
             n_rank = 2
             d = 5
             μ = zeros(realtype, d)

From 6fa2bc30dfecccf3d2c2a89ec109a42984fee97e Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Fri, 21 Nov 2025 11:18:16 -0500
Subject: [PATCH 06/26] run formatter

---
 bench/benchmarks.jl | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/bench/benchmarks.jl b/bench/benchmarks.jl
index 31e073c51..ee2788c16 100644
--- a/bench/benchmarks.jl
+++ b/bench/benchmarks.jl
@@ -32,9 +32,7 @@ end
 begin
     T = Float64
 
-    for (probname, prob) in [
-        ("normal", normal(; n_dims=10, realtype=T))
-    ]
+    for (probname, prob) in [("normal", normal(; n_dims=10, realtype=T))]
         max_iter = 10^4
         d = LogDensityProblems.dimension(prob)
         opt = Optimisers.Adam(T(1e-3))

From e201345fa6f02e0a21b3800ea0a38d6eb1b6c61c Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Fri, 21 Nov 2025 11:22:11 -0500
Subject: [PATCH 07/26] fix removed include to removed file

---
 test/runtests.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 3369ece67..e3844deba 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -51,7 +51,6 @@ struct TestModel{M,L,S,SC}
     is_meanfield::Bool
 end
 include("models/normal.jl")
-include("models/normallognormal.jl")
 include("models/subsamplednormals.jl")
 
 if GROUP == "All" || GROUP == "GENERAL"

From 66a22a55ec4522063939e90bc1fad3e69917a3d4 Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Fri, 21 Nov 2025 11:27:35 -0500
Subject: [PATCH 08/26] fix remove erroenous references to bijector

---
 test/general/clip_scale.jl                      | 2 +-
 test/general/proximal_location_scale_entropy.jl | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/test/general/clip_scale.jl b/test/general/clip_scale.jl
index 1f87ecb9c..4de055707 100644
--- a/test/general/clip_scale.jl
+++ b/test/general/clip_scale.jl
@@ -25,7 +25,7 @@
     end
 
     @testset "MvLocationScaleLowRank" begin
-        @testset "$(realtype) $(bijector)" for realtype in [Float32, Float64]
+        @testset "$(realtype)" for realtype in [Float32, Float64]
             n_rank = 2
             d = 5
             μ = zeros(realtype, d)
diff --git a/test/general/proximal_location_scale_entropy.jl b/test/general/proximal_location_scale_entropy.jl
index 817c08941..86a5edaa3 100644
--- a/test/general/proximal_location_scale_entropy.jl
+++ b/test/general/proximal_location_scale_entropy.jl
@@ -1,8 +1,7 @@
 
 @testset "interface ProximalLocationScaleEntropy" begin
     @testset "MvLocationScale" begin
-        @testset "$(string(covtype)) $(realtype) $(bijector)" for covtype in
-                                                                  [:meanfield, :fullrank],
+        @testset "$(string(covtype)) $(realtype)" for covtype in [:meanfield, :fullrank]
             realtype in [Float32, Float64]
 
             stepsize = 1e-2
@@ -43,7 +42,7 @@
             )
 
             q′ = re(params′)
-            scale′ = isnothing(bijector) ? q′.scale : q′.dist.scale
+            scale′ = q′.scale
 
             grad_left = ReverseDiff.gradient(
                 L_ -> first(logabsdet(LowerTriangular(reshape(L_, d, d)))), vec(scale′)

From f5f7fcb1014713cb12b62e3aae8ba7134eb52bf7 Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Fri, 21 Nov 2025 11:34:14 -0500
Subject: [PATCH 09/26] fix missing comma

---
 test/general/proximal_location_scale_entropy.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/general/proximal_location_scale_entropy.jl b/test/general/proximal_location_scale_entropy.jl
index 86a5edaa3..16f90ab0f 100644
--- a/test/general/proximal_location_scale_entropy.jl
+++ b/test/general/proximal_location_scale_entropy.jl
@@ -1,7 +1,7 @@
 
 @testset "interface ProximalLocationScaleEntropy" begin
     @testset "MvLocationScale" begin
-        @testset "$(string(covtype)) $(realtype)" for covtype in [:meanfield, :fullrank]
+        @testset "$(string(covtype)) $(realtype)" for covtype in [:meanfield, :fullrank],
             realtype in [Float32, Float64]
 
             stepsize = 1e-2

From 7a6e902a169455a3b2c70105d3962eb7c0a6ae4a Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Fri, 21 Nov 2025 11:44:07 -0500
Subject: [PATCH 10/26] fix remove include to removed file

---
 bench/benchmarks.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bench/benchmarks.jl b/bench/benchmarks.jl
index ee2788c16..8c01abe20 100644
--- a/bench/benchmarks.jl
+++ b/bench/benchmarks.jl
@@ -16,7 +16,6 @@ BLAS.set_num_threads(min(4, Threads.nthreads()))
 @info sprint(versioninfo)
 @info "BLAS threads: $(BLAS.get_num_threads())"
 
-include("normallognormal.jl")
 include("unconstrdist.jl")
 
 const SUITES = BenchmarkGroup()

From 2a1755a7d2253bd86057307cd1fe7047b3ed914d Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Fri, 21 Nov 2025 11:51:51 -0500
Subject: [PATCH 11/26] update READMe

---
 README.md | 66 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 53 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 4732d5c6e..a18d51445 100644
--- a/README.md
+++ b/README.md
@@ -3,13 +3,13 @@
 [![Tests](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Tests.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Tests.yml/badge.svg?branch=main)
 [![Coverage](https://codecov.io/gh/TuringLang/AdvancedVI.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/TuringLang/AdvancedVI.jl)
 
-| AD Backend    | Integration Status |
-| ------------- | ------------- |
-| [ForwardDiff](https://github.com/JuliaDiff/ForwardDiff.jl)   | [![ForwardDiff](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ForwardDiff.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ForwardDiff.yml?query=branch%3Amain) |
-| [ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl)   | [![ReverseDiff](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ReverseDiff.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ReverseDiff.yml?query=branch%3Amain) |
-| [Zygote](https://github.com/FluxML/Zygote.jl)        | [![Zygote](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Zygote.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Zygote.yml?query=branch%3Amain) |
-| [Mooncake](https://github.com/chalk-lab/Mooncake.jl)      | [![Mooncake](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Mooncake.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Mooncake.yml?query=branch%3Amain) |
-| [Enzyme](https://github.com/EnzymeAD/Enzyme.jl)        | [![Enzyme](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Enzyme.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Enzyme.yml?query=branch%3Amain) |
+| AD Backend                                                 | Integration Status                                                                                                                                                                                                       |
+|:---------------------------------------------------------- |:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| [ForwardDiff](https://github.com/JuliaDiff/ForwardDiff.jl) | [![ForwardDiff](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ForwardDiff.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ForwardDiff.yml?query=branch%3Amain) |
+| [ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl) | [![ReverseDiff](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ReverseDiff.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ReverseDiff.yml?query=branch%3Amain) |
+| [Zygote](https://github.com/FluxML/Zygote.jl)              | [![Zygote](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Zygote.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Zygote.yml?query=branch%3Amain)                |
+| [Mooncake](https://github.com/chalk-lab/Mooncake.jl)       | [![Mooncake](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Mooncake.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Mooncake.yml?query=branch%3Amain)          |
+| [Enzyme](https://github.com/EnzymeAD/Enzyme.jl)            | [![Enzyme](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Enzyme.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Enzyme.yml?query=branch%3Amain)                |
 
 # AdvancedVI.jl
 
@@ -69,7 +69,7 @@ end;
 
 Since the support of `σ` is constrained to be positive and most VI algorithms assume an unconstrained Euclidean support, we need to use a *bijector* to transform `θ`.
 We will use [`Bijectors`](https://github.com/TuringLang/Bijectors.jl) for this purpose.
-This corresponds to the automatic differentiation variational inference (ADVI) formulation[^KTRGB2017].
+The bijector corresponding to the joint support of our model can be constructed as follows:
 
 ```julia
 using Bijectors: Bijectors
@@ -85,6 +85,36 @@ end;
 
 A simpler approach would be to use [`Turing`](https://github.com/TuringLang/Turing.jl), where a `Turing.Model` can be automatically be converted into a `LogDensityProblem` and a corresponding `bijector` is automatically generated.
 
+Since most VI algorithms assume that the posterior is unconstrained, we will apply a change-of-variable to our model to make it unconstrained.
+This amounts to wrapping it into a `LogDensityProblem` that applies the transformation and apply a Jacobian adjustment.
+
+```julia
+struct TransformedLogDensityProblem{Prob,Trans}
+    prob::Prob
+    transform::Trans
+end
+
+function TransformedLogDensityProblem(prob, transform)
+    return TransformedLogDensityProblem{typeof(prob),typeof(transform)}(prob, transform)
+end
+
+function LogDensityProblems.logdensity(prob_trans::TransformedLogDensityProblem, θ_trans)
+    (; prob, transform) = prob_trans
+    θ, logabsdetjac = Bijectors.with_logabsdet_jacobian(transform, θ_trans)
+    return LogDensityProblems.logdensity(prob, θ) + logabsdetjac
+end
+
+function LogDensityProblems.dimension(prob_trans::TransformedLogDensityProblem)
+    return LogDensityProblems.dimension(prob_trans.prob)
+end
+
+function LogDensityProblems.capabilities(
+    ::Type{TransformedLogDensityProblem{Prob,Trans}}
+) where {Prob,Trans}
+    return LogDensityProblems.capabilities(Prob)
+end;
+```
+
 For the dataset, we will use the popular [sonar classification dataset](https://archive.ics.uci.edu/dataset/151/connectionist+bench+sonar+mines+vs+rocks) from the UCI repository.
 This can be automatically downloaded using [`OpenML`](https://github.com/JuliaAI/OpenML.jl).
 The sonar dataset corresponds to the dataset id 40.
@@ -109,7 +139,10 @@ X = hcat(X, ones(size(X, 1)));
 The model can now be instantiated as follows:
 
 ```julia
-model = LogReg(X, y);
+prob = LogReg(X, y);
+b = Bijectors.bijector(prob)
+binv = Bijectors.inverse(b)
+prob_trans = TransformedLogDensityProblem(prob, binv)
 ```
 
 For the VI algorithm, we will use `KLMinRepGradDescent`:
@@ -136,7 +169,7 @@ For this, it is straightforward to use `LogDensityProblemsAD`:
 using DifferentiationInterface: DifferentiationInterface
 using LogDensityProblemsAD: LogDensityProblemsAD
 
-model_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoReverseDiff(), model);
+prob_trans_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoReverseDiff(), prob_trans);
 ```
 
 For the variational family, we will consider a `FullRankGaussian` approximation:
@@ -144,7 +177,7 @@ For the variational family, we will consider a `FullRankGaussian` approximation:
 ```julia
 using LinearAlgebra
 
-d = LogDensityProblems.dimension(model_ad)
+d = LogDensityProblems.dimension(prob_trans_ad)
 q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.37*I, d, d)))
 q = MeanFieldGaussian(zeros(d), Diagonal(ones(d)));
 ```
@@ -161,7 +194,15 @@ We can now run VI:
 
 ```julia
 max_iter = 10^3
-q, info, _ = AdvancedVI.optimize(alg, max_iter, model_ad, q_transformed;);
+q_opt, info, _ = AdvancedVI.optimize(alg, max_iter, prob_trans_ad, q);
+```
+
+Recall that we applied a change-of-variable to the posterior to make it unconstrained.
+This, however, is not the original constrained posterior that we wanted to approximate.
+Therefore, we finally need to apply a change-of-variable to `q_opt` to make it approximate our original problem.
+
+```julia
+q_trans = Bijectors.TransformedDistribution(q, binv)
 ```
 
 For more examples and details, please refer to the documentation.
@@ -169,4 +210,3 @@ For more examples and details, please refer to the documentation.
 [^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014, June). Doubly stochastic variational Bayes for non-conjugate inference. In *International Conference on Machine Learning*. PMLR.
 [^RMW2014]: Rezende, D. J., Mohamed, S., & Wierstra, D. (2014, June). Stochastic backpropagation and approximate inference in deep generative models. In *International Conference on Machine Learning*. PMLR.
 [^KW2014]: Kingma, D. P., & Welling, M. (2014). Auto-encoding variational bayes. In *International Conference on Learning Representations*.
-[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of machine learning research*.

From f23eb7add69f38412f47dc16cffce84ae865638a Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Fri, 21 Nov 2025 13:12:57 -0500
Subject: [PATCH 12/26] fix move benchmark model to main file

---
 bench/benchmarks.jl   | 29 ++++++++++++++++++++++++++++-
 bench/unconstrdist.jl | 33 ---------------------------------
 2 files changed, 28 insertions(+), 34 deletions(-)
 delete mode 100644 bench/unconstrdist.jl

diff --git a/bench/benchmarks.jl b/bench/benchmarks.jl
index 8c01abe20..a2819bd7d 100644
--- a/bench/benchmarks.jl
+++ b/bench/benchmarks.jl
@@ -16,7 +16,34 @@ BLAS.set_num_threads(min(4, Threads.nthreads()))
 @info sprint(versioninfo)
 @info "BLAS threads: $(BLAS.get_num_threads())"
 
-include("unconstrdist.jl")
+struct Dist{D<:ContinuousMultivariateDistribution}
+    dist::D
+end
+
+function LogDensityProblems.logdensity(model::Dist, x)
+    return logpdf(model.dist, x)
+end
+
+function LogDensityProblems.logdensity_and_gradient(model::Dist, θ)
+    return (
+        LogDensityProblems.logdensity(model, θ),
+        ForwardDiff.gradient(Base.Fix1(LogDensityProblems.logdensity, model), θ),
+    )
+end
+
+function LogDensityProblems.dimension(model::Dist)
+    return length(model.dist)
+end
+
+function LogDensityProblems.capabilities(::Type{<:Dist})
+    return LogDensityProblems.LogDensityOrder{0}()
+end
+
+function normal(; n_dims=10, realtype=Float64)
+    μ = fill(realtype(5), n_dims)
+    Σ = Diagonal(ones(realtype, n_dims))
+    return Dist(MvNormal(μ, Σ))
+end
 
 const SUITES = BenchmarkGroup()
 
diff --git a/bench/unconstrdist.jl b/bench/unconstrdist.jl
deleted file mode 100644
index 164199f0b..000000000
--- a/bench/unconstrdist.jl
+++ /dev/null
@@ -1,33 +0,0 @@
-
-struct UnconstrDist{D<:ContinuousMultivariateDistribution}
-    dist::D
-end
-
-function LogDensityProblems.logdensity(model::UnconstrDist, x)
-    return logpdf(model.dist, x)
-end
-
-function LogDensityProblems.logdensity_and_gradient(model::UnconstrDist, θ)
-    return (
-        LogDensityProblems.logdensity(model, θ),
-        ForwardDiff.gradient(Base.Fix1(LogDensityProblems.logdensity, model), θ),
-    )
-end
-
-function LogDensityProblems.dimension(model::UnconstrDist)
-    return length(model.dist)
-end
-
-function LogDensityProblems.capabilities(::Type{<:UnconstrDist})
-    return LogDensityProblems.LogDensityOrder{0}()
-end
-
-function Bijectors.bijector(model::UnconstrDist)
-    return identity
-end
-
-function normal(; n_dims=10, realtype=Float64)
-    μ = fill(realtype(5), n_dims)
-    Σ = Diagonal(ones(realtype, n_dims))
-    return UnconstrDist(MvNormal(μ, Σ))
-end

From 4d8d95e217cb5ea46902e4ecc8b92d13addeda91 Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Sat, 22 Nov 2025 11:53:26 -0500
Subject: [PATCH 13/26] update docs to the new recommended use of Bijectors

---
 README.md                         |   6 +-
 docs/src/klminrepgraddescent.md   | 130 ++++++------------------------
 docs/src/tutorials/basic.md       |  89 +++++++++++++-------
 docs/src/tutorials/flows.md       |  94 +++++++++++++--------
 docs/src/tutorials/subsampling.md |  73 +++++++----------
 5 files changed, 179 insertions(+), 213 deletions(-)

diff --git a/README.md b/README.md
index a18d51445..4e54b6e06 100644
--- a/README.md
+++ b/README.md
@@ -86,7 +86,7 @@ end;
 A simpler approach would be to use [`Turing`](https://github.com/TuringLang/Turing.jl), where a `Turing.Model` can be automatically be converted into a `LogDensityProblem` and a corresponding `bijector` is automatically generated.
 
 Since most VI algorithms assume that the posterior is unconstrained, we will apply a change-of-variable to our model to make it unconstrained.
-This amounts to wrapping it into a `LogDensityProblem` that applies the transformation and apply a Jacobian adjustment.
+This amounts to wrapping it into a `LogDensityProblem` that applies the transformation and the corresponding Jacobian adjustment.
 
 ```julia
 struct TransformedLogDensityProblem{Prob,Trans}
@@ -178,7 +178,7 @@ For the variational family, we will consider a `FullRankGaussian` approximation:
 using LinearAlgebra
 
 d = LogDensityProblems.dimension(prob_trans_ad)
-q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.37*I, d, d)))
+q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.6*I, d, d)))
 q = MeanFieldGaussian(zeros(d), Diagonal(ones(d)));
 ```
 
@@ -202,7 +202,7 @@ This, however, is not the original constrained posterior that we wanted to appro
 Therefore, we finally need to apply a change-of-variable to `q_opt` to make it approximate our original problem.
 
 ```julia
-q_trans = Bijectors.TransformedDistribution(q, binv)
+q_trans = Bijectors.TransformedDistribution(q_opt, binv)
 ```
 
 For more examples and details, please refer to the documentation.
diff --git a/docs/src/klminrepgraddescent.md b/docs/src/klminrepgraddescent.md
index c2b3f9264..64cb5f5bb 100644
--- a/docs/src/klminrepgraddescent.md
+++ b/docs/src/klminrepgraddescent.md
@@ -1,9 +1,7 @@
 # [`KLMinRepGradDescent`](@id klminrepgraddescent)
 
-## Description
-
-This algorithm aims to minimize the exclusive (or reverse) Kullback-Leibler (KL) divergence via stochastic gradient descent in the space of parameters.
-Specifically, it uses the the *reparameterization gradient estimator*.
+This algorithms aim to minimize the exclusive (or reverse) Kullback-Leibler (KL) divergence via stochastic gradient descent in the space of parameters.
+Specifically, it uses the the *reparameterization gradient* estimator.
 As a result, this algorithm is best applicable when the target log-density is differentiable and the sampling process of the variational family is differentiable.
 (See the [methodology section](@ref klminrepgraddescent_method) for more details.)
 This algorithm is also commonly referred to as automatic differentiation variational inference, black-box variational inference with the reparameterization gradient, and stochastic gradient variational inference.
@@ -13,30 +11,15 @@ This algorithm is also commonly referred to as automatic differentiation variati
 KLMinRepGradDescent
 ```
 
-The associated objective value can be estimated through the following:
-
-```@docs; canonical=false
-estimate_objective(
-    ::Random.AbstractRNG,
-    ::Union{<:KLMinRepGradDescent,<:KLMinRepGradProxDescent,<:KLMinScoreGradDescent},
-    ::Any,
-    ::Any;
-    ::Int,
-    ::AbstractEntropyEstimator,
-)
-```
-
 ## [Methodology](@id klminrepgraddescent_method)
 
-This algorithm aims to solve the problem
+This algorithms aim to solve the problem
 
 ```math
   \mathrm{minimize}_{q \in \mathcal{Q}}\quad \mathrm{KL}\left(q, \pi\right)
 ```
 
 where $\mathcal{Q}$ is some family of distributions, often called the variational family, by running stochastic gradient descent in the (Euclidean) space of parameters.
-That is, for all $$q_{\lambda} \in \mathcal{Q}$$, we assume $$q_{\lambda}$$ there is a corresponding vector of parameters $$\lambda \in \Lambda$$, where the space of parameters is Euclidean such that $$\Lambda \subset \mathbb{R}^p$$.
-
 Since we usually only have access to the unnormalized densities of the target distribution $\pi$, we don't have direct access to the KL divergence.
 Instead, the ELBO maximization strategy maximizes a surrogate objective, the *evidence lower bound* (ELBO; [^JGJS1999])
 
@@ -57,7 +40,7 @@ Algorithmically, `KLMinRepGradDescent` iterates the step
 where $\widehat{\nabla \mathrm{ELBO}}(q_{\lambda})$ is the reparameterization gradient estimate[^HC1983][^G1991][^R1992][^P1996] of the ELBO gradient and $$\mathrm{operator}$$ is an optional operator (*e.g.* projections, identity mapping).
 
 The reparameterization gradient, also known as the push-in gradient or the pathwise gradient, was introduced to VI in [^TL2014][^RMW2014][^KW2014].
-For the variational family $$\mathcal{Q}$$, suppose the process of sampling from $$q_{\lambda} \in \mathcal{Q}$$ can be described by some differentiable reparameterization function $$T_{\lambda}$$ and a *base distribution* $$\varphi$$ independent of $$\lambda$$ such that
+For the variational family $\mathcal{Q} = \{q_{\lambda} \mid \lambda \in \Lambda\}$, suppose the process of sampling from $q_{\lambda}$ can be described by some differentiable reparameterization function $$T_{\lambda}$$ and a *base distribution* $$\varphi$$ independent of $$\lambda$$ such that
 
 ```math
 z \sim  q_{\lambda} \qquad\Leftrightarrow\qquad
@@ -80,50 +63,6 @@ where $$\epsilon_m \sim \varphi$$ are Monte Carlo samples.
 [^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014). Doubly stochastic variational Bayes for non-conjugate inference. In *International Conference on Machine Learning*.
 [^RMW2014]: Rezende, D. J., Mohamed, S., & Wierstra, D. (2014). Stochastic backpropagation and approximate inference in deep generative models. In *International Conference on Machine Learning*.
 [^KW2014]: Kingma, D. P., & Welling, M. (2014). Auto-encoding variational bayes. In *International Conference on Learning Representations*.
-## [Handling Constraints with `Bijectors`](@id bijectors)
-
-As mentioned in the docstring, `KLMinRepGradDescent` assumes that the variational approximation $$q_{\lambda}$$ and the target distribution $$\pi$$ have the same support for all $$\lambda \in \Lambda$$.
-However, in general, it is most convenient to use variational families that have the whole Euclidean space $$\mathbb{R}^d$$ as their support.
-This is the case for the [location-scale distributions](@ref locscale) provided by `AdvancedVI`.
-For target distributions which the support is not the full $$\mathbb{R}^d$$, we can apply some transformation $$b$$ to $$q_{\lambda}$$ to match its support such that
-
-```math
-z \sim  q_{b,\lambda} \qquad\Leftrightarrow\qquad
-z \stackrel{d}{=} b^{-1}\left(\eta\right);\quad \eta \sim q_{\lambda},
-```
-
-where $$b$$ is often called a *bijector*, since it is often chosen among bijective transformations.
-This idea is known as automatic differentiation VI[^KTRGB2017] and has subsequently been improved by Tensorflow Probability[^DLTBV2017].
-In Julia, [Bijectors.jl](https://github.com/TuringLang/Bijectors.jl)[^FXTYG2020] provides a comprehensive collection of bijections.
-
-[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of Machine Learning Research*, 18(14), 1-45.
-[^DLTBV2017]: Dillon, J. V., Langmore, I., Tran, D., Brevdo, E., Vasudevan, S., Moore, D., ... & Saurous, R. A. (2017). Tensorflow distributions. arXiv.
-[^FXTYG2020]: Fjelde, T. E., Xu, K., Tarek, M., Yalburgi, S., & Ge, H. (2020,. Bijectors. jl: Flexible transformations for probability distributions. In *Symposium on Advances in Approximate Bayesian Inference*.
-    One caveat of ADVI is that, after applying the bijection, a Jacobian adjustment needs to be applied.
-    That is, the objective is now
-```math
-\mathrm{ADVI}\left(\lambda\right)
-\triangleq
-\mathbb{E}_{\eta \sim q_{\lambda}}\left[
-  \log \pi\left( b^{-1}\left( \eta \right) \right)
-  + \log \lvert J_{b^{-1}}\left(\eta\right) \rvert
-\right]
-+ \mathbb{H}\left(q_{\lambda}\right)
-```
-
-This is automatically handled by `AdvancedVI` through `TransformedDistribution` provided by `Bijectors.jl`.
-See the following example:
-
-```julia
-using Bijectors
-q = MeanFieldGaussian(μ, L)
-b = Bijectors.bijector(dist)
-binv = inverse(b)
-q_transformed = Bijectors.TransformedDistribution(q, binv)
-```
-
-By passing `q_transformed` to `optimize`, the Jacobian adjustment for the bijector `b` is automatically applied.
-(See the [Basic Example](@ref basic) for a fully working example.)
 
 ## [Entropy Gradient Estimators](@id entropygrad)
 
@@ -157,7 +96,6 @@ Depending on the variational family, this might be computationally inefficient o
 For example, if ``q_{\lambda}`` is a Gaussian with a full-rank covariance, a back-substitution must be performed at every step, making the per-iteration complexity ``\mathcal{O}(d^3)`` and reducing numerical stability.
 
 ```@setup repgradelbo
-using Bijectors
 using FillArrays
 using LinearAlgebra
 using LogDensityProblems
@@ -168,51 +106,38 @@ using Optimisers
 using ADTypes, ForwardDiff, ReverseDiff
 using AdvancedVI
 
-struct NormalLogNormal{MX,SX,MY,SY}
-    μ_x::MX
-    σ_x::SX
-    μ_y::MY
-    Σ_y::SY
+struct Dist{D}
+    dist::D
 end
 
-function LogDensityProblems.logdensity(model::NormalLogNormal, θ)
-    (; μ_x, σ_x, μ_y, Σ_y) = model
-    logpdf(LogNormal(μ_x, σ_x), θ[1]) + logpdf(MvNormal(μ_y, Σ_y), θ[2:end])
+function LogDensityProblems.logdensity(model::Dist, θ)
+    return logpdf(model.dist, θ)
 end
 
-function LogDensityProblems.logdensity_and_gradient(model::NormalLogNormal, θ)
+function LogDensityProblems.logdensity_and_gradient(model::Dist, θ)
     return (
         LogDensityProblems.logdensity(model, θ),
         ForwardDiff.gradient(Base.Fix1(LogDensityProblems.logdensity, model), θ),
     )
 end
 
-function LogDensityProblems.dimension(model::NormalLogNormal)
-    length(model.μ_y) + 1
+function LogDensityProblems.dimension(model::Dist)
+    return length(model.dist)
 end
 
-function LogDensityProblems.capabilities(::Type{<:NormalLogNormal})
+function LogDensityProblems.capabilities(::Type{<:Dist})
     LogDensityProblems.LogDensityOrder{1}()
 end
 
 n_dims = 10
-μ_x    = 2.0
-σ_x    = 0.3
-μ_y    = Fill(2.0, n_dims)
-σ_y    = Fill(1.0, n_dims)
-model  = NormalLogNormal(μ_x, σ_x, μ_y, Diagonal(σ_y.^2));
+μ      = Fill(2.0, n_dims)
+σ      = Fill(1.0, n_dims)
+model  = Dist(MvNormal(μ, Diagonal(σ.^2)));
 
 d  = LogDensityProblems.dimension(model);
-μ  = zeros(d);
-L  = Diagonal(ones(d));
-q0 = AdvancedVI.MeanFieldGaussian(μ, L)
-
-function Bijectors.bijector(model::NormalLogNormal)
-    (; μ_x, σ_x, μ_y, Σ_y) = model
-    Bijectors.Stacked(
-        Bijectors.bijector.([LogNormal(μ_x, σ_x), MvNormal(μ_y, Σ_y)]),
-        [1:1, 2:1+length(μ_y)])
-end
+μ0 = zeros(d);
+L0 = Diagonal(ones(d));
+q0 = AdvancedVI.MeanFieldGaussian(μ0, L0)
 ```
 
 In this example, the true posterior is contained within the variational family.
@@ -223,10 +148,6 @@ Recall that the original ADVI objective with a closed-form entropy (CFE) is give
 
 ```@example repgradelbo
 n_montecarlo = 16;
-b = Bijectors.bijector(model);
-binv = inverse(b)
-
-q0_trans = Bijectors.TransformedDistribution(q0, binv)
 
 cfe = KLMinRepGradDescent(
     AutoReverseDiff();
@@ -250,12 +171,11 @@ nothing
 ```
 
 ```@setup repgradelbo
-max_iter = 3*10^3
+max_iter = 10^3
 
 function callback(; params, restructure, kwargs...)
-    q = restructure(params).dist
-    dist2 = sum(abs2, q.location - vcat([μ_x], μ_y)) 
-        + sum(abs2, diag(q.scale) - vcat(σ_x, σ_y))
+    q = restructure(params)
+    dist2 = sum(abs2, q.location - μ) + sum(abs2, diag(q.scale) - σ)
     (dist = sqrt(dist2),)
 end
 
@@ -263,7 +183,7 @@ _, info_cfe, _ = AdvancedVI.optimize(
     cfe,
     max_iter,
     model,
-    q0_trans;
+    q0;
     show_progress = false,
     callback      = callback,
 ); 
@@ -272,7 +192,7 @@ _, info_stl, _ = AdvancedVI.optimize(
     stl,
     max_iter,
     model,
-    q0_trans;
+    q0;
     show_progress = false,
     callback      = callback,
 ); 
@@ -281,7 +201,7 @@ _, info_stl, _ = AdvancedVI.optimize(
     stl,
     max_iter,
     model,
-    q0_trans;
+    q0;
     show_progress = false,
     callback      = callback,
 ); 
@@ -364,7 +284,7 @@ _, info_qmc, _ = AdvancedVI.optimize(
     KLMinRepGradDescent(AutoReverseDiff(); n_samples=n_montecarlo, optimizer=Adam(1e-2), operator=ClipScale()),
     max_iter,
     model,
-    q0_trans;
+    q0;
     show_progress = false,
     callback      = callback,
 ); 
diff --git a/docs/src/tutorials/basic.md b/docs/src/tutorials/basic.md
index 8998b0789..53c9db794 100644
--- a/docs/src/tutorials/basic.md
+++ b/docs/src/tutorials/basic.md
@@ -50,14 +50,13 @@ end
 nothing
 ```
 
-Since the support of `σ` is constrained to be positive and most VI algorithms assume an unconstrained Euclidean support, we need to use a *bijector* to transform `θ`.
-We will use [`Bijectors`](https://github.com/TuringLang/Bijectors.jl) for this purpose.
-This corresponds to the automatic differentiation variational inference (ADVI) formulation[^KTRGB2017].
+Notice that the support of `σ` is constrained to be positive. 
+Since most VI algorithms assume an unconstrained Euclidean support, we will apply a change-of-variable by a bijective map (a *bijector*) to the posterior to make it unconstrained.
+For this, we will use the [`Bijectors`](https://github.com/TuringLang/Bijectors.jl) package.
 
 In our case, we need a bijector that applies an identity map for the first `size(X,2)` coordinates, and map the last coordinate to the support of `LogNormal(0, 3)`.
-This can be done as  follows:
+This can be constructed as follows:
 
-[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of machine learning research*.
 ```@example basic
 using Bijectors: Bijectors
 
@@ -71,7 +70,37 @@ end
 nothing
 ```
 
-For more details, please refer to the documentation of [`Bijectors`](https://github.com/TuringLang/Bijectors.jl).
+Next, we will wrap our original `LogDensityProblem` into a new `LogDensityProblem` that applies the transformation and the corresponding Jacobian adjustment.
+
+```@example basic
+struct TransformedLogDensityProblem{Prob,Trans}
+    prob::Prob
+    transform::Trans
+end
+
+function TransformedLogDensityProblem(prob, transform)
+    return TransformedLogDensityProblem{typeof(prob),typeof(transform)}(prob, transform)
+end
+
+function LogDensityProblems.logdensity(prob_trans::TransformedLogDensityProblem, θ_trans)
+    (; prob, transform) = prob_trans
+    θ, logabsdetjac = Bijectors.with_logabsdet_jacobian(transform, θ_trans)
+    return LogDensityProblems.logdensity(prob, θ) + logabsdetjac
+end
+
+function LogDensityProblems.dimension(prob_trans::TransformedLogDensityProblem)
+    return LogDensityProblems.dimension(prob_trans.prob)
+end
+
+function LogDensityProblems.capabilities(
+    ::Type{TransformedLogDensityProblem{Prob,Trans}}
+) where {Prob,Trans}
+    return LogDensityProblems.capabilities(Prob)
+end
+nothing
+```
+
+For more details on the usage of bijectors, please refer to the documentation of [`Bijectors`](https://github.com/TuringLang/Bijectors.jl).
 
 For the dataset, we will use the popular [sonar classification dataset](https://archive.ics.uci.edu/dataset/151/connectionist+bench+sonar+mines+vs+rocks) from the UCI repository.
 This can be automatically downloaded using [`OpenML`](https://github.com/JuliaAI/OpenML.jl).
@@ -99,7 +128,10 @@ nothing
 The model can now be instantiated as follows:
 
 ```@example basic
-model = LogReg(X, y)
+prob = LogReg(X, y);
+b = Bijectors.bijector(prob)
+binv = Bijectors.inverse(b)
+prob_trans = TransformedLogDensityProblem(prob, binv)
 nothing
 ```
 
@@ -134,7 +166,7 @@ For this example, we will use `LogDensityProblemsAD` to equip our problem with a
 using DifferentiationInterface: DifferentiationInterface
 using LogDensityProblemsAD: LogDensityProblemsAD
 
-model_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoReverseDiff(), model)
+prob_trans_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoForwardDiff(), prob_trans)
 nothing
 ```
 
@@ -143,29 +175,20 @@ For the variational family, we will consider a `FullRankGaussian` approximation:
 ```@example basic
 using LinearAlgebra
 
-d = LogDensityProblems.dimension(model_ad)
-q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.37*I, d, d)))
+d = LogDensityProblems.dimension(prob_trans)
+q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.6*I, d, d)))
 nothing
 ```
 
-Now, `KLMinRepGradDescent` requires the variational approximation and the target log-density to have the same support.
-Since `y` follows a log-normal prior, its support is bounded to be the positive half-space ``\mathbb{R}_+``.
-Thus, we will use [Bijectors](https://github.com/TuringLang/Bijectors.jl) to match the support of our target posterior and the variational approximation.
-The bijector can now be applied to `q` to match the support of the target problem.
-
-```@example basic
-b = Bijectors.bijector(model)
-binv = Bijectors.inverse(b)
-q_transformed = Bijectors.TransformedDistribution(q, binv)
-nothing
-```
+`KLMinRepGradDescent` requires the variational approximation and the target log-density to have unconstrained support.
+Since we applied a bijector to the posterior to make it unconstrained, and the support of a `FullRankGaussian` is already the full $\mathbb{R}^d$, we are ready to go.
 
 We can now run VI:
 
 ```@example basic
 max_iter = 10^4
 q_out, info, _ = AdvancedVI.optimize(
-    alg, max_iter, model_ad, q_transformed; show_progress=false
+    alg, max_iter, prob_trans_ad, q; show_progress=false
 )
 nothing
 ```
@@ -190,6 +213,15 @@ nothing
 
 ![](basic_example_elbo.svg)
 
+Recall that we applied a change-of-variable to the posterior to make it unconstrained.
+This, however, is not the original constrained posterior that we wanted to approximate.
+Therefore, we finally need to apply a change-of-variable to `q_out` to make it approximate our original problem:
+
+```julia
+q_trans = Bijectors.TransformedDistribution(q_out, binv)
+```
+
+
 ## Custom Callback
 
 The ELBO estimates above however, use only a handful of Monte Carlo samples.
@@ -210,7 +242,7 @@ using StatsFuns: StatsFuns
 Approximate the posterior predictive probability for a logistic link function using Mackay's approximation (Bishop p. 220).
 """
 function logistic_prediction(X, μ_β, Σ_β)
-    xtΣx = sum((model.X*Σ_β) .* model.X; dims=2)[:, 1]
+    xtΣx = sum((prob.X*Σ_β) .* prob.X; dims=2)[:, 1]
     κ = @. 1/sqrt(1 + π/8*xtΣx)
     return StatsFuns.logistic.(κ .* X*μ_β)
 end
@@ -221,18 +253,19 @@ function callback(; iteration, averaged_params, restructure, kwargs...)
 
         # Use the averaged parameters (the eventual output of the algorithm)
         q_avg = restructure(averaged_params)
+        q_avg_trans = Bijectors.TransformedDistribution(q_avg, binv)
 
         # Compute predictions
-        μ_β = mean(q_avg.dist)[1:(end - 1)] # posterior mean of β
-        Σ_β = cov(q_avg.dist)[1:(end - 1), end - 1] # marginal posterior covariance of β
+        μ_β = mean(q_avg_trans.dist)[1:(end - 1)] # posterior mean of β
+        Σ_β = cov(q_avg_trans.dist)[1:(end - 1), end - 1] # marginal posterior covariance of β
         y_pred = logistic_prediction(X, μ_β, Σ_β) .> 0.5
 
         # Prediction accuracy
-        acc = mean(y_pred .== model.y)
+        acc = mean(y_pred .== prob.y)
 
         # Higher fidelity estimate of the ELBO on the averaged parameters
         n_samples = 256
-        elbo_callback = -estimate_objective(alg, q_avg, model; n_samples)
+        elbo_callback = -estimate_objective(alg, q_avg, prob_trans; n_samples)
 
         (elbo_callback=elbo_callback, accuracy=acc)
     else
@@ -250,7 +283,7 @@ The `callback` can be supplied to `optimize`:
 ```@example basic
 max_iter = 10^4
 q_out, info, _ = AdvancedVI.optimize(
-    alg, max_iter, model_ad, q_transformed; show_progress=false, callback=callback
+    alg, max_iter, prob_trans_ad, q; show_progress=false, callback=callback
 )
 nothing
 ```
diff --git a/docs/src/tutorials/flows.md b/docs/src/tutorials/flows.md
index 6334c946c..ae44758ce 100644
--- a/docs/src/tutorials/flows.md
+++ b/docs/src/tutorials/flows.md
@@ -22,7 +22,6 @@ Multiplicative degeneracy is not entirely made up and do come up in some models
 For example, in the 3-parameter (3-PL) item-response theory model and the N-mixture model used for estimating animal population.
 
 ```@example flow
-using Bijectors: Bijectors
 using Distributions
 using LogDensityProblems: LogDensityProblems
 
@@ -56,7 +55,7 @@ Degenerate posteriors often indicate that there is not enough data to pin-point
 Therefore, for the purpose of illustration, we will use a single data point:
 
 ```@example flow
-model = MultDegen([3.0])
+prob = MultDegen([3.0])
 nothing
 ```
 
@@ -68,7 +67,7 @@ using Plots
 contour(
     range(0, 4; length=64),
     range(-3, 25; length=64),
-    (x, y) -> LogDensityProblems.logdensity(model, [x, y]);
+    (x, y) -> LogDensityProblems.logdensity(prob, [x, y]);
     xlabel="α",
     ylabel="β",
     clims=(-8, Inf),
@@ -86,19 +85,6 @@ This sort of nonlinear correlation structure is difficult to model using only lo
 ## Gaussian Variational Family
 
 As usual, let's try to fit a multivariate Gaussian to this posterior.
-
-```@example flow
-using ADTypes: ADTypes
-using ReverseDiff: ReverseDiff
-using DifferentiationInterface: DifferentiationInterface
-using LogDensityProblemsAD: LogDensityProblemsAD
-
-model_ad = LogDensityProblemsAD.ADgradient(
-    ADTypes.AutoReverseDiff(; compile=true), model; x=[1.0, 1.0]
-)
-nothing
-```
-
 Since $\alpha$ is constrained to the positive real half-space, we have to employ bijectors.
 For this, we use [Bijectors](https://github.com/TuringLang/Bijectors.jl):
 
@@ -113,28 +99,72 @@ end
 nothing
 ```
 
+to transform the posterior to be unconstrained:
+
+```@example flow
+struct TransformedLogDensityProblem{Prob,Trans}
+    prob::Prob
+    transform::Trans
+end
+
+function TransformedLogDensityProblem(prob, transform)
+    return TransformedLogDensityProblem{typeof(prob),typeof(transform)}(prob, transform)
+end
+
+function LogDensityProblems.logdensity(prob_trans::TransformedLogDensityProblem, θ_trans)
+    (; prob, transform) = prob_trans
+    θ, logabsdetjac = Bijectors.with_logabsdet_jacobian(transform, θ_trans)
+    return LogDensityProblems.logdensity(prob, θ) + logabsdetjac
+end
+
+function LogDensityProblems.dimension(prob_trans::TransformedLogDensityProblem)
+    return LogDensityProblems.dimension(prob_trans.prob)
+end
+
+function LogDensityProblems.capabilities(
+    ::Type{TransformedLogDensityProblem{Prob,Trans}}
+) where {Prob,Trans}
+    return LogDensityProblems.capabilities(Prob)
+end
+nothing
+```
+
+Let's instantiate the model:
+
+```@example flow
+using ADTypes: ADTypes
+using ReverseDiff: ReverseDiff
+using DifferentiationInterface: DifferentiationInterface
+using LogDensityProblemsAD: LogDensityProblemsAD
+
+binv = Bijectors.inverse(Bijectors.bijector(prob))
+prob_trans = TransformedLogDensityProblem(prob, binv)
+prob_trans_ad = LogDensityProblemsAD.ADgradient(
+    ADTypes.AutoForwardDiff(), prob_trans; x=[1.0, 1.0]
+)
+nothing
+```
+
 For the algorithm, we will use the `KLMinRepGradProxDescent` objective.
 
 ```@example flow
 using AdvancedVI
 using LinearAlgebra
 
-d = LogDensityProblems.dimension(model_ad)
+d = LogDensityProblems.dimension(prob_trans_ad)
 q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(I, d, d)))
 
-binv = Bijectors.inverse(Bijectors.bijector(model))
-q_trans = Bijectors.TransformedDistribution(q, binv)
-
 max_iter = 3*10^3
 alg = KLMinRepGradProxDescent(ADTypes.AutoReverseDiff(; compile=true))
-q_out, info, _ = AdvancedVI.optimize(alg, max_iter, model_ad, q_trans; show_progress=false)
+q_out, info, _ = AdvancedVI.optimize(alg, max_iter, prob_trans_ad, q; show_progress=false)
+q_out_trans = Bijectors.TransformedDistribution(q_out, binv)
 nothing
 ```
 
 The resulting variational posterior can be visualized as follows:
 
 ```@example flow
-samples = rand(q_out, 10000)
+samples = rand(q_out_trans, 10000)
 histogram2d(
     samples[1, :],
     samples[2, :];
@@ -177,16 +207,7 @@ q_flow = realnvp(MvNormal(zeros(d), I), hidden_dims, n_layers; paramtype=Float64
 nothing
 ```
 
-Recall that out posterior is constrained.
-In most cases, flows assume an unconstrained support.
-Therefore, just as with the Gaussian variational family, we can incorporate `Bijectors` to match the supports:
-
-```@example flow
-q_flow_trans = Bijectors.TransformedDistribution(q_flow, binv)
-nothing
-```
-
-For the variational inference algorithms, we will similarly minimize the KL divergence with stochastic gradient descent as originally proposed by Rezende and Mohamed[^RM2015].
+For the variational inference algorithm, we will similarly minimize the KL divergence with stochastic gradient descent as originally proposed by Rezende and Mohamed[^RM2015].
 For this, however, we need to be mindful of the requirements of the variational algorithm.
 The default `entropy` gradient estimator of `KLMinRepGradDescent` is `ClosedFormEntropy()`, which assumes that the entropy of the variational family `entropy(q)` is available. For flows, the entropy is (usually) not available.
 Instead, we can use any gradient estimator that only relies on the log-density of the variational family `logpdf(q)`, `StickingTheLandingEntropy()` or `MonteCarloEntropy()`.
@@ -199,9 +220,12 @@ Furthermore, Agrawal *et al.*[^AD2025] claim that using a larger number of Monte
 [^ASD2020]: Agrawal, A., Sheldon, D. R., & Domke, J. (2020). Advances in black-box VI: Normalizing flows, importance weighting, and optimization. In *Advances in Neural Information Processing Systems*, 33, 17358-17369.
 [^AD2025]: Agrawal, A., & Domke, J. (2024). Disentangling impact of capacity, objective, batchsize, estimators, and step-size on flow VI. In *Proceedings of the International Conference on Artificial Intelligence and Statistics*.
 ```@example flow
+using Optimisers: Optimisers
+
 alg_flow = KLMinRepGradDescent(
     ADTypes.AutoReverseDiff(; compile=true);
     n_samples=8,
+    optimizer=Optimisers.Adam(1e-2),
     operator=IdentityOperator(),
     entropy=StickingTheLandingEntropy(),
 )
@@ -211,9 +235,11 @@ nothing
 Without further due, let's now run VI:
 
 ```@example flow
+max_iter = 300
 q_flow_out, info_flow, _ = AdvancedVI.optimize(
-    alg_flow, max_iter, model_ad, q_flow_trans; show_progress=false
+    alg_flow, max_iter, prob_trans_ad, q_flow; show_progress=false
 )
+q_flow_out_trans = Bijectors.TransformedDistribution(q_flow_out, binv)
 nothing
 ```
 
@@ -230,7 +256,7 @@ nothing
 Finally, let's visualize the variational posterior:
 
 ```@example flow
-samples = rand(q_flow_out, 10000)
+samples = rand(q_flow_out_trans, 10000)
 histogram2d(
     samples[1, :],
     samples[2, :];
diff --git a/docs/src/tutorials/subsampling.md b/docs/src/tutorials/subsampling.md
index 1816efb44..4f03bd757 100644
--- a/docs/src/tutorials/subsampling.md
+++ b/docs/src/tutorials/subsampling.md
@@ -10,7 +10,7 @@ In this tutorial, we will see how to perform subsampling with `KLMinRepGradProxD
 [^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of Machine Learning Research*, 18(14), 1-45.
 ## Setting Up Subsampling
 
-We will consider the same hierarchical logistic regression example used in the [Basic Example](@ref basic).
+We will consider a typical hierarchical logistic regression example.
 
 ```@example subsampling
 using LogDensityProblems: LogDensityProblems
@@ -26,10 +26,11 @@ end
 function LogDensityProblems.logdensity(model::LogReg, θ)
     (; X, y, n_data) = model
     n, d = size(X)
-    β, σ = θ[1:size(X, 2)], θ[end]
+    β, logσ = θ[1:size(X, 2)], θ[end]
+    σ = exp(logσ)
 
     logprior_β = logpdf(MvNormal(Zeros(d), σ), β)
-    logprior_σ = logpdf(LogNormal(0, 3), σ)
+    logprior_σ = logpdf(Normal(0, 3), σ)
 
     logit = X*β
     loglike_y = mapreduce((li, yi) -> logpdf(BernoulliLogit(li), yi), +, logit, y)
@@ -37,7 +38,7 @@ function LogDensityProblems.logdensity(model::LogReg, θ)
 end
 
 function LogDensityProblems.dimension(model::LogReg)
-    return size(model.X, 2) + 1
+    return size(prob.X, 2) + 1
 end
 
 function LogDensityProblems.capabilities(::Type{<:LogReg})
@@ -50,21 +51,6 @@ Notice that, to use subsampling, we need be able to rescale the likelihood stren
 That is, for the gradient of the log-density with a batch of data points of size `n` to be an unbiased estimate of the gradient using the full dataset of size `n_data`, we need to scale the likelihood by `n_data/n`.
 This part is critical to ensure that the algorithm correctly approximates the posterior with the full dataset.
 
-As usual, we will set up a bijector:
-
-```@example subsampling
-using Bijectors: Bijectors
-
-function Bijectors.bijector(model::LogReg)
-    d = size(model.X, 2)
-    return Bijectors.Stacked(
-        Bijectors.bijector.([MvNormal(Zeros(d), 1.0), LogNormal(0, 3)]),
-        [1:d, (d + 1):(d + 1)],
-    )
-end
-nothing
-```
-
 For the dataset, we will use one that is larger than that used in the [Basic Example](@ref basic).
 This is to properly assess the advantage of subsampling.
 In particular, we will utilize the "Phishing" dataset[^Tan2018], which consists of 10000 data points, each with 48 features.
@@ -100,26 +86,30 @@ Let's now istantiate the model and set up automatic differentiation using [`LogD
 using ADTypes, ReverseDiff
 using LogDensityProblemsAD
 
-model = LogReg(X, y, size(X, 1))
-model_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoReverseDiff(), model)
+prob = LogReg(X, y, size(X, 1))
+prob_ad = LogDensityProblemsAD.ADgradient(
+    ADTypes.AutoReverseDiff(), 
+    prob, 
+    x=randn(LogDensityProblems.dimension(prob))
+)
 nothing
 ```
 
 To enable subsampling, `LogReg` has to implement the method `AdvancedVI.subsample`.
 For our model, this is fairly simple: We only need to select the rows of `X` and the elements of `y` corresponding to the batch of data points.
-As subtle point here is that we wrapped `model` with `LogDensityProblemsAD.ADgradient` into `model_ad`.
-Therefore, `AdvancedVI` sees `model_ad` and not `model`.
-This means we have to specialize `AdvancedVI.subsample` to `typeof(model_ad)` and not `LogReg`.
+As subtle point here is that we wrapped `prob` with `LogDensityProblemsAD.ADgradient` into `prob_ad`.
+Therefore, `AdvancedVI` sees `prob_ad` and not `prob`.
+This means we have to specialize `AdvancedVI.subsample` to `typeof(prob_ad)` and not `LogReg`.
 
 ```@example subsampling
 using Accessors
 using AdvancedVI
 
-function AdvancedVI.subsample(model::typeof(model_ad), idx)
-    (; X, y, n_data) = parent(model)
-    model′ = @set model.ℓ.X = X[idx, :]
-    model′′ = @set model′.ℓ.y = y[idx]
-    return model′′
+function AdvancedVI.subsample(prob::typeof(prob_ad), idx)
+    (; X, y, n_data) = parent(prob_ad)
+    prob′ = @set prob.ℓ.X = X[idx, :]
+    prob′′ = @set prob′.ℓ.y = y[idx]
+    return prob′′
 end
 nothing
 ```
@@ -138,7 +128,7 @@ Here, we will use `ReshufflingBatchSubsampling`, which implements random reshuff
 We will us a batch size of 32, which results in `313 = length(subsampling) = ceil(Int, size(X,2)/32)` steps per epoch.
 
 ```@example subsampling
-dataset = 1:size(model.X, 1)
+dataset = 1:size(prob.X, 1)
 batchsize = 32
 subsampling = ReshufflingBatchSubsampling(dataset, batchsize)
 alg_sub = KLMinRepGradProxDescent(ADTypes.AutoReverseDiff(; compile=true); subsampling)
@@ -160,7 +150,7 @@ nothing
 If we don't supply a subsampling strategy to `KLMinRepGradProxDescent`, subsampling will not be used.
 
 ```@example subsampling
-alg_full = KLMinRepGradProxDescent(ADTypes.AutoReverseDiff(; compile=true))
+alg_full = KLMinRepGradProxDescent(ADTypesAutoReverseDiff(; compile=true))
 nothing
 ```
 
@@ -169,11 +159,8 @@ The variational family will be set up as follows:
 ```@example subsampling
 using LinearAlgebra
 
-d = LogDensityProblems.dimension(model_ad)
-q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.37*I, d, d)))
-b = Bijectors.bijector(model)
-binv = Bijectors.inverse(b)
-q_transformed = Bijectors.TransformedDistribution(q, binv)
+d = LogDensityProblems.dimension(prob_ad)
+q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.6*I, d, d)))
 nothing
 ```
 
@@ -192,7 +179,7 @@ time_begin = nothing
 Approximate the posterior predictive probability for a logistic link function using Mackay's approximation (Bishop p. 220).
 """
 function logistic_prediction(X, μ_β, Σ_β)
-    xtΣx = sum((model.X*Σ_β) .* model.X; dims=2)[:, 1]
+    xtΣx = sum((prob.X*Σ_β) .* prob.X; dims=2)[:, 1]
     κ = @. 1/sqrt(1 + π/8*xtΣx)
     return StatsFuns.logistic.(κ .* X*μ_β)
 end
@@ -204,16 +191,16 @@ function callback(; iteration, averaged_params, restructure, kwargs...)
         q_avg = restructure(averaged_params)
 
         # Compute predictions using 
-        μ_β = mean(q_avg.dist)[1:(end - 1)] # posterior mean of β
-        Σ_β = cov(q_avg.dist)[1:(end - 1), end - 1] # marginal posterior covariance of β
+        μ_β = mean(q_avg)[1:(end - 1)] # posterior mean of β
+        Σ_β = cov(q_avg)[1:(end - 1), end - 1] # marginal posterior covariance of β
         y_pred = logistic_prediction(X, μ_β, Σ_β) .> 0.5
 
         # Prediction accuracy
-        acc = mean(y_pred .== model.y)
+        acc = mean(y_pred .== prob.y)
 
         # Higher fidelity estimate of the ELBO on the averaged parameters
         n_samples = 256
-        elbo_callback = -estimate_objective(alg_full, q_avg, model; n_samples)
+        elbo_callback = -estimate_objective(alg_full, q_avg, prob; n_samples)
 
         (elbo_callback=elbo_callback, accuracy=acc, time_elapsed=time() - time_begin)
     else
@@ -223,12 +210,12 @@ end
 
 time_begin = time()
 _, info_full, _ = AdvancedVI.optimize(
-    alg_full, max_iter, model_ad, q_transformed; show_progress=false, callback
+    alg_full, max_iter, prob_ad, q; show_progress=true, callback
 );
 
 time_begin = time()
 _, info_sub, _ = AdvancedVI.optimize(
-    alg_sub, max_iter, model_ad, q_transformed; show_progress=false, callback
+    alg_sub, max_iter, prob_ad, q; show_progress=true, callback
 );
 nothing
 ```

From fba3d9fa362f96103c8ac7e7b572aed70f982e13 Mon Sep 17 00:00:00 2001
From: Kyurae Kim <msca8h@naver.com>
Date: Sat, 22 Nov 2025 12:25:19 -0500
Subject: [PATCH 14/26] run formatter

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 docs/src/klminrepgraddescent.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/src/klminrepgraddescent.md b/docs/src/klminrepgraddescent.md
index 64cb5f5bb..c239f3e4e 100644
--- a/docs/src/klminrepgraddescent.md
+++ b/docs/src/klminrepgraddescent.md
@@ -63,7 +63,6 @@ where $$\epsilon_m \sim \varphi$$ are Monte Carlo samples.
 [^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014). Doubly stochastic variational Bayes for non-conjugate inference. In *International Conference on Machine Learning*.
 [^RMW2014]: Rezende, D. J., Mohamed, S., & Wierstra, D. (2014). Stochastic backpropagation and approximate inference in deep generative models. In *International Conference on Machine Learning*.
 [^KW2014]: Kingma, D. P., & Welling, M. (2014). Auto-encoding variational bayes. In *International Conference on Learning Representations*.
-
 ## [Entropy Gradient Estimators](@id entropygrad)
 
 For the gradient of the entropy term, we provide three choices with varying requirements.

From 4daaa791fe658df89e35911422719cb0d53c343f Mon Sep 17 00:00:00 2001
From: Kyurae Kim <msca8h@naver.com>
Date: Sat, 22 Nov 2025 12:25:34 -0500
Subject: [PATCH 15/26] run formatter

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 docs/src/tutorials/basic.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/src/tutorials/basic.md b/docs/src/tutorials/basic.md
index 53c9db794..713d89904 100644
--- a/docs/src/tutorials/basic.md
+++ b/docs/src/tutorials/basic.md
@@ -221,7 +221,6 @@ Therefore, we finally need to apply a change-of-variable to `q_out` to make it a
 q_trans = Bijectors.TransformedDistribution(q_out, binv)
 ```
 
-
 ## Custom Callback
 
 The ELBO estimates above however, use only a handful of Monte Carlo samples.

From a0f13d54f9232cf58ffb5673e29b192a6889bedc Mon Sep 17 00:00:00 2001
From: Kyurae Kim <msca8h@naver.com>
Date: Sat, 22 Nov 2025 12:26:10 -0500
Subject: [PATCH 16/26] run formatter

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 docs/src/tutorials/basic.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/tutorials/basic.md b/docs/src/tutorials/basic.md
index 713d89904..cf2d44a3a 100644
--- a/docs/src/tutorials/basic.md
+++ b/docs/src/tutorials/basic.md
@@ -50,7 +50,7 @@ end
 nothing
 ```
 
-Notice that the support of `σ` is constrained to be positive. 
+Notice that the support of `σ` is constrained to be positive.
 Since most VI algorithms assume an unconstrained Euclidean support, we will apply a change-of-variable by a bijective map (a *bijector*) to the posterior to make it unconstrained.
 For this, we will use the [`Bijectors`](https://github.com/TuringLang/Bijectors.jl) package.
 

From 9a64db7bc3a30df5004999088d0b1751dd307365 Mon Sep 17 00:00:00 2001
From: Kyurae Kim <msca8h@naver.com>
Date: Sat, 22 Nov 2025 12:26:22 -0500
Subject: [PATCH 17/26] run formatter

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 docs/src/tutorials/subsampling.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/src/tutorials/subsampling.md b/docs/src/tutorials/subsampling.md
index 4f03bd757..eab0fbf7f 100644
--- a/docs/src/tutorials/subsampling.md
+++ b/docs/src/tutorials/subsampling.md
@@ -88,9 +88,7 @@ using LogDensityProblemsAD
 
 prob = LogReg(X, y, size(X, 1))
 prob_ad = LogDensityProblemsAD.ADgradient(
-    ADTypes.AutoReverseDiff(), 
-    prob, 
-    x=randn(LogDensityProblems.dimension(prob))
+    ADTypes.AutoReverseDiff(), prob; x=randn(LogDensityProblems.dimension(prob))
 )
 nothing
 ```

From 048a31028bb49670d95caabbf16dad9d23ab3090 Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Sat, 22 Nov 2025 13:37:23 -0500
Subject: [PATCH 18/26] add constraint tutorial

---
 docs/make.jl                      |   1 +
 docs/src/tutorials/constrained.md | 275 ++++++++++++++++++++++++++++++
 2 files changed, 276 insertions(+)
 create mode 100644 docs/src/tutorials/constrained.md

diff --git a/docs/make.jl b/docs/make.jl
index 34c0080d3..8b8c9f892 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -21,6 +21,7 @@ makedocs(;
             "Scaling to Large Datasets" => "tutorials/subsampling.md",
             "Stan Models" => "tutorials/stan.md",
             "Normalizing Flows" => "tutorials/flows.md",
+            "Dealing with Constrained Posteriors" => "tutorials/constrained.md"
         ],
         "Algorithms" => [
             "`KLMinRepGradDescent`" => "klminrepgraddescent.md",
diff --git a/docs/src/tutorials/constrained.md b/docs/src/tutorials/constrained.md
new file mode 100644
index 000000000..e1539fe39
--- /dev/null
+++ b/docs/src/tutorials/constrained.md
@@ -0,0 +1,275 @@
+# [Dealing with Constrained Posteriors](@id constrained)
+
+In this tutorial, we will demonstrate how to deal with constrained posteriors in more detail.
+Formally, by constrained posteriors, we mean that the target posterior has a density defined over a space that does not span the "full" Euclidean space $\mathbb{R}^d$:
+```math
+\pi : \mathcal{X} \to \mathbb{R}_{> 0} ,
+```
+where $\mathcal{X} \subset \mathbb{R}^d$ but not $\mathcal{X} = \mathbb{R}^d$.
+
+For instance, consider the basic hierarchical model for estimating the mean of the data $y_1, \ldots, y_n$:
+```math
+\begin{aligned}
+    \sigma &\sim \operatorname{LogNormal}(\alpha, \beta) \\
+    \mu &\sim \operatorname{Normal}(0, \sigma) \\
+    y_i &\sim \operatorname{Normal}(\mu, \sigma) .
+\end{aligned}
+```
+The corresponding posterior 
+```math
+\pi(\mu, \sigma \mid y_1, \ldots, y_n)
+=
+\operatorname{LogNormal}(\sigma; \alpha, \beta)
+\operatorname{Normal}(\mu; 0, \sigma)
+\prod_{i=1}^n \operatorname{Normal}(y_i; \mu, \sigma)
+```
+has a density with respect to the space 
+```math
+    \mathcal{X} = \mathbb{R}_{> 0} \times \mathbb{R} .
+```
+There are also more complicated examples of constrained spaces.
+For example, a $k$-dimensional variable with a Dirichlet prior will be constrained to live on a $k$-dimensional simplex.
+
+Now, most algorithms provided by `AdvancedVI`, such as:
+
+- `KLMinRepGradDescent`
+- `KLMinRepGradProxDescent`
+- `KLMinNaturalGradDescent`
+- `FisherMinBatchMatch`
+
+tend to assume the target posterior is defined over the whole Euclidean space $\mathbb{R}^d$.
+Therefore, to apply these algorithms, we need to do something about the constraints.
+We will describe some recommended ways of doing this.
+
+## Transforming the Posterior
+The most widely applicable way is to transform the posterior $\pi : \mathcal{X} \to \mathbb{R}_{>0}$ to be unconstrained.
+That is, consider some bijective map $b : \mathcal{X} \to \mathbb{R}^{d}$ between the $\mathcal{X}$ and the associated Euclidean space $\mathbb{R}^{d}$.
+Using the inverse of the map $b^{-1}$ and its Jacobian $\mathrm{J}_{b^{-1}}$, we can apply a change of variable to the posterior and obtain its unconstrained counterpart
+```math
+\pi_{b^{-1}}(\eta) : \mathbb{R}^d \to \mathbb{R}_{>0} = \pi(b^{-1}(\eta)) {\lvert  \mathrm{J}_{b^{-1}}(\eta)  \rvert} .
+```
+This idea popularized by Stan[^CGHetal2017] and Tensorflow probability[^DLTetal2017] is, in fact, how most probabilistic programming frameworks enable the use of off-the-shelf Markov chain Monte Carlo algorithms.
+In the context of variational inference, we will first approximate the unconstrained posterior as
+
+```math
+q^* = \arg\min_{q \in \mathcal{Q}} \;\; \mathrm{D}(q, \pi_{b^{-1}}) .
+```
+
+and then transform the optimal unconstrained approximation $q^*$ to be constrained by again applying a change of variable as
+
+```math
+q_{b}^* : \mathcal{X} \to \mathbb{R}_{>0} = q(b(z)) {\lvert \mathrm{J}_{b}(z) \rvert} .
+```
+
+Sampling from $q_{b}^*$ amounts to pushing each sample from $q$ into $b^{-1}$:
+
+```math
+z \sim q_{b}^* \quad\Leftrightarrow\quad z \stackrel{\mathrm{d}}{=} b^{-1}(\eta) ; \quad \eta \sim q^* .
+```
+
+The idea of applying a change-of-variable to the variational approximation to match a constrained posterior was popularized by the automatic differentiation VI[^KTRGB2017].
+
+[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. Journal of machine learning research, 18(14), 1-45.
+
+Now, there are two ways how to do this in Julia.
+First, let's define the constrained posterior example above using the `LogDensityProblems` interface for illustration:
+
+```@example constraints
+using LogDensityProblems
+
+struct Mean
+    y::Vector{Float64}
+end
+
+function LogDensityProblems.logdensity(prob::Mean, θ)
+    μ, σ = θ[1], θ[2]
+    ℓp_μ = logpdf(Normal(0, σ), μ) 
+    ℓp_σ = logpdf(LogNormal(0, 3), σ)
+    ℓl_y = mapreduce(yi -> logpdf(Normal(μ, σ), yi), +, prob.y)
+    return ℓp_μ + ℓp_σ + ℓl_y
+end
+
+LogDensityProblems.dimension(::Mean) = 2
+
+LogDensityProblems.capabilities(::Type{Mean}) = LogDensityProblems.LogDensityOrder{0}()
+
+n_data = 30
+prob = Mean(randn(n_data))
+nothing
+```
+
+We need to find the right transformation associated with a `LogNormal` prior.
+Most of the common bijective transformations can be found in [`Bijectors.jl`](https://github.com/TuringLang/Bijectors.jl) package[^FXTYG2020].
+See the following:
+
+```@example constraints
+using Bijectors
+
+b_σ = Bijectors.bijector(LogNormal(0, 1))
+```
+
+and the inverse transformation can be obtained as
+
+```@example constraints
+binv_σ = Bijectors.inverse(b_σ)
+```
+
+Multiple bijectors can also be stacked to form a joint bijector using `Bijectors.Stacked`.
+For example:
+
+```@example constraints
+function Bijectors.bijector(::Mean)
+    return Bijectors.Stacked(
+        Bijectors.bijector.([Normal(0, 1), LogNormal(1, 1)]), [1:1, 2:2],
+    )
+end
+
+b = Bijectors.bijector(prob)
+binv = Bijectors.inverse(b)
+```
+
+Refer to the documentation of `Bijectors.jl` for more details.
+
+
+## Wrap the `LogDensityProblem`
+
+The most general and easy way to obtain an unconstrained posterior using a `Bijector` is to wrap our original `LogDensityProblem` to form a new `LogDensityProblem`.
+This approach only requires the user to implement the model-specific `Bijectors.bijector` function as above. 
+The rest can be done by simply copy-pasting the code below:
+
+```@example constraints
+struct TransformedLogDensityProblem{Prob,BInv}
+    prob::Prob
+    binv::BInv
+end
+
+function TransformedLogDensityProblem(prob)
+    b = Bijectors.bijector(prob)
+    binv = Bijectors.inverse(b)
+    return TransformedLogDensityProblem{typeof(prob),typeof(binv)}(prob, binv)
+end
+
+function LogDensityProblems.logdensity(prob_trans::TransformedLogDensityProblem, θ_trans)
+    (; prob, binv) = prob_trans
+    θ, logabsdetjac = Bijectors.with_logabsdet_jacobian(binv, θ_trans)
+    return LogDensityProblems.logdensity(prob, θ) + logabsdetjac
+end
+
+function LogDensityProblems.dimension(prob_trans::TransformedLogDensityProblem)
+    (; prob, binv) = prob_trans
+    b = Bijectors.inverse(binv)
+    d = LogDensityProblems.dimension(prob)
+    return prod(Bijectors.output_size(b, (d,)))
+end
+
+function LogDensityProblems.capabilities(
+    ::Type{TransformedLogDensityProblem{Prob,BInv}}
+) where {Prob,BInv}
+    return LogDensityProblems.capabilities(Prob)
+end
+nothing
+```
+
+Wrapping `prob` with `TransformedLogDensityProblem` yields our unconstrained posterior.
+
+```@example constraints
+prob_trans = TransformedLogDensityProblem(prob)
+
+x = randn(LogDensityProblems.dimension(prob_trans)) # sample on an unconstrained support
+LogDensityProblems.logdensity(prob_trans, x)
+```
+
+We can also wrap `prob_trans` with `LogDensityProblemsAD.ADGradient` to make it differentiable. 
+```@example constraints
+using LogDensityProblemsAD
+using ADTypes, ReverseDiff
+
+prob_trans_ad = LogDensityProblemsAD.ADgradient(
+    ADTypes.AutoReverseDiff(; compile=true), prob_trans; x = randn(2)
+)
+```
+
+Let's now run VI to verify that it works.
+Here, we will use `FisherMinBatchMatch`, which expects an unconstrained posterior.
+
+```@example constraints
+using AdvancedVI
+using LinearAlgebra
+
+d = LogDensityProblems.dimension(prob_trans_ad)
+q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.6*I, d, d)))
+
+q_opt, info, _ = AdvancedVI.optimize(
+    FisherMinBatchMatch(), 100, prob_trans_ad, q; show_progress=false
+)
+nothing
+```
+
+We have now obtained a variational approximation `q_opt` of the unconstrained posterior associated with `prob_trans`.
+It remains to transform `q_opt` back to the constrained space we were originally interested in.
+This can be done by wrapping it into a `Bijectors.TransformedDistribution`.
+
+```@example constraints
+q_opt_trans = Bijectors.TransformedDistribution(q_opt, binv)
+```
+
+```@example constraints
+using Plots
+
+x = rand(q_opt_trans, 1000)
+
+Plots.stephist(x[2,:], normed=true, xlabel="Posterior of σ", label=nothing, xlims=(0, 2))
+Plots.vline!([1.0], label="True Value")
+savefig("constrained_histogram.svg")
+```
+
+![](constrained_histogram.svg)
+
+We can see that the transformed posterior is indeed a meaningful approximation of the original posterior $\pi(\sigma \mid y_1, \ldots, y_n)$ we were interested in.
+
+
+## Bake a Bijector into the `LogDensityProblem`
+
+A problem with the general approach above is that automatically differentiating through `TransformedLogDensityProblem` can be a bit inefficient (due to `Stacked`), especially with reverse-mode AD.
+Therefore, another effective but less automatic approach is to bake the transformation and Jacobian adjustment into the `LogDensityProblem` itself.
+Here is an example for our mean estimation model:
+
+```@example constraints
+struct MeanTransformed{BInvS}
+    y::Vector{Float64}
+    binv_σ::BInvS
+end
+
+function MeanTransformed(y::Vector{Float64})
+    binv_σ = Bijectors.bijector(LogNormal(0, 3)) |> Bijectors.inverse
+    return MeanTransformed(y, binv_σ)
+end
+
+function LogDensityProblems.logdensity(prob::MeanTransformed, θ)
+    (; y, binv_σ) = prob
+    μ = θ[1]
+    
+    # Apply bijector and compute Jacobian
+    σ, ℓabsdetjac_σ = with_logabsdet_jacobian(binv_σ, θ[2])	
+
+    ℓp_μ = logpdf(Normal(0, σ), μ) 
+    ℓp_σ = logpdf(LogNormal(0, 3), σ)
+    ℓl_y = mapreduce(yi -> logpdf(Normal(μ, σ), yi), +, prob.y)
+    return ℓp_μ + ℓp_σ + ℓl_y + ℓabsdetjac_σ
+end
+
+LogDensityProblems.dimension(::MeanTransformed) = 2
+
+LogDensityProblems.capabilities(::Type{MeanTransformed}) = LogDensityProblems.LogDensityOrder{0}()
+
+n_data = 30
+prob_bakedtrans = MeanTransformed(randn(n_data))
+nothing
+```
+
+Now, `prob_bakedtrans` can be used identically as `prob_trans` above.
+For problems with larger dimensions, however, baking the bijector into the problem as above could be significantly more efficient.
+
+[^CGHetal2017]: Carpenter, B., Gelman, A., Hoffman, M. D., Lee, D., Goodrich, B., Betancourt, M., ... & Riddell, A. (2017). Stan: A probabilistic programming language. Journal of statistical software, 76, 1-32.
+[^DLTetal2017]: Dillon, J. V., Langmore, I., Tran, D., Brevdo, E., Vasudevan, S., Moore, D., ... & Saurous, R. A. (2017). Tensorflow distributions. arXiv preprint arXiv:1711.10604.
+[^FXTYG2020]: Fjelde, T. E., Xu, K., Tarek, M., Yalburgi, S., & Ge, H. (2020, February). Bijectors. jl: Flexible transformations for probability distributions. In Symposium on Advances in Approximate Bayesian Inference (pp. 1-17). PMLR.

From 69ae57a389930741b0ef9d2c780e9394a165a538 Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Sat, 22 Nov 2025 13:38:56 -0500
Subject: [PATCH 19/26] fix missing import in docs

---
 docs/src/klminrepgraddescent.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/klminrepgraddescent.md b/docs/src/klminrepgraddescent.md
index 64cb5f5bb..771cf99cd 100644
--- a/docs/src/klminrepgraddescent.md
+++ b/docs/src/klminrepgraddescent.md
@@ -63,7 +63,6 @@ where $$\epsilon_m \sim \varphi$$ are Monte Carlo samples.
 [^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014). Doubly stochastic variational Bayes for non-conjugate inference. In *International Conference on Machine Learning*.
 [^RMW2014]: Rezende, D. J., Mohamed, S., & Wierstra, D. (2014). Stochastic backpropagation and approximate inference in deep generative models. In *International Conference on Machine Learning*.
 [^KW2014]: Kingma, D. P., & Welling, M. (2014). Auto-encoding variational bayes. In *International Conference on Learning Representations*.
-
 ## [Entropy Gradient Estimators](@id entropygrad)
 
 For the gradient of the entropy term, we provide three choices with varying requirements.
@@ -96,6 +95,7 @@ Depending on the variational family, this might be computationally inefficient o
 For example, if ``q_{\lambda}`` is a Gaussian with a full-rank covariance, a back-substitution must be performed at every step, making the per-iteration complexity ``\mathcal{O}(d^3)`` and reducing numerical stability.
 
 ```@setup repgradelbo
+using Distributions
 using FillArrays
 using LinearAlgebra
 using LogDensityProblems

From af4ad180ff75d56c6bc485ee61c64c9c3e6a38d2 Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Sat, 22 Nov 2025 13:39:06 -0500
Subject: [PATCH 20/26] run furmatter to constrained

---
 docs/src/tutorials/constrained.md | 53 +++++++++++++++++++------------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/docs/src/tutorials/constrained.md b/docs/src/tutorials/constrained.md
index e1539fe39..0cfcde5d8 100644
--- a/docs/src/tutorials/constrained.md
+++ b/docs/src/tutorials/constrained.md
@@ -2,12 +2,15 @@
 
 In this tutorial, we will demonstrate how to deal with constrained posteriors in more detail.
 Formally, by constrained posteriors, we mean that the target posterior has a density defined over a space that does not span the "full" Euclidean space $\mathbb{R}^d$:
+
 ```math
 \pi : \mathcal{X} \to \mathbb{R}_{> 0} ,
 ```
+
 where $\mathcal{X} \subset \mathbb{R}^d$ but not $\mathcal{X} = \mathbb{R}^d$.
 
 For instance, consider the basic hierarchical model for estimating the mean of the data $y_1, \ldots, y_n$:
+
 ```math
 \begin{aligned}
     \sigma &\sim \operatorname{LogNormal}(\alpha, \beta) \\
@@ -15,7 +18,9 @@ For instance, consider the basic hierarchical model for estimating the mean of t
     y_i &\sim \operatorname{Normal}(\mu, \sigma) .
 \end{aligned}
 ```
-The corresponding posterior 
+
+The corresponding posterior
+
 ```math
 \pi(\mu, \sigma \mid y_1, \ldots, y_n)
 =
@@ -23,31 +28,37 @@ The corresponding posterior
 \operatorname{Normal}(\mu; 0, \sigma)
 \prod_{i=1}^n \operatorname{Normal}(y_i; \mu, \sigma)
 ```
-has a density with respect to the space 
+
+has a density with respect to the space
+
 ```math
     \mathcal{X} = \mathbb{R}_{> 0} \times \mathbb{R} .
 ```
+
 There are also more complicated examples of constrained spaces.
 For example, a $k$-dimensional variable with a Dirichlet prior will be constrained to live on a $k$-dimensional simplex.
 
 Now, most algorithms provided by `AdvancedVI`, such as:
 
-- `KLMinRepGradDescent`
-- `KLMinRepGradProxDescent`
-- `KLMinNaturalGradDescent`
-- `FisherMinBatchMatch`
+  - `KLMinRepGradDescent`
+  - `KLMinRepGradProxDescent`
+  - `KLMinNaturalGradDescent`
+  - `FisherMinBatchMatch`
 
 tend to assume the target posterior is defined over the whole Euclidean space $\mathbb{R}^d$.
 Therefore, to apply these algorithms, we need to do something about the constraints.
 We will describe some recommended ways of doing this.
 
 ## Transforming the Posterior
+
 The most widely applicable way is to transform the posterior $\pi : \mathcal{X} \to \mathbb{R}_{>0}$ to be unconstrained.
 That is, consider some bijective map $b : \mathcal{X} \to \mathbb{R}^{d}$ between the $\mathcal{X}$ and the associated Euclidean space $\mathbb{R}^{d}$.
 Using the inverse of the map $b^{-1}$ and its Jacobian $\mathrm{J}_{b^{-1}}$, we can apply a change of variable to the posterior and obtain its unconstrained counterpart
+
 ```math
 \pi_{b^{-1}}(\eta) : \mathbb{R}^d \to \mathbb{R}_{>0} = \pi(b^{-1}(\eta)) {\lvert  \mathrm{J}_{b^{-1}}(\eta)  \rvert} .
 ```
+
 This idea popularized by Stan[^CGHetal2017] and Tensorflow probability[^DLTetal2017] is, in fact, how most probabilistic programming frameworks enable the use of off-the-shelf Markov chain Monte Carlo algorithms.
 In the context of variational inference, we will first approximate the unconstrained posterior as
 
@@ -70,7 +81,6 @@ z \sim q_{b}^* \quad\Leftrightarrow\quad z \stackrel{\mathrm{d}}{=} b^{-1}(\eta)
 The idea of applying a change-of-variable to the variational approximation to match a constrained posterior was popularized by the automatic differentiation VI[^KTRGB2017].
 
 [^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. Journal of machine learning research, 18(14), 1-45.
-
 Now, there are two ways how to do this in Julia.
 First, let's define the constrained posterior example above using the `LogDensityProblems` interface for illustration:
 
@@ -83,7 +93,7 @@ end
 
 function LogDensityProblems.logdensity(prob::Mean, θ)
     μ, σ = θ[1], θ[2]
-    ℓp_μ = logpdf(Normal(0, σ), μ) 
+    ℓp_μ = logpdf(Normal(0, σ), μ)
     ℓp_σ = logpdf(LogNormal(0, 3), σ)
     ℓl_y = mapreduce(yi -> logpdf(Normal(μ, σ), yi), +, prob.y)
     return ℓp_μ + ℓp_σ + ℓl_y
@@ -120,7 +130,7 @@ For example:
 ```@example constraints
 function Bijectors.bijector(::Mean)
     return Bijectors.Stacked(
-        Bijectors.bijector.([Normal(0, 1), LogNormal(1, 1)]), [1:1, 2:2],
+        Bijectors.bijector.([Normal(0, 1), LogNormal(1, 1)]), [1:1, 2:2]
     )
 end
 
@@ -130,11 +140,10 @@ binv = Bijectors.inverse(b)
 
 Refer to the documentation of `Bijectors.jl` for more details.
 
-
 ## Wrap the `LogDensityProblem`
 
 The most general and easy way to obtain an unconstrained posterior using a `Bijector` is to wrap our original `LogDensityProblem` to form a new `LogDensityProblem`.
-This approach only requires the user to implement the model-specific `Bijectors.bijector` function as above. 
+This approach only requires the user to implement the model-specific `Bijectors.bijector` function as above.
 The rest can be done by simply copy-pasting the code below:
 
 ```@example constraints
@@ -179,13 +188,14 @@ x = randn(LogDensityProblems.dimension(prob_trans)) # sample on an unconstrained
 LogDensityProblems.logdensity(prob_trans, x)
 ```
 
-We can also wrap `prob_trans` with `LogDensityProblemsAD.ADGradient` to make it differentiable. 
+We can also wrap `prob_trans` with `LogDensityProblemsAD.ADGradient` to make it differentiable.
+
 ```@example constraints
 using LogDensityProblemsAD
 using ADTypes, ReverseDiff
 
 prob_trans_ad = LogDensityProblemsAD.ADgradient(
-    ADTypes.AutoReverseDiff(; compile=true), prob_trans; x = randn(2)
+    ADTypes.AutoReverseDiff(; compile=true), prob_trans; x=randn(2)
 )
 ```
 
@@ -218,8 +228,8 @@ using Plots
 
 x = rand(q_opt_trans, 1000)
 
-Plots.stephist(x[2,:], normed=true, xlabel="Posterior of σ", label=nothing, xlims=(0, 2))
-Plots.vline!([1.0], label="True Value")
+Plots.stephist(x[2, :]; normed=true, xlabel="Posterior of σ", label=nothing, xlims=(0, 2))
+Plots.vline!([1.0]; label="True Value")
 savefig("constrained_histogram.svg")
 ```
 
@@ -227,7 +237,6 @@ savefig("constrained_histogram.svg")
 
 We can see that the transformed posterior is indeed a meaningful approximation of the original posterior $\pi(\sigma \mid y_1, \ldots, y_n)$ we were interested in.
 
-
 ## Bake a Bijector into the `LogDensityProblem`
 
 A problem with the general approach above is that automatically differentiating through `TransformedLogDensityProblem` can be a bit inefficient (due to `Stacked`), especially with reverse-mode AD.
@@ -241,18 +250,18 @@ struct MeanTransformed{BInvS}
 end
 
 function MeanTransformed(y::Vector{Float64})
-    binv_σ = Bijectors.bijector(LogNormal(0, 3)) |> Bijectors.inverse
+    binv_σ = Bijectors.inverse(Bijectors.bijector(LogNormal(0, 3)))
     return MeanTransformed(y, binv_σ)
 end
 
 function LogDensityProblems.logdensity(prob::MeanTransformed, θ)
     (; y, binv_σ) = prob
     μ = θ[1]
-    
+
     # Apply bijector and compute Jacobian
-    σ, ℓabsdetjac_σ = with_logabsdet_jacobian(binv_σ, θ[2])	
+    σ, ℓabsdetjac_σ = with_logabsdet_jacobian(binv_σ, θ[2])
 
-    ℓp_μ = logpdf(Normal(0, σ), μ) 
+    ℓp_μ = logpdf(Normal(0, σ), μ)
     ℓp_σ = logpdf(LogNormal(0, 3), σ)
     ℓl_y = mapreduce(yi -> logpdf(Normal(μ, σ), yi), +, prob.y)
     return ℓp_μ + ℓp_σ + ℓl_y + ℓabsdetjac_σ
@@ -260,7 +269,9 @@ end
 
 LogDensityProblems.dimension(::MeanTransformed) = 2
 
-LogDensityProblems.capabilities(::Type{MeanTransformed}) = LogDensityProblems.LogDensityOrder{0}()
+function LogDensityProblems.capabilities(::Type{MeanTransformed})
+    LogDensityProblems.LogDensityOrder{0}()
+end
 
 n_data = 30
 prob_bakedtrans = MeanTransformed(randn(n_data))

From f9d7f0bf2de49de88ef0e0202f4abd90095f85cb Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Sat, 22 Nov 2025 14:33:01 -0500
Subject: [PATCH 21/26] fix typo

---
 docs/src/tutorials/subsampling.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/tutorials/subsampling.md b/docs/src/tutorials/subsampling.md
index eab0fbf7f..6b5ef665b 100644
--- a/docs/src/tutorials/subsampling.md
+++ b/docs/src/tutorials/subsampling.md
@@ -148,7 +148,7 @@ nothing
 If we don't supply a subsampling strategy to `KLMinRepGradProxDescent`, subsampling will not be used.
 
 ```@example subsampling
-alg_full = KLMinRepGradProxDescent(ADTypesAutoReverseDiff(; compile=true))
+alg_full = KLMinRepGradProxDescent(ADTypes.AutoReverseDiff(; compile=true))
 nothing
 ```
 

From d6dede239e404b660366d02664e5a2164ba0c7e4 Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Sat, 22 Nov 2025 15:08:34 -0500
Subject: [PATCH 22/26] fix bins in normalizing flow tutorial

---
 docs/src/tutorials/flows.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/tutorials/flows.md b/docs/src/tutorials/flows.md
index ae44758ce..772438a1b 100644
--- a/docs/src/tutorials/flows.md
+++ b/docs/src/tutorials/flows.md
@@ -169,7 +169,7 @@ histogram2d(
     samples[1, :],
     samples[2, :];
     normalize=:pdf,
-    nbins=32,
+    nbins=(16, 16),
     xlabel="α",
     ylabel="β",
     xlims=(0, 4),
@@ -261,7 +261,7 @@ histogram2d(
     samples[1, :],
     samples[2, :];
     normalize=:pdf,
-    nbins=64,
+    nbins=(16, 16),
     xlabel="α",
     ylabel="β",
     xlims=(0, 4),

From c7e8c4421fdb2652a94d1e388d700276722a0cc9 Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Sat, 22 Nov 2025 15:09:22 -0500
Subject: [PATCH 23/26] fix formatting

---
 docs/src/tutorials/constrained.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/tutorials/constrained.md b/docs/src/tutorials/constrained.md
index 0cfcde5d8..857e6a3af 100644
--- a/docs/src/tutorials/constrained.md
+++ b/docs/src/tutorials/constrained.md
@@ -80,10 +80,10 @@ z \sim q_{b}^* \quad\Leftrightarrow\quad z \stackrel{\mathrm{d}}{=} b^{-1}(\eta)
 
 The idea of applying a change-of-variable to the variational approximation to match a constrained posterior was popularized by the automatic differentiation VI[^KTRGB2017].
 
-[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. Journal of machine learning research, 18(14), 1-45.
 Now, there are two ways how to do this in Julia.
 First, let's define the constrained posterior example above using the `LogDensityProblems` interface for illustration:
 
+[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. Journal of machine learning research, 18(14), 1-45.
 ```@example constraints
 using LogDensityProblems
 

From 373abdd21fef6eb1d09a9158fde0bb9d79001e26 Mon Sep 17 00:00:00 2001
From: Kyurae Kim <msca8h@naver.com>
Date: Sat, 22 Nov 2025 15:09:44 -0500
Subject: [PATCH 24/26] run formatter

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 docs/make.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/make.jl b/docs/make.jl
index 8b8c9f892..f85540832 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -21,7 +21,7 @@ makedocs(;
             "Scaling to Large Datasets" => "tutorials/subsampling.md",
             "Stan Models" => "tutorials/stan.md",
             "Normalizing Flows" => "tutorials/flows.md",
-            "Dealing with Constrained Posteriors" => "tutorials/constrained.md"
+            "Dealing with Constrained Posteriors" => "tutorials/constrained.md",
         ],
         "Algorithms" => [
             "`KLMinRepGradDescent`" => "klminrepgraddescent.md",

From 6a5c7ba15dbe2d45969977e26e7586cb118b5cf0 Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Sat, 22 Nov 2025 15:15:23 -0500
Subject: [PATCH 25/26] revert changes to documentation

---
 docs/make.jl                      |   1 -
 docs/src/klminrepgraddescent.md   | 132 ++++++++++++++++++++++++------
 docs/src/tutorials/basic.md       |  88 +++++++-------------
 docs/src/tutorials/flows.md       |  98 ++++++++--------------
 docs/src/tutorials/subsampling.md |  69 ++++++++++------
 5 files changed, 212 insertions(+), 176 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index 8b8c9f892..34c0080d3 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -21,7 +21,6 @@ makedocs(;
             "Scaling to Large Datasets" => "tutorials/subsampling.md",
             "Stan Models" => "tutorials/stan.md",
             "Normalizing Flows" => "tutorials/flows.md",
-            "Dealing with Constrained Posteriors" => "tutorials/constrained.md"
         ],
         "Algorithms" => [
             "`KLMinRepGradDescent`" => "klminrepgraddescent.md",
diff --git a/docs/src/klminrepgraddescent.md b/docs/src/klminrepgraddescent.md
index 771cf99cd..c2b3f9264 100644
--- a/docs/src/klminrepgraddescent.md
+++ b/docs/src/klminrepgraddescent.md
@@ -1,7 +1,9 @@
 # [`KLMinRepGradDescent`](@id klminrepgraddescent)
 
-This algorithms aim to minimize the exclusive (or reverse) Kullback-Leibler (KL) divergence via stochastic gradient descent in the space of parameters.
-Specifically, it uses the the *reparameterization gradient* estimator.
+## Description
+
+This algorithm aims to minimize the exclusive (or reverse) Kullback-Leibler (KL) divergence via stochastic gradient descent in the space of parameters.
+Specifically, it uses the the *reparameterization gradient estimator*.
 As a result, this algorithm is best applicable when the target log-density is differentiable and the sampling process of the variational family is differentiable.
 (See the [methodology section](@ref klminrepgraddescent_method) for more details.)
 This algorithm is also commonly referred to as automatic differentiation variational inference, black-box variational inference with the reparameterization gradient, and stochastic gradient variational inference.
@@ -11,15 +13,30 @@ This algorithm is also commonly referred to as automatic differentiation variati
 KLMinRepGradDescent
 ```
 
+The associated objective value can be estimated through the following:
+
+```@docs; canonical=false
+estimate_objective(
+    ::Random.AbstractRNG,
+    ::Union{<:KLMinRepGradDescent,<:KLMinRepGradProxDescent,<:KLMinScoreGradDescent},
+    ::Any,
+    ::Any;
+    ::Int,
+    ::AbstractEntropyEstimator,
+)
+```
+
 ## [Methodology](@id klminrepgraddescent_method)
 
-This algorithms aim to solve the problem
+This algorithm aims to solve the problem
 
 ```math
   \mathrm{minimize}_{q \in \mathcal{Q}}\quad \mathrm{KL}\left(q, \pi\right)
 ```
 
 where $\mathcal{Q}$ is some family of distributions, often called the variational family, by running stochastic gradient descent in the (Euclidean) space of parameters.
+That is, for all $$q_{\lambda} \in \mathcal{Q}$$, we assume $$q_{\lambda}$$ there is a corresponding vector of parameters $$\lambda \in \Lambda$$, where the space of parameters is Euclidean such that $$\Lambda \subset \mathbb{R}^p$$.
+
 Since we usually only have access to the unnormalized densities of the target distribution $\pi$, we don't have direct access to the KL divergence.
 Instead, the ELBO maximization strategy maximizes a surrogate objective, the *evidence lower bound* (ELBO; [^JGJS1999])
 
@@ -40,7 +57,7 @@ Algorithmically, `KLMinRepGradDescent` iterates the step
 where $\widehat{\nabla \mathrm{ELBO}}(q_{\lambda})$ is the reparameterization gradient estimate[^HC1983][^G1991][^R1992][^P1996] of the ELBO gradient and $$\mathrm{operator}$$ is an optional operator (*e.g.* projections, identity mapping).
 
 The reparameterization gradient, also known as the push-in gradient or the pathwise gradient, was introduced to VI in [^TL2014][^RMW2014][^KW2014].
-For the variational family $\mathcal{Q} = \{q_{\lambda} \mid \lambda \in \Lambda\}$, suppose the process of sampling from $q_{\lambda}$ can be described by some differentiable reparameterization function $$T_{\lambda}$$ and a *base distribution* $$\varphi$$ independent of $$\lambda$$ such that
+For the variational family $$\mathcal{Q}$$, suppose the process of sampling from $$q_{\lambda} \in \mathcal{Q}$$ can be described by some differentiable reparameterization function $$T_{\lambda}$$ and a *base distribution* $$\varphi$$ independent of $$\lambda$$ such that
 
 ```math
 z \sim  q_{\lambda} \qquad\Leftrightarrow\qquad
@@ -63,6 +80,51 @@ where $$\epsilon_m \sim \varphi$$ are Monte Carlo samples.
 [^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014). Doubly stochastic variational Bayes for non-conjugate inference. In *International Conference on Machine Learning*.
 [^RMW2014]: Rezende, D. J., Mohamed, S., & Wierstra, D. (2014). Stochastic backpropagation and approximate inference in deep generative models. In *International Conference on Machine Learning*.
 [^KW2014]: Kingma, D. P., & Welling, M. (2014). Auto-encoding variational bayes. In *International Conference on Learning Representations*.
+## [Handling Constraints with `Bijectors`](@id bijectors)
+
+As mentioned in the docstring, `KLMinRepGradDescent` assumes that the variational approximation $$q_{\lambda}$$ and the target distribution $$\pi$$ have the same support for all $$\lambda \in \Lambda$$.
+However, in general, it is most convenient to use variational families that have the whole Euclidean space $$\mathbb{R}^d$$ as their support.
+This is the case for the [location-scale distributions](@ref locscale) provided by `AdvancedVI`.
+For target distributions which the support is not the full $$\mathbb{R}^d$$, we can apply some transformation $$b$$ to $$q_{\lambda}$$ to match its support such that
+
+```math
+z \sim  q_{b,\lambda} \qquad\Leftrightarrow\qquad
+z \stackrel{d}{=} b^{-1}\left(\eta\right);\quad \eta \sim q_{\lambda},
+```
+
+where $$b$$ is often called a *bijector*, since it is often chosen among bijective transformations.
+This idea is known as automatic differentiation VI[^KTRGB2017] and has subsequently been improved by Tensorflow Probability[^DLTBV2017].
+In Julia, [Bijectors.jl](https://github.com/TuringLang/Bijectors.jl)[^FXTYG2020] provides a comprehensive collection of bijections.
+
+[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of Machine Learning Research*, 18(14), 1-45.
+[^DLTBV2017]: Dillon, J. V., Langmore, I., Tran, D., Brevdo, E., Vasudevan, S., Moore, D., ... & Saurous, R. A. (2017). Tensorflow distributions. arXiv.
+[^FXTYG2020]: Fjelde, T. E., Xu, K., Tarek, M., Yalburgi, S., & Ge, H. (2020,. Bijectors. jl: Flexible transformations for probability distributions. In *Symposium on Advances in Approximate Bayesian Inference*.
+    One caveat of ADVI is that, after applying the bijection, a Jacobian adjustment needs to be applied.
+    That is, the objective is now
+```math
+\mathrm{ADVI}\left(\lambda\right)
+\triangleq
+\mathbb{E}_{\eta \sim q_{\lambda}}\left[
+  \log \pi\left( b^{-1}\left( \eta \right) \right)
+  + \log \lvert J_{b^{-1}}\left(\eta\right) \rvert
+\right]
++ \mathbb{H}\left(q_{\lambda}\right)
+```
+
+This is automatically handled by `AdvancedVI` through `TransformedDistribution` provided by `Bijectors.jl`.
+See the following example:
+
+```julia
+using Bijectors
+q = MeanFieldGaussian(μ, L)
+b = Bijectors.bijector(dist)
+binv = inverse(b)
+q_transformed = Bijectors.TransformedDistribution(q, binv)
+```
+
+By passing `q_transformed` to `optimize`, the Jacobian adjustment for the bijector `b` is automatically applied.
+(See the [Basic Example](@ref basic) for a fully working example.)
+
 ## [Entropy Gradient Estimators](@id entropygrad)
 
 For the gradient of the entropy term, we provide three choices with varying requirements.
@@ -95,7 +157,7 @@ Depending on the variational family, this might be computationally inefficient o
 For example, if ``q_{\lambda}`` is a Gaussian with a full-rank covariance, a back-substitution must be performed at every step, making the per-iteration complexity ``\mathcal{O}(d^3)`` and reducing numerical stability.
 
 ```@setup repgradelbo
-using Distributions
+using Bijectors
 using FillArrays
 using LinearAlgebra
 using LogDensityProblems
@@ -106,38 +168,51 @@ using Optimisers
 using ADTypes, ForwardDiff, ReverseDiff
 using AdvancedVI
 
-struct Dist{D}
-    dist::D
+struct NormalLogNormal{MX,SX,MY,SY}
+    μ_x::MX
+    σ_x::SX
+    μ_y::MY
+    Σ_y::SY
 end
 
-function LogDensityProblems.logdensity(model::Dist, θ)
-    return logpdf(model.dist, θ)
+function LogDensityProblems.logdensity(model::NormalLogNormal, θ)
+    (; μ_x, σ_x, μ_y, Σ_y) = model
+    logpdf(LogNormal(μ_x, σ_x), θ[1]) + logpdf(MvNormal(μ_y, Σ_y), θ[2:end])
 end
 
-function LogDensityProblems.logdensity_and_gradient(model::Dist, θ)
+function LogDensityProblems.logdensity_and_gradient(model::NormalLogNormal, θ)
     return (
         LogDensityProblems.logdensity(model, θ),
         ForwardDiff.gradient(Base.Fix1(LogDensityProblems.logdensity, model), θ),
     )
 end
 
-function LogDensityProblems.dimension(model::Dist)
-    return length(model.dist)
+function LogDensityProblems.dimension(model::NormalLogNormal)
+    length(model.μ_y) + 1
 end
 
-function LogDensityProblems.capabilities(::Type{<:Dist})
+function LogDensityProblems.capabilities(::Type{<:NormalLogNormal})
     LogDensityProblems.LogDensityOrder{1}()
 end
 
 n_dims = 10
-μ      = Fill(2.0, n_dims)
-σ      = Fill(1.0, n_dims)
-model  = Dist(MvNormal(μ, Diagonal(σ.^2)));
+μ_x    = 2.0
+σ_x    = 0.3
+μ_y    = Fill(2.0, n_dims)
+σ_y    = Fill(1.0, n_dims)
+model  = NormalLogNormal(μ_x, σ_x, μ_y, Diagonal(σ_y.^2));
 
 d  = LogDensityProblems.dimension(model);
-μ0 = zeros(d);
-L0 = Diagonal(ones(d));
-q0 = AdvancedVI.MeanFieldGaussian(μ0, L0)
+μ  = zeros(d);
+L  = Diagonal(ones(d));
+q0 = AdvancedVI.MeanFieldGaussian(μ, L)
+
+function Bijectors.bijector(model::NormalLogNormal)
+    (; μ_x, σ_x, μ_y, Σ_y) = model
+    Bijectors.Stacked(
+        Bijectors.bijector.([LogNormal(μ_x, σ_x), MvNormal(μ_y, Σ_y)]),
+        [1:1, 2:1+length(μ_y)])
+end
 ```
 
 In this example, the true posterior is contained within the variational family.
@@ -148,6 +223,10 @@ Recall that the original ADVI objective with a closed-form entropy (CFE) is give
 
 ```@example repgradelbo
 n_montecarlo = 16;
+b = Bijectors.bijector(model);
+binv = inverse(b)
+
+q0_trans = Bijectors.TransformedDistribution(q0, binv)
 
 cfe = KLMinRepGradDescent(
     AutoReverseDiff();
@@ -171,11 +250,12 @@ nothing
 ```
 
 ```@setup repgradelbo
-max_iter = 10^3
+max_iter = 3*10^3
 
 function callback(; params, restructure, kwargs...)
-    q = restructure(params)
-    dist2 = sum(abs2, q.location - μ) + sum(abs2, diag(q.scale) - σ)
+    q = restructure(params).dist
+    dist2 = sum(abs2, q.location - vcat([μ_x], μ_y)) 
+        + sum(abs2, diag(q.scale) - vcat(σ_x, σ_y))
     (dist = sqrt(dist2),)
 end
 
@@ -183,7 +263,7 @@ _, info_cfe, _ = AdvancedVI.optimize(
     cfe,
     max_iter,
     model,
-    q0;
+    q0_trans;
     show_progress = false,
     callback      = callback,
 ); 
@@ -192,7 +272,7 @@ _, info_stl, _ = AdvancedVI.optimize(
     stl,
     max_iter,
     model,
-    q0;
+    q0_trans;
     show_progress = false,
     callback      = callback,
 ); 
@@ -201,7 +281,7 @@ _, info_stl, _ = AdvancedVI.optimize(
     stl,
     max_iter,
     model,
-    q0;
+    q0_trans;
     show_progress = false,
     callback      = callback,
 ); 
@@ -284,7 +364,7 @@ _, info_qmc, _ = AdvancedVI.optimize(
     KLMinRepGradDescent(AutoReverseDiff(); n_samples=n_montecarlo, optimizer=Adam(1e-2), operator=ClipScale()),
     max_iter,
     model,
-    q0;
+    q0_trans;
     show_progress = false,
     callback      = callback,
 ); 
diff --git a/docs/src/tutorials/basic.md b/docs/src/tutorials/basic.md
index cf2d44a3a..8998b0789 100644
--- a/docs/src/tutorials/basic.md
+++ b/docs/src/tutorials/basic.md
@@ -50,13 +50,14 @@ end
 nothing
 ```
 
-Notice that the support of `σ` is constrained to be positive.
-Since most VI algorithms assume an unconstrained Euclidean support, we will apply a change-of-variable by a bijective map (a *bijector*) to the posterior to make it unconstrained.
-For this, we will use the [`Bijectors`](https://github.com/TuringLang/Bijectors.jl) package.
+Since the support of `σ` is constrained to be positive and most VI algorithms assume an unconstrained Euclidean support, we need to use a *bijector* to transform `θ`.
+We will use [`Bijectors`](https://github.com/TuringLang/Bijectors.jl) for this purpose.
+This corresponds to the automatic differentiation variational inference (ADVI) formulation[^KTRGB2017].
 
 In our case, we need a bijector that applies an identity map for the first `size(X,2)` coordinates, and map the last coordinate to the support of `LogNormal(0, 3)`.
-This can be constructed as follows:
+This can be done as  follows:
 
+[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of machine learning research*.
 ```@example basic
 using Bijectors: Bijectors
 
@@ -70,37 +71,7 @@ end
 nothing
 ```
 
-Next, we will wrap our original `LogDensityProblem` into a new `LogDensityProblem` that applies the transformation and the corresponding Jacobian adjustment.
-
-```@example basic
-struct TransformedLogDensityProblem{Prob,Trans}
-    prob::Prob
-    transform::Trans
-end
-
-function TransformedLogDensityProblem(prob, transform)
-    return TransformedLogDensityProblem{typeof(prob),typeof(transform)}(prob, transform)
-end
-
-function LogDensityProblems.logdensity(prob_trans::TransformedLogDensityProblem, θ_trans)
-    (; prob, transform) = prob_trans
-    θ, logabsdetjac = Bijectors.with_logabsdet_jacobian(transform, θ_trans)
-    return LogDensityProblems.logdensity(prob, θ) + logabsdetjac
-end
-
-function LogDensityProblems.dimension(prob_trans::TransformedLogDensityProblem)
-    return LogDensityProblems.dimension(prob_trans.prob)
-end
-
-function LogDensityProblems.capabilities(
-    ::Type{TransformedLogDensityProblem{Prob,Trans}}
-) where {Prob,Trans}
-    return LogDensityProblems.capabilities(Prob)
-end
-nothing
-```
-
-For more details on the usage of bijectors, please refer to the documentation of [`Bijectors`](https://github.com/TuringLang/Bijectors.jl).
+For more details, please refer to the documentation of [`Bijectors`](https://github.com/TuringLang/Bijectors.jl).
 
 For the dataset, we will use the popular [sonar classification dataset](https://archive.ics.uci.edu/dataset/151/connectionist+bench+sonar+mines+vs+rocks) from the UCI repository.
 This can be automatically downloaded using [`OpenML`](https://github.com/JuliaAI/OpenML.jl).
@@ -128,10 +99,7 @@ nothing
 The model can now be instantiated as follows:
 
 ```@example basic
-prob = LogReg(X, y);
-b = Bijectors.bijector(prob)
-binv = Bijectors.inverse(b)
-prob_trans = TransformedLogDensityProblem(prob, binv)
+model = LogReg(X, y)
 nothing
 ```
 
@@ -166,7 +134,7 @@ For this example, we will use `LogDensityProblemsAD` to equip our problem with a
 using DifferentiationInterface: DifferentiationInterface
 using LogDensityProblemsAD: LogDensityProblemsAD
 
-prob_trans_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoForwardDiff(), prob_trans)
+model_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoReverseDiff(), model)
 nothing
 ```
 
@@ -175,20 +143,29 @@ For the variational family, we will consider a `FullRankGaussian` approximation:
 ```@example basic
 using LinearAlgebra
 
-d = LogDensityProblems.dimension(prob_trans)
-q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.6*I, d, d)))
+d = LogDensityProblems.dimension(model_ad)
+q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.37*I, d, d)))
 nothing
 ```
 
-`KLMinRepGradDescent` requires the variational approximation and the target log-density to have unconstrained support.
-Since we applied a bijector to the posterior to make it unconstrained, and the support of a `FullRankGaussian` is already the full $\mathbb{R}^d$, we are ready to go.
+Now, `KLMinRepGradDescent` requires the variational approximation and the target log-density to have the same support.
+Since `y` follows a log-normal prior, its support is bounded to be the positive half-space ``\mathbb{R}_+``.
+Thus, we will use [Bijectors](https://github.com/TuringLang/Bijectors.jl) to match the support of our target posterior and the variational approximation.
+The bijector can now be applied to `q` to match the support of the target problem.
+
+```@example basic
+b = Bijectors.bijector(model)
+binv = Bijectors.inverse(b)
+q_transformed = Bijectors.TransformedDistribution(q, binv)
+nothing
+```
 
 We can now run VI:
 
 ```@example basic
 max_iter = 10^4
 q_out, info, _ = AdvancedVI.optimize(
-    alg, max_iter, prob_trans_ad, q; show_progress=false
+    alg, max_iter, model_ad, q_transformed; show_progress=false
 )
 nothing
 ```
@@ -213,14 +190,6 @@ nothing
 
 ![](basic_example_elbo.svg)
 
-Recall that we applied a change-of-variable to the posterior to make it unconstrained.
-This, however, is not the original constrained posterior that we wanted to approximate.
-Therefore, we finally need to apply a change-of-variable to `q_out` to make it approximate our original problem:
-
-```julia
-q_trans = Bijectors.TransformedDistribution(q_out, binv)
-```
-
 ## Custom Callback
 
 The ELBO estimates above however, use only a handful of Monte Carlo samples.
@@ -241,7 +210,7 @@ using StatsFuns: StatsFuns
 Approximate the posterior predictive probability for a logistic link function using Mackay's approximation (Bishop p. 220).
 """
 function logistic_prediction(X, μ_β, Σ_β)
-    xtΣx = sum((prob.X*Σ_β) .* prob.X; dims=2)[:, 1]
+    xtΣx = sum((model.X*Σ_β) .* model.X; dims=2)[:, 1]
     κ = @. 1/sqrt(1 + π/8*xtΣx)
     return StatsFuns.logistic.(κ .* X*μ_β)
 end
@@ -252,19 +221,18 @@ function callback(; iteration, averaged_params, restructure, kwargs...)
 
         # Use the averaged parameters (the eventual output of the algorithm)
         q_avg = restructure(averaged_params)
-        q_avg_trans = Bijectors.TransformedDistribution(q_avg, binv)
 
         # Compute predictions
-        μ_β = mean(q_avg_trans.dist)[1:(end - 1)] # posterior mean of β
-        Σ_β = cov(q_avg_trans.dist)[1:(end - 1), end - 1] # marginal posterior covariance of β
+        μ_β = mean(q_avg.dist)[1:(end - 1)] # posterior mean of β
+        Σ_β = cov(q_avg.dist)[1:(end - 1), end - 1] # marginal posterior covariance of β
         y_pred = logistic_prediction(X, μ_β, Σ_β) .> 0.5
 
         # Prediction accuracy
-        acc = mean(y_pred .== prob.y)
+        acc = mean(y_pred .== model.y)
 
         # Higher fidelity estimate of the ELBO on the averaged parameters
         n_samples = 256
-        elbo_callback = -estimate_objective(alg, q_avg, prob_trans; n_samples)
+        elbo_callback = -estimate_objective(alg, q_avg, model; n_samples)
 
         (elbo_callback=elbo_callback, accuracy=acc)
     else
@@ -282,7 +250,7 @@ The `callback` can be supplied to `optimize`:
 ```@example basic
 max_iter = 10^4
 q_out, info, _ = AdvancedVI.optimize(
-    alg, max_iter, prob_trans_ad, q; show_progress=false, callback=callback
+    alg, max_iter, model_ad, q_transformed; show_progress=false, callback=callback
 )
 nothing
 ```
diff --git a/docs/src/tutorials/flows.md b/docs/src/tutorials/flows.md
index 772438a1b..6334c946c 100644
--- a/docs/src/tutorials/flows.md
+++ b/docs/src/tutorials/flows.md
@@ -22,6 +22,7 @@ Multiplicative degeneracy is not entirely made up and do come up in some models
 For example, in the 3-parameter (3-PL) item-response theory model and the N-mixture model used for estimating animal population.
 
 ```@example flow
+using Bijectors: Bijectors
 using Distributions
 using LogDensityProblems: LogDensityProblems
 
@@ -55,7 +56,7 @@ Degenerate posteriors often indicate that there is not enough data to pin-point
 Therefore, for the purpose of illustration, we will use a single data point:
 
 ```@example flow
-prob = MultDegen([3.0])
+model = MultDegen([3.0])
 nothing
 ```
 
@@ -67,7 +68,7 @@ using Plots
 contour(
     range(0, 4; length=64),
     range(-3, 25; length=64),
-    (x, y) -> LogDensityProblems.logdensity(prob, [x, y]);
+    (x, y) -> LogDensityProblems.logdensity(model, [x, y]);
     xlabel="α",
     ylabel="β",
     clims=(-8, Inf),
@@ -85,6 +86,19 @@ This sort of nonlinear correlation structure is difficult to model using only lo
 ## Gaussian Variational Family
 
 As usual, let's try to fit a multivariate Gaussian to this posterior.
+
+```@example flow
+using ADTypes: ADTypes
+using ReverseDiff: ReverseDiff
+using DifferentiationInterface: DifferentiationInterface
+using LogDensityProblemsAD: LogDensityProblemsAD
+
+model_ad = LogDensityProblemsAD.ADgradient(
+    ADTypes.AutoReverseDiff(; compile=true), model; x=[1.0, 1.0]
+)
+nothing
+```
+
 Since $\alpha$ is constrained to the positive real half-space, we have to employ bijectors.
 For this, we use [Bijectors](https://github.com/TuringLang/Bijectors.jl):
 
@@ -99,77 +113,33 @@ end
 nothing
 ```
 
-to transform the posterior to be unconstrained:
-
-```@example flow
-struct TransformedLogDensityProblem{Prob,Trans}
-    prob::Prob
-    transform::Trans
-end
-
-function TransformedLogDensityProblem(prob, transform)
-    return TransformedLogDensityProblem{typeof(prob),typeof(transform)}(prob, transform)
-end
-
-function LogDensityProblems.logdensity(prob_trans::TransformedLogDensityProblem, θ_trans)
-    (; prob, transform) = prob_trans
-    θ, logabsdetjac = Bijectors.with_logabsdet_jacobian(transform, θ_trans)
-    return LogDensityProblems.logdensity(prob, θ) + logabsdetjac
-end
-
-function LogDensityProblems.dimension(prob_trans::TransformedLogDensityProblem)
-    return LogDensityProblems.dimension(prob_trans.prob)
-end
-
-function LogDensityProblems.capabilities(
-    ::Type{TransformedLogDensityProblem{Prob,Trans}}
-) where {Prob,Trans}
-    return LogDensityProblems.capabilities(Prob)
-end
-nothing
-```
-
-Let's instantiate the model:
-
-```@example flow
-using ADTypes: ADTypes
-using ReverseDiff: ReverseDiff
-using DifferentiationInterface: DifferentiationInterface
-using LogDensityProblemsAD: LogDensityProblemsAD
-
-binv = Bijectors.inverse(Bijectors.bijector(prob))
-prob_trans = TransformedLogDensityProblem(prob, binv)
-prob_trans_ad = LogDensityProblemsAD.ADgradient(
-    ADTypes.AutoForwardDiff(), prob_trans; x=[1.0, 1.0]
-)
-nothing
-```
-
 For the algorithm, we will use the `KLMinRepGradProxDescent` objective.
 
 ```@example flow
 using AdvancedVI
 using LinearAlgebra
 
-d = LogDensityProblems.dimension(prob_trans_ad)
+d = LogDensityProblems.dimension(model_ad)
 q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(I, d, d)))
 
+binv = Bijectors.inverse(Bijectors.bijector(model))
+q_trans = Bijectors.TransformedDistribution(q, binv)
+
 max_iter = 3*10^3
 alg = KLMinRepGradProxDescent(ADTypes.AutoReverseDiff(; compile=true))
-q_out, info, _ = AdvancedVI.optimize(alg, max_iter, prob_trans_ad, q; show_progress=false)
-q_out_trans = Bijectors.TransformedDistribution(q_out, binv)
+q_out, info, _ = AdvancedVI.optimize(alg, max_iter, model_ad, q_trans; show_progress=false)
 nothing
 ```
 
 The resulting variational posterior can be visualized as follows:
 
 ```@example flow
-samples = rand(q_out_trans, 10000)
+samples = rand(q_out, 10000)
 histogram2d(
     samples[1, :],
     samples[2, :];
     normalize=:pdf,
-    nbins=(16, 16),
+    nbins=32,
     xlabel="α",
     ylabel="β",
     xlims=(0, 4),
@@ -207,7 +177,16 @@ q_flow = realnvp(MvNormal(zeros(d), I), hidden_dims, n_layers; paramtype=Float64
 nothing
 ```
 
-For the variational inference algorithm, we will similarly minimize the KL divergence with stochastic gradient descent as originally proposed by Rezende and Mohamed[^RM2015].
+Recall that out posterior is constrained.
+In most cases, flows assume an unconstrained support.
+Therefore, just as with the Gaussian variational family, we can incorporate `Bijectors` to match the supports:
+
+```@example flow
+q_flow_trans = Bijectors.TransformedDistribution(q_flow, binv)
+nothing
+```
+
+For the variational inference algorithms, we will similarly minimize the KL divergence with stochastic gradient descent as originally proposed by Rezende and Mohamed[^RM2015].
 For this, however, we need to be mindful of the requirements of the variational algorithm.
 The default `entropy` gradient estimator of `KLMinRepGradDescent` is `ClosedFormEntropy()`, which assumes that the entropy of the variational family `entropy(q)` is available. For flows, the entropy is (usually) not available.
 Instead, we can use any gradient estimator that only relies on the log-density of the variational family `logpdf(q)`, `StickingTheLandingEntropy()` or `MonteCarloEntropy()`.
@@ -220,12 +199,9 @@ Furthermore, Agrawal *et al.*[^AD2025] claim that using a larger number of Monte
 [^ASD2020]: Agrawal, A., Sheldon, D. R., & Domke, J. (2020). Advances in black-box VI: Normalizing flows, importance weighting, and optimization. In *Advances in Neural Information Processing Systems*, 33, 17358-17369.
 [^AD2025]: Agrawal, A., & Domke, J. (2024). Disentangling impact of capacity, objective, batchsize, estimators, and step-size on flow VI. In *Proceedings of the International Conference on Artificial Intelligence and Statistics*.
 ```@example flow
-using Optimisers: Optimisers
-
 alg_flow = KLMinRepGradDescent(
     ADTypes.AutoReverseDiff(; compile=true);
     n_samples=8,
-    optimizer=Optimisers.Adam(1e-2),
     operator=IdentityOperator(),
     entropy=StickingTheLandingEntropy(),
 )
@@ -235,11 +211,9 @@ nothing
 Without further due, let's now run VI:
 
 ```@example flow
-max_iter = 300
 q_flow_out, info_flow, _ = AdvancedVI.optimize(
-    alg_flow, max_iter, prob_trans_ad, q_flow; show_progress=false
+    alg_flow, max_iter, model_ad, q_flow_trans; show_progress=false
 )
-q_flow_out_trans = Bijectors.TransformedDistribution(q_flow_out, binv)
 nothing
 ```
 
@@ -256,12 +230,12 @@ nothing
 Finally, let's visualize the variational posterior:
 
 ```@example flow
-samples = rand(q_flow_out_trans, 10000)
+samples = rand(q_flow_out, 10000)
 histogram2d(
     samples[1, :],
     samples[2, :];
     normalize=:pdf,
-    nbins=(16, 16),
+    nbins=64,
     xlabel="α",
     ylabel="β",
     xlims=(0, 4),
diff --git a/docs/src/tutorials/subsampling.md b/docs/src/tutorials/subsampling.md
index 6b5ef665b..1816efb44 100644
--- a/docs/src/tutorials/subsampling.md
+++ b/docs/src/tutorials/subsampling.md
@@ -10,7 +10,7 @@ In this tutorial, we will see how to perform subsampling with `KLMinRepGradProxD
 [^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of Machine Learning Research*, 18(14), 1-45.
 ## Setting Up Subsampling
 
-We will consider a typical hierarchical logistic regression example.
+We will consider the same hierarchical logistic regression example used in the [Basic Example](@ref basic).
 
 ```@example subsampling
 using LogDensityProblems: LogDensityProblems
@@ -26,11 +26,10 @@ end
 function LogDensityProblems.logdensity(model::LogReg, θ)
     (; X, y, n_data) = model
     n, d = size(X)
-    β, logσ = θ[1:size(X, 2)], θ[end]
-    σ = exp(logσ)
+    β, σ = θ[1:size(X, 2)], θ[end]
 
     logprior_β = logpdf(MvNormal(Zeros(d), σ), β)
-    logprior_σ = logpdf(Normal(0, 3), σ)
+    logprior_σ = logpdf(LogNormal(0, 3), σ)
 
     logit = X*β
     loglike_y = mapreduce((li, yi) -> logpdf(BernoulliLogit(li), yi), +, logit, y)
@@ -38,7 +37,7 @@ function LogDensityProblems.logdensity(model::LogReg, θ)
 end
 
 function LogDensityProblems.dimension(model::LogReg)
-    return size(prob.X, 2) + 1
+    return size(model.X, 2) + 1
 end
 
 function LogDensityProblems.capabilities(::Type{<:LogReg})
@@ -51,6 +50,21 @@ Notice that, to use subsampling, we need be able to rescale the likelihood stren
 That is, for the gradient of the log-density with a batch of data points of size `n` to be an unbiased estimate of the gradient using the full dataset of size `n_data`, we need to scale the likelihood by `n_data/n`.
 This part is critical to ensure that the algorithm correctly approximates the posterior with the full dataset.
 
+As usual, we will set up a bijector:
+
+```@example subsampling
+using Bijectors: Bijectors
+
+function Bijectors.bijector(model::LogReg)
+    d = size(model.X, 2)
+    return Bijectors.Stacked(
+        Bijectors.bijector.([MvNormal(Zeros(d), 1.0), LogNormal(0, 3)]),
+        [1:d, (d + 1):(d + 1)],
+    )
+end
+nothing
+```
+
 For the dataset, we will use one that is larger than that used in the [Basic Example](@ref basic).
 This is to properly assess the advantage of subsampling.
 In particular, we will utilize the "Phishing" dataset[^Tan2018], which consists of 10000 data points, each with 48 features.
@@ -86,28 +100,26 @@ Let's now istantiate the model and set up automatic differentiation using [`LogD
 using ADTypes, ReverseDiff
 using LogDensityProblemsAD
 
-prob = LogReg(X, y, size(X, 1))
-prob_ad = LogDensityProblemsAD.ADgradient(
-    ADTypes.AutoReverseDiff(), prob; x=randn(LogDensityProblems.dimension(prob))
-)
+model = LogReg(X, y, size(X, 1))
+model_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoReverseDiff(), model)
 nothing
 ```
 
 To enable subsampling, `LogReg` has to implement the method `AdvancedVI.subsample`.
 For our model, this is fairly simple: We only need to select the rows of `X` and the elements of `y` corresponding to the batch of data points.
-As subtle point here is that we wrapped `prob` with `LogDensityProblemsAD.ADgradient` into `prob_ad`.
-Therefore, `AdvancedVI` sees `prob_ad` and not `prob`.
-This means we have to specialize `AdvancedVI.subsample` to `typeof(prob_ad)` and not `LogReg`.
+As subtle point here is that we wrapped `model` with `LogDensityProblemsAD.ADgradient` into `model_ad`.
+Therefore, `AdvancedVI` sees `model_ad` and not `model`.
+This means we have to specialize `AdvancedVI.subsample` to `typeof(model_ad)` and not `LogReg`.
 
 ```@example subsampling
 using Accessors
 using AdvancedVI
 
-function AdvancedVI.subsample(prob::typeof(prob_ad), idx)
-    (; X, y, n_data) = parent(prob_ad)
-    prob′ = @set prob.ℓ.X = X[idx, :]
-    prob′′ = @set prob′.ℓ.y = y[idx]
-    return prob′′
+function AdvancedVI.subsample(model::typeof(model_ad), idx)
+    (; X, y, n_data) = parent(model)
+    model′ = @set model.ℓ.X = X[idx, :]
+    model′′ = @set model′.ℓ.y = y[idx]
+    return model′′
 end
 nothing
 ```
@@ -126,7 +138,7 @@ Here, we will use `ReshufflingBatchSubsampling`, which implements random reshuff
 We will us a batch size of 32, which results in `313 = length(subsampling) = ceil(Int, size(X,2)/32)` steps per epoch.
 
 ```@example subsampling
-dataset = 1:size(prob.X, 1)
+dataset = 1:size(model.X, 1)
 batchsize = 32
 subsampling = ReshufflingBatchSubsampling(dataset, batchsize)
 alg_sub = KLMinRepGradProxDescent(ADTypes.AutoReverseDiff(; compile=true); subsampling)
@@ -157,8 +169,11 @@ The variational family will be set up as follows:
 ```@example subsampling
 using LinearAlgebra
 
-d = LogDensityProblems.dimension(prob_ad)
-q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.6*I, d, d)))
+d = LogDensityProblems.dimension(model_ad)
+q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.37*I, d, d)))
+b = Bijectors.bijector(model)
+binv = Bijectors.inverse(b)
+q_transformed = Bijectors.TransformedDistribution(q, binv)
 nothing
 ```
 
@@ -177,7 +192,7 @@ time_begin = nothing
 Approximate the posterior predictive probability for a logistic link function using Mackay's approximation (Bishop p. 220).
 """
 function logistic_prediction(X, μ_β, Σ_β)
-    xtΣx = sum((prob.X*Σ_β) .* prob.X; dims=2)[:, 1]
+    xtΣx = sum((model.X*Σ_β) .* model.X; dims=2)[:, 1]
     κ = @. 1/sqrt(1 + π/8*xtΣx)
     return StatsFuns.logistic.(κ .* X*μ_β)
 end
@@ -189,16 +204,16 @@ function callback(; iteration, averaged_params, restructure, kwargs...)
         q_avg = restructure(averaged_params)
 
         # Compute predictions using 
-        μ_β = mean(q_avg)[1:(end - 1)] # posterior mean of β
-        Σ_β = cov(q_avg)[1:(end - 1), end - 1] # marginal posterior covariance of β
+        μ_β = mean(q_avg.dist)[1:(end - 1)] # posterior mean of β
+        Σ_β = cov(q_avg.dist)[1:(end - 1), end - 1] # marginal posterior covariance of β
         y_pred = logistic_prediction(X, μ_β, Σ_β) .> 0.5
 
         # Prediction accuracy
-        acc = mean(y_pred .== prob.y)
+        acc = mean(y_pred .== model.y)
 
         # Higher fidelity estimate of the ELBO on the averaged parameters
         n_samples = 256
-        elbo_callback = -estimate_objective(alg_full, q_avg, prob; n_samples)
+        elbo_callback = -estimate_objective(alg_full, q_avg, model; n_samples)
 
         (elbo_callback=elbo_callback, accuracy=acc, time_elapsed=time() - time_begin)
     else
@@ -208,12 +223,12 @@ end
 
 time_begin = time()
 _, info_full, _ = AdvancedVI.optimize(
-    alg_full, max_iter, prob_ad, q; show_progress=true, callback
+    alg_full, max_iter, model_ad, q_transformed; show_progress=false, callback
 );
 
 time_begin = time()
 _, info_sub, _ = AdvancedVI.optimize(
-    alg_sub, max_iter, prob_ad, q; show_progress=true, callback
+    alg_sub, max_iter, model_ad, q_transformed; show_progress=false, callback
 );
 nothing
 ```

From 8c657e0ff8409217de911500e64d257edecf38b1 Mon Sep 17 00:00:00 2001
From: Kyurae Kim <kyrkim@seas.upenn.edu>
Date: Sat, 22 Nov 2025 15:27:16 -0500
Subject: [PATCH 26/26] revert changes to the README

---
 README.md | 68 ++++++++++++-------------------------------------------
 1 file changed, 14 insertions(+), 54 deletions(-)

diff --git a/README.md b/README.md
index 4e54b6e06..4732d5c6e 100644
--- a/README.md
+++ b/README.md
@@ -3,13 +3,13 @@
 [![Tests](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Tests.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Tests.yml/badge.svg?branch=main)
 [![Coverage](https://codecov.io/gh/TuringLang/AdvancedVI.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/TuringLang/AdvancedVI.jl)
 
-| AD Backend                                                 | Integration Status                                                                                                                                                                                                       |
-|:---------------------------------------------------------- |:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| [ForwardDiff](https://github.com/JuliaDiff/ForwardDiff.jl) | [![ForwardDiff](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ForwardDiff.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ForwardDiff.yml?query=branch%3Amain) |
-| [ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl) | [![ReverseDiff](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ReverseDiff.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ReverseDiff.yml?query=branch%3Amain) |
-| [Zygote](https://github.com/FluxML/Zygote.jl)              | [![Zygote](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Zygote.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Zygote.yml?query=branch%3Amain)                |
-| [Mooncake](https://github.com/chalk-lab/Mooncake.jl)       | [![Mooncake](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Mooncake.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Mooncake.yml?query=branch%3Amain)          |
-| [Enzyme](https://github.com/EnzymeAD/Enzyme.jl)            | [![Enzyme](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Enzyme.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Enzyme.yml?query=branch%3Amain)                |
+| AD Backend    | Integration Status |
+| ------------- | ------------- |
+| [ForwardDiff](https://github.com/JuliaDiff/ForwardDiff.jl)   | [![ForwardDiff](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ForwardDiff.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ForwardDiff.yml?query=branch%3Amain) |
+| [ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl)   | [![ReverseDiff](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ReverseDiff.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/ReverseDiff.yml?query=branch%3Amain) |
+| [Zygote](https://github.com/FluxML/Zygote.jl)        | [![Zygote](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Zygote.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Zygote.yml?query=branch%3Amain) |
+| [Mooncake](https://github.com/chalk-lab/Mooncake.jl)      | [![Mooncake](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Mooncake.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Mooncake.yml?query=branch%3Amain) |
+| [Enzyme](https://github.com/EnzymeAD/Enzyme.jl)        | [![Enzyme](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Enzyme.yml/badge.svg?branch=main)](https://github.com/TuringLang/AdvancedVI.jl/actions/workflows/Enzyme.yml?query=branch%3Amain) |
 
 # AdvancedVI.jl
 
@@ -69,7 +69,7 @@ end;
 
 Since the support of `σ` is constrained to be positive and most VI algorithms assume an unconstrained Euclidean support, we need to use a *bijector* to transform `θ`.
 We will use [`Bijectors`](https://github.com/TuringLang/Bijectors.jl) for this purpose.
-The bijector corresponding to the joint support of our model can be constructed as follows:
+This corresponds to the automatic differentiation variational inference (ADVI) formulation[^KTRGB2017].
 
 ```julia
 using Bijectors: Bijectors
@@ -85,36 +85,6 @@ end;
 
 A simpler approach would be to use [`Turing`](https://github.com/TuringLang/Turing.jl), where a `Turing.Model` can be automatically be converted into a `LogDensityProblem` and a corresponding `bijector` is automatically generated.
 
-Since most VI algorithms assume that the posterior is unconstrained, we will apply a change-of-variable to our model to make it unconstrained.
-This amounts to wrapping it into a `LogDensityProblem` that applies the transformation and the corresponding Jacobian adjustment.
-
-```julia
-struct TransformedLogDensityProblem{Prob,Trans}
-    prob::Prob
-    transform::Trans
-end
-
-function TransformedLogDensityProblem(prob, transform)
-    return TransformedLogDensityProblem{typeof(prob),typeof(transform)}(prob, transform)
-end
-
-function LogDensityProblems.logdensity(prob_trans::TransformedLogDensityProblem, θ_trans)
-    (; prob, transform) = prob_trans
-    θ, logabsdetjac = Bijectors.with_logabsdet_jacobian(transform, θ_trans)
-    return LogDensityProblems.logdensity(prob, θ) + logabsdetjac
-end
-
-function LogDensityProblems.dimension(prob_trans::TransformedLogDensityProblem)
-    return LogDensityProblems.dimension(prob_trans.prob)
-end
-
-function LogDensityProblems.capabilities(
-    ::Type{TransformedLogDensityProblem{Prob,Trans}}
-) where {Prob,Trans}
-    return LogDensityProblems.capabilities(Prob)
-end;
-```
-
 For the dataset, we will use the popular [sonar classification dataset](https://archive.ics.uci.edu/dataset/151/connectionist+bench+sonar+mines+vs+rocks) from the UCI repository.
 This can be automatically downloaded using [`OpenML`](https://github.com/JuliaAI/OpenML.jl).
 The sonar dataset corresponds to the dataset id 40.
@@ -139,10 +109,7 @@ X = hcat(X, ones(size(X, 1)));
 The model can now be instantiated as follows:
 
 ```julia
-prob = LogReg(X, y);
-b = Bijectors.bijector(prob)
-binv = Bijectors.inverse(b)
-prob_trans = TransformedLogDensityProblem(prob, binv)
+model = LogReg(X, y);
 ```
 
 For the VI algorithm, we will use `KLMinRepGradDescent`:
@@ -169,7 +136,7 @@ For this, it is straightforward to use `LogDensityProblemsAD`:
 using DifferentiationInterface: DifferentiationInterface
 using LogDensityProblemsAD: LogDensityProblemsAD
 
-prob_trans_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoReverseDiff(), prob_trans);
+model_ad = LogDensityProblemsAD.ADgradient(ADTypes.AutoReverseDiff(), model);
 ```
 
 For the variational family, we will consider a `FullRankGaussian` approximation:
@@ -177,8 +144,8 @@ For the variational family, we will consider a `FullRankGaussian` approximation:
 ```julia
 using LinearAlgebra
 
-d = LogDensityProblems.dimension(prob_trans_ad)
-q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.6*I, d, d)))
+d = LogDensityProblems.dimension(model_ad)
+q = FullRankGaussian(zeros(d), LowerTriangular(Matrix{Float64}(0.37*I, d, d)))
 q = MeanFieldGaussian(zeros(d), Diagonal(ones(d)));
 ```
 
@@ -194,15 +161,7 @@ We can now run VI:
 
 ```julia
 max_iter = 10^3
-q_opt, info, _ = AdvancedVI.optimize(alg, max_iter, prob_trans_ad, q);
-```
-
-Recall that we applied a change-of-variable to the posterior to make it unconstrained.
-This, however, is not the original constrained posterior that we wanted to approximate.
-Therefore, we finally need to apply a change-of-variable to `q_opt` to make it approximate our original problem.
-
-```julia
-q_trans = Bijectors.TransformedDistribution(q_opt, binv)
+q, info, _ = AdvancedVI.optimize(alg, max_iter, model_ad, q_transformed;);
 ```
 
 For more examples and details, please refer to the documentation.
@@ -210,3 +169,4 @@ For more examples and details, please refer to the documentation.
 [^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014, June). Doubly stochastic variational Bayes for non-conjugate inference. In *International Conference on Machine Learning*. PMLR.
 [^RMW2014]: Rezende, D. J., Mohamed, S., & Wierstra, D. (2014, June). Stochastic backpropagation and approximate inference in deep generative models. In *International Conference on Machine Learning*. PMLR.
 [^KW2014]: Kingma, D. P., & Welling, M. (2014). Auto-encoding variational bayes. In *International Conference on Learning Representations*.
+[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of machine learning research*.