From e332d8c87348b7d39529508edc96b7fbda9e72aa Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 15 Sep 2025 12:04:53 -0400 Subject: [PATCH 01/21] remove the type `ParamSpaceSGD` --- src/AdvancedVI.jl | 2 +- src/algorithms/paramspacesgd/constructors.jl | 140 +++++++++++++++++- src/algorithms/paramspacesgd/paramspacesgd.jl | 97 ++++-------- test/general/optimize.jl | 7 +- 4 files changed, 167 insertions(+), 79 deletions(-) diff --git a/src/AdvancedVI.jl b/src/AdvancedVI.jl index 6f51a9f0..20f1c345 100644 --- a/src/AdvancedVI.jl +++ b/src/AdvancedVI.jl @@ -279,7 +279,6 @@ include("optimize.jl") ## Parameter Space SGD include("algorithms/paramspacesgd/abstractobjective.jl") -include("algorithms/paramspacesgd/paramspacesgd.jl") export ParamSpaceSGD @@ -319,6 +318,7 @@ export RepGradELBO, SubsampledObjective include("algorithms/paramspacesgd/constructors.jl") +include("algorithms/paramspacesgd/paramspacesgd.jl") export KLMinRepGradDescent, KLMinRepGradProxDescent, KLMinScoreGradDescent, ADVI, BBVI diff --git a/src/algorithms/paramspacesgd/constructors.jl b/src/algorithms/paramspacesgd/constructors.jl index 2ec0ae41..d3f7bef6 100644 --- a/src/algorithms/paramspacesgd/constructors.jl +++ b/src/algorithms/paramspacesgd/constructors.jl @@ -18,6 +18,22 @@ KL divergence minimization by running stochastic gradient descent with the repar - `operator::AbstractOperator`: Operator to be applied after each gradient descent step. (default: `IdentityOperator()`) - `subsampling::Union{<:Nothing,<:AbstractSubsampling}`: Data point subsampling strategy. If `nothing`, subsampling is not used. (default: `nothing`) +# Output +- `q_averaged`: The variational approximation formed by the averaged SGD iterates. + +# Callback +The callback function `callback` has a signature of + + callback(; rng, iteration, restructure, params, averaged_params, restructure, gradient) + +The arguments are as follows: +- `rng`: Random number generator internally used by the algorithm. +- `iteration`: The index of the current iteration. +- `restructure`: Function that restructures the variational approximation from the variational parameters. Calling `restructure(params)` reconstructs the current variational approximation. +- `params`: Current variational parameters. +- `averaged_params`: Variational parameters averaged according to the averaging strategy. +- `gradient`: The estimated (possibly stochastic) gradient. + # Requirements - The trainable parameters in the variational approximation are expected to be extractable through `Optimisers.destructure`. This requires the variational approximation to be marked as a functor through `Functors.@functor`. - The variational approximation ``q_{\\lambda}`` implements `rand`. @@ -25,6 +41,30 @@ KL divergence minimization by running stochastic gradient descent with the repar - The target `LogDensityProblems.logdensity(prob, x)` must be differentiable with respect to `x` by the selected AD backend. - Additonal requirements on `q` may apply depending on the choice of `entropy`. """ +struct KLMinRepGradDescent{ + Obj<:Union{<:RepGradELBO,<:SubsampledObjective}, + AD<:ADTypes.AbstractADType, + Opt<:Optimisers.AbstractRule, + Avg<:AbstractAverager, + Op<:AbstractOperator, +} <: AbstractVariationalAlgorithm + objective::Obj + adtype::AD + optimizer::Opt + averager::Avg + operator::Op +end + +struct KLMinRepGradDescentState{P,Q,GradBuf,OptSt,ObjSt,AvgSt} + prob::P + q::Q + iteration::Int + grad_buf::GradBuf + opt_st::OptSt + obj_st::ObjSt + avg_st::AvgSt +end + function KLMinRepGradDescent( adtype::ADTypes.AbstractADType; entropy::Union{<:ClosedFormEntropy,<:StickingTheLandingEntropy,<:MonteCarloEntropy}=ClosedFormEntropy(), @@ -39,7 +79,11 @@ function KLMinRepGradDescent( else SubsampledObjective(RepGradELBO(n_samples; entropy=entropy), subsampling) end - return ParamSpaceSGD(objective, adtype, optimizer, averager, operator) + return KLMinRepGradDescent{ + typeof(objective),typeof(adtype),typeof(optimizer),typeof(averager),typeof(operator) + }( + objective, adtype, optimizer, averager, operator + ) end const ADVI = KLMinRepGradDescent @@ -63,12 +107,52 @@ Thus, only the entropy estimators with a "ZeroGradient" suffix are allowed. - `averager::AbstractAverager`: Parameter averaging strategy. (default: `PolynomialAveraging()`) - `subsampling::Union{<:Nothing,<:AbstractSubsampling}`: Data point subsampling strategy. If `nothing`, subsampling is not used. (default: `nothing`) +# Output +- `q_averaged`: The variational approximation formed by the averaged SGD iterates. + +# Callback +The callback function `callback` has a signature of + + callback(; rng, iteration, restructure, params, averaged_params, restructure, gradient) + +The arguments are as follows: +- `rng`: Random number generator internally used by the algorithm. +- `iteration`: The index of the current iteration. +- `restructure`: Function that restructures the variational approximation from the variational parameters. Calling `restructure(params)` reconstructs the current variational approximation. +- `params`: Current variational parameters. +- `averaged_params`: Variational parameters averaged according to the averaging strategy. +- `gradient`: The estimated (possibly stochastic) gradient. + # Requirements - The variational family is `MvLocationScale`. - The target distribution and the variational approximation have the same support. - The target `LogDensityProblems.logdensity(prob, x)` must be differentiable with respect to `x` by the selected AD backend. - Additonal requirements on `q` may apply depending on the choice of `entropy_zerograd`. """ +struct KLMinRepGradProxDescent{ + Obj<:Union{<:RepGradELBO,<:SubsampledObjective}, + AD<:ADTypes.AbstractADType, + Opt<:Optimisers.AbstractRule, + Avg<:AbstractAverager, + Op<:ProximalLocationScaleEntropy, +} <: AbstractVariationalAlgorithm + objective::Obj + adtype::AD + optimizer::Opt + averager::Avg + operator::Op +end + +struct KLMinRepGradProxDescentState{P,Q,GradBuf,OptSt,ObjSt,AvgSt} + prob::P + q::Q + iteration::Int + grad_buf::GradBuf + opt_st::OptSt + obj_st::ObjSt + avg_st::AvgSt +end + function KLMinRepGradProxDescent( adtype::ADTypes.AbstractADType; entropy_zerograd::Union{ @@ -85,7 +169,11 @@ function KLMinRepGradProxDescent( else SubsampledObjective(RepGradELBO(n_samples; entropy=entropy_zerograd), subsampling) end - return ParamSpaceSGD(objective, adtype, optimizer, averager, operator) + return KLMinRepGradProxDescent{ + typeof(objective),typeof(adtype),typeof(optimizer),typeof(averager),typeof(operator) + }( + objective, adtype, optimizer, averager, operator + ) end """ @@ -106,15 +194,55 @@ KL divergence minimization by running stochastic gradient descent with the score - `operator::Union{<:IdentityOperator, <:ClipScale}`: Operator to be applied after each gradient descent step. (default: `IdentityOperator()`) - `subsampling::Union{<:Nothing,<:AbstractSubsampling}`: Data point subsampling strategy. If `nothing`, subsampling is not used. (default: `nothing`) +# Output +- `q_averaged`: The variational approximation formed by the averaged SGD iterates. + +# Callback +The callback function `callback` has a signature of + + callback(; rng, iteration, restructure, params, averaged_params, restructure, gradient) + +The arguments are as follows: +- `rng`: Random number generator internally used by the algorithm. +- `iteration`: The index of the current iteration. +- `restructure`: Function that restructures the variational approximation from the variational parameters. Calling `restructure(params)` reconstructs the current variational approximation. +- `params`: Current variational parameters. +- `averaged_params`: Variational parameters averaged according to the averaging strategy. +- `gradient`: The estimated (possibly stochastic) gradient. + # Requirements - The trainable parameters in the variational approximation are expected to be extractable through `Optimisers.destructure`. This requires the variational approximation to be marked as a functor through `Functors.@functor`. - The variational approximation ``q_{\\lambda}`` implements `rand`. - The variational approximation ``q_{\\lambda}`` implements `logpdf(q, x)`, which should also be differentiable with respect to `x`. - The target distribution and the variational approximation have the same support. """ +struct KLMinScoreGradDescent{ + Obj<:Union{<:ScoreGradELBO,<:SubsampledObjective}, + AD<:ADTypes.AbstractADType, + Opt<:Optimisers.AbstractRule, + Avg<:AbstractAverager, + Op<:AbstractOperator, +} <: AbstractVariationalAlgorithm + objective::Obj + adtype::AD + optimizer::Opt + averager::Avg + operator::Op +end + +struct KLMinScoreGradDescentState{P,Q,GradBuf,OptSt,ObjSt,AvgSt} + prob::P + q::Q + iteration::Int + grad_buf::GradBuf + opt_st::OptSt + obj_st::ObjSt + avg_st::AvgSt +end + function KLMinScoreGradDescent( adtype::ADTypes.AbstractADType; - optimizer::Union{<:Descent,<:DoG,<:DoWG}=DoWG(), + optimizer::Optimisers.AbstractRule=DoWG(), n_samples::Int=1, averager::AbstractAverager=PolynomialAveraging(), operator::AbstractOperator=IdentityOperator(), @@ -125,7 +253,11 @@ function KLMinScoreGradDescent( else SubsampledObjective(ScoreGradELBO(n_samples), subsampling) end - return ParamSpaceSGD(objective, adtype, optimizer, averager, operator) + return KLMinScoreGradDescent{ + typeof(objective),typeof(adtype),typeof(optimizer),typeof(averager),typeof(operator) + }( + objective, adtype, optimizer, averager, operator + ) end const BBVI = KLMinScoreGradDescent diff --git a/src/algorithms/paramspacesgd/paramspacesgd.jl b/src/algorithms/paramspacesgd/paramspacesgd.jl index 92bbb0e5..67954d99 100644 --- a/src/algorithms/paramspacesgd/paramspacesgd.jl +++ b/src/algorithms/paramspacesgd/paramspacesgd.jl @@ -1,68 +1,9 @@ -""" - ParamSpaceSGD( - objective::AbstractVariationalObjective, - adtype::ADTypes.AbstractADType, - optimizer::Optimisers.AbstractRule, - averager::AbstractAverager, - operator::AbstractOperator, - ) - -This algorithm applies stochastic gradient descent (SGD) to the variational `objective` over the (Euclidean) space of variational parameters. - -The trainable parameters in the variational approximation are expected to be extractable through `Optimisers.destructure`. -This requires the variational approximation to be marked as a functor through `Functors.@functor`. - -!!! note - Different objective may impose different requirements on `adtype`, variational family, `optimizer`, and `operator`. It is therefore important to check the documentation corresponding to each specific objective. Essentially, each objective should be thought as forming its own unique algorithm. - -# Arguments -- `objective`: Variational Objective. -- `adtype`: Automatic differentiation backend. -- `optimizer`: Optimizer used for inference. -- `averager` : Parameter averaging strategy. -- `operator` : Operator applied to the parameters after each optimization step. - -# Output -- `q_averaged`: The variational approximation formed from the averaged SGD iterates. - -# Callback -The callback function `callback` has a signature of - - callback(; rng, iteration, restructure, params, averaged_params, restructure, gradient) - -The arguments are as follows: -- `rng`: Random number generator internally used by the algorithm. -- `iteration`: The index of the current iteration. -- `restructure`: Function that restructures the variational approximation from the variational parameters. Calling `restructure(params)` reconstructs the current variational approximation. -- `params`: Current variational parameters. -- `averaged_params`: Variational parameters averaged according to the averaging strategy. -- `gradient`: The estimated (possibly stochastic) gradient. - -""" -struct ParamSpaceSGD{ - Obj<:AbstractVariationalObjective, - AD<:ADTypes.AbstractADType, - Opt<:Optimisers.AbstractRule, - Avg<:AbstractAverager, - Op<:AbstractOperator, -} <: AbstractVariationalAlgorithm - objective::Obj - adtype::AD - optimizer::Opt - averager::Avg - operator::Op -end - -struct ParamSpaceSGDState{P,Q,GradBuf,OptSt,ObjSt,AvgSt} - prob::P - q::Q - iteration::Int - grad_buf::GradBuf - opt_st::OptSt - obj_st::ObjSt - avg_st::AvgSt -end +const ParamSpaceSGD = Union{ + <:KLMinRepGradDescent, + <:KLMinRepGradProxDescent, + <:KLMinScoreGradDescent, +} function init(rng::Random.AbstractRNG, alg::ParamSpaceSGD, q_init, prob) (; adtype, optimizer, averager, objective, operator) = alg @@ -76,7 +17,15 @@ function init(rng::Random.AbstractRNG, alg::ParamSpaceSGD, q_init, prob) obj_st = init(rng, objective, adtype, q_init, prob, params, re) avg_st = init(averager, params) grad_buf = DiffResults.DiffResult(zero(eltype(params)), similar(params)) - return ParamSpaceSGDState(prob, q_init, 0, grad_buf, opt_st, obj_st, avg_st) + if alg isa KLMinRepGradDescent + return KLMinRepGradDescentState(prob, q_init, 0, grad_buf, opt_st, obj_st, avg_st) + elseif alg isa KLMinRepGradProxDescent + return KLMinRepGradProxDescentState(prob, q_init, 0, grad_buf, opt_st, obj_st, avg_st) + elseif alg isa KLMinScoreGradDescent + return KLMinScoreGradDescentState(prob, q_init, 0, grad_buf, opt_st, obj_st, avg_st) + else + nothing + end end function output(alg::ParamSpaceSGD, state) @@ -104,9 +53,21 @@ function step( params = apply(operator, typeof(q), opt_st, params, re) avg_st = apply(averager, avg_st, params) - state = ParamSpaceSGDState( - prob, re(params), iteration, grad_buf, opt_st, obj_st, avg_st - ) + state = if alg isa KLMinRepGradDescent + KLMinRepGradDescentState( + prob, re(params), iteration, grad_buf, opt_st, obj_st, avg_st + ) + elseif alg isa KLMinRepGradProxDescent + KLMinRepGradProxDescentState( + prob, re(params), iteration, grad_buf, opt_st, obj_st, avg_st + ) + elseif alg isa KLMinScoreGradDescent + KLMinScoreGradDescentState( + prob, re(params), iteration, grad_buf, opt_st, obj_st, avg_st + ) + else + nothing + end if !isnothing(callback) averaged_params = value(averager, avg_st) diff --git a/test/general/optimize.jl b/test/general/optimize.jl index 71c3e4fb..126dc2e4 100644 --- a/test/general/optimize.jl +++ b/test/general/optimize.jl @@ -9,12 +9,7 @@ (; model, μ_true, L_true, n_dims, is_meanfield) = modelstats q0 = MeanFieldGaussian(zeros(Float64, n_dims), Diagonal(ones(Float64, n_dims))) - obj = RepGradELBO(10) - - optimizer = Optimisers.Adam(1e-2) - averager = PolynomialAveraging() - - alg = ParamSpaceSGD(obj, AD, optimizer, averager, IdentityOperator()) + alg = KLMinRepGradDescent(AD; optimizer=Optimisers.Adam(1e-2), operator=ClipScale()) @testset "default_rng" begin optimize(alg, T, model, q0; show_progress=false) From 1f35cc97db596bc19a0b71d8468ca794759e4a89 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 15 Sep 2025 12:08:27 -0400 Subject: [PATCH 02/21] run formatter Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- src/algorithms/paramspacesgd/paramspacesgd.jl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/algorithms/paramspacesgd/paramspacesgd.jl b/src/algorithms/paramspacesgd/paramspacesgd.jl index 67954d99..df22eda0 100644 --- a/src/algorithms/paramspacesgd/paramspacesgd.jl +++ b/src/algorithms/paramspacesgd/paramspacesgd.jl @@ -1,8 +1,6 @@ const ParamSpaceSGD = Union{ - <:KLMinRepGradDescent, - <:KLMinRepGradProxDescent, - <:KLMinScoreGradDescent, + <:KLMinRepGradDescent,<:KLMinRepGradProxDescent,<:KLMinScoreGradDescent } function init(rng::Random.AbstractRNG, alg::ParamSpaceSGD, q_init, prob) From c8404b6debe18b071e69e20ea796b7239f0113b0 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 15 Sep 2025 12:08:34 -0400 Subject: [PATCH 03/21] run formatter Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- src/algorithms/paramspacesgd/paramspacesgd.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/algorithms/paramspacesgd/paramspacesgd.jl b/src/algorithms/paramspacesgd/paramspacesgd.jl index df22eda0..8c9a7b47 100644 --- a/src/algorithms/paramspacesgd/paramspacesgd.jl +++ b/src/algorithms/paramspacesgd/paramspacesgd.jl @@ -18,7 +18,9 @@ function init(rng::Random.AbstractRNG, alg::ParamSpaceSGD, q_init, prob) if alg isa KLMinRepGradDescent return KLMinRepGradDescentState(prob, q_init, 0, grad_buf, opt_st, obj_st, avg_st) elseif alg isa KLMinRepGradProxDescent - return KLMinRepGradProxDescentState(prob, q_init, 0, grad_buf, opt_st, obj_st, avg_st) + return KLMinRepGradProxDescentState( + prob, q_init, 0, grad_buf, opt_st, obj_st, avg_st + ) elseif alg isa KLMinScoreGradDescent return KLMinScoreGradDescentState(prob, q_init, 0, grad_buf, opt_st, obj_st, avg_st) else From 0cc7538c2156f7f7d992149501d41ae9003c4097 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 15 Sep 2025 12:08:40 -0400 Subject: [PATCH 04/21] run formatter Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- src/algorithms/paramspacesgd/paramspacesgd.jl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/algorithms/paramspacesgd/paramspacesgd.jl b/src/algorithms/paramspacesgd/paramspacesgd.jl index 8c9a7b47..17b3e5e7 100644 --- a/src/algorithms/paramspacesgd/paramspacesgd.jl +++ b/src/algorithms/paramspacesgd/paramspacesgd.jl @@ -54,9 +54,7 @@ function step( avg_st = apply(averager, avg_st, params) state = if alg isa KLMinRepGradDescent - KLMinRepGradDescentState( - prob, re(params), iteration, grad_buf, opt_st, obj_st, avg_st - ) + KLMinRepGradDescentState(prob, re(params), iteration, grad_buf, opt_st, obj_st, avg_st) elseif alg isa KLMinRepGradProxDescent KLMinRepGradProxDescentState( prob, re(params), iteration, grad_buf, opt_st, obj_st, avg_st From ede91c61ccf40f134c5c0d64bcc3e820041ce6f8 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 13 Oct 2025 14:35:58 -0400 Subject: [PATCH 05/21] fix rename file paramspacesgd.jl to interface.jl --- src/algorithms/paramspacesgd/{paramspacesgd.jl => interface.jl} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/algorithms/paramspacesgd/{paramspacesgd.jl => interface.jl} (100%) diff --git a/src/algorithms/paramspacesgd/paramspacesgd.jl b/src/algorithms/paramspacesgd/interface.jl similarity index 100% rename from src/algorithms/paramspacesgd/paramspacesgd.jl rename to src/algorithms/paramspacesgd/interface.jl From 683a09d98b7cb692d5eff75fa140d2796f9ef4e9 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 13 Oct 2025 14:40:56 -0400 Subject: [PATCH 06/21] throw invalid state for unknown paramspacesgd type --- src/algorithms/paramspacesgd/interface.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/algorithms/paramspacesgd/interface.jl b/src/algorithms/paramspacesgd/interface.jl index 17b3e5e7..ce39857c 100644 --- a/src/algorithms/paramspacesgd/interface.jl +++ b/src/algorithms/paramspacesgd/interface.jl @@ -24,7 +24,7 @@ function init(rng::Random.AbstractRNG, alg::ParamSpaceSGD, q_init, prob) elseif alg isa KLMinScoreGradDescent return KLMinScoreGradDescentState(prob, q_init, 0, grad_buf, opt_st, obj_st, avg_st) else - nothing + throw(InvalidStateException()) end end @@ -64,7 +64,7 @@ function step( prob, re(params), iteration, grad_buf, opt_st, obj_st, avg_st ) else - nothing + throw(InvalidStateException()) end if !isnothing(callback) From 570fe11bde8d780a848d63bcd76f2258fe655156 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 13 Oct 2025 14:41:12 -0400 Subject: [PATCH 07/21] add docstring for union type of paramspacesgd algorithms --- src/algorithms/paramspacesgd/interface.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/algorithms/paramspacesgd/interface.jl b/src/algorithms/paramspacesgd/interface.jl index ce39857c..4627f56f 100644 --- a/src/algorithms/paramspacesgd/interface.jl +++ b/src/algorithms/paramspacesgd/interface.jl @@ -1,4 +1,10 @@ + +""" +This family of algorithms (`<:KLMinRepGradDescent`,`<:KLMinRepGradProxDescent`,`<:KLMinScoreGradDescent`) applies stochastic gradient descent (SGD) to the variational `objective` over the (Euclidean) space of variational parameters. +The trainable parameters in the variational approximation are expected to be extractable through `Optimisers.destructure`. +This requires the variational approximation to be marked as a functor through `Functors.@functor`. +""" const ParamSpaceSGD = Union{ <:KLMinRepGradDescent,<:KLMinRepGradProxDescent,<:KLMinScoreGradDescent } From 2d5f3735574d2a9433be4812889566637bc738b8 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 13 Oct 2025 14:46:38 -0400 Subject: [PATCH 08/21] fix remove custom state types for paramspacesgd algorithms --- src/algorithms/paramspacesgd/interface.jl | 43 ++++++++++------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/src/algorithms/paramspacesgd/interface.jl b/src/algorithms/paramspacesgd/interface.jl index 4627f56f..88a2623c 100644 --- a/src/algorithms/paramspacesgd/interface.jl +++ b/src/algorithms/paramspacesgd/interface.jl @@ -1,5 +1,4 @@ - """ This family of algorithms (`<:KLMinRepGradDescent`,`<:KLMinRepGradProxDescent`,`<:KLMinScoreGradDescent`) applies stochastic gradient descent (SGD) to the variational `objective` over the (Euclidean) space of variational parameters. The trainable parameters in the variational approximation are expected to be extractable through `Optimisers.destructure`. @@ -21,17 +20,15 @@ function init(rng::Random.AbstractRNG, alg::ParamSpaceSGD, q_init, prob) obj_st = init(rng, objective, adtype, q_init, prob, params, re) avg_st = init(averager, params) grad_buf = DiffResults.DiffResult(zero(eltype(params)), similar(params)) - if alg isa KLMinRepGradDescent - return KLMinRepGradDescentState(prob, q_init, 0, grad_buf, opt_st, obj_st, avg_st) - elseif alg isa KLMinRepGradProxDescent - return KLMinRepGradProxDescentState( - prob, q_init, 0, grad_buf, opt_st, obj_st, avg_st - ) - elseif alg isa KLMinScoreGradDescent - return KLMinScoreGradDescentState(prob, q_init, 0, grad_buf, opt_st, obj_st, avg_st) - else - throw(InvalidStateException()) - end + return ( + prob=prob, + q=q_init, + iteration=0, + grad_buf=grad_buf, + opt_st=opt_st, + obj_st=obj_st, + avg_st=avg_st, + ) end function output(alg::ParamSpaceSGD, state) @@ -59,19 +56,15 @@ function step( params = apply(operator, typeof(q), opt_st, params, re) avg_st = apply(averager, avg_st, params) - state = if alg isa KLMinRepGradDescent - KLMinRepGradDescentState(prob, re(params), iteration, grad_buf, opt_st, obj_st, avg_st) - elseif alg isa KLMinRepGradProxDescent - KLMinRepGradProxDescentState( - prob, re(params), iteration, grad_buf, opt_st, obj_st, avg_st - ) - elseif alg isa KLMinScoreGradDescent - KLMinScoreGradDescentState( - prob, re(params), iteration, grad_buf, opt_st, obj_st, avg_st - ) - else - throw(InvalidStateException()) - end + state = ( + prob=prob, + q=re(params), + iteration=iteration, + grad_buf=grad_buf, + opt_st=opt_st, + obj_st=obj_st, + avg_st=avg_st, + ) if !isnothing(callback) averaged_params = value(averager, avg_st) From e0221eb9ae7300bf67bde45bcc0a5295cfe6f134 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 13 Oct 2025 14:47:58 -0400 Subject: [PATCH 09/21] fix remove custom state types for paramspacesgd --- src/algorithms/paramspacesgd/constructors.jl | 30 -------------------- 1 file changed, 30 deletions(-) diff --git a/src/algorithms/paramspacesgd/constructors.jl b/src/algorithms/paramspacesgd/constructors.jl index d3f7bef6..32c85b22 100644 --- a/src/algorithms/paramspacesgd/constructors.jl +++ b/src/algorithms/paramspacesgd/constructors.jl @@ -55,16 +55,6 @@ struct KLMinRepGradDescent{ operator::Op end -struct KLMinRepGradDescentState{P,Q,GradBuf,OptSt,ObjSt,AvgSt} - prob::P - q::Q - iteration::Int - grad_buf::GradBuf - opt_st::OptSt - obj_st::ObjSt - avg_st::AvgSt -end - function KLMinRepGradDescent( adtype::ADTypes.AbstractADType; entropy::Union{<:ClosedFormEntropy,<:StickingTheLandingEntropy,<:MonteCarloEntropy}=ClosedFormEntropy(), @@ -143,16 +133,6 @@ struct KLMinRepGradProxDescent{ operator::Op end -struct KLMinRepGradProxDescentState{P,Q,GradBuf,OptSt,ObjSt,AvgSt} - prob::P - q::Q - iteration::Int - grad_buf::GradBuf - opt_st::OptSt - obj_st::ObjSt - avg_st::AvgSt -end - function KLMinRepGradProxDescent( adtype::ADTypes.AbstractADType; entropy_zerograd::Union{ @@ -230,16 +210,6 @@ struct KLMinScoreGradDescent{ operator::Op end -struct KLMinScoreGradDescentState{P,Q,GradBuf,OptSt,ObjSt,AvgSt} - prob::P - q::Q - iteration::Int - grad_buf::GradBuf - opt_st::OptSt - obj_st::ObjSt - avg_st::AvgSt -end - function KLMinScoreGradDescent( adtype::ADTypes.AbstractADType; optimizer::Optimisers.AbstractRule=DoWG(), From e51ab3cbf77e06cd081c7ae93a5a224c7734603b Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 13 Oct 2025 14:50:57 -0400 Subject: [PATCH 10/21] fix file path --- src/AdvancedVI.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AdvancedVI.jl b/src/AdvancedVI.jl index 20f1c345..811015ec 100644 --- a/src/AdvancedVI.jl +++ b/src/AdvancedVI.jl @@ -318,7 +318,7 @@ export RepGradELBO, SubsampledObjective include("algorithms/paramspacesgd/constructors.jl") -include("algorithms/paramspacesgd/paramspacesgd.jl") +include("algorithms/paramspacesgd/interface.jl") export KLMinRepGradDescent, KLMinRepGradProxDescent, KLMinScoreGradDescent, ADVI, BBVI From e49c6807dd3aa371e80be0110bbb7676981d32eb Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 13 Oct 2025 15:38:22 -0400 Subject: [PATCH 11/21] fix bug in BijectorsExt --- ext/AdvancedVIBijectorsExt.jl | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ext/AdvancedVIBijectorsExt.jl b/ext/AdvancedVIBijectorsExt.jl index a28dc9d0..0e85d4d4 100644 --- a/ext/AdvancedVIBijectorsExt.jl +++ b/ext/AdvancedVIBijectorsExt.jl @@ -25,7 +25,15 @@ function AdvancedVI.init( obj_st = AdvancedVI.init(rng, objective, adtype, q_init, prob, params, re) avg_st = AdvancedVI.init(averager, params) grad_buf = DiffResults.DiffResult(zero(eltype(params)), similar(params)) - return AdvancedVI.ParamSpaceSGDState(prob, q_init, 0, grad_buf, opt_st, obj_st, avg_st) + return ( + prob=prob, + q=q_init, + iteration=0, + grad_buf=grad_buf, + opt_st=opt_st, + obj_st=obj_st, + avg_st=avg_st, + ) end function AdvancedVI.apply( From 3c5b56f2bce387ea76174187811919ed71f288b7 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 13 Oct 2025 15:38:35 -0400 Subject: [PATCH 12/21] fix include `SubSampleObjective` as part of `ParamSpaceSGD` --- src/algorithms/paramspacesgd/interface.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/algorithms/paramspacesgd/interface.jl b/src/algorithms/paramspacesgd/interface.jl index 88a2623c..2147b875 100644 --- a/src/algorithms/paramspacesgd/interface.jl +++ b/src/algorithms/paramspacesgd/interface.jl @@ -1,11 +1,11 @@ """ -This family of algorithms (`<:KLMinRepGradDescent`,`<:KLMinRepGradProxDescent`,`<:KLMinScoreGradDescent`) applies stochastic gradient descent (SGD) to the variational `objective` over the (Euclidean) space of variational parameters. +This family of algorithms (`<:KLMinRepGradDescent`,`<:KLMinRepGradProxDescent`,`<:KLMinScoreGradDescent`,`<:SubsampledObjective`) applies stochastic gradient descent (SGD) to the variational `objective` over the (Euclidean) space of variational parameters. The trainable parameters in the variational approximation are expected to be extractable through `Optimisers.destructure`. This requires the variational approximation to be marked as a functor through `Functors.@functor`. """ const ParamSpaceSGD = Union{ - <:KLMinRepGradDescent,<:KLMinRepGradProxDescent,<:KLMinScoreGradDescent + <:KLMinRepGradDescent,<:KLMinRepGradProxDescent,<:KLMinScoreGradDescent,<:SubsampledObjective } function init(rng::Random.AbstractRNG, alg::ParamSpaceSGD, q_init, prob) From 30f51601efe64173f481b181acef5c57e5609a14 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 13 Oct 2025 15:43:44 -0400 Subject: [PATCH 13/21] fix formatting Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- src/algorithms/paramspacesgd/interface.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/algorithms/paramspacesgd/interface.jl b/src/algorithms/paramspacesgd/interface.jl index 2147b875..d863720d 100644 --- a/src/algorithms/paramspacesgd/interface.jl +++ b/src/algorithms/paramspacesgd/interface.jl @@ -5,7 +5,10 @@ The trainable parameters in the variational approximation are expected to be ext This requires the variational approximation to be marked as a functor through `Functors.@functor`. """ const ParamSpaceSGD = Union{ - <:KLMinRepGradDescent,<:KLMinRepGradProxDescent,<:KLMinScoreGradDescent,<:SubsampledObjective + <:KLMinRepGradDescent, + <:KLMinRepGradProxDescent, + <:KLMinScoreGradDescent, + <:SubsampledObjective, } function init(rng::Random.AbstractRNG, alg::ParamSpaceSGD, q_init, prob) From 008c4ea4c0bbeb7562f48ab5e3c28301bdb2b542 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 13 Oct 2025 15:48:00 -0400 Subject: [PATCH 14/21] fix revert adding SubsampledObjective into ParamSpaceSGD --- src/algorithms/paramspacesgd/interface.jl | 7 ++----- test/algorithms/paramspacesgd/subsampledobj.jl | 3 +-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/algorithms/paramspacesgd/interface.jl b/src/algorithms/paramspacesgd/interface.jl index d863720d..88a2623c 100644 --- a/src/algorithms/paramspacesgd/interface.jl +++ b/src/algorithms/paramspacesgd/interface.jl @@ -1,14 +1,11 @@ """ -This family of algorithms (`<:KLMinRepGradDescent`,`<:KLMinRepGradProxDescent`,`<:KLMinScoreGradDescent`,`<:SubsampledObjective`) applies stochastic gradient descent (SGD) to the variational `objective` over the (Euclidean) space of variational parameters. +This family of algorithms (`<:KLMinRepGradDescent`,`<:KLMinRepGradProxDescent`,`<:KLMinScoreGradDescent`) applies stochastic gradient descent (SGD) to the variational `objective` over the (Euclidean) space of variational parameters. The trainable parameters in the variational approximation are expected to be extractable through `Optimisers.destructure`. This requires the variational approximation to be marked as a functor through `Functors.@functor`. """ const ParamSpaceSGD = Union{ - <:KLMinRepGradDescent, - <:KLMinRepGradProxDescent, - <:KLMinScoreGradDescent, - <:SubsampledObjective, + <:KLMinRepGradDescent,<:KLMinRepGradProxDescent,<:KLMinScoreGradDescent } function init(rng::Random.AbstractRNG, alg::ParamSpaceSGD, q_init, prob) diff --git a/test/algorithms/paramspacesgd/subsampledobj.jl b/test/algorithms/paramspacesgd/subsampledobj.jl index f7e81d55..f6ac1bf9 100644 --- a/test/algorithms/paramspacesgd/subsampledobj.jl +++ b/test/algorithms/paramspacesgd/subsampledobj.jl @@ -63,8 +63,7 @@ end @testset "determinism" begin T = 128 sub = ReshufflingBatchSubsampling(1:n_data, 1) - sub_obj = SubsampledObjective(full_obj, sub) - alg = ParamSpaceSGD(sub_obj, AD, DoWG(), PolynomialAveraging(), ClipScale()) + alg = KLMinRepGradDescent(AD; subsampling=sub) rng = StableRNG(seed) q_avg, _, _ = optimize(rng, alg, T, prob, q0; show_progress=false) From 8a1890224d9629c89f6f463743101ff85761fcfe Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 13 Oct 2025 15:53:06 -0400 Subject: [PATCH 15/21] refactor flatten algorithms --- src/algorithms/{paramspacesgd => }/abstractobjective.jl | 0 src/algorithms/{paramspacesgd => }/constructors.jl | 0 src/algorithms/{paramspacesgd => }/entropy.jl | 0 src/algorithms/{paramspacesgd => }/interface.jl | 0 src/algorithms/{paramspacesgd => }/repgradelbo.jl | 0 src/algorithms/{paramspacesgd => }/scoregradelbo.jl | 0 src/algorithms/{paramspacesgd => }/subsampledobjective.jl | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename src/algorithms/{paramspacesgd => }/abstractobjective.jl (100%) rename src/algorithms/{paramspacesgd => }/constructors.jl (100%) rename src/algorithms/{paramspacesgd => }/entropy.jl (100%) rename src/algorithms/{paramspacesgd => }/interface.jl (100%) rename src/algorithms/{paramspacesgd => }/repgradelbo.jl (100%) rename src/algorithms/{paramspacesgd => }/scoregradelbo.jl (100%) rename src/algorithms/{paramspacesgd => }/subsampledobjective.jl (100%) diff --git a/src/algorithms/paramspacesgd/abstractobjective.jl b/src/algorithms/abstractobjective.jl similarity index 100% rename from src/algorithms/paramspacesgd/abstractobjective.jl rename to src/algorithms/abstractobjective.jl diff --git a/src/algorithms/paramspacesgd/constructors.jl b/src/algorithms/constructors.jl similarity index 100% rename from src/algorithms/paramspacesgd/constructors.jl rename to src/algorithms/constructors.jl diff --git a/src/algorithms/paramspacesgd/entropy.jl b/src/algorithms/entropy.jl similarity index 100% rename from src/algorithms/paramspacesgd/entropy.jl rename to src/algorithms/entropy.jl diff --git a/src/algorithms/paramspacesgd/interface.jl b/src/algorithms/interface.jl similarity index 100% rename from src/algorithms/paramspacesgd/interface.jl rename to src/algorithms/interface.jl diff --git a/src/algorithms/paramspacesgd/repgradelbo.jl b/src/algorithms/repgradelbo.jl similarity index 100% rename from src/algorithms/paramspacesgd/repgradelbo.jl rename to src/algorithms/repgradelbo.jl diff --git a/src/algorithms/paramspacesgd/scoregradelbo.jl b/src/algorithms/scoregradelbo.jl similarity index 100% rename from src/algorithms/paramspacesgd/scoregradelbo.jl rename to src/algorithms/scoregradelbo.jl diff --git a/src/algorithms/paramspacesgd/subsampledobjective.jl b/src/algorithms/subsampledobjective.jl similarity index 100% rename from src/algorithms/paramspacesgd/subsampledobjective.jl rename to src/algorithms/subsampledobjective.jl From b002e1e5dbfd8c1aeca3d909422beb854b27dfec Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 13 Oct 2025 15:57:51 -0400 Subject: [PATCH 16/21] fix error update paths in main file --- src/AdvancedVI.jl | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/AdvancedVI.jl b/src/AdvancedVI.jl index 811015ec..bb34071b 100644 --- a/src/AdvancedVI.jl +++ b/src/AdvancedVI.jl @@ -277,12 +277,11 @@ export optimize include("utils.jl") include("optimize.jl") -## Parameter Space SGD -include("algorithms/paramspacesgd/abstractobjective.jl") - -export ParamSpaceSGD ## Parameter Space SGD Implementations + +include("algorithms/abstractobjective.jl") + ### ELBO Maximization abstract type AbstractEntropyEstimator end @@ -303,10 +302,10 @@ Estimate the entropy of `q`. """ function estimate_entropy end -include("algorithms/paramspacesgd/subsampledobjective.jl") -include("algorithms/paramspacesgd/repgradelbo.jl") -include("algorithms/paramspacesgd/scoregradelbo.jl") -include("algorithms/paramspacesgd/entropy.jl") +include("algorithms/subsampledobjective.jl") +include("algorithms/repgradelbo.jl") +include("algorithms/scoregradelbo.jl") +include("algorithms/entropy.jl") export RepGradELBO, ScoreGradELBO, @@ -317,8 +316,8 @@ export RepGradELBO, StickingTheLandingEntropyZeroGradient, SubsampledObjective -include("algorithms/paramspacesgd/constructors.jl") -include("algorithms/paramspacesgd/interface.jl") +include("algorithms/constructors.jl") +include("algorithms/interface.jl") export KLMinRepGradDescent, KLMinRepGradProxDescent, KLMinScoreGradDescent, ADVI, BBVI From 1ba361f11ff82b037c87f33f6cd967d3ad44ed9a Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 13 Oct 2025 16:06:38 -0400 Subject: [PATCH 17/21] refactor flatten the tests to reflect new structure --- src/AdvancedVI.jl | 1 - test/algorithms/{paramspacesgd => }/repgradelbo.jl | 0 test/algorithms/{paramspacesgd => }/repgradelbo_locationscale.jl | 0 .../{paramspacesgd => }/repgradelbo_locationscale_bijectors.jl | 0 .../{paramspacesgd => }/repgradelbo_proximal_locationscale.jl | 0 .../repgradelbo_proximal_locationscale_bijectors.jl | 0 test/algorithms/{paramspacesgd => }/scoregradelbo.jl | 0 .../{paramspacesgd => }/scoregradelbo_locationscale.jl | 0 .../{paramspacesgd => }/scoregradelbo_locationscale_bijectors.jl | 0 test/algorithms/{paramspacesgd => }/subsampledobj.jl | 1 + 10 files changed, 1 insertion(+), 1 deletion(-) rename test/algorithms/{paramspacesgd => }/repgradelbo.jl (100%) rename test/algorithms/{paramspacesgd => }/repgradelbo_locationscale.jl (100%) rename test/algorithms/{paramspacesgd => }/repgradelbo_locationscale_bijectors.jl (100%) rename test/algorithms/{paramspacesgd => }/repgradelbo_proximal_locationscale.jl (100%) rename test/algorithms/{paramspacesgd => }/repgradelbo_proximal_locationscale_bijectors.jl (100%) rename test/algorithms/{paramspacesgd => }/scoregradelbo.jl (100%) rename test/algorithms/{paramspacesgd => }/scoregradelbo_locationscale.jl (100%) rename test/algorithms/{paramspacesgd => }/scoregradelbo_locationscale_bijectors.jl (100%) rename test/algorithms/{paramspacesgd => }/subsampledobj.jl (99%) diff --git a/src/AdvancedVI.jl b/src/AdvancedVI.jl index bb34071b..37b16b0e 100644 --- a/src/AdvancedVI.jl +++ b/src/AdvancedVI.jl @@ -277,7 +277,6 @@ export optimize include("utils.jl") include("optimize.jl") - ## Parameter Space SGD Implementations include("algorithms/abstractobjective.jl") diff --git a/test/algorithms/paramspacesgd/repgradelbo.jl b/test/algorithms/repgradelbo.jl similarity index 100% rename from test/algorithms/paramspacesgd/repgradelbo.jl rename to test/algorithms/repgradelbo.jl diff --git a/test/algorithms/paramspacesgd/repgradelbo_locationscale.jl b/test/algorithms/repgradelbo_locationscale.jl similarity index 100% rename from test/algorithms/paramspacesgd/repgradelbo_locationscale.jl rename to test/algorithms/repgradelbo_locationscale.jl diff --git a/test/algorithms/paramspacesgd/repgradelbo_locationscale_bijectors.jl b/test/algorithms/repgradelbo_locationscale_bijectors.jl similarity index 100% rename from test/algorithms/paramspacesgd/repgradelbo_locationscale_bijectors.jl rename to test/algorithms/repgradelbo_locationscale_bijectors.jl diff --git a/test/algorithms/paramspacesgd/repgradelbo_proximal_locationscale.jl b/test/algorithms/repgradelbo_proximal_locationscale.jl similarity index 100% rename from test/algorithms/paramspacesgd/repgradelbo_proximal_locationscale.jl rename to test/algorithms/repgradelbo_proximal_locationscale.jl diff --git a/test/algorithms/paramspacesgd/repgradelbo_proximal_locationscale_bijectors.jl b/test/algorithms/repgradelbo_proximal_locationscale_bijectors.jl similarity index 100% rename from test/algorithms/paramspacesgd/repgradelbo_proximal_locationscale_bijectors.jl rename to test/algorithms/repgradelbo_proximal_locationscale_bijectors.jl diff --git a/test/algorithms/paramspacesgd/scoregradelbo.jl b/test/algorithms/scoregradelbo.jl similarity index 100% rename from test/algorithms/paramspacesgd/scoregradelbo.jl rename to test/algorithms/scoregradelbo.jl diff --git a/test/algorithms/paramspacesgd/scoregradelbo_locationscale.jl b/test/algorithms/scoregradelbo_locationscale.jl similarity index 100% rename from test/algorithms/paramspacesgd/scoregradelbo_locationscale.jl rename to test/algorithms/scoregradelbo_locationscale.jl diff --git a/test/algorithms/paramspacesgd/scoregradelbo_locationscale_bijectors.jl b/test/algorithms/scoregradelbo_locationscale_bijectors.jl similarity index 100% rename from test/algorithms/paramspacesgd/scoregradelbo_locationscale_bijectors.jl rename to test/algorithms/scoregradelbo_locationscale_bijectors.jl diff --git a/test/algorithms/paramspacesgd/subsampledobj.jl b/test/algorithms/subsampledobj.jl similarity index 99% rename from test/algorithms/paramspacesgd/subsampledobj.jl rename to test/algorithms/subsampledobj.jl index f6ac1bf9..05572325 100644 --- a/test/algorithms/paramspacesgd/subsampledobj.jl +++ b/test/algorithms/subsampledobj.jl @@ -64,6 +64,7 @@ end T = 128 sub = ReshufflingBatchSubsampling(1:n_data, 1) alg = KLMinRepGradDescent(AD; subsampling=sub) + sub_obj = alg.objective rng = StableRNG(seed) q_avg, _, _ = optimize(rng, alg, T, prob, q0; show_progress=false) From 86baa07b72b3c9ac26ae67af18467e4e7e31fdc7 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 13 Oct 2025 16:07:32 -0400 Subject: [PATCH 18/21] fix file include path in tests --- test/runtests.jl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 2cf9474c..2dae5b31 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -71,13 +71,13 @@ if GROUP == "All" || GROUP == "AD" include("general/ad.jl") include("general/mixedad_logdensity.jl") - include("algorithms/paramspacesgd/subsampledobj.jl") - include("algorithms/paramspacesgd/repgradelbo.jl") - include("algorithms/paramspacesgd/scoregradelbo.jl") - include("algorithms/paramspacesgd/repgradelbo_locationscale.jl") - include("algorithms/paramspacesgd/repgradelbo_locationscale_bijectors.jl") - include("algorithms/paramspacesgd/repgradelbo_proximal_locationscale.jl") - include("algorithms/paramspacesgd/repgradelbo_proximal_locationscale_bijectors.jl") - include("algorithms/paramspacesgd/scoregradelbo_locationscale.jl") - include("algorithms/paramspacesgd/scoregradelbo_locationscale_bijectors.jl") + include("algorithms/subsampledobj.jl") + include("algorithms/repgradelbo.jl") + include("algorithms/scoregradelbo.jl") + include("algorithms/repgradelbo_locationscale.jl") + include("algorithms/repgradelbo_locationscale_bijectors.jl") + include("algorithms/repgradelbo_proximal_locationscale.jl") + include("algorithms/repgradelbo_proximal_locationscale_bijectors.jl") + include("algorithms/scoregradelbo_locationscale.jl") + include("algorithms/scoregradelbo_locationscale_bijectors.jl") end From 67e93755dec3676c0c069a542a8490826fd5d47b Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 13 Oct 2025 16:17:23 -0400 Subject: [PATCH 19/21] fix missing operator in subsampledobj tests --- test/algorithms/subsampledobj.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/algorithms/subsampledobj.jl b/test/algorithms/subsampledobj.jl index 05572325..35ff4b43 100644 --- a/test/algorithms/subsampledobj.jl +++ b/test/algorithms/subsampledobj.jl @@ -44,7 +44,7 @@ end @testset "algorithm constructors" begin @testset for batchsize in [1, 3, 4] sub = ReshufflingBatchSubsampling(1:n_data, batchsize) - alg = KLMinRepGradDescent(AD; n_samples=10, subsampling=sub) + alg = KLMinRepGradDescent(AD; n_samples=10, subsampling=sub, operator=ClipScale()) _, info, _ = optimize(alg, 10, prob, q0; show_progress=false) @test isfinite(last(info).elbo) @@ -63,7 +63,7 @@ end @testset "determinism" begin T = 128 sub = ReshufflingBatchSubsampling(1:n_data, 1) - alg = KLMinRepGradDescent(AD; subsampling=sub) + alg = KLMinRepGradDescent(AD; subsampling=sub, operator=ClipScale()) sub_obj = alg.objective rng = StableRNG(seed) From 9b2eabb9ef5eb1ed236e35117c132200dd0aa2a5 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Mon, 13 Oct 2025 16:19:10 -0400 Subject: [PATCH 20/21] fix formatting Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- test/algorithms/subsampledobj.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/algorithms/subsampledobj.jl b/test/algorithms/subsampledobj.jl index 35ff4b43..c5b8720e 100644 --- a/test/algorithms/subsampledobj.jl +++ b/test/algorithms/subsampledobj.jl @@ -44,7 +44,9 @@ end @testset "algorithm constructors" begin @testset for batchsize in [1, 3, 4] sub = ReshufflingBatchSubsampling(1:n_data, batchsize) - alg = KLMinRepGradDescent(AD; n_samples=10, subsampling=sub, operator=ClipScale()) + alg = KLMinRepGradDescent( + AD; n_samples=10, subsampling=sub, operator=ClipScale() + ) _, info, _ = optimize(alg, 10, prob, q0; show_progress=false) @test isfinite(last(info).elbo) From 922e5d79110cbb492ae3b84dd1f828fd439ae8c9 Mon Sep 17 00:00:00 2001 From: Kyurae Kim Date: Sun, 19 Oct 2025 20:31:52 -0400 Subject: [PATCH 21/21] update docs --- docs/make.jl | 14 +-- docs/src/families.md | 2 +- docs/src/index.md | 1 - .../repgradelbo.md => klminrepgraddescent.md} | 99 ++++++++++++------- docs/src/klminrepgradproxdescent.md | 61 ++++++++++++ docs/src/klminscoregraddescent.md | 79 +++++++++++++++ docs/src/paramspacesgd/general.md | 98 ------------------ docs/src/paramspacesgd/klminrepgraddescent.md | 10 -- .../paramspacesgd/klminrepgradproxdescent.md | 11 --- .../paramspacesgd/klminscoregraddescent.md | 15 --- docs/src/paramspacesgd/objectives.md | 35 ------- docs/src/paramspacesgd/scoregradelbo.md | 45 --------- 12 files changed, 206 insertions(+), 264 deletions(-) rename docs/src/{paramspacesgd/repgradelbo.md => klminrepgraddescent.md} (79%) create mode 100644 docs/src/klminrepgradproxdescent.md create mode 100644 docs/src/klminscoregraddescent.md delete mode 100644 docs/src/paramspacesgd/general.md delete mode 100644 docs/src/paramspacesgd/klminrepgraddescent.md delete mode 100644 docs/src/paramspacesgd/klminrepgradproxdescent.md delete mode 100644 docs/src/paramspacesgd/klminscoregraddescent.md delete mode 100644 docs/src/paramspacesgd/objectives.md delete mode 100644 docs/src/paramspacesgd/scoregradelbo.md diff --git a/docs/make.jl b/docs/make.jl index a3ae15bc..3c1d3a2d 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -23,17 +23,9 @@ makedocs(; "Normalizing Flows" => "tutorials/flows.md", ], "Algorithms" => [ - "KLMinRepGradDescent" => "paramspacesgd/klminrepgraddescent.md", - "KLMinRepGradProxDescent" => "paramspacesgd/klminrepgradproxdescent.md", - "KLMinScoreGradDescent" => "paramspacesgd/klminscoregraddescent.md", - "Parameter Space SGD" => [ - "General" => "paramspacesgd/general.md", - "Objectives" => [ - "Overview" => "paramspacesgd/objectives.md", - "RepGradELBO" => "paramspacesgd/repgradelbo.md", - "ScoreGradELBO" => "paramspacesgd/scoregradelbo.md", - ], - ], + "KLMinRepGradDescent" => "klminrepgraddescent.md", + "KLMinRepGradProxDescent" => "klminrepgradproxdescent.md", + "KLMinScoreGradDescent" => "klminscoregraddescent.md", ], "Variational Families" => "families.md", "Optimization" => "optimization.md", diff --git a/docs/src/families.md b/docs/src/families.md index 761769f3..a7f2e19b 100644 --- a/docs/src/families.md +++ b/docs/src/families.md @@ -1,6 +1,6 @@ # [Reparameterizable Variational Families](@id families) -The [RepGradELBO](@ref repgradelbo) objective assumes that the members of the variational family have a differentiable sampling path. +Algorithms such as [`KLMinRepGradELBO`](@ref klminrepgraddescent) assume that the members of the variational family have a differentiable sampling path. We provide multiple pre-packaged variational families that can be readily used. ## [The `LocationScale` Family](@id locscale) diff --git a/docs/src/index.md b/docs/src/index.md index c02c2cd1..5728e6f5 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -10,7 +10,6 @@ VI algorithms perform scalable and computationally efficient Bayesian inference # List of Algorithms - - [ParamSpaceSGD](@ref paramspacesgd) - [KLMinRepGradDescent](@ref klminrepgraddescent) (alias of `ADVI`) - [KLMinRepGradProxDescent](@ref klminrepgradproxdescent) - [KLMinScoreGradDescent](@ref klminscoregraddescent) (alias of `BBVI`) diff --git a/docs/src/paramspacesgd/repgradelbo.md b/docs/src/klminrepgraddescent.md similarity index 79% rename from docs/src/paramspacesgd/repgradelbo.md rename to docs/src/klminrepgraddescent.md index f61940be..0deec5a0 100644 --- a/docs/src/paramspacesgd/repgradelbo.md +++ b/docs/src/klminrepgraddescent.md @@ -1,49 +1,75 @@ -# [Reparameterization Gradient Estimator](@id repgradelbo) +# [`KLMinRepGradDescent`](@id klminrepgraddescent) -## Overview +## Description -The `RepGradELBO` objective implements the reparameterization gradient estimator[^HC1983][^G1991][^R1992][^P1996] of the ELBO gradient. -The reparameterization gradient, also known as the push-in gradient or the pathwise gradient, was introduced to VI in [^TL2014][^RMW2014][^KW2014]. -For the variational family $\mathcal{Q} = \{q_{\lambda} \mid \lambda \in \Lambda\}$, suppose the process of sampling from $q_{\lambda}$ can be described by some differentiable reparameterization function $$T_{\lambda}$$ and a *base distribution* $$\varphi$$ independent of $$\lambda$$ such that +This algorithm aims to minimize the exclusive (or reverse) Kullback-Leibler (KL) divergence via stochastic gradient descent in the space of parameters. +Specifically, it uses the the *reparameterization gradient estimator*. +As a result, this algorithm is best applicable when the target log-density is differentiable and the sampling process of the variational family is differentiable. +(See the [methodology section](@ref klminrepgraddescent_method) for more details.) +This algorithm is also commonly referred to as automatic differentiation variational inference, black-box variational inference with the reparameterization gradient, and stochastic gradient variational inference. +`KLMinRepGradDescent` is also an alias of `ADVI` . + +```@docs +KLMinRepGradDescent +``` + +## [Methodology](@id klminrepgraddescent_method) + +This algorithm aims to solve the problem -[^HC1983]: Ho, Y. C., & Cao, X. (1983). Perturbation analysis and optimization of queueing networks. Journal of optimization theory and Applications, 40(4), 559-582. -[^G1991]: Glasserman, P. (1991). Gradient estimation via perturbation analysis (Vol. 116). Springer Science & Business Media. -[^R1992]: Rubinstein, R. Y. (1992). Sensitivity analysis of discrete event systems by the “push out” method. Annals of Operations Research, 39(1), 229-250. -[^P1996]: Pflug, G. C. (1996). Optimization of stochastic models: the interface between simulation and optimization (Vol. 373). Springer Science & Business Media. -[^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014). Doubly stochastic variational Bayes for non-conjugate inference. In *International Conference on Machine Learning*. -[^RMW2014]: Rezende, D. J., Mohamed, S., & Wierstra, D. (2014). Stochastic backpropagation and approximate inference in deep generative models. In *International Conference on Machine Learning*. -[^KW2014]: Kingma, D. P., & Welling, M. (2014). Auto-encoding variational bayes. In *International Conference on Learning Representations*. ```math -z \sim q_{\lambda} \qquad\Leftrightarrow\qquad -z \stackrel{d}{=} T_{\lambda}\left(\epsilon\right);\quad \epsilon \sim \varphi \; . + \mathrm{minimize}_{q \in \mathcal{Q}}\quad \mathrm{KL}\left(q, \pi\right) ``` -In these cases, denoting the target log denstiy as $\log \pi$, we can effectively estimate the gradient of the ELBO by directly differentiating the stochastic estimate of the ELBO objective +where $\mathcal{Q}$ is some family of distributions, often called the variational family, by running stochastic gradient descent in the (Euclidean) space of parameters. +That is, for all $$q_{\lambda} \in \mathcal{Q}$$, we assume $$q_{\lambda}$$ there is a corresponding vector of parameters $$\lambda \in \Lambda$$, where the space of parameters is Euclidean such that $$\Lambda \subset \mathbb{R}^p$$. + +Since we usually only have access to the unnormalized densities of the target distribution $\pi$, we don't have direct access to the KL divergence. +Instead, the ELBO maximization strategy maximizes a surrogate objective, the *evidence lower bound* (ELBO; [^JGJS1999]) ```math - \widehat{\mathrm{ELBO}}\left(\lambda\right) = \frac{1}{M}\sum^M_{m=1} \log \pi\left(T_{\lambda}\left(\epsilon_m\right)\right) + \mathbb{H}\left(q_{\lambda}\right), + \mathrm{ELBO}\left(q\right) \triangleq \mathbb{E}_{\theta \sim q} \log \pi\left(\theta\right) + \mathbb{H}\left(q\right), ``` -where $$\epsilon_m \sim \varphi$$ are Monte Carlo samples. -The resulting gradient estimate is called the reparameterization gradient estimator. +which is equivalent to the KL up to an additive constant (the evidence). -In addition to the reparameterization gradient, `AdvancedVI` provides the following features: +Algorithmically, `KLMinRepGradDescent` iterates the step - 1. **Posteriors with constrained supports** are handled through [`Bijectors`](https://github.com/TuringLang/Bijectors.jl), which is known as the automatic differentiation VI (ADVI; [^KTRGB2017]) formulation. (See [this section](@ref bijectors).) - 2. **The gradient of the entropy** can be estimated through various strategies depending on the capabilities of the variational family. (See [this section](@ref entropygrad).) +```math + \lambda_{t+1} = \mathrm{operator}\big( + \lambda_{t} + \gamma_t \widehat{\nabla_{\lambda} \mathrm{ELBO}} (q_{\lambda_t}) + \big) , +``` -## `RepGradELBO` +where $\widehat{\nabla \mathrm{ELBO}}(q_{\lambda})$ is the reparameterization gradient estimate[^HC1983][^G1991][^R1992][^P1996] of the ELBO gradient and $$\mathrm{operator}$$ is an optional operator (*e.g.* projections, identity mapping). -To use the reparameterization gradient, `AdvancedVI` provides the following variational objective: +The reparameterization gradient, also known as the push-in gradient or the pathwise gradient, was introduced to VI in [^TL2014][^RMW2014][^KW2014]. +For the variational family $$\mathcal{Q}$$, suppose the process of sampling from $$q_{\lambda} \in \mathcal{Q}$$ can be described by some differentiable reparameterization function $$T_{\lambda}$$ and a *base distribution* $$\varphi$$ independent of $$\lambda$$ such that -```@docs -RepGradELBO +```math +z \sim q_{\lambda} \qquad\Leftrightarrow\qquad +z \stackrel{d}{=} T_{\lambda}\left(\epsilon\right);\quad \epsilon \sim \varphi \; . ``` -## [Handling Constraints with `Bijectors`](@id bijectors) +In these cases, denoting the target log denstiy as $\log \pi$, we can effectively estimate the gradient of the ELBO by directly differentiating the stochastic estimate of the ELBO objective -As mentioned in the docstring, the `RepGradELBO` objective assumes that the variational approximation $$q_{\lambda}$$ and the target distribution $$\pi$$ have the same support for all $$\lambda \in \Lambda$$. +```math + \widehat{\mathrm{ELBO}}\left(q_{\lambda}\right) = \frac{1}{M}\sum^M_{m=1} \log \pi\left(T_{\lambda}\left(\epsilon_m\right)\right) + \mathbb{H}\left(q_{\lambda}\right), +``` + +where $$\epsilon_m \sim \varphi$$ are Monte Carlo samples. + +[^JGJS1999]: Jordan, M. I., Ghahramani, Z., Jaakkola, T. S., & Saul, L. K. (1999). An introduction to variational methods for graphical models. Machine learning, 37, 183-233. +[^HC1983]: Ho, Y. C., & Cao, X. (1983). Perturbation analysis and optimization of queueing networks. Journal of optimization theory and Applications, 40(4), 559-582. +[^G1991]: Glasserman, P. (1991). Gradient estimation via perturbation analysis (Vol. 116). Springer Science & Business Media. +[^R1992]: Rubinstein, R. Y. (1992). Sensitivity analysis of discrete event systems by the “push out” method. Annals of Operations Research, 39(1), 229-250. +[^P1996]: Pflug, G. C. (1996). Optimization of stochastic models: the interface between simulation and optimization (Vol. 373). Springer Science & Business Media. +[^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014). Doubly stochastic variational Bayes for non-conjugate inference. In *International Conference on Machine Learning*. +[^RMW2014]: Rezende, D. J., Mohamed, S., & Wierstra, D. (2014). Stochastic backpropagation and approximate inference in deep generative models. In *International Conference on Machine Learning*. +[^KW2014]: Kingma, D. P., & Welling, M. (2014). Auto-encoding variational bayes. In *International Conference on Learning Representations*. +## [Handling Constraints with `Bijectors`](@id bijectors) +As mentioned in the docstring, `KLMinRepGradDescent` assumes that the variational approximation $$q_{\lambda}$$ and the target distribution $$\pi$$ have the same support for all $$\lambda \in \Lambda$$. However, in general, it is most convenient to use variational families that have the whole Euclidean space $$\mathbb{R}^d$$ as their support. This is the case for the [location-scale distributions](@ref locscale) provided by `AdvancedVI`. For target distributions which the support is not the full $$\mathbb{R}^d$$, we can apply some transformation $$b$$ to $$q_{\lambda}$$ to match its support such that @@ -57,9 +83,11 @@ where $$b$$ is often called a *bijector*, since it is often chosen among bijecti This idea is known as automatic differentiation VI[^KTRGB2017] and has subsequently been improved by Tensorflow Probability[^DLTBV2017]. In Julia, [Bijectors.jl](https://github.com/TuringLang/Bijectors.jl)[^FXTYG2020] provides a comprehensive collection of bijections. -One caveat of ADVI is that, after applying the bijection, a Jacobian adjustment needs to be applied. -That is, the objective is now - +[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of Machine Learning Research*, 18(14), 1-45. +[^DLTBV2017]: Dillon, J. V., Langmore, I., Tran, D., Brevdo, E., Vasudevan, S., Moore, D., ... & Saurous, R. A. (2017). Tensorflow distributions. arXiv. +[^FXTYG2020]: Fjelde, T. E., Xu, K., Tarek, M., Yalburgi, S., & Ge, H. (2020,. Bijectors. jl: Flexible transformations for probability distributions. In *Symposium on Advances in Approximate Bayesian Inference*. + One caveat of ADVI is that, after applying the bijection, a Jacobian adjustment needs to be applied. + That is, the objective is now ```math \mathrm{ADVI}\left(\lambda\right) \triangleq @@ -84,13 +112,10 @@ q_transformed = Bijectors.TransformedDistribution(q, binv) By passing `q_transformed` to `optimize`, the Jacobian adjustment for the bijector `b` is automatically applied. (See the [Basic Example](@ref basic) for a fully working example.) -[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of Machine Learning Research*. -[^DLTBV2017]: Dillon, J. V., Langmore, I., Tran, D., Brevdo, E., Vasudevan, S., Moore, D., ... & Saurous, R. A. (2017). Tensorflow distributions. arXiv. -[^FXTYG2020]: Fjelde, T. E., Xu, K., Tarek, M., Yalburgi, S., & Ge, H. (2020,. Bijectors. jl: Flexible transformations for probability distributions. In *Symposium on Advances in Approximate Bayesian Inference*. -## [Entropy Estimators](@id entropygrad) +## [Entropy Gradient Estimators](@id entropygrad) For the gradient of the entropy term, we provide three choices with varying requirements. -The user can select the entropy estimator by passing it as a keyword argument when constructing the `RepGradELBO` objective. +The user can select the entropy estimator by passing it as a keyword argument when constructing the algorithm object. | Estimator | `entropy(q)` | `logpdf(q)` | Type | |:--------------------------- |:------------:|:-----------:|:-------------------------------- | @@ -179,7 +204,7 @@ end In this example, the true posterior is contained within the variational family. This setting is known as "perfect variational family specification." -In this case, the `RepGradELBO` estimator with `StickingTheLandingEntropy` is the only estimator known to converge exponentially fast ("linear convergence") to the true solution. +In this case, `KLMinRepGradDescent` with `StickingTheLandingEntropy` is the only estimator known to converge exponentially fast ("linear convergence") to the true solution. Recall that the original ADVI objective with a closed-form entropy (CFE) is given as follows: @@ -281,7 +306,7 @@ Furthermore, in a lot of cases, a low-accuracy solution may be sufficient. [^KMG2024]: Kim, K., Ma, Y., & Gardner, J. (2024). Linear Convergence of Black-Box Variational Inference: Should We Stick the Landing?. In International Conference on Artificial Intelligence and Statistics (pp. 235-243). PMLR. ## Advanced Usage -There are two major ways to customize the behavior of `RepGradELBO` +There are two major ways to customize the behavior of `KLMinRepGradDescent` - Customize the `Distributions` functions: `rand(q)`, `entropy(q)`, `logpdf(q)`. - Customize `AdvancedVI.reparam_with_entropy`. diff --git a/docs/src/klminrepgradproxdescent.md b/docs/src/klminrepgradproxdescent.md new file mode 100644 index 00000000..24931a03 --- /dev/null +++ b/docs/src/klminrepgradproxdescent.md @@ -0,0 +1,61 @@ +# [`KLMinRepGradProxDescent`](@id klminrepgradproxdescent) + +## Description + +This algorithm is a slight variation of [`KLMinRepGradDescent`](@ref klminrepgraddescent) specialized to [location-scale families](@ref locscale). +Therefore, it also aims to minimize the exclusive (or reverse) Kullback-Leibler (KL) divergence over the space of parameters. +But instead, it uses stochastic proximal gradient descent with the [proximal operator](@ref proximalocationscaleentropy) of the entropy of location-scale variational families as discussed in: [^D2020][^KMG2024][^DGG2023]. +The remainder of the section will only discuss details specific to `KLMinRepGradProxDescent`. +Thus, for general usage and additional details, please refer to the docs of `KLMinRepGradDescent` instead. + +```@docs +KLMinRepGradProxDescent +``` + +It implements the stochastic proximal gradient descent-based algorithm described in: . + +## Methodology + +Recall that [KLMinRepGradDescent](@ref klminrepgraddescent) maximizes the ELBO. +Now, the ELBO can be re-written as follows: + +```math + \mathrm{ELBO}\left(q\right) \triangleq \mathcal{E}\left(q\right) + \mathbb{H}\left(q\right), +``` + +where + +```math + \mathcal{E}\left(q\right) = \mathbb{E}_{\theta \sim q} \log \pi\left(\theta\right) +``` + +is often referred to as the *negative energy functional*. +`KLMinRepGradProxDescent` attempts to address the fact that minimizing the whole ELBO can be unstable due to non-smoothness of $$\mathbb{H}\left(q\right)$$[^D2020]. +For this, `KLMinRepGradProxDescent` relies on proximal stochastic gradient descent, where the problematic term $$\mathbb{H}\left(q\right)$$ is separately handled via a *proximal operator*. +Specifically, `KLMinRepGradProxDescent` first estimates the gradient of the energy $$\mathcal{E}\left(q\right)$$ only via the reparameterization gradient estimator. +Let us denote this as $$\widehat{\nabla_{\lambda} \mathcal{E}}\left(q_{\lambda}\right)$$. +Then `KLMinRepGradProxDescent` iterates the step + +```math + \lambda_{t+1} = \mathrm{prox}_{-\gamma_t \mathbb{H}}\big( + \lambda_{t} + \gamma_t \widehat{\nabla_{\lambda} \mathcal{E}}(q_{\lambda_t}) + \big) , +``` + +where + +```math +\mathrm{prox}_{h}(\lambda_t) += \argmin_{\lambda \in \Lambda}\left\{ + h(\lambda) + {\lVert \lambda - \lambda_t \rVert}_2^2 +\right\} +``` + +is a proximal operator for the entropy. +As long as $$\mathrm{prox}_{-\gamma_t \mathbb{H}}$$ can be evaluated efficiently, this scheme can side-step the fact that $$\mathbb{H}(\lambda)$$ is difficult to deal with via gradient descent. +For location-scale families, it turns out the proximal operator of the entropy can be operated efficiently[^D2020], which is implemented as [`ProximalLocationScaleEntropy`](@ref proximalocationscaleentropy). +This has been empirically shown to be more robust[^D2020][^KMG2024]. + +[^D2020]: Domke, J. (2020). Provable smoothness guarantees for black-box variational inference. In *International Conference on Machine Learning*. +[^KMG2024]: Kim, K., Ma, Y., & Gardner, J. (2024). Linear Convergence of Black-Box Variational Inference: Should We Stick the Landing?. In International Conference on Artificial Intelligence and Statistics (pp. 235-243). PMLR. +[^DGG2023]: Domke, J., Gower, R., & Garrigos, G. (2023). Provable convergence guarantees for black-box variational inference. Advances in neural information processing systems, 36, 66289-66327. diff --git a/docs/src/klminscoregraddescent.md b/docs/src/klminscoregraddescent.md new file mode 100644 index 00000000..dd1a9667 --- /dev/null +++ b/docs/src/klminscoregraddescent.md @@ -0,0 +1,79 @@ +# [`KLMinScoreGradDescent`](@id klminscoregraddescent) + +## Description + +This algorithms aim to minimize the exclusive (or reverse) Kullback-Leibler (KL) divergence via stochastic gradient descent in the space of parameters. +Specifically, it uses the the *score gradient* estimator, which is similar to the algorithm that was originally referred to as black-box variational inference (BBVI; [^RGB2014][^WW2013]). +(The term BBVI has also recently been used to refer to the more general setup of maximizing the ELBO in parameter space. We are using the more narrow definition, which restricts to the use of the score gradient.) +However, instead of using the vanilla score gradient estimator, we differentiate the "VarGrad" objective[^RBNRA2020], which results in the score gradient variance-reduced by the leave-one-out control variate[^SK2014][^KvHW2019]. +`KLMinScoreGradDescent` is also an alias of `BBVI`. + +[^RBNRA2020]: Richter, L., Boustati, A., Nüsken, N., Ruiz, F., & Akyildiz, O. D. (2020). Vargrad: a low-variance gradient estimator for variational inference. Advances in Neural Information Processing Systems, 33, 13481-13492. +[^SK2014]: Salimans, T., & Knowles, D. A. (2014). On using control variates with stochastic approximation for variational bayes and its connection to stochastic linear regression. arXiv preprint arXiv:1401.1022. +```@docs +KLMinScoreGradDescent +``` + +## Methodology + +This algorithm aims to solve the problem + +```math + \mathrm{minimize}_{q \in \mathcal{Q}}\quad \mathrm{KL}\left(q, \pi\right) +``` + +where $\mathcal{Q}$ is some family of distributions, often called the variational family, by running stochastic gradient descent in the (Euclidean) space of parameters. +That is, for all $$q_{\lambda} \in \mathcal{Q}$$, we assume $$q_{\lambda}$$ there is a corresponding vector of parameters $$\lambda \in \Lambda$$, where the space of parameters is Euclidean such that $$\Lambda \subset \mathbb{R}^p$$. + +Since we usually only have access to the unnormalized densities of the target distribution $\pi$, we don't have direct access to the KL divergence. +Instead, the ELBO maximization strategy maximizes a surrogate objective, the *evidence lower bound* (ELBO; [^JGJS1999]) + +```math + \mathrm{ELBO}\left(q\right) \triangleq \mathbb{E}_{\theta \sim q} \log \pi\left(\theta\right) + \mathbb{H}\left(q\right), +``` + +which is equivalent to the KL up to an additive constant (the evidence). + +Algorithmically, `KLMinRepGradDescent` iterates the step + +```math + \lambda_{t+1} = \mathrm{operator}\big( + \lambda_{t} + \gamma_t \widehat{\nabla_{\lambda} \mathrm{ELBO}} (q_{\lambda_t}) + \big) , +``` + +where $\widehat{\nabla \mathrm{ELBO}}(q_{\lambda})$ is the score gradient estimate[^G1990][^KR1996][^RSU1996][^W1992] of the ELBO gradient and $$\mathrm{operator}$$ is an optional operator (*e.g.* projections, identity mapping). + +Let us describe the score gradient estimator[^G1990][^KR1996][^RSU1996][^W1992] of the ELBO gradient, also known as the score function method and the REINFORCE gradient. +For variational inference, the use of the score gradient was proposed in [^WW2013][^RGB2014]. +Unlike the reparameterization gradient, the score gradient does not require the target log density to be differentiable, and does not differentiate through the sampling process of the variational approximation $q$. +Instead, it only requires gradients of the log density $\log q$. +For this reason, the score gradient is the standard method to deal with discrete variables and targets with discrete support. +In more detail, the score gradient uses the Fisher log-derivative identity: For any regular $f$, + +```math +\nabla_{\lambda} \mathbb{E}_{z \sim q_{\lambda}} f += +\mathbb{E}_{z \sim q_{\lambda}}\left[ f(z) \log q_{\lambda}(z) \right] \; . +``` + +The ELBO corresponds to the case where $f = \log \pi / \log q$, where $\log \pi$ is the target log density. +Instead of implementing the canonical score gradient, `KLMinScoreGradDescent` internally uses the "VarGrad" objective[^RBNRA2020]: + +```math +\widehat{\mathrm{VarGrad}}(\lambda) += +\mathrm{Var}\left( \log q_{\lambda}(z_i) - \log \pi\left(z_i\right) \right) \; , +``` + +where the variance is computed over the samples $z_1, \ldots, z_m \sim q_{\lambda}$. +Differentiating the VarGrad objective corresponds to the canonical score gradient combined with the "leave-one-out" control variate[^SK2014][^KvHW2019]. + +[^JGJS1999]: Jordan, M. I., Ghahramani, Z., Jaakkola, T. S., & Saul, L. K. (1999). An introduction to variational methods for graphical models. Machine learning, 37, 183-233. +[^G1990]: Glynn, P. W. (1990). Likelihood ratio gradient estimation for stochastic systems. Communications of the ACM, 33(10), 75-84. +[^KR1996]: Kleijnen, J. P., & Rubinstein, R. Y. (1996). Optimization and sensitivity analysis of computer simulation models by the score function method. European Journal of Operational Research, 88(3), 413-427. +[^RSU1996]: Rubinstein, R. Y., Shapiro, A., & Uryasev, S. (1996). The score function method. Encyclopedia of Management Sciences, 1363-1366. +[^W1992]: Williams, R. J. (1992). Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine learning, 8, 229-256. +[^WW2013]: Wingate, D., & Weber, T. (2013). Automated variational inference in probabilistic programming. arXiv preprint arXiv:1301.1299. +[^RGB2014]: Ranganath, R., Gerrish, S., & Blei, D. (2014). Black box variational inference. In Artificial intelligence and statistics (pp. 814-822). PMLR. +[^KvHW2019]: Kool, W., van Hoof, H., & Welling, M. (2019). Buy 4 reinforce samples, get a baseline for free!. diff --git a/docs/src/paramspacesgd/general.md b/docs/src/paramspacesgd/general.md deleted file mode 100644 index 347a9dc8..00000000 --- a/docs/src/paramspacesgd/general.md +++ /dev/null @@ -1,98 +0,0 @@ - -# [General](@id paramspacesgd) - -`ParamSpaceSGD` SGD is a general algorithm for leveraging automatic differentiation and SGD. -Furthermore, it operates in the space of *variational parameters*. -Consider the case where each member $q_{\lambda} \in \mathcal{Q}$ of the variational family $\mathcal{Q}$ is uniquely represented through a collection of parameters $\lambda \in \Lambda \subseteq \mathbb{R}^p$. -That is, - -```math -\mathcal{Q} = \{q_{\lambda} \mid \lambda \in \Lambda \}, -``` -Then, as implied by the name, `ParamSpaceSGD` runs SGD on $\Lambda$, the (Euclidean) space of parameters. - -Any algorithm that operates by iterating the following steps can easily be implemented via `ParamSpaceSGD`: - -1. Obtain an unbiased estimate of the target objective. -2. Obtain an estimate of the gradient of the objective by differentiating the objective estimate with respect to the parameters. -3. Perform gradient descent with the stochastic gradient estimate. - -After some simplifications, each `step` of `ParamSpaceSGD` can be described as follows: - -```julia -function step(rng, alg::ParamSpaceSGD, state, callback, objargs...; kwargs...) - (; adtype, problem, objective, operator, averager) = alg - (; q, iteration, grad_buf, opt_st, obj_st, avg_st) = state - iteration += 1 - - # Extract variational parameters of `q` - params, re = Optimisers.destructure(q) - - # Estimate gradient and update the `DiffResults` buffer `grad_buf`. - grad_buf, obj_st, info = estimate_gradient!(...) - - # Gradient descent step. - grad = DiffResults.gradient(grad_buf) - opt_st, params = Optimisers.update!(opt_st, params, grad) - - # Apply operator - params = apply(operator, typeof(q), opt_st, params, re) - - # Apply parameter averaging - avg_st = apply(averager, avg_st, params) - - # Updated state - state = ParamSpaceSGDState(re(params), iteration, grad_buf, opt_st, obj_st, avg_st) - state, false, info -end -``` -The output of `ParamSpaceSGD` is the final state of `averager`. -Furthermore, `operator` can be anything from an identity mapping, a projection operator, a proximal operator, and so on. - -## `ParamSpaceSGD` -The constructor for `ParamSpaceSGD` is as follows: - -```@docs -ParamSpaceSGD -``` - -## Objective Interface - -To define an instance of a `ParamSpaceSGD` algorithm, it suffices to implement the `AbstractVariationalObjective` interface. -First, we need to define a subtype of `AbstractVariationalObjective`: - -```@docs -AdvancedVI.AbstractVariationalObjective -``` - -In addition, we need to implement some methods associated with the objective. -First, each objective may maintain a state such as buffers, online estimates of control variates, batch iterators for subsampling, and so on. -Such things should be initialized by implementing the following: - -```@docs -AdvancedVI.init( - ::Random.AbstractRNG, - ::AdvancedVI.AbstractVariationalObjective, - ::ADTypes.AbstractADType, - ::Any, - ::Any, - ::Any, - ::Any, -) -``` -If this method is not implemented, the state will be automatically be `nothing`. - -Next, the key functionality of estimating stochastic gradients should be implemented through the following: - -```@docs -AdvancedVI.estimate_gradient! -``` - -`AdvancedVI` only interacts with each variational objective by querying gradient estimates. -In a lot of cases, however, it is convinient to be able to estimate the current value of the objective. -For example, for monitoring convergence. -This should be done through the following: - -```@docs -AdvancedVI.estimate_objective -``` diff --git a/docs/src/paramspacesgd/klminrepgraddescent.md b/docs/src/paramspacesgd/klminrepgraddescent.md deleted file mode 100644 index 6212085d..00000000 --- a/docs/src/paramspacesgd/klminrepgraddescent.md +++ /dev/null @@ -1,10 +0,0 @@ -# [`KLMinRepGradDescent`](@id klminrepgraddescent) - -This is a convenience constructor for [`ParamSpaceSGD`](@ref paramspacesgd) with the [`RepGradELBO`](@ref repgradelbo) objective. -This is equivalent to the algorithm commonly referred as automatic differentiation variational inference[^KTRGB2017]. -`KLMinRepGradDescent` is also an alias of `ADVI` . - -[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of Machine Learning Research*, 18(14), 1-45. -```@docs -KLMinRepGradDescent -``` diff --git a/docs/src/paramspacesgd/klminrepgradproxdescent.md b/docs/src/paramspacesgd/klminrepgradproxdescent.md deleted file mode 100644 index 831c86bd..00000000 --- a/docs/src/paramspacesgd/klminrepgradproxdescent.md +++ /dev/null @@ -1,11 +0,0 @@ -# [`KLMinRepGradProxDescent`](@id klminrepgradproxdescent) - -This is a convenience constructor for [`ParamSpaceSGD`](@ref paramspacesgd) with the [`RepGradELBO`](@ref repgradelbo) objective with a proximal operator of the entropy (see [here](@ref proximalocationscaleentropy)) of location-scale variational families. -It implements the stochastic proximal gradient descent-based algorithm described in: [^D2020][^KMG2024][^DGG2023]. - -[^D2020]: Domke, J. (2020). Provable smoothness guarantees for black-box variational inference. In *International Conference on Machine Learning*. -[^KMG2024]: Kim, K., Ma, Y., & Gardner, J. (2024). Linear Convergence of Black-Box Variational Inference: Should We Stick the Landing?. In International Conference on Artificial Intelligence and Statistics (pp. 235-243). PMLR. -[^DGG2023]: Domke, J., Gower, R., & Garrigos, G. (2023). Provable convergence guarantees for black-box variational inference. Advances in neural information processing systems, 36, 66289-66327. -```@docs -KLMinRepGradProxDescent -``` diff --git a/docs/src/paramspacesgd/klminscoregraddescent.md b/docs/src/paramspacesgd/klminscoregraddescent.md deleted file mode 100644 index 514a1410..00000000 --- a/docs/src/paramspacesgd/klminscoregraddescent.md +++ /dev/null @@ -1,15 +0,0 @@ -# [`KLMinScoreGradDescent`](@id klminscoregraddescent) - -This is a convenience constructor for [`ParamSpaceSGD`](@ref paramspacesgd) with the [`ScoreGradELBO`](@ref scoregradelbo) objective. -This is similar to the algorithm that was originally referred to as black-box variational inference (BBVI; [^RGB2014][^WW2013]). -(The term BBVI has also recently been used to refer to the more general setup of maximizing the ELBO in parameter space. We are using the more narrow definition, which restricts to the use of the score gradient.) -However, instead of using the vanilla score gradient estimator, we differentiate the "VarGrad" objective[^RBNRA2020], which results in the score gradient variance-reduced by the leave-one-out control variate[^SK2014][^KvHW2019]. - -[^RGB2014]: Ranganath, R., Gerrish, S., & Blei, D. (2014, April). Black box variational inference. In *Artificial Intelligence and Statistics* (pp. 814-822). PMLR. -[^WW2013]: Wingate, D., & Weber, T. (2013). Automated variational inference in probabilistic programming. arXiv preprint arXiv:1301.1299. -[^RBNRA2020]: Richter, L., Boustati, A., Nüsken, N., Ruiz, F., & Akyildiz, O. D. (2020). Vargrad: a low-variance gradient estimator for variational inference. Advances in Neural Information Processing Systems, 33, 13481-13492. -[^SK2014]: Salimans, T., & Knowles, D. A. (2014). On using control variates with stochastic approximation for variational bayes and its connection to stochastic linear regression. arXiv preprint arXiv:1401.1022. -[^KvHW2019]: Kool, W., van Hoof, H., & Welling, M. (2019). Buy 4 reinforce samples, get a baseline for free!. -```@docs -KLMinScoreGradDescent -``` diff --git a/docs/src/paramspacesgd/objectives.md b/docs/src/paramspacesgd/objectives.md deleted file mode 100644 index d3a822cf..00000000 --- a/docs/src/paramspacesgd/objectives.md +++ /dev/null @@ -1,35 +0,0 @@ -# Overview of Algorithms - -This section will provide an overview of the algorithm form by each objectives provided by `AdvancedVI`. - -## Evidence Lower Bound Maximization - -Evidence lower bound (ELBO) maximization[^JGJS1999] is a general family of algorithms that minimize the exclusive (or reverse) Kullback-Leibler (KL) divergence between the target distribution ``\pi`` and a variational approximation ``q_{\lambda}``. -More generally, it aims to solve the problem - -```math - \mathrm{minimize}_{q \in \mathcal{Q}}\quad \mathrm{KL}\left(q, \pi\right) \; , -``` - -where $\mathcal{Q}$ is some family of distributions, often called the variational family. -Since we usually only have access to the unnormalized densities of the target distribution $\pi$, we don't have direct access to the KL divergence. -Instead, the ELBO maximization strategy maximizes a surrogate objective, the *ELBO*: - -```math - \mathrm{ELBO}\left(q\right) \triangleq \mathbb{E}_{\theta \sim q} \log \pi\left(\theta\right) + \mathbb{H}\left(q\right), -``` - -which is equivalent to the KL up to an additive constant (the evidence). -The ELBO and its gradient can be readily estimated through various strategies. -Overall, ELBO maximization algorithms aim to solve the problem: - -```math - \mathrm{minimize}_{q \in \mathcal{Q}}\quad -\mathrm{ELBO}\left(q\right). -``` - -Multiple ways to solve this problem exist, each leading to a different variational inference algorithm. `AdvancedVI` provides the following objectives: - - - [RepGradELBO](@ref repgradelbo): Implements the reparameterization gradient estimator of the ELBO gradient. - - [ScoreGradELBO](@ref scoregradelbo): Implements the score gradient estimator of the ELBO gradient. - -[^JGJS1999]: Jordan, M. I., Ghahramani, Z., Jaakkola, T. S., & Saul, L. K. (1999). An introduction to variational methods for graphical models. Machine learning, 37, 183-233. diff --git a/docs/src/paramspacesgd/scoregradelbo.md b/docs/src/paramspacesgd/scoregradelbo.md deleted file mode 100644 index c86ceca1..00000000 --- a/docs/src/paramspacesgd/scoregradelbo.md +++ /dev/null @@ -1,45 +0,0 @@ -# [Score Gradient Estimator](@id scoregradelbo) - -## Overview - -The `ScoreGradELBO` implements the score gradient estimator[^G1990][^KR1996][^RSU1996][^W1992] of the ELBO gradient, also known as the score function method and the REINFORCE gradient. -For variational inference, the use of the score gradient was proposed in [^WW2013][^RGB2014]. -Unlike the [reparameterization gradient](@ref repgradelbo), the score gradient does not require the target log density to be differentiable, and does not differentiate through the sampling process of the variational approximation $q$. -Instead, it only requires gradients of the log density $\log q$. -For this reason, the score gradient is the standard method to deal with discrete variables and targets with discrete support. - -[^G1990]: Glynn, P. W. (1990). Likelihood ratio gradient estimation for stochastic systems. Communications of the ACM, 33(10), 75-84. -[^KR1996]: Kleijnen, J. P., & Rubinstein, R. Y. (1996). Optimization and sensitivity analysis of computer simulation models by the score function method. European Journal of Operational Research, 88(3), 413-427. -[^RSU1996]: Rubinstein, R. Y., Shapiro, A., & Uryasev, S. (1996). The score function method. Encyclopedia of Management Sciences, 1363-1366. -[^W1992]: Williams, R. J. (1992). Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine learning, 8, 229-256. -[^WW2013]: Wingate, D., & Weber, T. (2013). Automated variational inference in probabilistic programming. arXiv preprint arXiv:1301.1299. -[^RGB2014]: Ranganath, R., Gerrish, S., & Blei, D. (2014). Black box variational inference. In Artificial intelligence and statistics (pp. 814-822). PMLR. - In more detail, the score gradient uses the Fisher log-derivative identity: For any regular $f$, -```math -\nabla_{\lambda} \mathbb{E}_{z \sim q_{\lambda}} f -= -\mathbb{E}_{z \sim q_{\lambda}}\left[ f(z) \log q_{\lambda}(z) \right] \; . -``` - -The ELBO corresponds to the case where $f = \log \pi / \log q$, where $\log \pi$ is the target log density. - -Instead of implementing the canonical score gradient, `ScoreGradELBO` uses the "VarGrad" objective[^RBNRA2020]: - -```math -\widehat{\mathrm{VarGrad}}(\lambda) -= -\mathrm{Var}\left( \log q_{\lambda}(z_i) - \log \pi\left(z_i\right) \right) \; , -``` - -where the variance is computed over the samples $z_1, \ldots, z_m \sim q_{\lambda}$. -Differentiating the VarGrad objective corresponds to the canonical score gradient combined with the "leave-one-out" control variate[^SK2014][^KvHW2019]. - -[^RBNRA2020]: Richter, L., Boustati, A., Nüsken, N., Ruiz, F., & Akyildiz, O. D. (2020). Vargrad: a low-variance gradient estimator for variational inference. Advances in Neural Information Processing Systems, 33, 13481-13492. -[^SK2014]: Salimans, T., & Knowles, D. A. (2014). On using control variates with stochastic approximation for variational bayes and its connection to stochastic linear regression. arXiv preprint arXiv:1401.1022. -[^KvHW2019]: Kool, W., van Hoof, H., & Welling, M. (2019). Buy 4 reinforce samples, get a baseline for free!. - Since the expectation of the `VarGrad` objective (not its gradient) is not exactly the ELBO, we separately obtain an unbiased estimate of the ELBO to be returned by [`estimate_objective`](@ref). -## `ScoreGradELBO` - -```@docs -ScoreGradELBO -```