From 0c1ef5b027c38dcc70f056e1925ab8e73c21d6ab Mon Sep 17 00:00:00 2001 From: Spencer Lyon Date: Mon, 25 Jan 2016 09:41:55 -0500 Subject: [PATCH] refactor ddp types --- src/markov/ddp.jl | 326 ++++++++++++++++++++++++++-------------------- 1 file changed, 188 insertions(+), 138 deletions(-) diff --git a/src/markov/ddp.jl b/src/markov/ddp.jl index 5731582c..81d6648f 100644 --- a/src/markov/ddp.jl +++ b/src/markov/ddp.jl @@ -31,28 +31,43 @@ DiscreteDP type for specifying paramters for discrete dynamic programming model - `R::Array{T,NR}` : Reward Array - `Q::Array{T,NQ}` : Transition Probability Array - `beta::Float64` : Discount Factor +- `s_indices::Nullable{Vector{Tind}}`: State Indices. Null unless using + SA formulation +- `a_indices::Nullable{Vector{Tind}}`: Action Indices. Null unless using + SA formulation +- `a_indptr::Nullable{Vector{Tind}}`: Action Index Pointers. Null unless using + SA formulation ##### Returns - `ddp::DiscreteDP` : DiscreteDP object """ - -abstract AbstractDiscreteDP{T} - -type DiscreteDP{T<:Real,NQ,NR,Tbeta<:Real} <: AbstractDiscreteDP{T} - R::Array{T,NR} #-Reward Array-# - Q::Array{T,NQ} #-Transition Probability Array-# - beta::Tbeta #-Discount Factor-# +type DiscreteDP{T<:Real,NQ,NR,Tbeta<:Real,Tind} + R::Array{T,NR} # Reward Array + Q::Array{T,NQ} # Transition Probability Array + beta::Tbeta # Discount Factor + s_indices::Nullable{Vector{Tind}} # State Indices + a_indices::Nullable{Vector{Tind}} # Action Indices + a_indptr::Nullable{Vector{Tind}} # Action Index Pointers function DiscreteDP(R::Array, Q::Array, beta::Real) # verify input integrity 1 - NQ != 3 && throw(ArgumentError("Q must be 3-dimensional without s-a formulation")) - NR != 2 && throw(ArgumentError("R must be 2-dimensional without s-a formulation")) + if NQ != 3 + msg = "Q must be 3-dimensional without s-a formulation" + throw(ArgumentError(msg)) + end + if NR != 2 + msg = "R must be 2-dimensional without s-a formulation" + throw(ArgumentError(msg)) + end beta < 0 || beta >= 1 && throw(ArgumentError("beta must be [0, 1)")) + # verify input integrity 2 num_states, num_actions = size(R) - (size(Q) != (num_states,num_actions,num_states)) && throw(ArgumentError("shapes of R and Q must be (n,m) and (n,m,n)")) + if size(Q) != (num_states, num_actions, num_states) + throw(ArgumentError("shapes of R and Q must be (n,m) and (n,m,n)")) + end # check feasibility R_max = _s_wise_max(R) @@ -63,70 +78,96 @@ type DiscreteDP{T<:Real,NQ,NR,Tbeta<:Real} <: AbstractDiscreteDP{T} some action: violated for state $s")) end - new(R, Q, beta) - end -end + # here the indices and indptr are null. + s_indices = Nullable{Vector{Int}}() + a_indices = Nullable{Vector{Int}}() + a_indptr = Nullable{Vector{Int}}() -type DiscreteDP_sa{T<:Real,NQ,NR,Tbeta<:Real,Tind<:Integer} <: AbstractDiscreteDP{T} - R::Array{T,NR} #-Reward Array-# - Q::Array{T,NQ} #-Transition Probability Array-# - beta::Tbeta #-Discount Factor-# - s_indices::Vector{Tind} #-State Indices-# - a_indices::Vector{Tind} #-Action Indices-# - a_indptr::Vector{Tind} #-Action Index Pointers-# + new(R, Q, beta, s_indices, a_indices, a_indptr) + end - function DiscreteDP_sa(R::Array, Q::Array, beta::Real, s_indices::Vector, a_indices::Vector) + function DiscreteDP(R::Array, Q::Array, beta::Real, s_indices::Vector, + a_indices::Vector) # verify input integrity 1 - NQ != 2 && throw(ArgumentError("Q must be 2-dimensional with s-a formulation")) - NR != 1 && throw(ArgumentError("R must be 1-dimensional with s-a formulation")) + if NQ != 2 + throw(ArgumentError("Q must be 2-dimensional with s-a formulation")) + end + if NR != 1 + throw(ArgumentError("R must be 1-dimensional with s-a formulation")) + end beta < 0 || beta >= 1 && throw(ArgumentError("beta must be [0, 1)")) + # verify input integrity 2 num_sa_pairs, num_states = size(Q) - length(R) != num_sa_pairs && throw(ArgumentError("shapes of R and Q must be (L,) and (L,n)")) - ([length(s_indices); length(a_indices)] != fill(num_sa_pairs,2)) && throw(ArgumentError("length of s_indices and a_indices must be equal to the number of s-a pairs")) + if length(R) != num_sa_pairs + throw(ArgumentError("shapes of R and Q must be (L,) and (L,n)")) + end + if length(s_indices) != num_sa_pairs + msg = "length of s_indices must be equal to the number of s-a pairs" + throw(ArgumentError(msg)) + end + if length(a_indices) != num_sa_pairs + msg = "length of a_indices must be equal to the number of s-a pairs" + throw(ArgumentError(msg)) + end - if _has_sorted_sa_indices(s_indices,a_indices) - a_indptr = Array(Int64, num_states+1) - _generate_a_indptr!(num_states, s_indices, a_indptr) + + if _has_sorted_sa_indices(s_indices, a_indices) + a_indptr = Array(Int64, num_states+1) + _generate_a_indptr!(num_states, s_indices, a_indptr) else - # transpose matrix to use Julia's CSC; now rows are actions and columns are states (this is why it's called as_ptr instead of sa_ptr) - as_ptr = sparse(a_indices, s_indices, collect(1:num_sa_pairs)) - a_indices = as_ptr.rowval - a_indptr = as_ptr.colptr - - R = R[as_ptr.nzval] - Q = Q[as_ptr.nzval,:] - - _s_indices = Array(eltype(a_indices),num_sa_pairs) - for i in 1:num_states, j in a_indptr[i]:(a_indptr[i+1]-1) - _s_indices[j] = i - end - copy!(s_indices,_s_indices) + # transpose matrix to use Julia's CSC; now rows are actions and + # columns are states (this is why it's called as_ptr not sa_ptr) + as_ptr = sparse(a_indices, s_indices, collect(1:num_sa_pairs)) + a_indices = as_ptr.rowval + a_indptr = as_ptr.colptr + + R = R[as_ptr.nzval] + Q = Q[as_ptr.nzval, :] + + _s_indices = Array(eltype(a_indices), num_sa_pairs) + for i in 1:num_states, j in a_indptr[i]:(a_indptr[i+1]-1) + _s_indices[j] = i + end + copy!(s_indices, _s_indices) end # check feasibility - diff = a_indptr[2:end]-a_indptr[1:end-1] - if any(diff .== 0.0) + aptr_diff = diff(a_indptr) + if any(aptr_diff .== 0.0) # First state index such that no action is available - s = find(diff .== 0.0) #-Only Gives True + s = find(aptr_diff .== 0.0) # Only Gives True throw(ArgumentError("for every state at least one action must be available: violated for state $s")) end + # package into nullables before constructing type + s_indices = Nullable{Vector{Tind}}(s_indices) + a_indices = Nullable{Vector{Tind}}(a_indices) + a_indptr = Nullable{Vector{Tind}}(a_indptr) + new(R, Q, beta, s_indices, a_indices, a_indptr) end end -# necessary for dispatch to fill in type parameters {T,NQ,NR,Tbeta} +# necessary for dispatch to fill in type parameters {T,NQ,NR,Tbeta}. Notice +# that we set `Tind=Int` in this case, which is ok because DiscreteDP{T,NQ,NR,Tbeta}(R::Array{T,NR}, Q::Array{T,NQ}, beta::Tbeta) = - DiscreteDP{T,NQ,NR,Tbeta}(R, Q, beta) + DiscreteDP{T,NQ,NR,Tbeta,Int}(R, Q, beta) + +function DiscreteDP{T,NQ,NR,Tbeta,Tind}(R::Array{T,NR}, Q::Array{T,NQ}, + beta::Tbeta, s_indices::Vector{Tind}, + a_indices::Vector{Tind}) + DiscreteDP{T,NQ,NR,Tbeta,Tind}(R, Q, beta, s_indices, a_indices) +end -DiscreteDP{T,NQ,NR,Tbeta,Tind}(R::Array{T,NR}, Q::Array{T,NQ}, beta::Tbeta, s_indices::Vector{Tind}, a_indices::Vector{Tind}) = - DiscreteDP_sa{T,NQ,NR,Tbeta,Tind}(R, Q, beta, s_indices, a_indices) +# create an alias for each of the common formulations +typealias DDP{T,Tbeta,Tind} DiscreteDP{T,3,2,Tbeta,Tind} +typealias DDPsa{T,Tbeta,Tind} DiscreteDP{T,2,1,Tbeta,Tind} #~Property Functions~# -num_states(ddp::DiscreteDP) = size(ddp.R, 1) -num_states(ddp::DiscreteDP_sa) = size(ddp.Q, 2) +num_states(ddp::DDP) = size(ddp.R, 1) +num_states(ddp::DDPsa) = size(ddp.Q, 2) num_actions(ddp::DiscreteDP) = size(ddp.R, 2) abstract DDPAlgorithm @@ -140,7 +181,7 @@ solving the model ##### Parameters -- `ddp::AbstractDiscreteDP` : DiscreteDP object +- `ddp::DiscreteDP` : DiscreteDP object ##### Returns @@ -154,8 +195,8 @@ type DPSolveResult{Algo<:DDPAlgorithm,Tval<:Real} sigma::Array{Int,1} mc::MarkovChain - function DPSolveResult(ddp::AbstractDiscreteDP) - v = s_wise_max(ddp, ddp.R) #Initialise v with v_init + function DPSolveResult(ddp::DiscreteDP) + v = s_wise_max(ddp, ddp.R) # Initialise v with v_init ddpr = new(v, similar(v), 0, similar(v, Int)) # fill in sigma with proper values @@ -164,7 +205,7 @@ type DPSolveResult{Algo<:DDPAlgorithm,Tval<:Real} end # method to pass initial value function (skip the s-wise max) - function DPSolveResult(ddp::AbstractDiscreteDP, v::Vector) + function DPSolveResult(ddp::DiscreteDP, v::Vector) ddpr = new(v, similar(v), 0, similar(v, Int)) # fill in sigma with proper values @@ -183,7 +224,7 @@ for a value function v. ##### Parameters -- `ddp::AbstractDiscreteDP` : Object that contains the model parameters +- `ddp::DiscreteDP` : Object that contains the model parameters - `v::Vector{T<:AbstractFloat}`: The current guess of the value function - `Tv::Vector{T<:AbstractFloat}`: A buffer array to hold the updated value function. Initial value not used and will be overwritten @@ -195,7 +236,7 @@ for a value function v. - `Tv::Vector` : Updated value function vector - `sigma::Vector` : Updated policiy function vector """ -function bellman_operator!(ddp::AbstractDiscreteDP, v::Vector, Tv::Vector, sigma::Vector) +function bellman_operator!(ddp::DiscreteDP, v::Vector, Tv::Vector, sigma::Vector) vals = ddp.R + ddp.beta * ddp.Q * v s_wise_max!(ddp, vals, Tv, sigma) Tv, sigma @@ -209,7 +250,7 @@ Apply the Bellman operator using `v=ddpr.v`, `Tv=ddpr.Tv`, and `sigma=ddpr.sigma Updates `ddpr.Tv` and `ddpr.sigma` inplace """ -bellman_operator!(ddp::AbstractDiscreteDP, ddpr::DPSolveResult) = +bellman_operator!(ddp::DiscreteDP, ddpr::DPSolveResult) = bellman_operator!(ddp, ddpr.v, ddpr.Tv, ddpr.sigma) """ @@ -221,7 +262,7 @@ corresponding policy rule ##### Parameters -- `ddp::AbstractDiscreteDP`: The ddp model +- `ddp::DiscreteDP`: The ddp model - `v::Vector{T<:AbstractFloat}`: The current guess of the value function. This array will be overwritten - `sigma::Vector`: A buffer array to hold the policy function. Initial @@ -232,11 +273,10 @@ corresponding policy rule - `Tv::Vector`: Updated value function vector - `sigma::Vector{T<:Integer}`: Policy rule """ -bellman_operator!{T<:AbstractFloat}(ddp::AbstractDiscreteDP, v::Vector{T}, sigma::Vector) = +bellman_operator!{T<:AbstractFloat}(ddp::DiscreteDP, v::Vector{T}, sigma::Vector) = bellman_operator!(ddp, v, v, sigma) # method to allow dispatch on rationals -# TODO: add a test for this # TODO from albep: not sure how to update this to the state-action pair formulation function bellman_operator!{T1<:Rational,T2<:Rational,NR,NQ,T3<:Rational}(ddp::DiscreteDP{T1,NR,NQ,T2}, v::Vector{T3}, @@ -250,14 +290,14 @@ for a given value function v. ##### Parameters -- `ddp::AbstractDiscreteDP`: The ddp model +- `ddp::DiscreteDP`: The ddp model - `v::Vector`: The current guess of the value function ##### Returns - `Tv::Vector` : Updated value function vector """ -bellman_operator(ddp::AbstractDiscreteDP, v::Vector) = +bellman_operator(ddp::DiscreteDP, v::Vector) = s_wise_max(ddp, ddp.R + ddp.beta * ddp.Q * v) # ---------------------- # @@ -269,7 +309,7 @@ Compute the v-greedy policy ##### Parameters -- `ddp::AbstractDiscreteDP` : Object that contains the model parameters +- `ddp::DiscreteDP` : Object that contains the model parameters - `ddpr::DPSolveResult` : Object that contains result variables ##### Returns @@ -281,10 +321,10 @@ Compute the v-greedy policy modifies ddpr.sigma and ddpr.Tv in place """ -compute_greedy!(ddp::AbstractDiscreteDP, ddpr::DPSolveResult) = +compute_greedy!(ddp::DiscreteDP, ddpr::DPSolveResult) = (bellman_operator!(ddp, ddpr); ddpr.sigma) -function compute_greedy{TV<:Real}(ddp::AbstractDiscreteDP, v::Vector{TV}) +function compute_greedy{TV<:Real}(ddp::DiscreteDP, v::Vector{TV}) Tv = similar(v) sigma = ones(Int, length(v)) bellman_operator!(ddp, v, Tv, sigma) @@ -300,7 +340,7 @@ Method of `evaluate_policy` that extracts sigma from a `DPSolveResult` See other docstring for details """ -evaluate_policy(ddp::AbstractDiscreteDP, ddpr::DPSolveResult) = +evaluate_policy(ddp::DiscreteDP, ddpr::DPSolveResult) = evaluate_policy(ddp, ddpr.sigma) """ @@ -308,7 +348,7 @@ Compute the value of a policy. ##### Parameters -- `ddp::AbstractDiscreteDP` : Object that contains the model parameters +- `ddp::DiscreteDP` : Object that contains the model parameters - `sigma::Vector{T<:Integer}` : Policy rule vector ##### Returns @@ -316,7 +356,7 @@ Compute the value of a policy. - `v_sigma::Array{Float64}` : Value vector of `sigma`, of length n. """ -function evaluate_policy{T<:Integer}(ddp::AbstractDiscreteDP, sigma::Vector{T}) +function evaluate_policy{T<:Integer}(ddp::DiscreteDP, sigma::Vector{T}) R_sigma, Q_sigma = RQ_sigma(ddp, sigma) b = R_sigma A = I - ddp.beta * Q_sigma @@ -332,7 +372,7 @@ Solve the dynamic programming problem. ##### Parameters -- `ddp::AbstractDiscreteDP` : Object that contains the Model Parameters +- `ddp::DiscreteDP` : Object that contains the Model Parameters - `method::Type{Tind2sub(size(ddp.R), x), sigma) @@ -418,9 +458,9 @@ function RQ_sigma{T<:Integer}(ddp::DiscreteDP, sigma::Array{T}) end # TODO: express it in a similar way as above to exploit Julia's column major order -function RQ_sigma{T<:Integer}(ddp::DiscreteDP_sa, sigma::Array{T}) +function RQ_sigma{T<:Integer}(ddp::DDPsa, sigma::Array{T}) sigma_indices = Array(T, num_states(ddp)) - _find_indices!(ddp.a_indices, ddp.a_indptr, sigma, sigma_indices) + _find_indices!(get(ddp.a_indices), get(ddp.a_indptr), sigma, sigma_indices) R_sigma = ddp.R[sigma_indices] Q_sigma = ddp.Q[sigma_indices, :] return R_sigma, Q_sigma @@ -430,10 +470,15 @@ end # Internal methods # # ---------------- # s_wise_max(ddp::DiscreteDP, vals::Matrix) = _s_wise_max(vals) -s_wise_max(ddp::DiscreteDP_sa, vals::Vector) = _s_wise_max!(ddp.a_indices, ddp.a_indptr, vals, Array(Float64, num_states(ddp))) +function s_wise_max(ddp::DDPsa, vals::Vector) + _s_wise_max!(get(ddp.a_indices), get(ddp.a_indptr), vals, + Array(Float64, num_states(ddp))) +end -s_wise_max!(ddp::DiscreteDP, vals::Matrix, out::Vector, out_argmax::Vector) = _s_wise_max!(vals, out, out_argmax) -s_wise_max!(ddp::DiscreteDP_sa, vals::Vector, out::Vector, out_argmax::Vector) = _s_wise_max!(ddp.a_indices, ddp.a_indptr, vals, out, out_argmax) +s_wise_max!(ddp::DiscreteDP, vals::Matrix, out::Vector, out_argmax::Vector) = + _s_wise_max!(vals, out, out_argmax) +s_wise_max!(ddp::DDPsa, vals::Vector, out::Vector, out_argmax::Vector) = + _s_wise_max!(get(ddp.a_indices), get(ddp.a_indptr), vals, out, out_argmax) """ Return the `Vector` `max_a vals(s, a)`, where `vals` is represented as a @@ -461,19 +506,20 @@ _s_wise_max!(vals::Matrix, out::Vector, out_argmax::Vector) = Populate `out` with `max_a vals(s, a)`, where `vals` is represented as a `Vector` of size `(num_sa_pairs,)`. """ -function _s_wise_max!(a_indices::Vector, a_indptr::Vector, vals::Vector, out::Vector) - n = length(out) - for i in 1:n - if a_indptr[i] != a_indptr[i+1] - m = a_indptr[i] - for j in a_indptr[i]+1:(a_indptr[i+1]-1) - if vals[j] > vals[m] - m = j +function _s_wise_max!(a_indices::Vector, a_indptr::Vector, vals::Vector, + out::Vector) + n = length(out) + for i in 1:n + if a_indptr[i] != a_indptr[i+1] + m = a_indptr[i] + for j in a_indptr[i]+1:(a_indptr[i+1]-1) + if vals[j] > vals[m] + m = j + end + end + out[i] = vals[m] end - end - out[i] = vals[m] end - end end """ @@ -483,20 +529,21 @@ Populate `out` with `max_a vals(s, a)`, where `vals` is represented as a Also fills `out_argmax` with the linear index associated with the indmax in each row """ -function _s_wise_max!(a_indices::Vector, a_indptr::Vector, vals::Vector, out::Vector, out_argmax::Vector) - n = length(out) - for i in 1:n - if a_indptr[i] != a_indptr[i+1] - m = a_indptr[i] - for j in a_indptr[i]+1:(a_indptr[i+1]-1) - if vals[j] > vals[m] - m = j +function _s_wise_max!(a_indices::Vector, a_indptr::Vector, vals::Vector, + out::Vector, out_argmax::Vector) + n = length(out) + for i in 1:n + if a_indptr[i] != a_indptr[i+1] + m = a_indptr[i] + for j in a_indptr[i]+1:(a_indptr[i+1]-1) + if vals[j] > vals[m] + m = j + end + end + out[i] = vals[m] + out_argmax[i] = a_indices[m] end - end - out[i] = vals[m] - out_argmax[i] = a_indices[m] end - end end """ @@ -511,21 +558,22 @@ Returns bool: Whether `s_indices` and `a_indices` are sorted. """ function _has_sorted_sa_indices(s_indices::Vector, a_indices::Vector) - L = length(s_indices) - ev = NaN - for i in 1:L-1 - ((s_indices[i] == s_indices[i+1]) && (a_indices[i] == a_indices[i+1])) && throw(ArgumentError("duplicate state-action pair")) - if s_indices[i] > s_indices[i+1] - ev = false - break - end - if (s_indices[i] == s_indices[i+1]) && (a_indices[i] > a_indices[i+1]) - ev = false - break + L = length(s_indices) + ev = true + for i in 1:L-1 + if s_indices[i] != s_indices[i+1] || a_indices[i] != a_indices[i+1] + throw(ArgumentError("duplicate state-action pair")) + end + if s_indices[i] > s_indices[i+1] + ev = false + break + end + if (s_indices[i] == s_indices[i+1]) && (a_indices[i] > a_indices[i+1]) + ev = false + break + end end - ev = true - end - ev + ev end """ @@ -541,25 +589,28 @@ s_indices : Vector{Int} out : Vector{Int} with length = num_states+1 """ function _generate_a_indptr!(num_states::Int, s_indices::Vector, out::Vector) - idx = 1 - out[1] = 1 - for s in 1:num_states-1 - while(s_indices[idx] == s) - idx += 1 + idx = 1 + out[1] = 1 + for s in 1:num_states-1 + while(s_indices[idx] == s) + idx += 1 + end + out[s+1] = idx end - out[s+1] = idx - end - out[num_states+1] = length(s_indices)+1 # need this +1 to be consistent with Julia's sparse pointers: colptr[i]:(colptr[i+1]-1) - out + # need this +1 to be consistent with Julia's sparse pointers: + # colptr[i]:(colptr[i+1]-1) + out[num_states+1] = length(s_indices)+1 + out end -function _find_indices!(a_indices::Vector, a_indptr::Vector, sigma::Vector, out::Vector) - n = length(sigma) - for i in 1:n, j in a_indptr[i]:(a_indptr[i+1]-1) - if sigma[i] == a_indices[j] - out[i] = j +function _find_indices!(a_indices::Vector, a_indptr::Vector, sigma::Vector, + out::Vector) + n = length(sigma) + for i in 1:n, j in a_indptr[i]:(a_indptr[i+1]-1) + if sigma[i] == a_indices[j] + out[i] = j + end end - end end #= @@ -578,7 +629,6 @@ function *{T}(A::Array{T,3}, v::Vector) return reshape(out, shape[1:end-1]) end - Base.ind2sub(ddp::DiscreteDP, x::Vector) = map(_ -> ind2sub(size(ddp.R), _)[2], x) @@ -586,7 +636,7 @@ Base.ind2sub(ddp::DiscreteDP, x::Vector) = Impliments Value Iteration NOTE: See `solve` for further details """ -function _solve!(ddp::AbstractDiscreteDP, ddpr::DPSolveResult{VFI}, max_iter::Integer, +function _solve!(ddp::DiscreteDP, ddpr::DPSolveResult{VFI}, max_iter::Integer, epsilon::Real, k::Integer) if ddp.beta == 0.0 tol = Inf @@ -615,10 +665,10 @@ end Policy Function Iteration NOTE: The epsilon is ignored in this method. It is only here so dispatch can - go from `solve(::AbstractDiscreteDP, ::Type{Algo})` to any of the algorithms. + go from `solve(::DiscreteDP, ::Type{Algo})` to any of the algorithms. See `solve` for further details """ -function _solve!(ddp::AbstractDiscreteDP, ddpr::DPSolveResult{PFI}, max_iter::Integer, +function _solve!(ddp::DiscreteDP, ddpr::DPSolveResult{PFI}, max_iter::Integer, epsilon::Real, k::Integer) old_sigma = copy(ddpr.sigma) @@ -643,7 +693,7 @@ midrange(x::Vector) = mean(extrema(x)) """ Modified Policy Function Iteration """ -function _solve!(ddp::AbstractDiscreteDP, ddpr::DPSolveResult{MPFI}, max_iter::Integer, +function _solve!(ddp::DiscreteDP, ddpr::DPSolveResult{MPFI}, max_iter::Integer, epsilon::Real, k::Integer) beta = ddp.beta fill!(ddpr.v, minimum(ddp.R[ddp.R .> -Inf]) / (1.0 - beta))