# Using autodiff to check gradient/Hessians

In [1]:
using Revise
using DataFrames, Random, GLM, QuasiCopula
using ForwardDiff, Test, LinearAlgebra
using LinearAlgebra: BlasReal, copytri!
using ToeplitzMatrices
using BenchmarkTools
using SnpArrays
using ForwardDiff
# using MendelPlots
ENV["COLUMNS"] = 240

BLAS.set_num_threads(1)
Threads.nthreads()

# simulate PLINK data where each SNP has at least min_ma copies of the alt allele
function simulate_random_snparray(s::Union{String, UndefInitializer}, n::Int64,
        p::Int64, mafs::Vector{Float64}; min_ma::Int = 1)
    all(0.0 .<= mafs .<= 0.5) || throw(ArgumentError("vector of minor allele frequencies must be in (0, 0.5)"))
#     any(mafs .<= 0.0005) && @warn("Provided minor allele frequencies contain entries smaller than 0.0005, simulation may take long if sample size is small and min_ma = $min_ma is large")

    #first simulate a random {0, 1, 2} matrix with each SNP drawn from Binomial(2, r[i])
    A1 = BitArray(undef, n, p) 
    A2 = BitArray(undef, n, p) 
    for j in 1:p
        minor_alleles = 0
        maf = mafs[j]
        while minor_alleles <= min_ma
            for i in 1:n
                A1[i, j] = rand(Bernoulli(maf))
                A2[i, j] = rand(Bernoulli(maf))
            end
            minor_alleles = sum(view(A1, :, j)) + sum(view(A2, :, j))
        end
    end

    #fill the SnpArray with the corresponding x_tmp entry
    return _make_snparray(s, A1, A2)
end


# function simulate_random_snparray(s::Union{String, UndefInitializer}, n::Int64,
#     p::Int64; mafs::Vector{Float64}=zeros(Float64, p), min_ma::Int = 5)

#     #first simulate a random {0, 1, 2} matrix with each SNP drawn from Binomial(2, r[i])
#     A1 = BitArray(undef, n, p) 
#     A2 = BitArray(undef, n, p) 
#     for j in 1:p
#         minor_alleles = 0
#         maf = 0
#         while minor_alleles <= min_ma
#             maf = 0.5rand()
#             for i in 1:n
#                 A1[i, j] = rand(Bernoulli(maf))
#                 A2[i, j] = rand(Bernoulli(maf))
#             end
#             minor_alleles = sum(view(A1, :, j)) + sum(view(A2, :, j))
#         end
#         mafs[j] = maf
#     end

#     #fill the SnpArray with the corresponding x_tmp entry
#     return _make_snparray(s, A1, A2)
# end

function _make_snparray(s::Union{String, UndefInitializer}, A1::BitArray, A2::BitArray)
    n, p = size(A1)
    x = SnpArray(s, n, p)
    for i in 1:(n*p)
        c = A1[i] + A2[i]
        if c == 0
            x[i] = 0x00
        elseif c == 1
            x[i] = 0x02
        elseif c == 2
            x[i] = 0x03
        else
            throw(MissingException("matrix shouldn't have missing values!"))
        end
    end
    return x
end

┌ Info: Precompiling QuasiCopula [c47b6ae2-b804-4668-9957-eb588c99ffbc]
└ @ Base loading.jl:1423


_make_snparray (generic function with 1 method)

In [2]:
function simulate_VC_longitudinal(;
    n = 1000, # sample size
    d_min = 1, # min number of observations per sample
    d_max = 5, # max number of observations per sample
    p = 3, # number of nongenetic covariates, including intercept
    m = 1, # number of variance components
    q = 1000, # number of SNPs
    k = 10, # number of causal SNPs
    maf = 0.5rand(),
    causal_snp_β = 0.5rand(),
    seed = 2022,
    y_distribution = Bernoulli,
    T = Float64,
    )
    Random.seed!(seed)
    m == 1 || m == 2 || error("m (number of VC) must be 1 or 2")
    
    # non-genetic effect sizes
    Random.seed!(seed)
#     βtrue = [1.0; rand(-0.05:0.1:0.05, p-1)]
    βtrue = [1.0; rand(-0.5:1:0.5, p-1)]
#     βtrue = [1.0; rand(-5:10:5, p-1) .* rand(Uniform(0, 5), p-1)]
    dist = y_distribution()
    link = canonicallink(dist)
    Dist = typeof(dist)
    Link = typeof(link)

    # variance components
    θtrue = fill(0.1, m)

    # simulate (nongenetic) design matrices
    Random.seed!(seed)
    X_full = Matrix{Float64}[]
    for i in 1:n
        nobs = rand(d_min:d_max) # number of obs for this sample
        push!(X_full, hcat(ones(nobs), randn(nobs, p - 1)))
    end
    
    # simulate causal alleles
    Random.seed!(seed)
    γtrue = zeros(q)
#     γtrue[1:k] .= rand([-0.2, 0.2], k)
    γtrue[1:k] .= causal_snp_β
    shuffle!(γtrue)
    
    # set minor allele freq
    mafs = fill(maf, q)
    
    # simulate random SnpArray with q SNPs with prespecified maf
    Random.seed!(seed)
    G = simulate_random_snparray(undef, n, q, mafs)
    Gfloat = convert(Matrix{T}, G, center=true, scale=false)
    
    # effect of causal alleles
    η_G = Gfloat * γtrue

    # simulate phenotypes
    if y_distribution == Normal
        τtrue = 10.0
        σ2 = inv(τtrue)
        σ = sqrt(σ2)
        obs = Vector{GaussianCopulaVCObs{T}}(undef, n)
        for i in 1:n
            # data matrix
            X = X_full[i]
            η = X * βtrue
            η .+= η_G[i] # add genetic effects
            μ = GLM.linkinv.(link, η)
            vecd = Vector{ContinuousUnivariateDistribution}(undef, size(X, 1))
            # VC matrices
            V1 = ones(size(X, 1), size(X, 1))
            V2 = Matrix(I, size(X, 1), size(X, 1))
            Γ = m == 1 ? θtrue[1] * V1 : θtrue[1] * V1 + θtrue[2] * V2
            for i in 1:size(X, 1)
                vecd[i] = y_distribution(μ[i], σ)
            end
            nonmixed_multivariate_dist = NonMixedMultivariateDistribution(vecd, Γ)
            # simuate single vector y
            y = Vector{T}(undef, size(X, 1))
            res = Vector{T}(undef, size(X, 1))
            rand(nonmixed_multivariate_dist, y, res)
            V = m == 1 ? [V1] : [V1, V2]
            obs[i] = GaussianCopulaVCObs(y, X, V)
        end
        qc_model = GaussianCopulaVCModel(obs)
    else
        obs = Vector{GLMCopulaVCObs{T, Dist, Link}}(undef, n)
        for i in 1:n
            # data matrix
            X = X_full[i]
            η = X * βtrue
            η .+= η_G[i] # add genetic effects
            μ = GLM.linkinv.(link, η)
            # VC matrices
            V1 = ones(size(X, 1), size(X, 1))
            V2 = Matrix(I, size(X, 1), size(X, 1))
            Γ = m == 1 ? θtrue[1] * V1 : θtrue[1] * V1 + θtrue[2] * V2
            vecd = Vector{DiscreteUnivariateDistribution}(undef, size(X, 1))
            for i in 1:size(X, 1)
                vecd[i] = y_distribution(μ[i])
            end
            nonmixed_multivariate_dist = NonMixedMultivariateDistribution(vecd, Γ)
            # simuate single vector y
            y = Vector{T}(undef, size(X, 1))
            res = Vector{T}(undef, size(X, 1))
            rand(nonmixed_multivariate_dist, y, res)
            V = m == 1 ? [V1] : [V1, V2]
            obs[i] = GLMCopulaVCObs(y, X, V, dist, link)
        end
        qc_model = GLMCopulaVCModel(obs)
    end
    return qc_model, G, βtrue, θtrue, γtrue
end

k = 0 # number of causal SNPs
maf = 0.3

qc_model, G, βtrue, θtrue, γtrue = simulate_VC_longitudinal(
    n = 5000, # sample size
    d_min = 5, # min number of observations per sample
    d_max = 5, # max number of observations per sample
    p = 3, # number of fixed effects, including intercept
    m = 2, # number of variance components
    q = 1000, # number of SNPs
    k = k, # number of causal SNPs
    seed = 123,
    y_distribution = Bernoulli,
    T = Float64,
    maf = maf,
    causal_snp_β = 0.2
)

@show qc_model;

qc_model = Quasi-Copula Variance Component Model
  * base distribution: Bernoulli
  * link function: LogitLink
  * number of clusters: 5000
  * cluster size min, max: 5, 5
  * number of variance components: 2
  * number of fixed effects: 3



In [3]:
@time optm = QuasiCopula.fit!(qc_model,
    Ipopt.IpoptSolver(
        print_level = 5, 
        tol = 10^-6, 
        max_iter = 100,
        accept_after_max_steps = 4,
        warm_start_init_point="yes", 
        limited_memory_max_history = 6, # default value
        hessian_approximation = "limited-memory",
#         derivative_test="second-order"
    )
);


******************************************************************************
This program contains Ipopt, a library for large-scale nonlinear optimization.
 Ipopt is released as open source code under the Eclipse Public License (EPL).
         For more information visit https://github.com/coin-or/Ipopt
******************************************************************************

This is Ipopt version 3.13.4, running with linear solver mumps.
NOTE: Other linear solvers might be more efficient (see Ipopt documentation).

Number of nonzeros in equality constraint Jacobian...:        0
Number of nonzeros in inequality constraint Jacobian.:        0
Number of nonzeros in Lagrangian Hessian.............:        0

Total number of variables............................:        5
                     variables with only lower bounds:        2
                variables with lower and upper bounds:        0
                     variables with only upper bounds:        0
Total number of equal

In [4]:
@show βtrue
@show qc_model.β
@show qc_model.∇β

@show θtrue
@show qc_model.θ
@show qc_model.∇θ;

βtrue = [1.0, 0.5, 0.5]
qc_model.β = [1.0091896288240927, 0.49068185517601387, 0.5016364218552014]
qc_model.∇β = [3.3382613090493507e-7, -8.106559831189131e-7, -1.3638758370460025e-6]
θtrue = [0.1, 0.1]
qc_model.θ = [0.10529641840791103, 0.11976101213786954]
qc_model.∇θ = [-6.708739634930794e-7, -1.8532789480829592e-7]


## Is $\nabla_\beta res$ calculated correctly? 

We can check using ForwardDiff

The function is 

$$res_{ij}(\beta) = \frac{y_{ij} - \mu_{ij}}{\sqrt{\sigma_{ij}^2(\beta)}}$$

### Normal

Assumes y, X are given. We calculate the residuals for just 1 sample

In [11]:
# sample data
X = qc_model.data[1].X # d by p
y = qc_model.data[1].y # d by 1

# objective
function resβ(β)
    η = X * β # d by 1
    μ = GLM.linkinv.(IdentityLink(), η)
    varμ = GLM.glmvar.(Normal(), μ)
    return (y - μ) ./ sqrt.(varμ) # d by 1
end

# mathematical gradient
function ∇resβ(β)
    d, p = size(X)
    ∇resβ = zeros(d, p)
    for i in 1:p, j in 1:d
        ∇resβ[j, i] = -X[j, i]
    end
    return ∇resβ # d × p
end

# autodiff gradient
∇resβ_autodiff = x -> ForwardDiff.jacobian(resβ, x)

# random beta vector
β = rand(size(qc_model.data[1].X, 2))

# check objective
@show resβ(β)

# compare mathematical and numerical gradient
[vec(∇resβ(β)) vec(∇resβ_autodiff(β))]

resβ(β) = [0.1296459602031359, -1.5158332856428878, 0.19631145529958616, -0.03059313160267374, -0.8880321856615292]


15×2 Matrix{Float64}:
 -1.0        -1.0
 -1.0        -1.0
 -1.0        -1.0
 -1.0        -1.0
 -1.0        -1.0
  2.07458     2.07458
 -1.94686    -1.94686
  0.0808759   0.0808759
  0.154606    0.154606
 -0.931964   -0.931964
 -2.26098    -2.26098
 -1.19819    -1.19819
  0.0763038   0.0763038
 -0.470584   -0.470584
 -1.03039    -1.03039

### Bernoulli

In [20]:
# sample data
X = qc_model.data[1].X # d by p
y = qc_model.data[1].y # d by 1

# objective
function resβ(β)
    η = X * β # d by 1
    μ = GLM.linkinv.(LogitLink(), η)
    varμ = GLM.glmvar.(Bernoulli(), μ)
    return (y - μ) ./ sqrt.(varμ) # d by 1
end

# mathematical gradient
function ∇resβ(β::AbstractVector{T}) where T
    d, p = size(X)
    ∇resβ = zeros(T, d, p)
    η = X * β # d by 1
    μ = GLM.linkinv.(LogitLink(), η) # d by 1
    varμ = GLM.glmvar.(Bernoulli(), μ) # d by 1
    res = (y - μ) ./ sqrt.(varμ) # d by 1
    for i in 1:p, j in 1:d
        varμ_j = varμ[j]
        x_ji = X[j, i]
        res_j = res[j]
        μ_j = μ[j]
        ∇resβ[j, i] = -sqrt(varμ_j) * x_ji - 
            (0.5 * res_j * (1 - 2μ_j) * x_ji)
    end
    return ∇resβ # d × p
end
∇²resβ_autodiff = x -> ForwardDiff.jacobian(∇resβ, x)

# autodiff gradient
∇resβ_autodiff = x -> ForwardDiff.jacobian(resβ, x)

# check objective
@show resβ(qc_model.β)

# compare mathematical and numerical gradient
[vec(∇resβ(qc_model.β)) vec(∇resβ_autodiff(qc_model.β))]


resβ(qc_model.β) = [0.5344439504497369, 0.02670454905299698, 0.7098215386604426, 0.44787592964791534, 0.08710902598335912]


15×2 Matrix{Float64}:
 -0.267222   -0.267222
 -0.0133523  -0.0133523
 -0.354911   -0.354911
 -0.223938   -0.223938
 -0.0435545  -0.0435545
  0.554373    0.554373
 -0.025995   -0.025995
  0.0287037   0.0287037
  0.0346223   0.0346223
 -0.0405912  -0.0405912
 -0.604183   -0.604183
 -0.0159986  -0.0159986
  0.0270811   0.0270811
 -0.105382   -0.105382
 -0.0448782  -0.0448782

### Poisson

In [20]:
# sample data
X = qc_model.data[1].X # d by p
y = qc_model.data[1].y # d by 1

# objective
function resβ(β)
    η = X * β # d by 1
    μ = GLM.linkinv.(LogLink(), η)
    varμ = GLM.glmvar.(Poisson(), μ)
    return (y - μ) ./ sqrt.(varμ) # d by 1
end

# mathematical gradient
function ∇resβ(β)
    d, p = size(X)
    ∇resβ = zeros(d, p)
    η = X * β # d by 1
    μ = GLM.linkinv.(LogLink(), η) # d by 1
    varμ = GLM.glmvar.(Poisson(), μ) # d by 1
    res = (y - μ) ./ sqrt.(varμ) # d by 1
    dμ = GLM.mueta.(LogLink(), η) # d by 1
    for i in 1:p, j in 1:d
        varμ_j = varμ[j]
        x_ji = X[j, i]
        res_j = res[j]
        μ_j = μ[j]
        dμ_j = dμ[j]
        ∇resβ[j, i] = x_ji * (-(inv(sqrt(varμ_j)) + (0.5 * inv(varμ_j)) * res_j) * dμ_j)
    end
    return ∇resβ # d × p
end

# autodiff gradient
∇resβ_autodiff = x -> ForwardDiff.jacobian(resβ, x)

# random beta vector
β = rand(size(qc_model.data[1].X, 2))

# check objective
@show resβ(β)

# compare mathematical and numerical gradient
[vec(∇resβ(β)) vec(∇resβ_autodiff(β))]

resβ(β) = [-0.20116605438121704, 654.5724663515489, -0.04513275044062135, 3.5718353780954617, 70.68722761611653]


15×2 Matrix{Float64}:
   -1.73497      -1.73497
 -329.553      -329.553
   -1.00025      -1.00025
   -3.03142      -3.03142
  -37.1641      -37.1641
    3.59933       3.59933
 -641.594      -641.594
    0.0808965     0.0808965
    0.468677      0.468677
  -34.6356      -34.6356
   -3.92273      -3.92273
 -394.867      -394.867
    0.0763233     0.0763233
   -1.42654      -1.42654
  -38.2936      -38.2936

## Check $\nabla_\beta L$

In [192]:
function A_mul_b!(c::AbstractVector{T}, A::AbstractMatrix, b::AbstractVector) where T
    n, p = size(A)
    fill!(c, zero(T))
    for j in 1:p, i in 1:n
        c[i] += A[i, j] * b[j]
    end
    return c
end

function loglikelihood(
    β::AbstractVector{T}, 
    qc_model::Union{GLMCopulaVCModel, NBCopulaVCModel}
    ) where T
    θ = qc_model.θ
    # allocate vector of type T
    n, p = size(qc_model.data[1].X)
    η = zeros(T, n)
    μ = zeros(T, n)
    varμ = zeros(T, n)
    res = zeros(T, n)
    storage_n = zeros(T, n)
    q = zeros(T, length(θ))
    logl = zero(T)
    for (i, gc) in enumerate(qc_model.data)
        X = gc.X
        y = gc.y
        n, p = size(X)
        # update_res! step (need to avoid BLAS)
        A_mul_b!(η, X, β)
        for i in 1:gc.n
            μ[i] = GLM.linkinv(gc.link, η[i])
            varμ[i] = GLM.glmvar(gc.d, μ[i]) # Note: for negative binomial, d.r is used
#             dμ[i] = GLM.mueta(gc.link, η[i])
#             w1[i] = dμ[i] / varμ[i]
#             w2[i] = w1[i] * dμ[i]
            res[i] = y[i] - μ[i]
        end
        # standardize_res! step
        for j in eachindex(y)
            res[j] /= sqrt(varμ[j])
        end
        # std_res_differential! step (this will compute ∇resβ)
#         for i in 1:gc.p
#             for j in 1:gc.n
#                 ∇resβ[j, i] = -sqrt(varμ[j]) * X[j, i] - (0.5 * res[j] * (1 - (2 * μ[j])) * X[j, i])
#             end
#         end
        # update Γ
        @inbounds for k in 1:gc.m
            A_mul_b!(storage_n, gc.V[k], res)
            q[k] = dot(res, storage_n) / 2 # q[k] = 0.5 r' * V[k] * r (update variable b for variance component model)
        end
        # component_loglikelihood
        for j in 1:gc.n
            logl += QuasiCopula.loglik_obs(gc.d, y[j], μ[j], 1.0, 1.0)
        end
        tsum = dot(θ, gc.t)
        logl += -log(1 + tsum)
        qsum  = dot(θ, q) # qsum = 0.5 r'Γr
        logl += log(1 + qsum)
    end
    return logl
end

function loglikelihood(
    β::AbstractVector{T}, 
    gcm::GaussianCopulaVCModel
    ) where T
    θ = gcm.θ
    τ = gcm.τ[1]
    # allocate vector of type T
    n, p = size(gcm.data[1].X)
    μ = zeros(T, n)
    res = zeros(T, n)
    storage_n = zeros(T, n)
    q = zeros(T, length(θ))
    logl = zero(T)
    for gc in gcm.data
        X = gc.X
        y = gc.y
        n, p = size(X)
        sqrtτ = sqrt(abs(τ))
        # update_res! step (need to avoid BLAS)
        A_mul_b!(μ, X, β)
        for i in 1:gc.n
            res[i] = y[i] - μ[i]
        end
        # standardize_res! step
        res .*= sqrtτ
        rss  = abs2(norm(res)) # RSS of standardized residual
        tsum = dot(θ, gc.t) # ben: why is there abs here?
        logl += - log(1 + tsum) - (gc.n * log(2π) -  gc.n * log(abs(τ)) + rss) / 2
        # update Γ
        @inbounds for k in 1:gc.m
            A_mul_b!(storage_n, gc.V[k], res)
            q[k] = dot(res, storage_n) / 2 # q[k] = 0.5 r' * V[k] * r (update variable b for variance component model)
        end
        qsum  = dot(θ, q)
        logl += log(1 + qsum)
    end
    return logl
end

# sample data
autodiff_loglikelihood(β) = loglikelihood(β, qc_model)

# function grad_loglikelihood(
#     β::AbstractVector{T}, 
#     qc_model::Union{GLMCopulaVCModel, NBCopulaVCModel}
#     ) where T
#     β = qc_model.β
#     n, p = size(qc_model.data[1].X)
#     η = zeros(T, n)
#     μ = zeros(T, n)
#     varμ = zeros(T, n)
#     res = zeros(T, n)
#     w1 = zeros(T, n)
#     dμ = zeros(T, n)
#     storage_n = zeros(T, n)
#     out = zeros(length(β))
#     for gc in qc_model.data
#         X = gc.X
#         y = gc.y
#         n, p = size(X)
#         # update_res! step
#         A_mul_b!(η, X, β)
#         for i in 1:gc.n
#             μ[i] = GLM.linkinv(gc.link, η[i])
#             varμ[i] = GLM.glmvar(gc.d, μ[i]) # Note: for negative binomial, d.r is used
#             dμ[i] = GLM.mueta(gc.link, η[i])
#             w1[i] = dμ[i] / varμ[i]
#             res[i] = y[i] - μ[i]
#         end
#         # GLM gradient
#         out += X' * Diagonal(w1) * res
#         # 2nd gradient term
#         res ./= sqrt.(varμ)
        
#     end
#     return out
# end

autodiff_loglikelihood (generic function with 1 method)

In [205]:
autodiff_loglikelihood(qc_model.β), loglikelihood!(qc_model, true, false)

(-8635.392759793393, -8635.39275979312)

Beta is $\pm 5$

In [200]:
# autodiff Gradient
∇logl = x -> ForwardDiff.gradient(autodiff_loglikelihood, x)
∇βtrue = ∇logl(qc_model.β)

3-element Vector{Float64}:
 9.659198719758066e-6
 2.194933313059977e-5
 8.53400730504994e-6

In [201]:
# gradient from math
loglikelihood!(qc_model, true, false)
∇βobs = qc_model.∇β

3-element Vector{Float64}:
 3.149173630434543e-5
 7.270955512718447e-5
 6.383652480647373e-5

Beta is $\pm 2$

In [7]:
# autodiff Gradient
∇logl = x -> ForwardDiff.gradient(autodiff_loglikelihood, x)
∇βtrue = ∇logl(qc_model.β)

3-element Vector{Float64}:
 -2.2863324753430447e-6
 -7.904796714974793e-7
  2.0348114821988617e-6

In [8]:
# gradient from math
loglikelihood!(qc_model, true, false)
∇βobs = qc_model.∇β

3-element Vector{Float64}:
 -2.286331556300425e-6
 -7.904736671893176e-7
  2.0348091102906363e-6

In [13]:
# autodiff Gradient
∇logl = x -> ForwardDiff.gradient(autodiff_loglikelihood, x)
∇βtrue = ∇logl(qc_model.β)

3-element Vector{Float64}:
 -2.2863324753430447e-6
 -7.904796714974793e-7
  2.0348114821988617e-6

In [14]:
# gradient from math
loglikelihood!(qc_model, true, false)
∇βobs = qc_model.∇β

3-element Vector{Float64}:
 -2.286331556300425e-6
 -7.904736671893176e-7
  2.0348091102906363e-6

In [None]:
# my naive gradient (does not work)
# function compute_∇resβ(β, X, y, dist, link)
#     d, p = size(X)
#     ∇resβ = zeros(d, p)
#     η = X * β # d by 1
#     μ = GLM.linkinv.(link, η) # d by 1
#     varμ = GLM.glmvar.(dist, μ) # d by 1
#     res = (y - μ) ./ sqrt.(varμ) # d by 1
#     for i in 1:p, j in 1:d
#         varμ_j = varμ[j]
#         x_ji = X[j, i]
#         res_j = res[j]
#         μ_j = μ[j]
#         ∇resβ[j, i] = -sqrt(varμ_j) * x_ji - 
#             (0.5 * res_j * (1 - 2μ_j) * x_ji)
#     end
#     return ∇resβ # d × p
# end
# function grad_logl_sample_i(dist, link, Γ, X, y, β)
#     η = X*β
#     μ = GLM.linkinv.(link, η)
#     varμ = GLM.glmvar.(dist, μ)
#     res = (y .- μ) ./ sqrt.(varμ)
#     denom = 1 + 0.5 * (res' * Γ * res)
#     ∇resβ = compute_∇resβ(β, X, y, dist, link)
#     W1 = GLM.mueta.(link, η) / GLM.glmvar.(dist, μ)
#     return X' * Diagonal(W1) * (y - μ) + ∇resβ'*Γ*res / denom
# end

# ∇β_test = zeros(3)
# for i in 1:length(qc_model.data)
#     Γ = sum(qc_model.θ .* qc_model.data[i].V)
#     X = qc_model.data[i].X
#     y = qc_model.data[i].y
#     ∇β_test += grad_logl_sample_i(Bernoulli(), LogitLink(), Γ, X, y, qc_model.β)
# end
# ∇β_test

In [14]:
# autodiff Gradient
∇logl = x -> ForwardDiff.gradient(autodiff_loglikelihood, x)
∇βtrue = ∇logl(qc_model.β)

3-element Vector{Float64}:
  8.57327120407092e-8
  2.7007899694453386e-8
 -1.1578850189764012e-7

In [15]:
# gradient from math
loglikelihood!(qc_model, true, false)
∇βobs = qc_model.∇β

3-element Vector{Float64}:
  8.528801986873447e-8
  2.6077807413482645e-8
 -1.1694493039574039e-7

## Check $\nabla_\beta^2 L$

Hessians for a single observation seems to differ quite a bit

In [28]:
function two_term_Hessian(gcm::Union{GLMCopulaVCModel, NBCopulaVCModel})
    p = length(gcm.β)
    T = eltype(gcm.β)
    H = zeros(T, p, p)
    for gc in gcm.data
        d = gc.n # number of observations for current sample
        # GLM term
        H -= Transpose(gc.X) * Diagonal(gc.w2) * gc.X
        # trailing terms
        res = gc.res # d × 1 standardized residuals
        ∇resβ = gc.∇resβ # d × p
        Γ = zeros(T, d, d)
        for k in 1:gc.m # loop over variance components
            Γ .+= gcm.θ[k] .* gc.V[k]
        end
        denom = abs2(1 + 0.5 * (res' * Γ * res))
        H -= (∇resβ' * Γ * res) * (∇resβ' * Γ * res)' / denom
    end
    return H
end

function two_term_Hessian(gcm::GaussianCopulaVCModel)
    p = length(gcm.β)
    T = eltype(gcm.β)
    H = zeros(T, p, p)
    for gc in gcm.data
        d = gc.n # number of observations for current sample
        # GLM term
        H -= Transpose(gc.X) * gc.X
        # trailing terms
        res = gc.res # d × 1 standardized residuals
        ∇resβ = -sqrt(gcm.τ[1]) .* gc.X # d × p
        Γ = zeros(T, d, d)
        for k in 1:gc.m # loop over variance components
            Γ .+= gcm.θ[k] .* gc.V[k]
        end
        denom = abs2(1 + 0.5 * (res' * Γ * res))
        H -= (∇resβ' * Γ * res) * (∇resβ' * Γ * res)' / denom
    end
    return H
end

function three_term_hessian(qc_model::Union{GLMCopulaVCModel, NBCopulaVCModel})
#     # sarah's implementation
#     loglikelihood!(qc_model, true, true)
#     return qc_model.Hβ
#     @show qc_model.Hβ
    p = length(qc_model.β)
    T = eltype(qc_model.β)
    H = zeros(T, p, p)
    for gc in qc_model.data
        d = gc.n # number of observations for current sample
        # GLM term
        H -= Transpose(gc.X) * Diagonal(gc.w2) * gc.X
        # 2nd term
        res = gc.res # d × 1 standardized residuals
        ∇resβ = gc.∇resβ # d × p
        Γ = zeros(T, d, d)
        for k in 1:gc.m # loop over variance components
            Γ .+= qc_model.θ[k] .* gc.V[k]
        end
        denom = 1 + 0.5 * (res' * Γ * res)
        H -= (∇resβ' * Γ * res) * (∇resβ' * Γ * res)' / denom^2
        # 3rd term
        H += (∇resβ' * Γ * ∇resβ) / denom
    end
    return H
end

# autodiff ∇²resβ (giving some kind of tensor)
∇²resβ_autodiff = x -> ForwardDiff.jacobian(∇resβ, x)

# this is d²rᵢₖ(β) needed for computing the 4th hessian term
function r_ik(β, k)
    res = resβ(β)
    return res[k]
end
r_ik(β) = r_ik(β, k)
∇²r_ik = x -> ForwardDiff.hessian(r_ik, x)

function full_hessian(qc_model::Union{GLMCopulaVCModel, NBCopulaVCModel})
    p = length(qc_model.β)
    T = eltype(qc_model.β)
    H = zeros(T, p, p)    
    # loop over samples
    for (i, gc) in enumerate(qc_model.data)
        d = gc.n # number of observations for current sample
        # GLM term
        H -= Transpose(gc.X) * Diagonal(gc.w2) * gc.X
        # 2nd term
        res = gc.res # d × 1 standardized residuals
        ∇resβ = gc.∇resβ # d × p
        Γ = zeros(T, d, d)
        for k in 1:gc.m # loop over variance components
            Γ .+= qc_model.θ[k] .* gc.V[k]
        end
        denom = 1 + 0.5 * (res' * Γ * res)
        H -= (∇resβ' * Γ * res) * (∇resβ' * Γ * res)' / denom^2
        # 3rd term
        H += (∇resβ' * Γ * ∇resβ) / denom
        # 4th term
        ek = zeros(d)    
        for k in 1:d
            # somehow need to define autodiff functions here, or else k is treated as fixed
            function resβ(β)
                η = X * β # d by 1
                μ = GLM.linkinv.(LogitLink(), η)
                varμ = GLM.glmvar.(Bernoulli(), μ)
                return (y - μ) ./ sqrt.(varμ) # d by 1
            end
            r_ik(β, k) = resβ(β)[k]
            r_ik(β) = r_ik(β, k)
            ∇²r_ik = x -> ForwardDiff.hessian(r_ik, x) 
            
            
            fill!(ek, 0)
            ek[k] = 1
            X = gc.X
            y = gc.y
#             @show ∇²r_ik(qc_model.β)
            H += (ek' * Γ * res * ∇²r_ik(qc_model.β)) / denom
        end
    end
    return H
end

full_hessian (generic function with 1 method)

In [29]:
# directly evaluating ∇²resβ (giving some kind of tensor)
Hββ = ∇²resβ_autodiff(qc_model.β)

15×3 Matrix{Float64}:
 -0.253406    0.525711    -0.572947
 -0.279741   -0.544617    -0.335183
 -0.263081    0.0212769    0.0200741
 -0.263562    0.0407485   -0.124028
 -0.272238   -0.253716    -0.280512
  0.525711   -1.09063      1.18862
 -0.544617   -1.06029     -0.652555
  0.0212769  -0.00172079  -0.00162351
  0.0407485  -0.00629998   0.0191756
 -0.253716   -0.236454    -0.261427
 -0.572947    1.18862     -1.29542
 -0.335183   -0.652555    -0.401613
  0.0200741  -0.00162351  -0.00153173
 -0.124028    0.0191756   -0.0583658
 -0.280512   -0.261427    -0.289037

In [30]:
# calculating hessian of r_ik for k = 1:p
k = 1
∇²r_ik(qc_model.β)

3×3 Matrix{Float64}:
 -0.253406   0.525711  -0.572947
  0.525711  -1.09063    1.18862
 -0.572947   1.18862   -1.29542

In [31]:
# calculating hessian of r_ik for k = 1:p
k = 2
∇²r_ik(qc_model.β)

3×3 Matrix{Float64}:
 -0.279741  -0.544617  -0.335183
 -0.544617  -1.06029   -0.652555
 -0.335183  -0.652555  -0.401613

In [33]:
# 2 term Hessian from math
two_terms_H = two_term_Hessian(qc_model)

3×3 Matrix{Float64}:
 -5800.35       28.5649     84.1043
    28.5649  -5155.84       59.5103
    84.1043     59.5103  -5164.75

In [34]:
# 3 term Hessian from math
three_term_hessian(qc_model)

3×3 Matrix{Float64}:
 -3684.56       28.4691     61.3107
    28.4691  -4268.18       48.266
    61.3107     48.266   -4284.97

In [35]:
# 3 term Hessian implemented by sarah
loglikelihood!(qc_model, true, true)
qc_model.Hβ

3×3 Matrix{Float64}:
 -3684.56       28.4691     61.3107
    28.4691  -4268.18       48.266
    61.3107     48.266   -4284.97

In [36]:
# 4 term Hessian from math
full_hessian(qc_model)

3×3 Matrix{Float64}:
 -2793.55       27.4235     44.8972
    27.4235  -3385.69       43.6875
    44.8972     43.6875  -3393.24

In [39]:
# autodiff Hessian
∇²logl = x -> ForwardDiff.hessian(
        autodiff_loglikelihood, x)
autodiff_H = ∇²logl(qc_model.β)

3×3 Matrix{Float64}:
 -2793.55       27.4235     44.8972
    27.4235  -3385.69       43.6875
    44.8972     43.6875  -3393.24

# Check $\nabla_{\theta}L$, $\nabla^2_{\theta}L$, and $\nabla_{\theta}\nabla_{\beta} L$ 

In [11]:
# Loglikelihood function friendly to autodiff
autodiff_loglikelihood(β) = QuasiCopula.loglikelihood(β, qc_model, z)

# autodiff Gradient
∇logl = x -> ForwardDiff.gradient(autodiff_loglikelihood, x)

# autodiff Hessian
∇²logl = x -> ForwardDiff.hessian(autodiff_loglikelihood, x)


#22 (generic function with 1 method)

First, check if `autodiff_loglikelihood` returns same answer as `QuasiCopula.loglikelihood!`

In [16]:
i = 1
z = convert(Vector{Float64}, @view(G[:, i]), center=true, scale=false)
fullβ = [qc_model.β; qc_model.θ; 0.0] # poisson or bernoulli
# fullβ = [qc_model.β; qc_model.θ; qc_model.τ; 0.0] # normal

@show autodiff_loglikelihood(fullβ)
@show QuasiCopula.loglikelihood!(qc_model, false, false);

autodiff_loglikelihood(fullβ) = -18033.163626812184
QuasiCopula.loglikelihood!(qc_model, false, false) = -18033.16362681217


### Check $\nabla_\theta L$

In [39]:
# autodiff (the 4th and 5th position stores the 2 gradient terms with respect to θ)
i = 5
z = convert(Vector{Float64}, @view(G[:, i]), center=true, scale=false)
fullβ = [qc_model.β; qc_model.θ; 0.0] # poisson or bernoulli
∇logl(fullβ)[4:5]

2-element Vector{Float64}:
 -1.1370986533992892e-6
  2.3139215938341664e-6

In [40]:
# mathematical formula
Ω = qc_model.data[i].V
m = length(Ω)
grad_math = zeros(m)
for i in 1:length(qc_model.data)
    r = qc_model.data[i].res
    Ω = qc_model.data[i].V
    b = [0.5r' * Ω[k] * r for k in 1:m]
    c = [0.5tr(Ω[k]) for k in 1:m]
    grad_math += b / (1 + qc_model.θ'*b) - c / (1 + qc_model.θ'*c)
end
grad_math

2-element Vector{Float64}:
 -1.137098653399271e-6
  2.3139215938342303e-6

### Check $\nabla_\theta^2 L$

In [37]:
# autodiff (the 4th and 5th position stores the 2 gradient terms with respect to θ)
i = 5
z = convert(Vector{Float64}, @view(G[:, i]), center=true, scale=false)
fullβ = [qc_model.β; qc_model.θ; 0.0] # poisson or bernoulli
∇²logl(fullβ)[4:5, 4:5]


2×2 Matrix{Float64}:
 1.25985e-15   1.0245e-15
 1.0245e-15   -9.38656e-15

In [62]:
# mathematical formula
Ω = qc_model.data[i].V
m = length(Ω)
hess_math = zeros(m, m)
for i in 1:length(qc_model.data)
    r = qc_model.data[i].res
    Ω = qc_model.data[i].V
    b = [0.5r' * Ω[k] * r for k in 1:m]
    c = [0.5tr(Ω[k]) for k in 1:m]
    hess_math += b*b' / (1 + qc_model.θ'*b)^2 - c*c' / (1 + qc_model.θ'*c)^2
end
hess_math

2×2 Matrix{Float64}:
 -1.25985e-15  -1.0245e-15
 -1.0245e-15    9.38656e-15

### Check $\nabla_\theta\nabla_\beta L$

In [44]:
# autodiff (the 4th and 5th position stores the 2 gradient terms with respect to θ)
i = 5
z = convert(Vector{Float64}, @view(G[:, i]), center=true, scale=false)
fullβ = [qc_model.β; qc_model.θ; 0.0] # poisson or bernoulli
∇²logl(fullβ)[1:3, 4:5]

3×2 Matrix{Float64}:
 -1.0073e-7   2.04979e-7
 -4.43245e-8  9.01974e-8
 -5.11492e-8  1.04085e-7

In [68]:
# mathematical formula
Ω = qc_model.data[i].V
m = length(Ω)
p = size(qc_model.data[i].X, 2)
hess_math = zeros(p, m)
for i in 1:length(qc_model.data)
    r = qc_model.data[i].res
    Ω = qc_model.data[i].V
    θ = qc_model.θ
    ∇resβ = qc_model.data[i].∇resβ
    b = [0.5r' * Ω[k] * r for k in 1:m]
    A = hcat([∇resβ' * Ω[k] * r for k in 1:m]...)
    hess_math += A ./ (1 + θ'*b) - (A*θ ./ (1 + θ'*b)^2) * b'
end
hess_math

3×2 Matrix{Float64}:
 -1.0073e-7   2.04979e-7
 -4.43245e-8  9.01974e-8
 -5.11492e-8  1.04085e-7

## Check $\frac{\partial^2\mu}{\partial \eta^2}$

In [73]:
"""
    mueta2(l::Link, η::Real)

Second derivative of the inverse link function `d^2μ/dη^2`, for link `L` at linear predictor value `η`.
I.e. derivative of the mueta function in GLM.jl
"""
function mueta2 end

mueta2(::IdentityLink, η::Real) = zero(η)
function mueta2(::LogitLink, η::Real)
    expabs = exp(-abs(η))
    denom = 1 + expabs
    return -expabs / denom^2 + 2expabs^2 / denom^3
end
mueta2(::LogLink, η::Real) = exp(η)

mueta2 (generic function with 3 methods)

In [74]:
# test mueta2 function
# l = IdentityLink()
# l = LogLink()
l = LogitLink()
η = 0.1234
μ = GLM.linkinv(l, η)

# mathematical hessian
@show mueta2(l, η)

# ForwardDiff Hessian
logit_mueta = η -> GLM.mueta(l, η)
mueta2_autodiff = x -> ForwardDiff.derivative(logit_mueta, x)
@show mueta2_autodiff(η);

mueta2(l, η) = -0.015346957645411913
mueta2_autodiff(η) = -0.015346957645411843


## Check $\frac{\partial\sigma^2}{\partial \mu}$

In [75]:
"""
    sigmaeta(D::Distribution, μ::Real)

Computes dσ²/dμ
"""
function sigmamu end

sigmamu(::Normal, μ::Real) = zero(μ)
sigmamu(::Bernoulli, μ::Real) = one(μ) - 2μ
sigmamu(::Poisson, μ::Real) = one(μ)

sigmamu (generic function with 3 methods)

## Check $\frac{\partial^2(\sigma^2)}{\partial \mu^2}$

In [76]:
"""
    sigmaμ2(D::Distribution, μ::Real)

Computes d²σ²/dμ²
"""
function sigmamu2 end

sigmamu2(::Normal, μ::Real) = zero(μ)
sigmamu2(::Bernoulli, μ::Real) = -2
sigmamu2(::Poisson, μ::Real) = zero(μ)

sigmamu2 (generic function with 3 methods)

## Check $\nabla \mu$, $\nabla^2 \mu$, $\nabla \sigma^2$ and $\nabla^2 \sigma^2$

In [77]:
"""
    ∇²μ_j(l::Link, Xi::Matrix, β::Vector, j)

Computes the Hessian of the mean function with respect to β for sample i (Xi) at time j
"""
function ∇²μ_j(l::Link, Xi::Matrix, β::Vector, j)
    xj = Xi[j, :]
    ηj = dot(xj, β)
    d²μdη² = mueta2(l, ηj)
    return d²μdη² * xj * xj'
end

# objective 
function eval_μj(X, β, link, j)
    η = X*β
    μj = GLM.linkinv(link, η[j])
    return μj
end

# autodiff hessian
eval_μj(β) = eval_μj(X, β, link, j)
∇²μ_j_autodiff = x -> ForwardDiff.hessian(eval_μj, x)

# data
link = LogitLink()
X = qc_model.data[1].X
β = qc_model.β
j = 1
@show eval_μj(X, β, link, j)

# compare autodiff and mathematical result
math_result = ∇²μ_j(link, X, β, j)
autodiff_result = ∇²μ_j_autodiff(β)

[vec(math_result) vec(autodiff_result)]

eval_μj(X, β, link, j) = 0.5614464525554457


9×2 Matrix{Float64}:
 -0.0302592   -0.0302592
  0.033953     0.033953
  0.0127624    0.0127624
  0.033953     0.033953
 -0.0380978   -0.0380978
 -0.0143203   -0.0143203
  0.0127624    0.0127624
 -0.0143203   -0.0143203
 -0.00538278  -0.00538278

In [78]:
"""
    ∇²σ²_j(d::Distribution, l::Link, Xi::Matrix, β::Vector, j)

Computes the Hessian of the σ^2 function with respect to β for sample i (Xi) at time j
"""
function ∇²σ²_j(d::Distribution, l::Link, Xi::Matrix, β::Vector, j)
    xj = Xi[j, :]
    ηj = dot(xj, β)
    μj = GLM.linkinv.(l, ηj)
    c = sigmamu2(d, μj)*GLM.mueta(l, ηj)^2 + sigmamu(d, μj)*mueta2(l, ηj)
    return c * xj * xj'
end

# objective
function eval_σ2j(X, β, dist, link, j)
    η = X*β
    μ = GLM.linkinv.(link, η)
    σ2j = GLM.glmvar(dist, μ[j])
    return σ2j
end

# autodiff hessian
eval_σ2j(β) = eval_σ2j(X, β, dist, link, j)
∇²σ2j_autodiff = x -> ForwardDiff.hessian(eval_σ2j, x)

# data
dist = Bernoulli()
link = LogitLink()
X = qc_model.data[1].X
β = qc_model.β
j = 1
@show eval_σ2j(X, β, dist, link, j)

# compare autodiff and mathematical result
math_result = ∇²σ²_j(dist, link, X, β, j)
autodiff_result = ∇²σ2j_autodiff(β)

[vec(math_result) vec(autodiff_result)]

eval_σ2j(X, β, dist, link, j) = 0.24622433346835138


9×2 Matrix{Float64}:
 -0.117534   -0.117534
  0.131882    0.131882
  0.0495722   0.0495722
  0.131882    0.131882
 -0.147981   -0.147981
 -0.0556237  -0.0556237
  0.0495722   0.0495722
 -0.0556237  -0.0556237
 -0.020908   -0.020908

## Show that we can compute $∇resβ$ generally, although we don't do so in practice

In [90]:
# sample data
X = qc_model.data[1].X # d by p
y = qc_model.data[1].y # d by 1

# objective
function resβ(X, y, β)
    η = X * β # d by 1
    μ = GLM.linkinv.(LogitLink(), η)
    varμ = GLM.glmvar.(Bernoulli(), μ)
    return (y - μ) ./ sqrt.(varμ) # d by 1
end
resβ(β) = resβ(X, y, β)

# mathematical gradient
function ∇resβ(X, y, β::AbstractVector{T}) where T
    dist = Bernoulli()
    link = LogitLink()
    
    d, p = size(X)
    ∇resβ = zeros(T, d, p)
    η = X * β # d by 1
    μ = GLM.linkinv.(link, η) # d by 1
    varμ = GLM.glmvar.(dist, μ) # d by 1
    res = (y - μ) ./ sqrt.(varμ) # d by 1
    dμ = GLM.mueta.(link, η) # d by 1
    for i in 1:p, j in 1:d
        varμ_j = varμ[j]
        x_ji = X[j, i]
        res_j = res[j]
        μ_j = μ[j]
        dμ_j = dμ[j]
        dμdβ = dμ_j * x_ji
        dσ2dβ = sigmamu(dist, μ_j) * dμdβ
        # in practice, we have update_∇resβ fucntions to compute ∇resβ[j, i]
        ∇resβ[j, i] = -inv(sqrt(varμ_j)) * dμdβ - 0.5 * res_j * inv(varμ_j) * dσ2dβ
    end
    return ∇resβ # d × p
end
∇resβ(β) = ∇resβ(X, y, β)
∇²resβ_autodiff = x -> ForwardDiff.jacobian(∇resβ, x)

# autodiff gradient
∇resβ_autodiff = x -> ForwardDiff.jacobian(resβ, x)

# compare mathematical and numerical gradient
[vec(∇resβ(qc_model.β)) vec(∇resβ_autodiff(qc_model.β))]

15×2 Matrix{Float64}:
 -0.565735   -0.565735
 -0.449521   -0.449521
 -0.736097   -0.736097
 -0.23579    -0.23579
 -0.27525    -0.27525
  0.634795    0.634795
  0.496558    0.496558
  0.306947    0.306947
 -0.0678103  -0.0678103
 -0.0632576  -0.0632576
  0.238609    0.238609
  0.609367    0.609367
  0.045592    0.045592
 -0.165939   -0.165939
 -0.0394505  -0.0394505

## Compute $∇^2resβ$: Hessian of residual vector of sample $i$ at observation $k$

In [145]:
# mathematical Hessian of residuals wrt β for sample i at time j
# note: need function sigmamu, sigmamu2, mueta2
function ∇²resβ_ij(qc_model, i, j)
    dist = Bernoulli()
    link = LogitLink()
    
    X = qc_model.data[i].X
    y = qc_model.data[i].y
    β = qc_model.β
    xj = X[j, :]
    d, p = size(X)
    
    # intermediate quantities?
    η = X * β # d by 1
    μ = GLM.linkinv.(link, η) # d by 1
    varμ = GLM.glmvar.(dist, μ) # d by 1
    res = (y - μ) ./ sqrt.(varμ) # d by 1
    invσ = inv.(sqrt.(varμ))
    ∇μ_ij  = GLM.mueta(link, η[j]) * xj
    ∇σ²_ij = sigmamu(dist, μ[j]) * GLM.mueta(link, η[j]) * xj

    # assemble 5 terms
    term1 = -invσ[j] * ∇²μ_j(link, X, β, j)
    term2 = 0.5invσ[j]^3 * ∇σ²_ij * ∇μ_ij'
    term3 = -0.5 * res[j] * inv(varμ[j]) * ∇²σ²_j(dist, link, X, β, j)
    term4 = 0.5invσ[j]^3 * ∇μ_ij * ∇σ²_ij'
    term5 = 0.75res[j] * inv(varμ[j]^2) * ∇σ²_ij * ∇σ²_ij'
    ∇²resβ_ik = term1 + term2 + term3 + term4 + term5

    return ∇²resβ_ik # p × p
end
i = 1 # sample id
j = 1 # time point
∇²resβ_ij(qc_model, 1, 1)

3×3 Matrix{Float64}:
 -0.282867   0.317398   0.119305
  0.317398  -0.356143  -0.133868
  0.119305  -0.133868  -0.0503189

In [146]:
# autodiff Hessian of residuals wrt β for sample i at time j
β = qc_model.β
# gc = qc_model.data[1]
T = eltype(β)
X = qc_model.data[i].X
y = qc_model.data[i].y

p = length(β)
d = qc_model.data[i].n
H = zeros(T, p, p)
ek = zeros(T, d)
k = 1

function resβ(X, y, β)
    η = X * β # d by 1
    μ = GLM.linkinv.(LogitLink(), η)
    varμ = GLM.glmvar.(Bernoulli(), μ)
    return (y - μ) ./ sqrt.(varμ) # d by 1
end
r_ik(β, k) = resβ(X, y, β)[k]
r_ik(β) = r_ik(β, k)
∇²r_ik = x -> ForwardDiff.hessian(r_ik, x) 
∇²r_ik(β)


3×3 Matrix{Float64}:
 -0.282867   0.317398   0.119305
  0.317398  -0.356143  -0.133868
  0.119305  -0.133868  -0.0503189

## Compute $d_\gamma d_\gamma  r_{ik}$: Hessian of residual vector of sample $i$ at observation $j$

Because $\gamma$ is a scalar, the resulting Hessian is a vector with length of $\beta$. 

In [125]:
# # math (note: this is same as dβdβ_res_ij but we just plug in z in place of xj)
# function dγdγ_res_ij(dist, link, xj, η_j, μ_j, varμ_j, res_j)
#     invσ_j = inv(sqrt(varμ_j))
#     ∇μ_ij  = GLM.mueta(link, η_j) * xj
#     ∇σ²_ij = sigmamu(dist, μ_j) * GLM.mueta(link, η_j) * xj

#     # assemble 5 terms
#     term1 = -invσ_j * QuasiCopula.∇²μ_j(link, η_j, xj)
#     term2 = 0.5invσ_j^3 * ∇σ²_ij * ∇μ_ij'
#     term3 = -0.5 * res_j * inv(varμ_j) * QuasiCopula.∇²σ²_j(dist, link, xj, μ_j, η_j)
#     term4 = 0.5invσ_j^3 * ∇μ_ij * ∇σ²_ij'
#     term5 = 0.75res_j * inv(varμ_j^2) * ∇σ²_ij * ∇σ²_ij'
#     result = term1 + term2 + term3 + term4 + term5

#     return result # 1 × 1
# end
# z = rand()
# η_j = rand()
# μ_j = rand()
# varμ_j = rand()
# res_j = rand()
# dγdγ_res_ij(Bernoulli(), LogitLink(), z, η_j, μ_j, varμ_j, res_j)

-0.09308160081535394

In [139]:
# # autodiff
# function r_j(y, X, z, β, γ, j)
#     η = X * β + z * γ
#     μ = GLM.linkinv.(LogitLink(), η)
#     varμ = GLM.glmvar.(Bernoulli(), μ)
#     res = (y - μ) ./ sqrt.(varμ)
#     return res[j]
# end
# r_j(γ) = r_j(y, X, z, β, γ, j)
# auto_dγdγ_res_ij = x -> ForwardDiff.hessian(r_j, x) 

# γ = rand()
# z = rand(5)
# r_j(γ)
# auto_dγdγ_res_ij([γ])

LoadError: MethodError: no method matching *(::Vector{Float64}, ::Vector{ForwardDiff.Dual{ForwardDiff.Tag{typeof(r_j), Float64}, ForwardDiff.Dual{ForwardDiff.Tag{typeof(r_j), Float64}, Float64, 1}, 1}})
[0mClosest candidates are:
[0m  *(::Any, ::Any, [91m::Any[39m, [91m::Any...[39m) at /Applications/Julia-1.7.app/Contents/Resources/julia/share/julia/base/operators.jl:655
[0m  *([91m::StridedMatrix{T}[39m, ::StridedVector{S}) where {T<:Union{Float32, Float64, ComplexF32, ComplexF64}, S<:Real} at /Applications/Julia-1.7.app/Contents/Resources/julia/share/julia/stdlib/v1.7/LinearAlgebra/src/matmul.jl:44
[0m  *(::StridedVecOrMat, [91m::Adjoint{<:Any, <:LinearAlgebra.LQPackedQ}[39m) at /Applications/Julia-1.7.app/Contents/Resources/julia/share/julia/stdlib/v1.7/LinearAlgebra/src/lq.jl:266
[0m  ...

## Check Q term

When $\beta$ have large effects, the Hessian with respect to beta no longer match autodiff. Lets try to reproduce this behavior

+ $Q = d_{\gamma}^2 \mathcal{L}$ is the Hessian of the loglikelihood with respect to the effect of the SNP $\gamma$. 

In [31]:

function loglikelihood_i(
    par::AbstractVector{T}, # p+m+1 × 1 where m is number of VCs, 1 is for the SNP
    qc_model::Union{GLMCopulaVCModel, NBCopulaVCModel}, # fitted null model
    z::AbstractVector, # n × 1 genotype vector
    i::Int # sample log
    ) where T
    p = qc_model.p
    m = length(par) - 1 - p
    β = [par[1:end-(m+1)]; par[end]] # nongenetic + genetic beta
    θ = par[end-m:end-1]             # vc parameters
    # allocate storage vectors of type T
    nmax = maximum(size(qc_model.data[i].X, 1) for i in 1:length(qc_model.data))
    η_store = zeros(T, nmax)
    μ_store = zeros(T, nmax)
    varμ_store = zeros(T, nmax)
    res_store = zeros(T, nmax)
    storage_n_store = zeros(T, nmax)
    Xstore = zeros(T, nmax, p+1)
    q = zeros(T, length(θ))
    logl = zero(T)
        
    gc = qc_model.data[i]
        n = size(gc.X, 1)
        X = @view(Xstore[1:n, :])
        η = @view(η_store[1:n])
        μ = @view(μ_store[1:n])
        varμ = @view(varμ_store[1:n])
        res = @view(res_store[1:n])
        storage_n = @view(storage_n_store[1:n])
        # sync nogenetic + genetic covariates
        copyto!(X, gc.X)
        X[:, end] .= z[i]
        y = gc.y
        # update_res! step (need to avoid BLAS)
        QuasiCopula.A_mul_b!(η, X, β)
        for j in 1:gc.n
            μ[j] = GLM.linkinv(gc.link, η[j])
            varμ[j] = GLM.glmvar(gc.d, μ[j]) # Note: for negative binomial, d.r is used
            # dμ[j] = GLM.mueta(gc.link, η[j])
            # w1[j] = dμ[j] / varμ[j]
            # w2[j] = w1[j] * dμ[j]
            res[j] = y[j] - μ[j]
        end
        # standardize_res! step
        for j in eachindex(y)
            res[j] /= sqrt(varμ[j])
        end
        # std_res_differential! step (this will compute ∇resβ)
        # for i in 1:gc.p
        #     for j in 1:gc.n
        #         ∇resβ[j, i] = -sqrt(varμ[j]) * X[j, i] - (0.5 * res[j] * (1 - (2 * μ[j])) * X[j, i])
        #     end
        # end
        # update Γ
        @inbounds for k in 1:gc.m
            QuasiCopula.A_mul_b!(storage_n, gc.V[k], res)
            q[k] = dot(res, storage_n) / 2 # q[k] = 0.5 r' * V[k] * r (update variable b for variance component model)
        end
        # component_loglikelihood
        for j in 1:gc.n
            logl += QuasiCopula.loglik_obs(gc.d, y[j], μ[j], one(T), one(T))
        end
        tsum = dot(θ, gc.t)
        logl += -log(1 + tsum)
        qsum  = dot(θ, q) # qsum = 0.5 r'Γr
        logl += log(1 + qsum)
#     end
    return logl
end

loglikelihood_i (generic function with 1 method)

In [109]:
function calculate_Qi(qc_model, gc::GLMCopulaVCObs, z, Γ, i)
    β = qc_model.β
    X = gc.X
    y = gc.y
    
        dist = gc.d
        d = gc.n # number of observations for current sample
        # the snp
        zi = fill(z[i], gc.n)
        # update ∇resγ
        ∇resγ = zeros(d)
        ∇resβ = gc.∇resβ # d × p
        res = gc.res # d × 1 standardized residuals
        for k in 1:d # loop over each sample's observation
            ∇resγ[k] = QuasiCopula.update_∇resβ(dist, zi[k], res[k], gc.μ[k], gc.dμ[k], gc.varμ[k])
        end
        denom = 1 + 0.5 * (res' * Γ * res)
        denom2 = abs2(denom)
    
    #
    # check dβdβ_res_ij for γ term matches autodiff
    #
    # autodiff
    auto_resβ_ij = Float64[]
    for k in 1:d
        # somehow this has to be inside the loop in order to vary the `k` parameter
        function resγ(X, y, β, z, γ)
            η = X * β + z .* γ # d by 1: z is vector of size == size(X, 1)
            μ = GLM.linkinv.(LogitLink(), η)
            varμ = GLM.glmvar.(Bernoulli(), μ)
            return (y - μ) ./ sqrt.(varμ) # d by 1
        end
        res_ik(γ, k) = resγ(X, y, β, zi, γ)[k] # residual of sample i at measurement k
        res_ik(γ) = res_ik(γ, k) 
        auto_dβdβ_res_ij = x -> ForwardDiff.hessian(res_ik, x) 
        # evaluate autodiff
        push!(auto_resβ_ij, auto_dβdβ_res_ij([0.0])[1])
    end
    @show auto_resβ_ij
    # math 
    math_resβ_ij = Float64[]
    for k in 1:d
        push!(math_resβ_ij, QuasiCopula.dβdβ_res_ij(gc.d, gc.link, z[i], gc.η[k], gc.μ[k], gc.varμ[k], res[k]))
    end
    @show math_resβ_ij
    fdsa
            
    Q = 0.0
    Q += Transpose(zi) * Diagonal(gc.w2) * zi
    Q += (∇resγ' * Γ * res) * (∇resγ' * Γ * res)' / denom2 # 2nd term
    Q -= ∇resγ' * Γ * ∇resγ / denom # 3rd term
    ek = zeros(d)
    for k in 1:d
        fill!(ek, 0)
        ek[k] = 1
        Q -= (ek' * Γ * res * QuasiCopula.dβdβ_res_ij(gc.d, gc.link, z[i], gc.η[k], gc.μ[k], gc.varμ[k], res[k])) / denom
        Q -= numer
    end
    return Q
end

calculate_Qi (generic function with 3 methods)

When $i = 1$ and $k = 2$, dβdβ_res_ij fails to match autodiff, why??

In [127]:
i = 1
k = 1
gc = qc_model.data[i]
y = gc.y
X = gc.X
z = fill(z[i], size(X, 1))
γ = [0.0]

# autodiff
function resγ(X, y, β, z, γ)
    η = X * β + z .* γ # d by 1: z is vector of size == size(X, 1)
    μ = GLM.linkinv.(LogitLink(), η)
    varμ = GLM.glmvar.(Bernoulli(), μ)
    return (y - μ) ./ sqrt.(varμ) # d by 1
end
res_ik(γ, k) = resγ(X, y, β, zi, γ)[k] # residual of sample i at measurement k
res_ik(γ) = res_ik(γ, k) 
auto_dβdβ_res_ij = x -> ForwardDiff.hessian(res_ik, x)
auto = auto_dβdβ_res_ij(γ)[1]
@show auto

# math
math = QuasiCopula.dβdβ_res_ij(gc.d, gc.link, z[i], gc.η[k], gc.μ[k], gc.varμ[k], gc.res[k])
@show math;

auto = -0.09693671578820441
η_j = 0.24703448524051816
GLM.mueta(link, η_j) = 0.24622433346835135
varμ_j = 0.24622433346835138
invσ_j = 2.0152759158475693
∇μ_ij = -0.1441397248123729
∇σ²_ij = 0.017713749524076932
term1 = 0.020897663522326825
term2 = -0.010448831761163407
term3 = -0.09254470722633086
term4 = -0.010448831761163409
term5 = -0.00439200856187351
math = -0.09693671578820436


In [128]:
i = 1
k = 2
gc = qc_model.data[i]
y = gc.y
X = gc.X
z = fill(z[i], size(X, 1))
γ = [0.0]

# autodiff
function resγ(X, y, β, z, γ)
    η = X * β + z .* γ # d by 1: z is vector of size == size(X, 1)
    μ = GLM.linkinv.(LogitLink(), η)
    varμ = GLM.glmvar.(Bernoulli(), μ)
    return (y - μ) ./ sqrt.(varμ) # d by 1
end
res_ik(γ, k) = resγ(X, y, β, zi, γ)[k] # residual of sample i at measurement k
res_ik(γ) = res_ik(γ, k) 
auto_dβdβ_res_ij = x -> ForwardDiff.hessian(res_ik, x)
auto = auto_dβdβ_res_ij(γ)[1]
@show auto

# math
math = QuasiCopula.dβdβ_res_ij(gc.d, gc.link, z[i], gc.η[k], gc.μ[k], gc.varμ[k], gc.res[k])
@show math;

auto = -0.07702396925952908
η_j = -0.21284889183092182
GLM.mueta(link, η_j) = 0.24718970323767067
varμ_j = 0.2471897032376707
invσ_j = 2.011336856888617
∇μ_ij = -0.14470485227533242
∇σ²_ij = -0.015342254457215551
term1 = 0.01806453186341636
term2 = 0.009032265931708191
term3 = -0.07788981010525632
term4 = 0.009032265931708191
term5 = -0.0025975225371817007
math = -0.04435826891560527


In [129]:
# get snp 1
snp = 1
z = convert(Vector{Float64}, @view(G[:, snp]), center=true)

# mathematical Q
math_Q = 0.0
for i in 1:length(qc_model.data)
    gc = qc_model.data[i]
    Γ = zeros(gc.n, gc.n)
    for k in 1:gc.m # loop over variance components
        Γ .+= qc_model.θ[k] .* gc.V[k]
    end
    math_Q += calculate_Qi(qc_model, gc, z, Γ, i)
end
@show math_Q

# autodiff Q
autodiff_loglikelihood(β) = QuasiCopula.loglikelihood(β, qc_model, z)
∇²logl = x -> ForwardDiff.hessian(autodiff_loglikelihood, x)
fullβ = [qc_model.β; qc_model.θ; 0.0]
Hfull = ∇²logl(fullβ)
auto_Q = -Hfull[end, end]
@show auto_Q;

auto_resβ_ij = [-0.09693671578820441, -0.07702396925952908, -0.1261277809575639, 0.04040177775644848, 0.04716318537196372]
math_resβ_ij = [-0.09693671578820436, -0.07702396925952905, -0.1261277809575638, 0.04040177775644851, 0.04716318537196369]


LoadError: UndefVarError: fdsa not defined

In [111]:
for i in 1:10
    # mathematical Q for 1 sample
    gc = qc_model.data[i]
    Γ = zeros(gc.n, gc.n)
    for k in 1:gc.m # loop over variance components
        Γ .+= qc_model.θ[k] .* gc.V[k]
    end
    Qi_math = calculate_Qi(qc_model, gc, z, Γ, i)

    # autodiff Q for 1 sample
    autodiff_loglikelihood_i(β) = loglikelihood_i(β, qc_model, z, i)
    ∇²logl_i = x -> ForwardDiff.hessian(autodiff_loglikelihood_i, x)
    fullβ = [qc_model.β; qc_model.θ; 0.0]
    Hfull_i = ∇²logl_i(fullβ)
    Qi_auto = -Hfull_i[end, end]
    
    @show Qi_math, Qi_auto, Qi_math - Qi_auto
end

auto_resβ_ij = [-0.09693671578820441, -0.07702396925952908, -0.1261277809575639, 0.04040177775644848, 0.04716318537196372]
math_resβ_ij = [-0.09693671578820436, -0.04435826891560527, -0.1261277809575638, 0.04040177775644851, 0.04716318537196369]


LoadError: UndefVarError: fdsa not defined

In [24]:
qc_model.∇β, qc_model.∇θ

([3.3382613090493507e-7, -8.106559831189131e-7, -1.3638758370460025e-6], [-6.708739634930794e-7, -1.8532789480829592e-7])

In [38]:
[qc_model.data[1].y qc_model.data[3].y qc_model.data[7].y  qc_model.data[2].y qc_model.data[4].y]

5×5 Matrix{Float64}:
 0.0  1.0  1.0  1.0  0.0
 0.0  1.0  0.0  0.0  0.0
 0.0  0.0  1.0  0.0  1.0
 1.0  1.0  0.0  1.0  1.0
 1.0  1.0  0.0  1.0  1.0

In [46]:
[qc_model.data[1].q qc_model.data[3].q qc_model.data[7].q    qc_model.data[2].q qc_model.data[4].q]

2×5 Matrix{Float64}:
 3.07676  0.960326  11.9462   0.364994  0.218971
 2.39065  2.47697    6.80771  2.01004   2.17117

In [47]:
[qc_model.data[1].t qc_model.data[3].t qc_model.data[7].t    qc_model.data[2].t qc_model.data[4].t]

2×5 Matrix{Float64}:
 2.5  2.5  2.5  2.5  2.5
 2.5  2.5  2.5  2.5  2.5

In [45]:
[qc_model.data[1].η qc_model.data[3].η qc_model.data[7].η    qc_model.data[2].η qc_model.data[4].η]

5×5 Matrix{Float64}:
  0.247034   1.93406   0.987779  2.28029     0.0868077
 -0.212849   0.658095  1.13795   0.696106    0.743739
  0.773509   0.90897   0.827584  0.00790315  1.46994
  1.50333    0.618312  1.69343   0.265664    0.875086
  1.19385   -0.239743  1.44632   1.98587     0.69093

In [39]:
[qc_model.data[1].μ qc_model.data[3].μ qc_model.data[7].μ    qc_model.data[2].μ qc_model.data[4].μ]

5×5 Matrix{Float64}:
 0.561446  0.873698  0.728649  0.907231  0.521688
 0.446988  0.658832  0.757303  0.667324  0.677813
 0.684279  0.712789  0.695844  0.501976  0.813048
 0.818071  0.649835  0.844675  0.566028  0.705803
 0.76743   0.44035   0.809431  0.879305  0.666174

In [48]:
[qc_model.data[1].res qc_model.data[3].res qc_model.data[7].res    qc_model.data[2].res qc_model.data[4].res]

5×5 Matrix{Float64}:
 -1.13147    0.380211   0.610248   0.319773  -1.04436
 -0.899043   0.719609  -1.76646   -1.41631   -1.45044
 -1.47219   -1.57536    0.661138  -1.00396    0.47952
  0.47158    0.734066  -2.33197    0.875612   0.645621
  0.5505     1.12735   -2.06093    0.370488   0.707891

In [40]:
[qc_model.data[1].dμ qc_model.data[3].dμ qc_model.data[7].dμ    qc_model.data[2].dμ qc_model.data[4].dμ]

5×5 Matrix{Float64}:
 0.246224  0.11035   0.19772   0.0841628  0.24953
 0.24719   0.224772  0.183795  0.222003   0.218383
 0.216041  0.204721  0.211645  0.249996   0.152001
 0.148831  0.22755   0.131199  0.24564    0.207645
 0.178481  0.246442  0.154253  0.106127   0.222386

In [41]:
[qc_model.data[1].varμ qc_model.data[3].varμ qc_model.data[7].varμ    qc_model.data[2].varμ qc_model.data[4].varμ]

5×5 Matrix{Float64}:
 0.246224  0.11035   0.19772   0.0841628  0.24953
 0.24719   0.224772  0.183795  0.222003   0.218383
 0.216041  0.204721  0.211645  0.249996   0.152001
 0.148831  0.22755   0.131199  0.24564    0.207645
 0.178481  0.246442  0.154253  0.106127   0.222386

In [42]:
[qc_model.data[1].w1 qc_model.data[3].w1 qc_model.data[7].w1    qc_model.data[2].w1 qc_model.data[4].w1]

5×5 Matrix{Float64}:
 1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0
 1.0  1.0  1.0  1.0  1.0

In [43]:
[qc_model.data[1].w2 qc_model.data[3].w2 qc_model.data[7].w2    qc_model.data[2].w2 qc_model.data[4].w2]

5×5 Matrix{Float64}:
 0.246224  0.11035   0.19772   0.0841628  0.24953
 0.24719   0.224772  0.183795  0.222003   0.218383
 0.216041  0.204721  0.211645  0.249996   0.152001
 0.148831  0.22755   0.131199  0.24564    0.207645
 0.178481  0.246442  0.154253  0.106127   0.222386