# Using autodiff to check gradient/Hessians

In [40]:
using Revise
using DataFrames, Random, GLM, QuasiCopula
using ForwardDiff, Test, LinearAlgebra
using LinearAlgebra: BlasReal, copytri!
using ToeplitzMatrices
using BenchmarkTools
using SnpArrays
using ForwardDiff
# using MendelPlots

BLAS.set_num_threads(1)
Threads.nthreads()

function simulate_random_snparray(s::Union{String, UndefInitializer}, n::Int64,
    p::Int64; mafs::Vector{Float64}=zeros(Float64, p), min_ma::Int = 5)

    #first simulate a random {0, 1, 2} matrix with each SNP drawn from Binomial(2, r[i])
    A1 = BitArray(undef, n, p) 
    A2 = BitArray(undef, n, p) 
    for j in 1:p
        minor_alleles = 0
        maf = 0
        while minor_alleles <= min_ma
            maf = 0.5rand()
            for i in 1:n
                A1[i, j] = rand(Bernoulli(maf))
                A2[i, j] = rand(Bernoulli(maf))
            end
            minor_alleles = sum(view(A1, :, j)) + sum(view(A2, :, j))
        end
        mafs[j] = maf
    end

    #fill the SnpArray with the corresponding x_tmp entry
    return _make_snparray(s, A1, A2)
end

function _make_snparray(s::Union{String, UndefInitializer}, A1::BitArray, A2::BitArray)
    n, p = size(A1)
    x = SnpArray(s, n, p)
    for i in 1:(n*p)
        c = A1[i] + A2[i]
        if c == 0
            x[i] = 0x00
        elseif c == 1
            x[i] = 0x02
        elseif c == 2
            x[i] = 0x03
        else
            throw(MissingException("matrix shouldn't have missing values!"))
        end
    end
    return x
end

_make_snparray (generic function with 1 method)

In [70]:
function simulate_VC_longitudinal(;
    n = 1000, # sample size
    d = 5, # number of observations per sample
    p = 3, # number of nongenetic covariates, including intercept
    m = 2, # number of variance components
    q = 1000, # number of SNPs
    k = 10, # number of causal SNPs
    seed = 2022,
    y_distribution = Bernoulli,
    T = Float64,
    )
    m == 1 || m == 2 || error("m (number of VC) must be 1 or 2")
    
    # non-genetic effect sizes
    Random.seed!(seed)
    βtrue = rand(Uniform(-0.2, 0.2), p)
    dist = y_distribution()
    link = canonicallink(dist)
    Dist = typeof(dist)
    Link = typeof(link)

    # variance components
    θtrue = fill(0.1, m)
    V1 = ones(d, d)
    V2 = Matrix(I, d, d)
    Γ = m == 1 ? θtrue[1] * V1 : θtrue[1] * V1 + θtrue[2] * V2

    # simulate design matrices
    Random.seed!(seed)
    X_full = [hcat(ones(d), randn(d, p - 1)) for i in 1:n]

    # simulate random SnpArray with 100 SNPs and randomly choose k SNPs to be causal
    Random.seed!(2022)
    G = simulate_random_snparray(undef, n, q)
    Gfloat = convert(Matrix{T}, G, center=true, scale=false)
    γtrue = zeros(q)
    γtrue[1:k] .= rand([-0.2, 0.2], k)
    shuffle!(γtrue)
    η_G = Gfloat * γtrue

    # simulate phenotypes
    if y_distribution == Normal
        τtrue = 10.0
        σ2 = inv(τtrue)
        σ = sqrt(σ2)
        obs = Vector{GaussianCopulaVCObs{T}}(undef, n)
        for i in 1:n
            X = X_full[i]
            η = X * βtrue
            η .+= η_G[i] # add genetic effects
            μ = GLM.linkinv.(link, η)
            vecd = Vector{ContinuousUnivariateDistribution}(undef, d)
            for i in 1:d
                vecd[i] = y_distribution(μ[i], σ)
            end
            nonmixed_multivariate_dist = NonMixedMultivariateDistribution(vecd, Γ)
            # simuate single vector y
            y = Vector{T}(undef, d)
            res = Vector{T}(undef, d)
            rand(nonmixed_multivariate_dist, y, res)
            V = m == 1 ? [V1] : [V1, V2]
            obs[i] = GaussianCopulaVCObs(y, X, V)
        end
        qc_model = GaussianCopulaVCModel(obs)
    else
        obs = Vector{GLMCopulaVCObs{T, Dist, Link}}(undef, n)
        for i in 1:n
            X = X_full[i]
            η = X * βtrue
            η .+= η_G[i] # add genetic effects
            μ = GLM.linkinv.(link, η)
            vecd = Vector{DiscreteUnivariateDistribution}(undef, d)
            for i in 1:d
                vecd[i] = y_distribution(μ[i])
            end
            nonmixed_multivariate_dist = NonMixedMultivariateDistribution(vecd, Γ)
            # simuate single vector y
            y = Vector{T}(undef, d)
            res = Vector{T}(undef, d)
            rand(nonmixed_multivariate_dist, y, res)
            V = m == 1 ? [V1] : [V1, V2]
            obs[i] = GLMCopulaVCObs(y, X, V, dist, link)
        end
        qc_model = GLMCopulaVCModel(obs)
    end
    return qc_model, Γ, G, βtrue, θtrue, γtrue
end

k = 0 # number of causal SNPs

qc_model, Γ, G, βtrue, θtrue, γtrue = simulate_VC_longitudinal(
    n = 5000, # sample size
    d = 5, # number of observations per sample
    p = 3, # number of fixed effects, including intercept
    m = 1, # number of variance components
    q = 1000, # number of SNPs
    k = k, # number of causal SNPs
    seed = 1000,
    y_distribution = Bernoulli,
    T = Float64,
)

@show qc_model;

qc_model = Quasi-Copula Variance Component Model
  * base distribution: Bernoulli
  * link function: LogitLink
  * number of clusters: 5000
  * cluster size min, max: 5, 5
  * number of variance components: 1
  * number of fixed effects: 3



In [71]:
@time optm = QuasiCopula.fit!(qc_model,
    Ipopt.IpoptSolver(
        print_level = 5, 
        tol = 10^-6, 
        max_iter = 1000,
        accept_after_max_steps = 4,
        warm_start_init_point="yes", 
        limited_memory_max_history = 6, # default value
        hessian_approximation = "limited-memory",
#         derivative_test="second-order"
    )
);

initializing β using Newton's Algorithm under Independence Assumption
gcm.β = [0.012856084500578055, 0.004855553143028062, 0.049012922077609024]
initializing variance components using MM-Algorithm
gcm.θ = [0.0903852936011474]
This is Ipopt version 3.13.4, running with linear solver mumps.
NOTE: Other linear solvers might be more efficient (see Ipopt documentation).

Number of nonzeros in equality constraint Jacobian...:        0
Number of nonzeros in inequality constraint Jacobian.:        0
Number of nonzeros in Lagrangian Hessian.............:        0

Total number of variables............................:        4
                     variables with only lower bounds:        1
                variables with lower and upper bounds:        0
                     variables with only upper bounds:        0
Total number of equality constraints.................:        0
Total number of inequality constraints...............:        0
        inequality constraints with only lower bounds:

In [72]:
@show βtrue
@show qc_model.β
@show qc_model.∇β

@show θtrue
@show qc_model.θ
@show qc_model.∇θ;

βtrue = [0.019926508247760877, 0.010309257172965214, 0.05956897900629837]
qc_model.β = [0.015567602562558885, 0.004040267621651801, 0.05181521692625306]
qc_model.∇β = [-1.1793769072454552e-8, 1.724461429208901e-8, -4.6281332875472425e-8]
θtrue = [0.1]
qc_model.θ = [0.09042550920170454]
qc_model.∇θ = [4.175548773410753e-8]


## Is $\nabla_\beta res$ calculated correctly? 

We can check using ForwardDiff

The function is 

$$res_{ij}(\beta) = \frac{y_{ij} - \mu_{ij}}{\sqrt{\sigma_{ij}^2(\beta)}}$$

### Normal

Assumes y, X are given. We calculate the residuals for just 1 sample

In [2]:
# sample data
X = qc_model.data[1].X # d by p
y = qc_model.data[1].y # d by 1

# objective
function resβ(β)
    η = X * β # d by 1
    μ = GLM.linkinv.(IdentityLink(), η)
    varμ = GLM.glmvar.(Normal(), μ)
    return (y - μ) ./ sqrt.(varμ) # d by 1
end

# mathematical gradient
function ∇resβ(β)
    d, p = size(X)
    ∇resβ = zeros(d, p)
    for i in 1:p, j in 1:d
        ∇resβ[j, i] = -X[j, i]
    end
    return ∇resβ # d × p
end

# autodiff gradient
∇resβ_autodiff = x -> ForwardDiff.jacobian(resβ, x)

# random beta vector
β = rand(size(qc_model.data[1].X, 2))

# check objective
@show resβ(β)

# compare mathematical and numerical gradient
[vec(∇resβ(β)) vec(∇resβ_autodiff(β))]

resβ(β) = [-1.5263015405222384, -2.6945001310537258, -1.9847678519577736, -0.900074590336336]


12×2 Matrix{Float64}:
 -1.0        -1.0
 -1.0        -1.0
 -1.0        -1.0
 -1.0        -1.0
  2.07458     2.07458
 -1.94686    -1.94686
  0.0808759   0.0808759
  0.154606    0.154606
 -0.931964   -0.931964
 -2.26098    -2.26098
 -1.19819    -1.19819
  0.0763038   0.0763038

### Bernoulli

In [3]:
# sample data
X = qc_model.data[1].X # d by p
y = qc_model.data[1].y # d by 1

# objective
function resβ(β)
    η = X * β # d by 1
    μ = GLM.linkinv.(LogitLink(), η)
    varμ = GLM.glmvar.(Bernoulli(), μ)
    return (y - μ) ./ sqrt.(varμ) # d by 1
end

# mathematical gradient
function ∇resβ(β)
    d, p = size(X)
    ∇resβ = zeros(d, p)
    η = X * β # d by 1
    μ = GLM.linkinv.(LogitLink(), η) # d by 1
    varμ = GLM.glmvar.(Bernoulli(), μ) # d by 1
    res = (y - μ) ./ sqrt.(varμ) # d by 1
    for i in 1:p, j in 1:d
        varμ_j = varμ[j]
        x_ji = X[j, i]
        res_j = res[j]
        μ_j = μ[j]
        ∇resβ[j, i] = -sqrt(varμ_j) * x_ji - (0.5 * res_j * (1 - 2μ_j) * x_ji)
    end
    return ∇resβ # d × p
end

# autodiff gradient
∇resβ_autodiff = x -> ForwardDiff.jacobian(resβ, x)

# random beta vector
β = rand(size(qc_model.data[1].X, 2))

# check objective
@show resβ(β)

# compare mathematical and numerical gradient
[vec(∇resβ(β)) vec(∇resβ_autodiff(β))]

resβ(β) = [1.3943163199970943, 0.24515130649692646, 0.6580775661179072, 0.6577805352369949, 0.390446316792847]


15×2 Matrix{Float64}:
 -0.697158   -0.697158
 -0.122576   -0.122576
 -0.329039   -0.329039
 -0.32889    -0.32889
 -0.195223   -0.195223
  1.44631     1.44631
 -0.238638   -0.238638
  0.0266113   0.0266113
  0.0508486   0.0508486
 -0.181941   -0.181941
 -1.57626    -1.57626
 -0.146869   -0.146869
  0.0251069   0.0251069
 -0.154771   -0.154771
 -0.201156   -0.201156

### Poisson

In [61]:
# sample data
X = qc_model.data[1].X # d by p
y = qc_model.data[1].y # d by 1

# objective
function resβ(β)
    η = X * β # d by 1
    μ = GLM.linkinv.(LogLink(), η)
    varμ = GLM.glmvar.(Poisson(), μ)
    return (y - μ) ./ sqrt.(varμ) # d by 1
end

# mathematical gradient
function ∇resβ(β)
    d, p = size(X)
    ∇resβ = zeros(d, p)
    η = X * β # d by 1
    μ = GLM.linkinv.(LogLink(), η) # d by 1
    varμ = GLM.glmvar.(Poisson(), μ) # d by 1
    res = (y - μ) ./ sqrt.(varμ) # d by 1
    dμ = GLM.mueta.(LogLink(), η) # d by 1
    for i in 1:p, j in 1:d
        varμ_j = varμ[j]
        x_ji = X[j, i]
        res_j = res[j]
        μ_j = μ[j]
        dμ_j = dμ[j]
        ∇resβ[j, i] = x_ji * (-(inv(sqrt(varμ_j)) + (0.5 * inv(varμ_j)) * res_j) * dμ_j)
    end
    return ∇resβ # d × p
end

# autodiff gradient
∇resβ_autodiff = x -> ForwardDiff.jacobian(resβ, x)

# random beta vector
β = rand(size(qc_model.data[1].X, 2))

# check objective
@show resβ(β)

# compare mathematical and numerical gradient
[vec(∇resβ(β)) vec(∇resβ_autodiff(β))]

resβ(β) = [0.8012638765796852, -6.734952547066679, -1.3994071698413866, -0.48023808695797]


12×2 Matrix{Float64}:
 -1.07727    -1.07727
 -3.65238    -3.65238
 -1.22049    -1.22049
 -1.02842    -1.02842
  2.23488     2.23488
 -7.11068    -7.11068
  0.0987079   0.0987079
  0.159001    0.159001
 -1.00397    -1.00397
 -8.25796    -8.25796
 -1.46237    -1.46237
  0.0784727   0.0784727

## Is Gradient of likelihood correct?

In [44]:
function A_mul_b!(c::AbstractVector{T}, A::AbstractMatrix, b::AbstractVector) where T
    n, p = size(A)
    fill!(c, zero(T))
    for j in 1:p, i in 1:n
        c[i] += A[i, j] * b[j]
    end
    return c
end

function loglikelihood(
    β::AbstractVector{T}, 
    qc_model::Union{GLMCopulaVCModel, NBCopulaVCModel}
    ) where T
    θ = qc_model.θ
    # allocate vector of type T
    n, p = size(qc_model.data[1].X)
    η = zeros(T, n)
    μ = zeros(T, n)
    varμ = zeros(T, n)
    res = zeros(T, n)
    storage_n = zeros(T, n)
    q = zeros(T, length(θ))
    logl = zero(T)
    for gc in qc_model.data
        X = gc.X
        y = gc.y
        n, p = size(X)
        # update_res! step (need to avoid BLAS)
        A_mul_b!(η, X, β)
        for i in 1:gc.n
            μ[i] = GLM.linkinv(gc.link, η[i])
            varμ[i] = GLM.glmvar(gc.d, μ[i]) # Note: for negative binomial, d.r is used
#             dμ[i] = GLM.mueta(gc.link, η[i])
#             w1[i] = dμ[i] / varμ[i]
#             w2[i] = w1[i] * dμ[i]
            res[i] = y[i] - μ[i]
        end
        # standardize_res! step
        for j in eachindex(y)
            res[j] /= sqrt(varμ[j])
        end
        # std_res_differential! step (this will compute ∇resβ)
#         for i in 1:gc.p
#             for j in 1:gc.n
#                 ∇resβ[j, i] = -sqrt(varμ[j]) * X[j, i] - (0.5 * res[j] * (1 - (2 * μ[j])) * X[j, i])
#             end
#         end
        # update Γ
        @inbounds for k in 1:gc.m
            A_mul_b!(storage_n, gc.V[k], res)
            q[k] = dot(res, storage_n) / 2 # q[k] = 0.5 r' * V[k] * r (update variable b for variance component model)
        end
        # component_loglikelihood
        for j in 1:gc.n
            logl += QuasiCopula.loglik_obs(gc.d, y[j], μ[j], one(T), one(T))
        end
        tsum = dot(θ, gc.t)
        logl += -log(1 + tsum)
        qsum  = dot(θ, q) # qsum = 0.5 r'Γr
        logl += log(1 + qsum)
    end
    return logl
end

# sample data
autodiff_loglikelihood(β) = loglikelihood(β, qc_model)


autodiff_loglikelihood (generic function with 1 method)

In [45]:
# autodiff Gradient
∇logl = x -> ForwardDiff.gradient(autodiff_loglikelihood, x)
∇βtrue = ∇logl(qc_model.β)

3-element Vector{Float64}:
 -1.1793326093467726e-8
  1.724427295402009e-8
 -4.6281334979692e-8

In [46]:
# gradient from math
loglikelihood!(qc_model, true, false)
∇βobs = qc_model.∇β

3-element Vector{Float64}:
 -1.1793769072454552e-8
  1.724461429208901e-8
 -4.6281332875472425e-8

# Is Hessian of loglikelihood correct?

Hessians for a single observation seems to differ quite a bit

In [75]:
# 2 term hessian from math
function two_term_Hessian(gcm)
    p = length(gcm.β)
    T = eltype(gcm.β)
    H = zeros(T, p, p)
    for gc in gcm.data
        d = gc.n # number of observations for current sample
        # GLM term
        H -= Transpose(gc.X) * Diagonal(gc.w2) * gc.X
        # trailing terms
        res = gc.res # d × 1 standardized residuals
        ∇resβ = gc.∇resβ # d × p
        Γ = zeros(T, d, d)
        for k in 1:gc.m # loop over variance components
            Γ .+= gcm.θ[k] .* gc.V[k]
        end
        denom = abs2(1 + 0.5 * (res' * Γ * res))
        H -= (∇resβ' * Γ * res) * (∇resβ' * Γ * res)' / denom
    end
    return H
end

two_term_Hessian (generic function with 1 method)

In [76]:
# 2 term Hessian from math
two_terms_H = two_term_Hessian(qc_model)

3×3 Matrix{Float64}:
 -6553.88      -24.8886     72.158
   -24.8886  -6294.53       73.4028
    72.158      73.4028  -6294.07

In [77]:
# 3 term Hessian from math
loglikelihood!(qc_model, true, true)
three_terms_H = qc_model.Hβ

3×3 Matrix{Float64}:
 -4634.79      -15.3138     39.9192
   -18.6025  -5903.51       64.686
    40.9515     65.6278  -5921.69

In [78]:
# autodiff Hessian
∇²logl = x -> ForwardDiff.hessian(autodiff_loglikelihood, x)
autodiff_H = ∇²logl(qc_model.β)

3×3 Matrix{Float64}:
 -4173.84      -21.2542     31.7545
   -21.2542  -5444.94       58.3133
    31.7545     58.3133  -5466.57

In [79]:
# 2 term inv Hessian from math
inv(two_terms_H)

3×3 Matrix{Float64}:
 -0.000152603   5.8307e-7    -1.74271e-6
  5.8307e-7    -0.000158892  -1.84635e-6
 -1.74271e-6   -1.84635e-6   -0.000158921

In [80]:
# 3 term inv Hessian from math
inv(three_terms_H)

3×3 Matrix{Float64}:
 -0.000215774   5.43617e-7   -1.44864e-6
  6.63655e-7   -0.000169413  -1.84612e-6
 -1.48484e-6   -1.87378e-6   -0.000168901

In [81]:
# autodiff inv Hessian
inv(autodiff_H)

3×3 Matrix{Float64}:
 -0.000239603   9.20482e-7   -1.382e-6
  9.20482e-7   -0.000183681  -1.95403e-6
 -1.382e-6     -1.95403e-6   -0.000182959

In [82]:
[vec(autodiff_H) vec(two_terms_H) vec(three_terms_H)]

9×3 Matrix{Float64}:
 -4173.84    -6553.88    -4634.79
   -21.2542    -24.8886    -18.6025
    31.7545     72.158      40.9515
   -21.2542    -24.8886    -15.3138
 -5444.94    -6294.53    -5903.51
    58.3133     73.4028     65.6278
    31.7545     72.158      39.9192
    58.3133     73.4028     64.686
 -5466.57    -6294.07    -5921.69

In [83]:
[vec(inv(autodiff_H)) vec(inv(two_terms_H)) vec(inv(three_terms_H))]

9×3 Matrix{Float64}:
 -0.000239603  -0.000152603  -0.000215774
  9.20482e-7    5.8307e-7     6.63655e-7
 -1.382e-6     -1.74271e-6   -1.48484e-6
  9.20482e-7    5.8307e-7     5.43617e-7
 -0.000183681  -0.000158892  -0.000169413
 -1.95403e-6   -1.84635e-6   -1.87378e-6
 -1.382e-6     -1.74271e-6   -1.44864e-6
 -1.95403e-6   -1.84635e-6   -1.84612e-6
 -0.000182959  -0.000158921  -0.000168901