# Using autodiff to check gradient/Hessians

In [1]:
using Revise
using DataFrames, Random, GLM, QuasiCopula
using ForwardDiff, LinearAlgebra
using ToeplitzMatrices
using BenchmarkTools
using SnpArrays
using ForwardDiff
# using MendelPlots
ENV["COLUMNS"] = 240

BLAS.set_num_threads(1)
Threads.nthreads()

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mPrecompiling QuasiCopula [c47b6ae2-b804-4668-9957-eb588c99ffbc]


1

## simulate data

In [2]:
k = 0 # number of causal SNPs
p = 3
d = 4

qc_model, G, Btrue, θtrue, γtrue, τtrue = simulate_multivariate_traits(
    n = 5000, # sample size
    p = p, # number of fixed effects, including intercept
    m = 1, # number of variance components
    d = d, # number of phenotypes
    q = 1000, # number of SNPs
    k = k, # number of causal SNPs
    seed = 1,
    τtrue = 0.01,
    possible_distributions = [Bernoulli, Normal]
)

qc_model.Y

5000×4 Matrix{Float64}:
 1.0  0.0    4.50974   -13.2746
 0.0  1.0   -3.98109    -0.0887145
 0.0  0.0   -5.50458     2.07538
 1.0  1.0    4.26426   -10.6924
 0.0  1.0   24.5941      8.02716
 1.0  1.0   -1.9564     -9.11461
 0.0  0.0   -2.24566     2.20784
 1.0  0.0    4.09903     2.56163
 1.0  1.0  -10.0673     15.4169
 0.0  1.0    8.80747    -9.22851
 0.0  0.0    3.62388   -12.995
 1.0  0.0    4.78672     5.71522
 0.0  0.0    0.592567   -9.55532
 ⋮                     
 0.0  0.0    4.96956     2.98844
 0.0  1.0   -6.87342     3.39349
 0.0  0.0    8.20093     7.38732
 0.0  1.0    9.62711     0.159582
 1.0  0.0    6.9844      0.196057
 1.0  0.0    7.41004     7.18247
 0.0  0.0  -12.6771     -9.44202
 0.0  0.0   -9.14128    -8.51743
 0.0  1.0    0.15375     2.9925
 0.0  0.0   26.5356      6.72687
 0.0  1.0  -12.5527      2.99434
 1.0  1.0   -3.4743      7.59344

## Compare loglikelihood/gradient with longitudinal (single observation)

To test this, we simulate a single multivariate gaussian Copula, and compare its loglikelihood with longitudinal gaussian copula with only 1 observation per sample

In [9]:
n = 5000
dist = Normal

qc_model1, G, Btrue, θtrue, γtrue, τtrue = simulate_multivariate_traits(
    n = n, # sample size
    p = 3, # number of fixed effects, including intercept
    m = 1, # number of variance components
    d = 1, # number of phenotypes
    q = 1000, # number of SNPs
    k = k, # number of causal SNPs
    seed = 123,
    τtrue = 0.01,
    possible_distributions = [dist]
)

qc_model2, G, Btrue, θtrue, γtrue, τtrue = simulate_longitudinal_traits(
    n = n, # sample size
    p = 3, # number of fixed effects, including intercept
    m = 1, # number of variance components
    d_max = 1, # number of observations per sample
    q = 1000, # number of SNPs
    k = k, # number of causal SNPs
    seed = 123,
    τtrue = 0.01,
    y_distribution = dist,
)

# force longitudinal qc_model to share the same y/X/θ/τ as multivariate case
qc_model2.β .= qc_model1.B
for i in 1:n
    copyto!(qc_model2.data[i].y, qc_model1.Y[i, :])
    copyto!(qc_model2.data[i].X, qc_model1.X[i, :])
#     copyto!(qc_model2.data[i].η, qc_model1.data[i].η)
#     copyto!(qc_model2.data[i].μ, qc_model1.data[i].μ)
#     copyto!(qc_model2.data[i].res, qc_model1.data[i].res)
end
qc_model2.θ .= qc_model1.θ
qc_model1.ϕ .= qc_model2.τ
# qc_model2.τ .= qc_model1.ϕ

1-element Vector{Float64}:
 0.009775278594465796

In [10]:
@show loglikelihood!(qc_model1, true, false)
@show loglikelihood!(qc_model2, true, false);

loglikelihood!(qc_model1, true, false) = -18930.965914285898
loglikelihood!(qc_model2, true, false) = -18930.965914285898


In [11]:
@show qc_model1.∇vecB
@show qc_model2.∇β;

qc_model1.∇vecB = [-0.9563351570448801, -0.39865893560309557, 0.006400758476562237]
qc_model2.∇β = [-0.9563351570448801, -0.3986589356030947, 0.006400758476562223]


In [12]:
@show qc_model1.∇θ
@show qc_model2.∇θ;

qc_model1.∇θ = [-354.56438108432167]
qc_model2.∇θ = [-354.56438108432167]


In [13]:
@show qc_model1.∇ϕ
@show qc_model2.∇τ;

qc_model1.∇ϕ = [66371.51597804665]
qc_model2.∇τ = [66371.51597804665]


## Is $\nabla_\beta res$ calculated correctly? 

We can check using ForwardDiff

The function is 

$$res_{ij}(\beta) = \frac{y_{ij} - \mu_{ij}}{\sqrt{\sigma_{ij}^2(\beta)}}$$

### Normal

Assumes y, X are given. We calculate the (standardized) residuals for just 1 sample

In [6]:
# sample data
xi = qc_model.X[1, :] # p by 1
yi = qc_model.Y[1, :] # d by 1

# objective
function resβ(y, x, vecB::AbstractVector)
    p = length(x)
    d = length(y)
    B = reshape(vecB, p, d)
    η = B' * x
    μ = GLM.linkinv.(IdentityLink(), η)
    varμ = GLM.glmvar.(Normal(), μ)
    return (y - μ) ./ sqrt.(varμ)
end
resβ(B::AbstractMatrix) = resβ(yi, xi, vec(B))
resβ(B::AbstractVector) = resβ(yi, xi, B)

B = randn(p, d)
@show resβ(yi, xi, vec(B))

# autodiff gradient
∇resβ_autodiff = x -> ForwardDiff.jacobian(resβ, x)
correct = ∇resβ_autodiff(vec(B))'

# mathematical gradient
function ∇resβ(x, y, vecB::AbstractVector{T}) where T
    p = length(x)
    d = length(y)
    B = reshape(vecB, p, d)
    η = B' * x
    μ = GLM.linkinv.(IdentityLink(), η)
    varμ = GLM.glmvar.(Normal(), μ)
    res = y - μ
    std_res = res ./ sqrt.(varμ)
    dμ = GLM.mueta.(IdentityLink(), η)
    out = zeros(T, d*p, d)
    @inbounds for j in 1:d
        for k in 1:p
            out[(j-1)*p + k, j] = QuasiCopula.update_∇res_ij(Normal(), x[k], 
                std_res[j], μ[j], dμ[j], varμ[j])
        end
    end
    return out
end
math_result = ∇resβ(xi, yi, vec(B))

# compare results
[vec(math_result) vec(correct)]

resβ(yi, xi, vec(B)) = [1.3884090998930962, -0.16822204638920013, 2.241041968786522, 0.04559962991400112]


48×2 Matrix{Float64}:
 -1.0        -1.0
 -0.100418   -0.100418
  0.0470154   0.0470154
  0.0        -0.0
  0.0        -0.0
  0.0        -0.0
  0.0        -0.0
  0.0        -0.0
  0.0        -0.0
  0.0        -0.0
  0.0        -0.0
  0.0        -0.0
  0.0         0.0
  ⋮          
  0.0        -0.0
  0.0        -0.0
  0.0        -0.0
  0.0        -0.0
  0.0        -0.0
  0.0        -0.0
  0.0        -0.0
  0.0        -0.0
  0.0        -0.0
 -1.0        -1.0
 -0.100418   -0.100418
  0.0470154   0.0470154

### Bernoulli

In [6]:
# sample data
xi = qc_model.X[1, :] # p by 1
yi = qc_model.Y[1, :] # d by 1

# objective
function resβ(y, x, vecB::AbstractVector)
    p = length(x)
    d = length(y)
    B = reshape(vecB, p, d)
    η = B' * x
    μ = GLM.linkinv.(LogitLink(), η)
    varμ = GLM.glmvar.(Bernoulli(), μ)
    return (y - μ) ./ sqrt.(varμ)
end
resβ(B::AbstractMatrix) = resβ(yi, xi, vec(B))
resβ(B::AbstractVector) = resβ(yi, xi, B)

B = randn(p, d)
@show resβ(yi, xi, vec(B))

# autodiff gradient
∇resβ_autodiff = x -> ForwardDiff.jacobian(resβ, x)
correct = ∇resβ_autodiff(vec(B))'

# mathematical gradient
function ∇resβ(x, y, vecB::AbstractVector{T}) where T
    p = length(x)
    d = length(y)
    B = reshape(vecB, p, d)
    η = B' * x
    μ = GLM.linkinv.(LogitLink(), η)
    varμ = GLM.glmvar.(Bernoulli(), μ)
    res = y - μ
    std_res = res ./ sqrt.(varμ)
    dμ = GLM.mueta.(LogitLink(), η)
    out = zeros(T, d*p, d)
    @inbounds for j in 1:d
        for k in 1:p
            out[(j-1)*p + k, j] = QuasiCopula.update_∇res_ij(Bernoulli(), x[k], 
                std_res[j], μ[j], dμ[j], varμ[j])
        end
    end
    return out
end
math_result = ∇resβ(xi, yi, vec(B))

# compare results
[vec(math_result) vec(correct)]

resβ(yi, xi, vec(B)) = [-1.699553635820773, -2.9658266436544203, 2.621786668608018]


45×2 Matrix{Float64}:
 -0.849777  -0.849777
  0.262282   0.262282
 -1.92764   -1.92764
 -0.268584  -0.268584
 -0.564345  -0.564345
  0.0        0.0
  0.0        0.0
  0.0        0.0
  0.0        0.0
  0.0        0.0
  0.0        0.0
  0.0        0.0
  0.0        0.0
  ⋮         
  0.0       -0.0
  0.0       -0.0
  0.0       -0.0
  0.0       -0.0
  0.0       -0.0
  0.0       -0.0
  0.0       -0.0
 -1.31089   -1.31089
  0.404604   0.404604
 -2.97365   -2.97365
 -0.414327  -0.414327
 -0.870577  -0.870577

### Poisson

In [7]:
# sample data
xi = qc_model.X[1, :] # p by 1
yi = qc_model.Y[1, :] # d by 1

# objective
function resβ(y, x, vecB::AbstractVector)
    p = length(x)
    d = length(y)
    B = reshape(vecB, p, d)
    η = B' * x
    μ = GLM.linkinv.(LogLink(), η)
    varμ = GLM.glmvar.(Poisson(), μ)
    return (y - μ) ./ sqrt.(varμ)
end
resβ(B::AbstractMatrix) = resβ(yi, xi, vec(B))
resβ(B::AbstractVector) = resβ(yi, xi, B)

B = randn(p, d)
@show resβ(yi, xi, vec(B))

# autodiff gradient
∇resβ_autodiff = x -> ForwardDiff.jacobian(resβ, x)
correct = ∇resβ_autodiff(vec(B))'

# mathematical gradient
function ∇resβ(x, y, vecB::AbstractVector{T}) where T
    p = length(x)
    d = length(y)
    B = reshape(vecB, p, d)
    η = B' * x
    μ = GLM.linkinv.(LogLink(), η)
    varμ = GLM.glmvar.(Poisson(), μ)
    res = y - μ
    std_res = res ./ sqrt.(varμ)
    dμ = GLM.mueta.(LogLink(), η)
    out = zeros(T, d*p, d)
    @inbounds for j in 1:d
        for k in 1:p
            out[(j-1)*p + k, j] = QuasiCopula.update_∇res_ij(Poisson(), x[k], 
                std_res[j], μ[j], dμ[j], varμ[j])
        end
    end
    return out
end
math_result = ∇resβ(xi, yi, vec(B))

# compare results
[vec(math_result) vec(correct)]

resβ(yi, xi, vec(B)) = [-0.6311261621867231, -2.784614111314093, 18.689464943875688]


45×2 Matrix{Float64}:
  -0.315563    -0.315563
   0.0973979    0.0973979
  -0.715827    -0.715827
  -0.0997383   -0.0997383
  -0.209568    -0.209568
   0.0          0.0
   0.0          0.0
   0.0          0.0
   0.0          0.0
   0.0          0.0
   0.0          0.0
   0.0          0.0
   0.0          0.0
   ⋮          
   0.0         -0.0
   0.0         -0.0
   0.0         -0.0
   0.0         -0.0
   0.0         -0.0
   0.0         -0.0
   0.0         -0.0
  -9.39809     -9.39809
   2.9007       2.9007
 -21.3187     -21.3187
  -2.9704      -2.9704
  -6.24136     -6.24136

## Check $\nabla_{\beta} L$

In [15]:
# sample data
i = 10
xi = qc_model.X[i, :] # p by 1
yi = qc_model.Y[i, :] # d by 1

function A_mul_b!(c::AbstractVector{T}, A::AbstractMatrix, b::AbstractVector) where T
    n, p = size(A)
    fill!(c, zero(T))
    for j in 1:p, i in 1:n
        c[i] += A[i, j] * b[j]
    end
    return c
end

# loglikelihood for a single sample
function loglikelihood(y, x, vecB::AbstractVector{T}, qc_model) where T
    p = length(x)
    d = length(y)
    m = qc_model.m
    B = reshape(vecB, p, d)
    η = B' * x
    μ = GLM.linkinv.(qc_model.veclink, η)
    varμ = GLM.glmvar.(qc_model.vecdist, μ)
    res = y - μ
    std_res = zeros(T, length(res))
    nuisance_counter = 1
    for j in eachindex(res)
        if typeof(qc_model.vecdist[j]) <: Normal
            τ = abs(qc_model.ϕ[nuisance_counter])
            std_res[j] = res[j] * sqrt(τ)
            nuisance_counter += 1
        else
            std_res[j] = res[j] / sqrt(varμ[j])
        end
    end
    storage_d = zeros(T, d)
    q = zeros(T, m)
    # GLM loglikelihood (term 2)
    nuisance_counter = 1
    logl = zero(T)
    @inbounds for j in eachindex(y)
        dist = qc_model.vecdist[j]
        if typeof(dist) <: Normal
            τ = inv(qc_model.ϕ[nuisance_counter])
            logl += QuasiCopula.loglik_obs(dist, y[j], μ[j], one(T), τ)
            nuisance_counter += 1
        else
            logl += QuasiCopula.loglik_obs(dist, y[j], μ[j], one(T), one(T))
        end
    end
    # loglikelihood term 1 i.e. -sum ln(1 + 0.5tr(Γ(θ)))
    tsum = dot(qc_model.θ, qc_model.t) # tsum = 0.5tr(Γ)
    logl += -log(1 + tsum)
    # loglikelihood term 3 i.e. sum ln(1 + 0.5 r*Γ*r)
    @inbounds for k in 1:qc_model.m # loop over m variance components
        mul!(storage_d, qc_model.V[k], std_res) # storage_d = V[k] * r
        q[k] = dot(std_res, storage_d) / 2 # q[k] = 0.5 r * V[k] * r
    end
    qsum = dot(qc_model.θ, q) # qsum = 0.5 r*Γ*r
    logl += log(1 + qsum)
    return logl
end
loglikelihood(B::AbstractMatrix) = loglikelihood(yi, xi, vec(B), qc_model)
loglikelihood(B::AbstractVector) = loglikelihood(yi, xi, B, qc_model)

qc_model.ϕ .= 1.1

# autodiff gradient
Random.seed!(2023)
B = randn(p, d)
logl_autodiff = x -> ForwardDiff.gradient(loglikelihood, x)
correct = logl_autodiff(vec(B))

# gradient from math
qc_model.B .= B
loglikelihood!(qc_model, true, false)

[correct qc_model.data[i].∇vecB]

12×2 Matrix{Float64}:
  -0.125531    -0.125531
   0.237507     0.237507
   0.0716134    0.0716134
   0.554527     0.554527
  -1.04917     -1.04917
  -0.316348    -0.316348
   7.83048      7.83048
 -14.8154     -14.8154
  -4.46715     -4.46715
  -9.23224     -9.23224
  17.4675      17.4675
   5.26683      5.26683

## Check $\nabla_{\theta} L$

In [19]:
# sample data
i = 5
xi = qc_model.X[i, :] # p by 1
yi = qc_model.Y[i, :] # d by 1

function A_mul_b!(c::AbstractVector{T}, A::AbstractMatrix, b::AbstractVector) where T
    n, p = size(A)
    fill!(c, zero(T))
    for j in 1:p, i in 1:n
        c[i] += A[i, j] * b[j]
    end
    return c
end

# loglikelihood for a single sample
function loglikelihood(y, x, θ::AbstractVector{T}, qc_model) where T
    p = length(x)
    d = length(y)
    m = qc_model.m
    B = qc_model.B
    η = B' * x
    μ = GLM.linkinv.(qc_model.veclink, η)
    varμ = GLM.glmvar.(qc_model.vecdist, μ)
    res = y - μ
    std_res = zeros(T, length(res))
    nuisance_counter = 1
    for j in eachindex(res)
        if typeof(qc_model.vecdist[j]) <: Normal
            τ = abs(qc_model.ϕ[nuisance_counter])
            std_res[j] = res[j] * sqrt(τ)
            nuisance_counter += 1
        else
            std_res[j] = res[j] / sqrt(varμ[j])
        end
    end
    storage_d = zeros(T, d)
    q = zeros(T, m)
    # GLM loglikelihood (term 2)
    logl = zero(T)
    @inbounds for j in eachindex(y)
        logl += QuasiCopula.loglik_obs(qc_model.vecdist[j], y[j], μ[j], one(T), one(T))
    end
    # loglikelihood term 1 i.e. -sum ln(1 + 0.5tr(Γ(θ)))
    tsum = dot(θ, qc_model.t) # tsum = 0.5tr(Γ)
    logl += -log(1 + tsum)
    # loglikelihood term 3 i.e. sum ln(1 + 0.5 r*Γ*r)
    @inbounds for k in 1:qc_model.m # loop over m variance components
        mul!(storage_d, qc_model.V[k], std_res) # storage_d = V[k] * r
        q[k] = dot(std_res, storage_d) / 2 # q[k] = 0.5 r * V[k] * r
    end
    qsum = dot(θ, q) # qsum = 0.5 r*Γ*r
    logl += log(1 + qsum)
    return logl
end
loglikelihood(θ::AbstractVector) = loglikelihood(yi, xi, θ, qc_model)

# autodiff gradient
logl_autodiff = x -> ForwardDiff.gradient(loglikelihood, x)

θ = [rand()]
correct = logl_autodiff(θ)

# gradient from math
qc_model.θ .= θ
loglikelihood!(qc_model, true, false)

[correct qc_model.data[i].∇θ]

1×2 Matrix{Float64}:
 3.55748  3.55748

## Check $\nabla_{\phi} L$

In [20]:
# sample data
i = 5
xi = qc_model.X[i, :] # p by 1
yi = qc_model.Y[i, :] # d by 1

function A_mul_b!(c::AbstractVector{T}, A::AbstractMatrix, b::AbstractVector) where T
    n, p = size(A)
    fill!(c, zero(T))
    for j in 1:p, i in 1:n
        c[i] += A[i, j] * b[j]
    end
    return c
end

# loglikelihood for a single sample
function loglikelihood(y, x, ϕ::AbstractVector{T}, qc_model) where T
    p = length(x)
    d = length(y)
    m = qc_model.m
    B = qc_model.B
    η = B' * x
    μ = GLM.linkinv.(qc_model.veclink, η)
    varμ = GLM.glmvar.(qc_model.vecdist, μ)
    res = y - μ
    std_res = zeros(T, length(res))
    nuisance_counter = 1
    for j in eachindex(res)
        if typeof(qc_model.vecdist[j]) <: Normal
            τ = abs(ϕ[nuisance_counter])
            std_res[j] = res[j] * sqrt(τ)
            nuisance_counter += 1
        else
            std_res[j] = res[j] / sqrt(varμ[j])
        end
    end
    storage_d = zeros(T, d)
    q = zeros(T, m)
    # GLM loglikelihood (term 2)
    logl = zero(T)
    @inbounds for j in eachindex(y)
        logl += QuasiCopula.loglik_obs(qc_model.vecdist[j], y[j], μ[j], one(T), one(T))
    end
    # loglikelihood term 1 i.e. -sum ln(1 + 0.5tr(Γ(θ)))
    tsum = dot(qc_model.θ, qc_model.t) # tsum = 0.5tr(Γ)
    logl += -log(1 + tsum)
    # loglikelihood term 3 i.e. sum ln(1 + 0.5 r*Γ*r)
    @inbounds for k in 1:qc_model.m # loop over m variance components
        mul!(storage_d, qc_model.V[k], std_res) # storage_d = V[k] * r
        q[k] = dot(std_res, storage_d) / 2 # q[k] = 0.5 r * V[k] * r
    end
    qsum = dot(qc_model.θ, q) # qsum = 0.5 r*Γ*r
    logl += log(1 + qsum)
    return logl
end
loglikelihood(ϕ::AbstractVector) = loglikelihood(yi, xi, ϕ, qc_model)

# autodiff gradient
logl_autodiff = x -> ForwardDiff.gradient(loglikelihood, x)

ϕ = rand(2)
correct = logl_autodiff(ϕ)

# gradient from math
qc_model.ϕ .= ϕ
loglikelihood!(qc_model, true, false)

[correct qc_model.data[i].∇ϕ]

2×2 Matrix{Float64}:
 1.39017   2.41363
 0.585787  2.41363