# Multivariate QuasiCopula GWAS with Mixed Marginals

If there is only 1 phenotype, then the result should be equivalent to longitudinal GWAS with 1 observation per sample. 

In [1]:
using Revise
using DataFrames, Random, GLM, QuasiCopula
using ForwardDiff, Test, LinearAlgebra
using LinearAlgebra: BlasReal, copytri!
using ToeplitzMatrices
using BenchmarkTools
using SnpArrays
using Statistics
using StatsBase

BLAS.set_num_threads(1)
Threads.nthreads()

function simulate_random_snparray(s::Union{String, UndefInitializer}, n::Int64,
    p::Int64; mafs::Vector{Float64}=zeros(Float64, p), min_ma::Int = 5)

    #first simulate a random {0, 1, 2} matrix with each SNP drawn from Binomial(2, r[i])
    A1 = BitArray(undef, n, p) 
    A2 = BitArray(undef, n, p) 
    for j in 1:p
        minor_alleles = 0
        maf = 0
        while minor_alleles <= min_ma
            maf = 0.5rand()
            for i in 1:n
                A1[i, j] = rand(Bernoulli(maf))
                A2[i, j] = rand(Bernoulli(maf))
            end
            minor_alleles = sum(view(A1, :, j)) + sum(view(A2, :, j))
        end
        mafs[j] = maf
    end

    #fill the SnpArray with the corresponding x_tmp entry
    return _make_snparray(s, A1, A2)
end

function _make_snparray(s::Union{String, UndefInitializer}, A1::BitArray, A2::BitArray)
    n, p = size(A1)
    x = SnpArray(s, n, p)
    for i in 1:(n*p)
        c = A1[i] + A2[i]
        if c == 0
            x[i] = 0x00
        elseif c == 1
            x[i] = 0x02
        elseif c == 2
            x[i] = 0x03
        else
            throw(MissingException("matrix shouldn't have missing values!"))
        end
    end
    return x
end

┌ Info: Precompiling QuasiCopula [c47b6ae2-b804-4668-9957-eb588c99ffbc]
└ @ Base loading.jl:1423


_make_snparray (generic function with 1 method)

## Simulate data

Given $n$ independent samples, we simulate phenotypes from 
$$\mathbf{y}_i \sim QC(\mathbf{\Gamma}, f_1, ..., f_d)$$

In [58]:
# simulate data
p = 3    # number of fixed effects, including intercept
m = 2    # number of variance componentsac
n = 5000 # number of sample
d = 3    # number of phenotypes per sample
q = 1000 # number of SNPs
k = 0   # number of causal SNPs
seed = 2023

# sample d marginal distributions for each phenotype within samples
Random.seed!(seed)
possible_distributions = [Bernoulli, Poisson, Normal]
vecdist = rand(possible_distributions, d)
# vecdist = [Poisson, Bernoulli, Bernoulli] # this derivative test is fine
# vecdist = [Bernoulli, Bernoulli, Poisson] # this derivative test is wrong everywhere
veclink = [canonicallink(vecdist[j]()) for j in 1:d]

# simulate nongenetic coefficient and variance component params
Random.seed!(seed)
Btrue = rand(Uniform(-0.5, 0.5), p, d)
θtrue = fill(0.4, m)
V1 = ones(d, d)
V2 = Matrix(I, d, d)
Γ = m == 1 ? θtrue[1] * V1 : θtrue[1] * V1 + θtrue[2] * V2

# simulate non-genetic design matrix
Random.seed!(seed)
X = [ones(n) randn(n, p - 1)]

# simulate random SnpArray with q SNPs and randomly choose k SNPs to be causal
Random.seed!(seed)
G = simulate_random_snparray(undef, n, q)
Gfloat = convert(Matrix{Float64}, G, center=true, scale=true)
γtrue = zeros(q, d)
causal_snps = sample(1:q, k, replace=false) |> sort
for j in 1:d
    γtrue[causal_snps, j] .= rand([-1, 1], k)
end

# sample phenotypes
Y = zeros(n, d)
y = Vector{Float64}(undef, d)
for i in 1:n
    Xi = X[i, :]
    Gi = Gfloat[i, :]
    η = Btrue' * Xi + γtrue' * Gi
    vecd_tmp = Vector{UnivariateDistribution}(undef, d)
    for j in 1:d
        dist = vecdist[j]
        μj = GLM.linkinv(canonicallink(dist()), η[j])
        vecd_tmp[j] = dist(μj)
    end
    multivariate_dist = MultivariateMix(vecd_tmp, Γ)
    res = Vector{Float64}(undef, d)
    rand(multivariate_dist, y, res)
    Y[i, :] .= y
end

# form model
V = m == 1 ? [V1] : [V1, V2]
qc_model = MultivariateCopulaVCModel(Y, X, V, vecdist, veclink);

In [51]:
Y

5000×3 Matrix{Float64}:
 1.0  2.0   0.436519
 1.0  2.0  -0.819839
 0.0  0.0   1.5982
 1.0  0.0  -1.20721
 1.0  0.0   0.28193
 1.0  2.0  -0.839879
 1.0  4.0  -1.9317
 1.0  3.0   1.62387
 1.0  6.0   0.136927
 1.0  0.0   1.20368
 0.0  2.0  -0.691747
 0.0  2.0  -0.31042
 0.0  1.0  -0.228761
 ⋮         
 0.0  3.0  -0.349699
 0.0  2.0   1.21203
 0.0  1.0  -0.946448
 1.0  4.0  -0.511367
 0.0  0.0  -1.21434
 1.0  1.0   0.649446
 1.0  1.0   1.05457
 0.0  0.0  -1.84551
 0.0  1.0  -1.21778
 0.0  0.0   2.03475
 1.0  6.0  -0.66195
 1.0  2.0   0.629577

In [52]:
X

5000×3 Matrix{Float64}:
 1.0   0.100418   -0.0470154
 1.0   0.539314    1.34338
 1.0   0.032493   -0.304856
 1.0  -1.82519    -0.217143
 1.0  -1.1939      1.15006
 1.0  -0.205944    0.0662676
 1.0   0.910159    1.02386
 1.0  -0.7272     -0.164025
 1.0  -0.211406   -1.75684
 1.0  -0.0310205   0.83961
 1.0   0.174579   -0.555913
 1.0   0.599171   -0.248304
 1.0  -0.219329   -0.910624
 ⋮                
 1.0  -1.44813    -0.185781
 1.0  -1.22394    -0.794701
 1.0   1.4347      1.01114
 1.0  -0.700221   -0.94818
 1.0   0.286581    1.25612
 1.0  -1.05336     1.27283
 1.0  -0.0611384   0.794235
 1.0  -1.53319     0.194859
 1.0  -0.482952    0.545294
 1.0  -0.0636979   0.346252
 1.0  -0.661712    0.284675
 1.0   0.256958    0.272786

Make regular (longitudinal) model

In [53]:
dist = Bernoulli()
link = canonicallink(dist)
myDist = typeof(dist)
myLink = typeof(link)

obs = Vector{GLMCopulaVCObs{Float64, myDist, myLink}}(undef, n)
for i in 1:n
    obs[i] = GLMCopulaVCObs([Y[i]], Matrix(X[i, :]'), V, dist, link)
end
qc_longitudinal_model = GLMCopulaVCModel(obs)

Quasi-Copula Variance Component Model
  * base distribution: Bernoulli
  * link function: LogitLink
  * number of clusters: 5000
  * cluster size min, max: 1, 1
  * number of variance components: 1
  * number of fixed effects: 3


## Fit Null multivariate model

In [54]:
@time optm = QuasiCopula.fit!(qc_model,
    Ipopt.IpoptSolver(
        print_level = 5, 
        tol = 10^-6, 
        max_iter = 100,
        accept_after_max_steps = 10,
        warm_start_init_point="yes", 
        limited_memory_max_history = 6, # default value
        hessian_approximation = "limited-memory",
        derivative_test="first-order"
    )
);

This is Ipopt version 3.13.4, running with linear solver mumps.
NOTE: Other linear solvers might be more efficient (see Ipopt documentation).

Starting derivative checker for first derivatives.

* grad_f[          1] = -4.4175349269337408e+02    ~  0.0000000000000000e+00  [ 4.418e+02]
* grad_f[          2] = -1.5666791506998365e+02    ~  0.0000000000000000e+00  [ 1.567e+02]
* grad_f[          3] =  5.6546424302037201e+02    ~  0.0000000000000000e+00  [ 5.655e+02]
* grad_f[          7] = -3.0865523269591875e+04    ~  0.0000000000000000e+00  [ 3.087e+04]
* grad_f[          8] =  4.9466705672967633e+04    ~  0.0000000000000000e+00  [ 4.947e+04]
* grad_f[          9] =  2.3419343459556178e+04    ~  0.0000000000000000e+00  [ 2.342e+04]
* grad_f[         10] = -1.7542687808452879e+02    ~  0.0000000000000000e+00  [ 1.754e+02]

Derivative checker detected 7 error(s).

Number of nonzeros in equality constraint Jacobian...:        0
Number of nonzeros in inequality constraint Jacobian.:        

In [55]:
@show qc_model.∇vecB
@show qc_model.∇θ;

qc_model.∇vecB = [-1.0457653854489823e-5, -7.426068464322366e-8, -8.935443147742306e-6, 2.0432625412514227e-5, -6.7775386540791516e-6, -3.0343477559452436e-5, -1.5457391627671235e-5, 1.4430696336720228e-5, -1.0774524806106411e-5]
qc_model.∇θ = [1.430453396289444e-6]


In [56]:
[vec(qc_model.B) vec(Btrue)]

9×2 Matrix{Float64}:
 -0.434256  -0.442015
  0.020669   0.0655548
 -0.439151  -0.477028
  0.240869   0.247196
  0.321899   0.324582
 -0.383326  -0.361607
 -0.173956  -0.166936
 -0.192975  -0.174676
 -0.304402  -0.298439

In [57]:
[qc_model.θ θtrue]

1×2 Matrix{Float64}:
 0.381813  0.4

## Fit Null longitudinal model

In [None]:
qc_model.∇vecB = [1.3315837346569335, -0.5532699222780756, -0.8825223230877026]


In [None]:
gcm.∇β = [1.3315837346179868, -0.5532699222930468, -0.8825223232111048]


In [24]:
@time optm = QuasiCopula.fit!(qc_longitudinal_model,
    Ipopt.IpoptSolver(
        print_level = 5, 
        tol = 10^-6, 
        max_iter = 100,
        accept_after_max_steps = 10,
        warm_start_init_point="yes", 
        limited_memory_max_history = 6, # default value
        hessian_approximation = "limited-memory",
        derivative_test="first-order"
    )
);

gcm.∇β = [1.3315837346179868, -0.5532699222930468, -0.8825223232111048]
This is Ipopt version 3.13.4, running with linear solver mumps.
NOTE: Other linear solvers might be more efficient (see Ipopt documentation).



LoadError: UndefVarError: fdsa not defined

In [98]:
@show Btrue
@show qc_longitudinal_model.β
@show qc_longitudinal_model.∇β

@show θtrue
@show qc_longitudinal_model.θ
@show qc_longitudinal_model.∇θ;

@show qc_longitudinal_model.τ
@show qc_longitudinal_model.∇τ;

Btrue = [-0.3678580518515241; -0.12820979298252866; 0.40051323184217924;;]
qc_longitudinal_model.β = [-0.3814378383187703, -0.15781136267580856, 0.41656677817845617]
qc_longitudinal_model.∇β = [7.687295244807046e-12, 3.780864510360971e-12, -6.5225047585215634e-12]
θtrue = [0.4]
qc_longitudinal_model.θ = [0.39536351578917867]
qc_longitudinal_model.∇θ = [-1.9557244712586908e-11]
qc_longitudinal_model.τ = [1.0]
qc_longitudinal_model.∇τ = [0.0]
