In [1]:
using Distributed
addprocs(4)

@everywhere begin
    using Revise
    using MendelIHT
    using SnpArrays
    using Random
    using GLM
    using DelimitedFiles
    using Test
    using Distributions
    using LinearAlgebra
    using CSV
    using DataFrames
    using StatsBase
#     using TraitSimulation, OrdinalMultinomialModels, VarianceComponentModels
end

┌ Info: Precompiling MendelIHT [921c7187-1484-5754-b919-5d3ed9ac03c4]
└ @ Base loading.jl:1278


# Simple multivariate Gaussian traits

With $r$ traits, each sample's phenotype $\mathbf{y}_{i} \in \mathbb{R}^{n \times 1}$ is simulated under

$$\mathbf{y}_{i}^{r \times 1} \sim N(\mathbf{B}^{r \times p}\mathbf{x}_{i}^{p \times 1}, \ \ \Sigma_{r \times r})$$

This model assumes each sample is independent.

In [60]:
n = 1000  # number of samples
p = 10000 # number of SNPs
k = 10    # number of causal SNPs
r = 2     # number of traits

# set random seed for reproducibility
Random.seed!(2021)

# simulate `.bed` file with no missing data
x = simulate_random_snparray("multivariate_$(r)traits.bed", n, p)
xla = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true) 

# intercept is the only nongenetic covariate
z = ones(n, 1)
intercepts = [10.0 1.0] # each trait have different intercept

# simulate response y, true model b, and the correct non-0 positions of b
Y, true_Σ, true_b, correct_position = simulate_random_response(xla, k, r, Zu=z*intercepts, overlap=2);

In [3]:
# save true SNP's position and effect size
open("multivariate_$(r)traits_true_beta.txt", "w") do io
    println(io, "snpID,effectsize")
    for pos in correct_position
        println(io, "snp$pos,", true_b[pos])
    end
end

# create `.bim` and `.bam` files using phenotype
make_bim_fam_files(x, Y, "multivariate_$(r)traits")

# save phenotypes in separate file
open("multivariate_$(r)traits.phen", "w") do io
    println(io, "FID\tIID\tT1\tT2")
    for i in 1:n
        println(io, "$i\t1\t", Y[i, 1], "\t", Y[i, 2])
    end
end

## Run IHT

In [61]:
Yt = Matrix(Y')
Zt = Matrix(z')
ktrue = k + count(!iszero, intercepts)
@time result = fit_iht(Yt, Transpose(xla), Zt, k=12, verbose=true)

****                   MendelIHT Version 1.3.3                  ****
****     Benjamin Chu, Kevin Keys, Chris German, Hua Zhou       ****
****   Jin Zhou, Eric Sobel, Janet Sinsheimer, Kenneth Lange    ****
****                                                            ****
****                 Please cite our paper!                     ****
****         https://doi.org/10.1093/gigascience/giaa044        ****

Running sparse Multivariate Gaussian regression
Link functin = IdentityLink()
Sparsity parameter (k) = 12
Prior weight scaling = off
Doubly sparse projection = off
Debias = off
Max IHT iterations = 200
Converging when tol < 0.0001:

Iteration 1: loglikelihood = 215.4892687838203, backtracks = 0, tol = 0.1243455904380314
Iteration 2: loglikelihood = 1382.37242885485, backtracks = 0, tol = 0.027019894208789006
Iteration 3: loglikelihood = 1477.7383135165255, backtracks = 0, tol = 0.014225517157910452
Iteration 4: loglikelihood = 1511.714843337414, backtracks = 0, tol = 0.004456457


Compute time (sec):     1.4830679893493652
Final loglikelihood:    1521.6067421001371
Iterations:             15
Trait 1's SNP PVE:      0.554527358091918
Trait 2's SNP PVE:      0.6195879626449299

Trait 1: IHT estimated 4 nonzero SNP predictors
[1m4×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │     1197     0.121446
   2 │     5651    -0.200705
   3 │     5797    -1.09767
   4 │     8087     1.2791

Trait 1: IHT estimated 1 non-genetic predictors
[1m1×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │        1       10.027

Trait 2: IHT estimated 6 nonzero SNP predictors
[1m6×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │      326     0.3318




## Check answer

In [62]:
# first beta
β1 = result.beta[1, :]
true_b1_idx = findall(!iszero, true_b[:, 1])
[β1[true_b1_idx] true_b[true_b1_idx, 1]]

4×2 Array{Float64,2}:
 -0.200705  -0.224675
 -1.09767   -1.14044
  0.0       -0.14698
  1.2791     1.25668

In [63]:
# second beta
β2 = result.beta[2, :]
true_b2_idx = findall(!iszero, true_b[:, 2])
[β2[true_b2_idx] true_b[true_b2_idx, 2]]

6×2 Array{Float64,2}:
 0.331882  0.315219
 0.575645  0.609812
 1.19357   1.20121
 0.502072  0.531549
 0.81844   0.808327
 1.36932   1.43455

In [64]:
# non genetic covariates
[result.c intercepts']

2×2 Array{Float64,2}:
 10.027    10.0
  1.03625   1.0

In [65]:
# covariance matrix
[vec(result.Σ) vec(true_Σ)]

4×2 Array{Float64,2}:
  2.4681    2.53934
 -1.83681  -1.85399
 -1.83681  -1.85399
  2.42177   2.41416

In [66]:
# number of causal SNPs recovered
correct_snps = [x[1] for x in correct_position]             # truely causal snps
signif_snps = findall(!iszero, β1) ∪ findall(!iszero, β2)   # IHT's selected snps
signif_snps ∩ correct_snps

8-element Array{Int64,1}:
 5651
 5797
 8087
  326
 2110
 5375
 6015
 6813

## Test Cross validation

In [11]:
Random.seed!(2020)
Yt = Matrix(Y')
Zt = Matrix(z')
@time mses = cv_iht(Yt, Transpose(xla), Zt);

[32mCross validating...100%|████████████████████████████████| Time: 0:01:03[39m




Crossvalidation Results:
	k	MSE
	1	2888.7160633632484
	2	2560.135862053543
	3	2067.943067029389
	4	1812.0395079444284
	5	1554.3120367449362
	6	1277.3237598020087
	7	1154.9320629872832
	8	1098.5910963871872
	9	1019.4597637296985
	10	1030.1412464715597
	11	1023.5545874904792
	12	1007.9022110997687
	13	1012.6193656356761
	14	1019.1491606608182
	15	1024.6877890077092
	16	1022.9300595671257
	17	1040.0286509856787
	18	1033.345570850089
	19	1039.8828186471897
	20	1036.274158344765

Best k = 12

 63.787494 seconds (10.28 M allocations: 497.667 MiB, 0.24% gc time)


**Conclusion:** Using $k = 12$ (which achieves minimum MSE) IHT finds 8/10 causal SNPs + 2 intercept terms. Beta estimates and cross-trait covariances are very precise. 

# GEMMA multivariate results

In [84]:
gemma_df = CSV.read("gemma.result.assoc.txt", DataFrame)

# pvalues
pval_wald = gemma_df[!, :p_wald]
pval_lrt = gemma_df[!, :p_lrt]
pval_score = gemma_df[!, :p_score]

# estimated beta
estim_β1 = gemma_df[!, :beta_1]
estim_β2 = gemma_df[!, :beta_2]

# estimated covariance matrix
estim_σ11 = gemma_df[!, :Vbeta_1_1]
estim_σ12 = gemma_df[!, :Vbeta_1_2]
estim_σ22 = gemma_df[!, :Vbeta_2_2];

correct_snps = [x[1] for x in correct_position]  # truely causal snps
signif_snps = findall(x -> x ≤ 0.05/p, pval_lrt) # gemma's selected snps
signif_snps ∩ correct_snps

6-element Array{Int64,1}:
 2110
 5375
 5797
 6015
 6813
 8087

**Conclusion:** GEMMA finds 6/10 causal SNPs

# MV-PLINK

In [25]:
mvplink_df = CSV.read("plink.mqfam.total", DataFrame, delim=' ', ignorerepeated=true)

# pvalues
pval = mvplink_df[!, :P]

# SNPs passing threshold
signif_snps = findall(x -> x ≤ 0.05 / p, pval)
signif_snps ∩ correct_snps

6-element Array{Int64,1}:
 2110
 5375
 5797
 6015
 6813
 8087

**Conclusion:** MV-PLINK finds 6/10 causal SNPs

# More complicated simulations (multivariate traits)

Let us simulate:
+ Non-independent samples
+ Polygenic traits where every SNP contributes to the phenotype, but $k$ SNPs have large effect

For $r$ traits, our model is:

$$\mathbf{Y}_{r \times n} \sim \text{MatrixNormal}(\mathbf{B}_{r \times p}\mathbf{X}_{p \times n}, \ \ \Sigma_{r \times r} , \ \ \sigma_g^2\Phi_{n \times n} + \sigma_e^2 \mathbf{I}_{n \times n})$$

where
+ $\mathbf{X}_{p \times n}$ contains *all* predictors (genetic + non-genetic)
+ $\mathbf{B}_{r \times p}$ contains (true) regression coefficients. Each row is $k$-sparse.
+ $\Sigma_{r \times r}$ is the row (trait) covariance matrix
+ $\sigma_g^2\Phi_{n \times n} + \sigma_e^2 \mathbf{I}_{n \times n}$ is the column (sample) covariance matrix
+ $\Phi$ is the GRM estimated from the genotypes
+ $\sigma_g^2 = 0.6$ and $\sigma_e^2 = 0.4$ (thus heritability is 60%)

In [2]:
"""
k = Number of causal SNPs
p = Total number of SNPs
traits = Number of traits (phenotypes)
overlap = number of causal SNPs shared in each trait
"""
function simulate_random_beta(k::Int, p::Int, traits::Int; overlap::Int=0)
    true_b = zeros(p, traits)
    if overlap == 0
        causal_snps = sample(1:(traits * p), k, replace=false)
        true_b[causal_snps] = randn(k)
    else
        shared_snps = sample(1:p, overlap, replace=false)
        weight_vector = aweights(1 / (traits * (p - overlap)) * ones(traits * p))
        for i in 1:traits
            weight_vector[i*shared_snps] .= 0.0 # avoid sampling from shared snps
        end
        @assert sum(weight_vector) ≈ 1.0
        # simulate β for shared predictors
        for i in 1:traits
            true_b[shared_snps, i] = randn(overlap)
        end
        # simulate β for none shared predictors
        nonshared_snps = sample(1:(traits * p), weight_vector, k - traits * overlap, replace=false)
        true_b[nonshared_snps] = randn(k - traits * overlap)
    end

    return true_b
end

simulate_random_beta

In [37]:
n = 1000  # number of samples
p = 10000 # number of SNPs
k = 10    # number of causal SNPs
r = 2     # number of traits

# set random seed for reproducibility
Random.seed!(111)

# simulate `.bed` file with no missing data
x = simulate_random_snparray("multivariate_$(r)traits.bed", n, p)
xla = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true) 

# intercept is the only nongenetic covariate
z = ones(n, 1)
intercepts = [0.5 1.0] # each trait have different intercept

# simulate β
B = simulate_random_beta(k, p, r, overlap=2)

# between trait covariance matrix
Σ = random_covariance_matrix(r)

# between sample covariance is identity + GRM (2 times because in SnpArrays grm is halved)
Φ = 2grm(x)
σg = 0.6
σe = 0.4
V = σg * Φ + σe * I

# simulate y using TraitSimulations.jl
# VCM_model = VCMTrait(z, intercepts, x, B, [Σ], [V]) #https://github.com/OpenMendel/TraitSimulation.jl/blob/6d1f09c7332471a74b4dd6c8ef2d2b95a96c585c/src/modelframework.jl#L159
# Y = simulate(VCM_model)

# simulate using naive model
μ = z * intercepts + xla * B
Yt = rand(MatrixNormal(μ', Σ, V));

In [38]:
# create `.bim` and `.bam` files using phenotype
make_bim_fam_files(x, Transpose(Yt), "multivariate_$(r)traits")

# save phenotypes in separate file
open("multivariate_$(r)traits.phen", "w") do io
    println(io, "FID\tIID\tT1\tT2")
    for i in 1:n
        println(io, "$i\t1\t", Yt[1, i], "\t", Yt[2, i])
    end
end

└ @ Base.Docs docs/Docs.jl:227
└ @ Base.Docs docs/Docs.jl:227
└ @ Base.Docs docs/Docs.jl:227
└ @ Base.Docs docs/Docs.jl:227


## Run IHT 

In [26]:
Zt = Matrix(z')
ktrue = k + count(!iszero, intercepts)
@time result = fit_iht(Yt, Transpose(xla), Zt, k=17, verbose=true)

****                   MendelIHT Version 1.3.3                  ****
****     Benjamin Chu, Kevin Keys, Chris German, Hua Zhou       ****
****   Jin Zhou, Eric Sobel, Janet Sinsheimer, Kenneth Lange    ****
****                                                            ****
****                 Please cite our paper!                     ****
****         https://doi.org/10.1093/gigascience/giaa044        ****

Running sparse Multivariate Gaussian regression
Link functin = IdentityLink()
Sparsity parameter (k) = 17
Prior weight scaling = off
Doubly sparse projection = off
Debias = off
Max IHT iterations = 200
Converging when tol < 0.0001:

Iteration 1: loglikelihood = -208.1303677652943, backtracks = 0, tol = 0.2187665316250511
Iteration 2: loglikelihood = 1730.5686443633824, backtracks = 0, tol = 0.027954213730783353
Iteration 3: loglikelihood = 2141.9436184153865, backtracks = 0, tol = 0.018380547307851085
Iteration 4: loglikelihood = 2458.699098945602, backtracks = 0, tol = 0.002480


Compute time (sec):     2.7989869117736816
Final loglikelihood:    3356.964031382751
Iterations:             33
Trait 1's SNP PVE:      0.4078279521313628
Trait 2's SNP PVE:      0.8925271478138327

Trait 1: IHT estimated 7 nonzero SNP predictors
[1m7×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │     2351   -0.0173457
   2 │     2529   -0.254579
   3 │     2850   -0.0151446
   4 │     2872    0.820841
   5 │     2986   -0.0125593
   6 │     4248   -1.403
   7 │     8921    1.24517

Trait 1: IHT estimated 1 non-genetic predictors
[1m1×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │        1       10.006

Trait 2: IHT estimated 8 nonzero SNP predictors
[1m8×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    




In [27]:
# first beta
β1 = result.beta[1, :]
true_b1_idx = findall(!iszero, B[:, 1])
[β1[true_b1_idx] B[true_b1_idx, 1]]

5×2 Array{Float64,2}:
 -0.254579  -0.413412
  0.820841   0.811243
  0.0        0.0244199
 -1.403     -1.42827
  1.24517    1.24969

In [28]:
# second beta
β2 = result.beta[2, :]
true_b2_idx = findall(!iszero, B[:, 2])
[β2[true_b2_idx] B[true_b2_idx, 2]]

5×2 Array{Float64,2}:
  0.0       -0.0597678
 -0.114557  -0.11578
 -2.70478   -2.70844
 -0.218876  -0.212269
  0.312726   0.310274

In [29]:
# non genetic covariates
[result.c intercepts']

2×2 Array{Float64,2}:
 10.006    10.0
  1.00318   1.0

In [30]:
# covariance matrix
[vec(result.Σ) vec(Σ)]

4×2 Array{Float64,2}:
 5.9949    5.65059
 2.0924    1.97568
 2.0924    1.97568
 0.741367  0.702678

In [31]:
# number of causal SNPs recovered
correct_position = findall(!iszero, B)
correct_snps = [x[1] for x in correct_position]             # truely causal snps
signif_snps = findall(!iszero, β1) ∪ findall(!iszero, β2)   # IHT's selected snps
signif_snps ∩ correct_snps

7-element Array{Int64,1}:
 2529
 2872
 4248
 8921
 4710
 6991
 8964

In [25]:
# 4 cores
Random.seed!(2020)
@time mses = cv_iht(Yt, Transpose(xla), Zt, path=1:20, parallel=true);

[32mCross validating...100%|████████████████████████████████| Time: 0:02:08[39m




Crossvalidation Results:
	k	MSE
	1	3998.273828055534
	2	9578.120753914623
	3	3016.1704306898628
	4	3093.1156906190977
	5	1647.3951330746731
	6	1493.6148958073015
	7	1478.4155902821421
	8	1440.2002531421394
	9	1445.9719136655942
	10	1444.1614121170446
	11	1442.354562056365
	12	1440.6469910540297
	13	1396.036495492588
	14	1387.9503968396707
	15	1376.6781831779936
	16	1376.3548266854364
	17	1364.316584848776
	18	1364.4357316403978
	19	1364.6056874298563
	20	1365.1390118890417

Best k = 17

128.249256 seconds (76.04 k allocations: 10.089 MiB)


## GEMMA multivariate results

In [12]:
gemma_df = CSV.read("gemma.polygenic.result.assoc.txt", DataFrame)

# pvalues
pval_wald = gemma_df[!, :p_wald]
pval_lrt = gemma_df[!, :p_lrt]
pval_score = gemma_df[!, :p_score]

# estimated beta
estim_β1 = gemma_df[!, :beta_1]
estim_β2 = gemma_df[!, :beta_2]

# estimated covariance matrix
estim_σ11 = gemma_df[!, :Vbeta_1_1]
estim_σ12 = gemma_df[!, :Vbeta_1_2]
estim_σ22 = gemma_df[!, :Vbeta_2_2];

# check how many real SNPs were recovered
correct_snps = [x[1] for x in correct_position]  # truely causal snps
signif_snps = findall(x -> x ≤ 0.05/p, pval_lrt) # gemma's selected snps
signif_snps ∩ correct_snps

Int64[]

## MV-PLINK

In [35]:
mvplink_df = CSV.read("plink.mqfam.polygenic.total", DataFrame, delim=' ', ignorerepeated=true)

# pvalues
pval = mvplink_df[!, :P]

# SNPs passing threshold
signif_snps = findall(x -> x ≤ 0.05 / p, pval)
@show length(signif_snps)
signif_snps ∩ correct_snps

length(signif_snps) = 5


5-element Array{Int64,1}:
 2529
 2872
 4248
 6991
 8921

# Conclusion
+ Multivariate IHT's beta estimates are precise, under both truly sparse or polygenic model.
+ In a separate simulation with 5k samples and 300k SNPs, cross validation takes roughly 1.5h on 8 cores. 