In [1]:
using Distributed
# addprocs(4)

@everywhere begin
    using Revise
    using MendelIHT
    using SnpArrays
    using Random
    using GLM
    using DelimitedFiles
    using Test
    using Distributions
    using LinearAlgebra
    using CSV
    using DataFrames
end

┌ Info: Precompiling MendelIHT [921c7187-1484-5754-b919-5d3ed9ac03c4]
└ @ Base loading.jl:1278


# First simulate multivariate Gaussian traits

In [2]:
n = 1000  # number of samples
p = 10000 # number of SNPs
k = 10    # number of causal SNPs per trait
r = 2     # number of traits

# set random seed for reproducibility
Random.seed!(2021)

# simulate `.bed` file with no missing data
x = simulate_random_snparray("multivariate_$(r)traits.bed", n, p)
xla = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true) 

# intercept is the only nongenetic covariate
z = ones(n) 

# simulate response y, true model b, and the correct non-0 positions of b
Y, true_Σ, true_b, correct_position = simulate_random_response(xla, k, r);

In [3]:
# save true SNP's position and effect size
open("multivariate_$(r)traits_true_beta.txt", "w") do io
    println(io, "snpID,effectsize")
    for pos in correct_position
        println(io, "snp$pos,", true_b[pos])
    end
end

# create `.bim` and `.bam` files using phenotype
make_bim_fam_files(x, Y, "multivariate_$(r)traits")

# Run IHT with simple gradient update

In [24]:
Yt = Matrix(Y')
Zt = Matrix(z')
@time result = fit_iht(Yt, Transpose(xla), Zt, k=10, verbose=true)

****                   MendelIHT Version 1.4.0                  ****
****     Benjamin Chu, Kevin Keys, Chris German, Hua Zhou       ****
****   Jin Zhou, Eric Sobel, Janet Sinsheimer, Kenneth Lange    ****
****                                                            ****
****                 Please cite our paper!                     ****
****         https://doi.org/10.1093/gigascience/giaa044        ****

Running sparse unknown regression
Link functin = IdentityLink()
Sparsity parameter (k) = 10
Prior weight scaling = off
Doubly sparse projection = off
Debias = off

Converging when tol < 0.0001:
Iteration 1: loglikelihood = -190.25516921334543, backtracks = 0, tol = 0.7758118456211052
Iteration 2: loglikelihood = 1819.187992792398, backtracks = 0, tol = 0.1428898155012873
Iteration 3: loglikelihood = 2410.0073888161824, backtracks = 0, tol = 0.04651754346934129
Iteration 4: loglikelihood = 2567.74886336667, backtracks = 0, tol = 0.011846769005166678
Iteration 5: loglikelihood = 2


Compute time (sec):     1.504953145980835
Final loglikelihood:    2635.989391876837
Iterations:             14

Trait 1: IHT estimated 10 nonzero SNP predictors
[1m10×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │      782   -0.387253
   2 │      901    0.776228
   3 │     1204    0.73234
   4 │     1306   -1.45682
   5 │     1427    0.0570532
   6 │     1655   -0.186513
   7 │     3160   -0.845163
   8 │     4201    0.309019
   9 │     6047    0.152959
  10 │     6879   -1.23302

Trait 1: IHT estimated 0 non-genetic predictors
[1m0×2 DataFrame[0m

Trait 2: IHT estimated 10 nonzero SNP predictors
[1m10×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │       46    1.73289
   2 │     4797   -1.20165
   3 │     5616   -0.962282
   4 │     6072    0.536906
   

# Run IHT with full gradient update

In [50]:
Yt = Matrix(Y')
Zt = Matrix(z')
@time result = fit_iht(Yt, Transpose(xla), Zt, k=10, fullIHT=true, max_iter=1000, max_step=2, verbose=true)

****                   MendelIHT Version 1.4.0                  ****
****     Benjamin Chu, Kevin Keys, Chris German, Hua Zhou       ****
****   Jin Zhou, Eric Sobel, Janet Sinsheimer, Kenneth Lange    ****
****                                                            ****
****                 Please cite our paper!                     ****
****         https://doi.org/10.1093/gigascience/giaa044        ****

Running sparse unknown regression
Link functin = IdentityLink()
Sparsity parameter (k) = 10
Prior weight scaling = off
Doubly sparse projection = off
Debias = off

Converging when tol < 0.0001:
Iteration 1: loglikelihood = -4420.403505790376, backtracks = 0, tol = 6.905572866316025
Iteration 2: loglikelihood = 3718.5167619096255, backtracks = 0, tol = 0.7752051150640281
Iteration 3: loglikelihood = 237557.72740802474, backtracks = 0, tol = 0.6019493841414093
Iteration 4: loglikelihood = 47671.523775718604, backtracks = 5, tol = 0.011171888948061348
Iteration 5: loglikelihood = 4


Compute time (sec):     75.24118995666504
Final loglikelihood:    1779.413597588743
Iterations:             424

Trait 1: IHT estimated 10 nonzero SNP predictors
[1m10×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │      782    -0.392369
   2 │      901     0.77888
   3 │     1204     0.736398
   4 │     1306    -1.45906
   5 │     1427     0.060341
   6 │     1655    -0.182698
   7 │     3160    -0.846335
   8 │     4201     0.312977
   9 │     6047     0.151031
  10 │     6879    -1.23321

Trait 1: IHT estimated 0 non-genetic predictors
[1m0×2 DataFrame[0m

Trait 2: IHT estimated 10 nonzero SNP predictors
[1m10×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │       46    1.7353
   2 │     4797   -1.20672
   3 │     5616   -0.966976
   4 │     6072    0.5

## Check answer

In [51]:
# first beta
β1 = result.beta[1, :]
true_b1_idx = findall(!iszero, true_b[:, 1])
[β1[true_b1_idx] true_b[true_b1_idx, 1]]

10×2 Array{Float64,2}:
 -0.392369  -0.402269
  0.77888    0.758756
  0.736398   0.729135
 -1.45906   -1.47163
 -0.182698  -0.172668
 -0.846335  -0.847906
  0.312977   0.296183
  0.0       -0.0034339
  0.151031   0.125965
 -1.23321   -1.24972

In [52]:
# second beta
β2 = result.beta[2, :]
true_b2_idx = findall(!iszero, true_b[:, 2])
[β2[true_b2_idx] true_b[true_b2_idx, 2]]

10×2 Array{Float64,2}:
  1.7353      1.73729
 -1.20672    -1.19911
  0.0         0.0121193
 -0.966976   -0.969569
  0.534409    0.540525
 -0.586309   -0.609556
  0.503907    0.481189
 -0.0727545  -0.0524866
  0.340336    0.31182
  1.28035     1.29813

In [54]:
# covariance matrix
[vec(result.Σ) vec(true_Σ)]

4×2 Array{Float64,2}:
 1.23959  1.22512
 1.14345  1.23674
 1.14345  1.23674
 1.51206  1.494

# Test Cross validation

In [9]:
Random.seed!(2020)
Yt = Matrix(Y')
Zt = Matrix(z')
@time cv_iht(Yt, x, Zt);



Crossvalidation Results:
	k	MSE
	1	2506.1361026992413
	2	1707.4609062019433
	3	1297.253354519584
	4	985.2280061442002
	5	819.8080675324194
	6	739.6359401670024
	7	641.5504839534303
	8	612.7042220588955
	9	608.3277089584982
	10	610.9984342539259
	11	614.807751686425
	12	616.615453582629
	13	621.4468351156702
	14	628.0553031281725
	15	628.5181881547544
	16	626.4682026915706
	17	636.4936893268582
	18	635.9073694771134
	19	630.7374247665347
	20	633.3757710953382
284.723952 seconds (151.20 k allocations: 823.455 MiB, 0.05% gc time)


# Compare with univariate IHT

+ `fit_iht`: ~20 times slower
+ `cv_iht`: ~70 times slower

In [40]:
n = 1000  # number of samples
p = 10000 # number of SNPs
k = 10    # number of causal SNPs per trait
d = Normal
l = IdentityLink()

# set random seed for reproducibility
Random.seed!(2021)

# simulate `.bed` file with no missing data
x = simulate_random_snparray(undef, n, p)
xla = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true) 

# intercept is the only nongenetic covariate
z = ones(n) 

# simulate response y, true model b, and the correct non-0 positions of b
Y, true_b, correct_position = simulate_random_response(xla, k, d, l);

In [42]:
@time result = fit_iht(Y, xla, z, k=10)

****                   MendelIHT Version 1.4.0                  ****
****     Benjamin Chu, Kevin Keys, Chris German, Hua Zhou       ****
****   Jin Zhou, Eric Sobel, Janet Sinsheimer, Kenneth Lange    ****
****                                                            ****
****                 Please cite our paper!                     ****
****         https://doi.org/10.1093/gigascience/giaa044        ****

Running sparse linear regression
Link functin = IdentityLink()
Sparsity parameter (k) = 10
Prior weight scaling = off
Doubly sparse projection = off
Debias = off

Converging when tol < 0.0001:
Iteration 1: loglikelihood = -1568.8072402506227, tol = 1.2369352935298255
Iteration 2: loglikelihood = -1485.2388920745977, tol = 0.1246509194436555
Iteration 3: loglikelihood = -1473.4688017685708, tol = 0.060408468355759436
Iteration 4: loglikelihood = -1472.7990029487871, tol = 0.0051541599786159185
Iteration 5: loglikelihood = -1472.6215787426543, tol = 0.0027074009598392432
Iteration


IHT estimated 10 nonzero SNP predictors and 0 non-genetic predictors.

Compute time (sec):     0.06568217277526855
Final loglikelihood:    -1472.5590426182382
Iterations:             10

Selected genetic predictors:
[1m10×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │      782    -0.438598
   2 │      901     0.747536
   3 │     1204     0.691351
   4 │     1306    -1.42482
   5 │     1655    -0.194714
   6 │     3160    -0.86084
   7 │     3936    -0.147349
   8 │     4201     0.338177
   9 │     4402    -0.126286
  10 │     6879    -1.21081

Selected nongenetic predictors:
[1m0×2 DataFrame[0m

In [43]:
# compare beta values
[result.beta[correct_position] true_b[correct_position]]

10×2 Array{Float64,2}:
 -0.438598  -0.402269
  0.747536   0.758756
  0.691351   0.729135
 -1.42482   -1.47163
 -0.194714  -0.172668
 -0.86084   -0.847906
  0.338177   0.296183
  0.0       -0.0034339
  0.0        0.125965
 -1.21081   -1.24972

### Cross validation timing

In [8]:
# 1 cores
Random.seed!(2020)
@time cv_iht(Y, x, z);



Crossvalidation Results:
	k	MSE
	1	1431.385325867502
	2	1431.569108502913
	3	1429.2348660984221
	4	1429.5730864011514
	5	1429.4077307081652
	6	1429.2323485244572
	7	1429.5277693297523
	8	1429.4669590879946
	9	1429.4376255311238
	10	1429.526751070809
	11	1429.5205225689476
	12	1429.6007792570374
	13	1430.324407382292
	14	1430.2241106412075
	15	1430.2935769756937
	16	1430.0160991249024
	17	1430.2798769215433
	18	1430.7721172663867
	19	1430.8725535864496
	20	1429.7969640813615
  4.344964 seconds (10.10 M allocations: 343.536 MiB, 0.70% gc time)


In [7]:
# 4 cores
Random.seed!(2020)
@time cv_iht(Y, x, z);



Crossvalidation Results:
	k	MSE
	1	1431.385325867502
	2	1431.569108502913
	3	1429.2348660984221
	4	1429.5730864011514
	5	1429.4077307081652
	6	1429.2323485244572
	7	1429.5277693297523
	8	1429.4669590879946
	9	1429.4376255311238
	10	1429.526751070809
	11	1429.5205225689476
	12	1429.6007792570374
	13	1430.324407382292
	14	1430.2241106412075
	15	1430.2935769756937
	16	1430.0160991249024
	17	1430.2798769215433
	18	1430.7721172663867
	19	1430.8725535864496
	20	1429.7969640813615
  4.223226 seconds (10.10 M allocations: 343.629 MiB, 0.88% gc time)


# GEMMA multivariate results

In [37]:
gemma_df = CSV.read("gemma.result.assoc.txt", DataFrame)

# pvalues
pval_wald = gemma_df[!, :p_wald]
pval_lrt = gemma_df[!, :p_lrt]
pval_score = gemma_df[!, :p_score]

# estimated beta
estim_β1 = gemma_df[!, :beta_1]
estim_β2 = gemma_df[!, :beta_2]

# estimated covariance matrix
estim_σ11 = gemma_df[!, :Vbeta_1_1]
estim_σ12 = gemma_df[!, :Vbeta_1_2]
estim_σ22 = gemma_df[!, :Vbeta_2_2];

In [39]:
correct_snps = [x[1] for x in correct_position]  # truely causal snps
signif_snps = findall(x -> x ≤ 0.05/p, pval_lrt) # gemma's selected snps
signif_snps ∩ correct_snps

12-element Array{Int64,1}:
   46
  901
 1204
 1306
 3160
 4797
 5616
 6072
 6573
 6879
 7236
 9763