In [1]:
using Distributed
# addprocs(4)

@everywhere begin
    using Revise
    using MendelIHT
    using SnpArrays
    using Random
    using GLM
    using DelimitedFiles
    using Test
    using Distributions
    using LinearAlgebra
    using CSV
    using DataFrames
end

┌ Info: Precompiling MendelIHT [921c7187-1484-5754-b919-5d3ed9ac03c4]
└ @ Base loading.jl:1278


# First simulate multivariate Gaussian traits

In [2]:
n = 1000  # number of samples
p = 10000 # number of SNPs
k = 10    # number of causal SNPs per trait
r = 2     # number of traits

# set random seed for reproducibility
Random.seed!(2021)

# simulate `.bed` file with no missing data
x = simulate_random_snparray("multivariate_$(r)traits.bed", n, p)
xla = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true) 

# intercept is the only nongenetic covariate
z = ones(n, 1)
intercepts = [100 -101.0] # each trait have different intercept

# simulate response y, true model b, and the correct non-0 positions of b
Y, true_Σ, true_b, correct_position = simulate_random_response(xla, k, r, Zu=z*intercepts);

In [3]:
# save true SNP's position and effect size
open("multivariate_$(r)traits_true_beta.txt", "w") do io
    println(io, "snpID,effectsize")
    for pos in correct_position
        println(io, "snp$pos,", true_b[pos])
    end
end

# create `.bim` and `.bam` files using phenotype
make_bim_fam_files(x, Y, "multivariate_$(r)traits")

# Run IHT with simple gradient update

In [5]:
Yt = Matrix(Y')
Zt = Matrix(z')
@time result = fit_iht(Yt, Transpose(xla), Zt, k=22, verbose=true)

****                   MendelIHT Version 1.4.0                  ****
****     Benjamin Chu, Kevin Keys, Chris German, Hua Zhou       ****
****   Jin Zhou, Eric Sobel, Janet Sinsheimer, Kenneth Lange    ****
****                                                            ****
****                 Please cite our paper!                     ****
****         https://doi.org/10.1093/gigascience/giaa044        ****

Running sparse Multivariate Gaussian regression
Link functin = IdentityLink()
Sparsity parameter (k) = 22
Prior weight scaling = off
Doubly sparse projection = off
Debias = off
Debias = off
Max IHT iterations = 100
Converging when tol < 0.0001:

Iteration 1: loglikelihood = -118.11809635085024, backtracks = 0, tol = 0.012019767029716565
Iteration 2: loglikelihood = 1573.2429442728142, backtracks = 0, tol = 0.004550375318943792
Iteration 3: loglikelihood = 2355.4487593015833, backtracks = 0, tol = 0.0016793596007401404
Iteration 4: loglikelihood = 2553.3630550433586, backtracks =


Compute time (sec):     1.5595719814300537
Final loglikelihood:    2637.102522166727
Iterations:             8

Trait 1: IHT estimated 9 nonzero SNP predictors
[1m9×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │      782    -0.38656
   2 │      901     0.775385
   3 │     1204     0.734345
   4 │     1306    -1.45903
   5 │     1655    -0.184918
   6 │     3160    -0.845601
   7 │     4201     0.311749
   8 │     6047     0.150917
   9 │     6879    -1.22952

Trait 1: IHT estimated 1 non-genetic predictors
[1m1×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │        1      100.128

Trait 2: IHT estimated 11 nonzero SNP predictors
[1m11×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     

## Check answer

In [5]:
# first beta
β1 = result.beta[1, :]
true_b1_idx = findall(!iszero, true_b[:, 1])
[β1[true_b1_idx] true_b[true_b1_idx, 1]]

10×2 Array{Float64,2}:
 -0.38711   -0.402269
  0.773282   0.758756
  0.732931   0.729135
 -1.45665   -1.47163
 -0.186112  -0.172668
 -0.8451    -0.847906
  0.306102   0.296183
  0.0       -0.0034339
  0.152059   0.125965
 -1.23006   -1.24972

In [6]:
# second beta
β2 = result.beta[2, :]
true_b2_idx = findall(!iszero, true_b[:, 2])
[β2[true_b2_idx] true_b[true_b2_idx, 2]]

10×2 Array{Float64,2}:
  1.73499     1.73729
 -1.20023    -1.19911
  0.0         0.0121193
 -0.966437   -0.969569
  0.53347     0.540525
 -0.591593   -0.609556
  0.507787    0.481189
 -0.0706648  -0.0524866
  0.33863     0.31182
  1.28155     1.29813

In [7]:
# non genetic covariates
[result.c intercepts']

2×2 Array{Float64,2}:
  100.128   100.0
 -100.853  -101.0

In [8]:
# covariance matrix
[vec(result.Σ) vec(true_Σ)]

4×2 Array{Float64,2}:
 1.15024  1.22512
 1.17493  1.23674
 1.17493  1.23674
 1.43489  1.494

# Test Cross validation

In [38]:
Random.seed!(2020)
Yt = Matrix(Y')
Zt = Matrix(z')
@time mses = cv_iht(Yt, x, Zt, path=15:25);



Crossvalidation Results:
	k	MSE
	15	680.8312394187362
	16	654.1440855866089
	17	631.2314087955
	18	625.4281658724203
	19	621.7835698511656
	20	622.0887666330024
	21	622.0797588884208
	22	623.7460645901206
	23	624.740126376586
	24	627.6406541125037
	25	627.6441757767382
 44.832151 seconds (4.65 M allocations: 371.081 MiB, 0.23% gc time)


In [9]:
argmin(mses)

9

# GEMMA multivariate results

In [17]:
gemma_df = CSV.read("gemma.result.assoc.txt", DataFrame)

# pvalues
pval_wald = gemma_df[!, :p_wald]
pval_lrt = gemma_df[!, :p_lrt]
pval_score = gemma_df[!, :p_score]

# estimated beta
estim_β1 = gemma_df[!, :beta_1]
estim_β2 = gemma_df[!, :beta_2]

# estimated covariance matrix
estim_σ11 = gemma_df[!, :Vbeta_1_1]
estim_σ12 = gemma_df[!, :Vbeta_1_2]
estim_σ22 = gemma_df[!, :Vbeta_2_2];

In [18]:
correct_snps = [x[1] for x in correct_position]  # truely causal snps
signif_snps = findall(x -> x ≤ 0.05/p, pval_lrt) # gemma's selected snps
signif_snps ∩ correct_snps

12-element Array{Int64,1}:
   46
  901
 1204
 1306
 3160
 4797
 5616
 6072
 6573
 6879
 7236
 9763

**Conclusion:** IHT finds 18/20 SNPs, while GEMMA finds 12/20. 

# Compare with univariate IHT

+ `fit_iht`: ~20 times slower
+ `cv_iht`: ~70 times slower

In [49]:
n = 1000  # number of samples
p = 10000 # number of SNPs
k = 10    # number of causal SNPs per trait
d = Normal
l = canonicallink(d())

# set random seed for reproducibility
Random.seed!(2021)

# simulate `.bed` file with no missing data
x = simulate_random_snparray(undef, n, p)
xla = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true) 

# intercept is the only nongenetic covariate
z = ones(n)
intercept = 1.0

# simulate response y, true model b, and the correct non-0 positions of b
Y, true_b, correct_position = simulate_random_response(xla, k, d, l, Zu=z*intercept);

In [50]:
@time result = fit_iht(Y, xla, z, d=d(), l=l, k=11)

****                   MendelIHT Version 1.4.0                  ****
****     Benjamin Chu, Kevin Keys, Chris German, Hua Zhou       ****
****   Jin Zhou, Eric Sobel, Janet Sinsheimer, Kenneth Lange    ****
****                                                            ****
****                 Please cite our paper!                     ****
****         https://doi.org/10.1093/gigascience/giaa044        ****

Running sparse linear regression
Link functin = IdentityLink()
Sparsity parameter (k) = 11
Prior weight scaling = off
Doubly sparse projection = off
Debias = off
Debias = off
Max IHT iterations = 100
Converging when tol < 0.0001:

Iteration 1: loglikelihood = -1577.170794759688, backtracks = 0, tol = 0.609864675283163
Iteration 2: loglikelihood = -1484.8568136206177, backtracks = 0, tol = 0.1269955771967065
Iteration 3: loglikelihood = -1472.9529635904933, backtracks = 0, tol = 0.05823372413927707
Iteration 4: loglikelihood = -1472.5366421393842, backtracks = 1, tol = 0.00450895


IHT estimated 10 nonzero SNP predictors and 1 non-genetic predictors.

Compute time (sec):     0.06683206558227539
Final loglikelihood:    -1472.3905989669602
Iterations:             10

Selected genetic predictors:
[1m10×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │      782    -0.437828
   2 │      901     0.747956
   3 │     1204     0.691327
   4 │     1306    -1.42505
   5 │     1655    -0.19456
   6 │     3160    -0.861591
   7 │     3936    -0.147235
   8 │     4201     0.338606
   9 │     4402    -0.126472
  10 │     6879    -1.21895

Selected nongenetic predictors:
[1m1×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │        1      1.02016

In [41]:
# compare estimated vs true beta values
[result.beta[correct_position] true_b[correct_position]]

10×2 Array{Float64,2}:
 -0.437828  -0.402269
  0.747956   0.758756
  0.691327   0.729135
 -1.42505   -1.47163
 -0.19456   -0.172668
 -0.861591  -0.847906
  0.338606   0.296183
  0.0       -0.0034339
  0.0        0.125965
 -1.21895   -1.24972

### Cross validation timing

In [92]:
# 1 cores
Random.seed!(2020)
@time cv_iht(Y, x, z);



Crossvalidation Results:
	k	MSE
	1	1221.686764265696
	2	864.1553085156108
	3	661.6430376327284
	4	516.2679723147669
	5	418.2721191658746
	6	342.7931325462592
	7	303.15938053956916
	8	276.6241892508097
	9	278.29180795455045
	10	280.31878745464377
	11	282.10757732881564
	12	285.1294156734012
	13	293.16875008847467
	14	297.43739905389987
	15	296.17300065041906
	16	307.7814278347698
	17	302.0113818893564
	18	312.70262374274137
	19	309.87367962700966
	20	315.6750311115386
  4.315330 seconds (9.35 M allocations: 332.277 MiB, 1.12% gc time)
