In [1]:
using Revise
using MendelIHT
using SnpArrays
using Random
using GLM
using DelimitedFiles
using Test
using Distributions
using LinearAlgebra
using CSV
using DataFrames
using StatsBase
BLAS.set_num_threads(1) # remember to set BLAS threads to 1 !!!
#     using TraitSimulation, OrdinalMultinomialModels, VarianceComponentModels

┌ Info: Precompiling MendelIHT [921c7187-1484-5754-b919-5d3ed9ac03c4]
└ @ Base loading.jl:1317


# Univariate Gaussian trait

In [2]:
n = 1000  # number of samples
p = 10000 # number of SNPs
q = 5     # number of non-genetic covariates
k = 10    # number of causal SNPs per trait
d = Normal
l = canonicallink(d())

# set random seed for reproducibility
Random.seed!(2021)

# simulate `.bed` file with no missing data
x = simulate_random_snparray(undef, n, p)
xla = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true) 

# nongenetic covarites, 1st column is intercept
z = randn(n, q)
z[:, 1] .= 1
c = randn(q)

# simulate response y, true model b, and the correct non-0 positions of b
y, true_b, correct_position = simulate_random_response(xla, k, d, l, Zu=z*c);

## Run IHT

In [4]:
@time result = fit_iht(y, xla, z, k=10, init_beta=true)

****                   MendelIHT Version 1.4.1                  ****
****     Benjamin Chu, Kevin Keys, Chris German, Hua Zhou       ****
****   Jin Zhou, Eric Sobel, Janet Sinsheimer, Kenneth Lange    ****
****                                                            ****
****                 Please cite our paper!                     ****
****         https://doi.org/10.1093/gigascience/giaa044        ****

Initializing β to univariate regression values...
...completed in 0.1 seconds.

Running sparse linear regression
Number of threads = 8
Link functin = IdentityLink()
Sparsity parameter (k) = 10
Prior weight scaling = off
Doubly sparse projection = off
Debias = off
Max IHT iterations = 200
Converging when tol < 0.0001 and iteration ≥ 5:

Iteration 1: loglikelihood = -3002.723211083534, backtracks = 0, tol = 0.6926932469992776
Iteration 2: loglikelihood = -1805.3237453585161, backtracks = 0, tol = 0.34637684087692966
Iteration 3: loglikelihood = -1460.2586248070177, backtracks = 0,


IHT estimated 10 nonzero SNP predictors and 5 non-genetic predictors.

Compute time (sec):     0.06963992118835449
Final loglikelihood:    -1422.4040762615862
SNP PVE:                0.7545490924593975
Iterations:             8

Selected genetic predictors:
[1m10×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │     1487    -0.678565
   2 │     1734    -0.169403
   3 │     2097    -2.13101
   4 │     2266    -0.235182
   5 │     2551     0.16789
   6 │     4775     2.29512
   7 │     4791     0.763704
   8 │     4955    -0.417309
   9 │     7933    -1.72017
  10 │     8710    -0.565201

Selected nongenetic predictors:
[1m5×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │        1    0.125532
   2 │        2    1.15239
   3 │        3   -0.58176
   4 │        4

## Check answer

In [5]:
[true_b[correct_position] result.beta[correct_position]]

10×2 Matrix{Float64}:
 -0.674202  -0.678565
 -0.212237  -0.169403
 -2.16656   -2.13101
 -0.203392  -0.235182
  0.165819   0.16789
  2.30263    2.29512
  0.687439   0.763704
 -0.405677  -0.417309
 -1.66149   -1.72017
 -0.546303  -0.565201

In [6]:
# non genetic covariates
[result.c c]

5×2 Matrix{Float64}:
  0.125532    0.0284927
  1.15239     1.13825
 -0.58176    -0.566457
  1.42219     1.35203
 -0.0582245  -0.0531031

## Test Cross validation

In [7]:
Threads.nthreads()

8

In [8]:
Random.seed!(2020)
@time mses = cv_iht(y, xla, z, path=0:20, init_beta=true);

****                   MendelIHT Version 1.4.1                  ****
****     Benjamin Chu, Kevin Keys, Chris German, Hua Zhou       ****
****   Jin Zhou, Eric Sobel, Janet Sinsheimer, Kenneth Lange    ****
****                                                            ****
****                 Please cite our paper!                     ****
****         https://doi.org/10.1093/gigascience/giaa044        ****



[32mCross validating...100%|████████████████████████████████| Time: 0:00:06[39m




Crossvalidation Results:
	k	MSE
	0	3046.1068850324336
	1	2014.9680141393687
	2	1153.520612853073
	3	527.4303875427266
	4	421.56539803705937
	5	334.607065931551
	6	267.76045903864633
	7	231.18007562734212
	8	218.7115381296477
	9	219.93024184511523
	10	212.56326397610883
	11	216.42591815912317
	12	218.52008060795322
	13	225.76746161959892
	14	231.14431195338898
	15	234.36915696941708
	16	233.17497749362144
	17	240.68670944533042
	18	248.3512559330554
	19	245.9563361500128
	20	246.6022429280416

Best k = 10

  7.188606 seconds (35.71 M allocations: 7.338 GiB, 24.04% gc time)


# Logistic (binary) traits

In [25]:
n = 1000  # number of samples
p = 10000 # number of SNPs
q = 3     # number of non-genetic covariates
k = 10    # number of causal SNPs per trait
d = Bernoulli
l = canonicallink(d())

# set random seed for reproducibility
Random.seed!(2021)

# simulate `.bed` file with no missing data
x = simulate_random_snparray(undef, n, p)
xla = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true) 

# nongenetic covarites, 1st column is intercept
z = randn(n, q)
z[:, 1] .= 1
c = randn(q)

# simulate response y, true model b, and the correct non-0 positions of b
y, true_b, correct_position = simulate_random_response(xla, k, d, l, Zu=z*c);

In [26]:
@time result = fit_iht(y, xla, z, k=10, d=d(), l=l, init_beta=false, max_iter=1000)

****                   MendelIHT Version 1.4.1                  ****
****     Benjamin Chu, Kevin Keys, Chris German, Hua Zhou       ****
****   Jin Zhou, Eric Sobel, Janet Sinsheimer, Kenneth Lange    ****
****                                                            ****
****                 Please cite our paper!                     ****
****         https://doi.org/10.1093/gigascience/giaa044        ****

Running sparse logistic regression
Number of threads = 8
Link functin = LogitLink()
Sparsity parameter (k) = 10
Prior weight scaling = off
Doubly sparse projection = off
Debias = off
Max IHT iterations = 1000
Converging when tol < 0.0001 and iteration ≥ 5:

Iteration 1: loglikelihood = -395.77014413725806, backtracks = 0, tol = 0.5303310300461843
Iteration 2: loglikelihood = -346.3417924804989, backtracks = 0, tol = 0.29597732231115287
Iteration 3: loglikelihood = -326.09265897099965, backtracks = 0, tol = 0.26341642913501495
Iteration 4: loglikelihood = -320.75740042076006, bac


IHT estimated 10 nonzero SNP predictors and 3 non-genetic predictors.

Compute time (sec):     0.26929783821105957
Final loglikelihood:    -313.01641704112535
SNP PVE:                0.5791181046176805
Iterations:             44

Selected genetic predictors:
[1m10×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │      181    -0.889481
   2 │     1007     1.00471
   3 │     2417     0.891502
   4 │     3242    -0.641796
   5 │     3264    -0.925003
   6 │     3488    -1.61155
   7 │     6083    -0.84155
   8 │     6934     1.63939
   9 │     7118    -1.06178
  10 │     8119    -0.406123

Selected nongenetic predictors:
[1m3×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │        1    -0.521723
   2 │        2     0.453126
   3 │        3    -1.63495

## chech answers

In [27]:
[true_b[correct_position] result.beta[correct_position]]

10×2 Matrix{Float64}:
 -0.852107  -0.889481
 -0.388558   0.0
  0.989745   1.00471
  0.647143   0.891502
 -0.616519  -0.641796
 -0.992854  -0.925003
 -1.59093   -1.61155
 -0.725757  -0.84155
  1.5154     1.63939
 -1.15028   -1.06178

In [28]:
# non genetic covariates
[result.c c]

3×2 Matrix{Float64}:
 -0.521723  -0.550508
  0.453126   0.38738
 -1.63495   -1.61836