In [1]:
using Distributed
# addprocs(4)

@everywhere begin
    using Revise
    using MendelIHT
    using SnpArrays
    using Random
    using GLM
    using DelimitedFiles
    using Test
    using Distributions
    using LinearAlgebra
    using CSV
    using DataFrames
end

┌ Info: Precompiling MendelIHT [921c7187-1484-5754-b919-5d3ed9ac03c4]
└ @ Base loading.jl:1278


# Simulate data

There are 20 true SNPs. 

In [77]:
n = 1000  # number of samples
p = 10000 # number of SNPs
k = 20    # number of causal SNPs per trait
d = Normal
l = canonicallink(d())

# set random seed for reproducibility
Random.seed!(2021)

# simulate `.bed` file with no missing data
x = simulate_random_snparray(undef, n, p)
xla = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true) 

# intercept is the only nongenetic covariate
z = ones(n)
intercept = 0.0

# simulate response y, true model b, and the correct non-0 positions of b
Y, true_b, correct_position = simulate_random_response(xla, k, d, l, Zu=z*intercept);

# If $k$ unknown, try clustering by absolute value

Here is a simple projection based on based on k-means clustering. 

In [14]:
"""
    project_by_clustering!(x::AbstractVector)

Projects `x` to sparsity by clustering. We run k-means with 2 clusters for 10
iterations, and then project the cluster with the smaller mean to 0.
"""
function project_by_clustering!(x::AbstractVector{T}) where T <: Real
    center1, center2 = zero(T), one(T)
    members1, members2 = Int[], Int[]

    # run 10 iterations of k-means
    for iter in 1:10
        # assign xi to the nearest cluster
        empty!(members1) # refresh cluster members
        empty!(members2) # refresh cluster members
        for i in eachindex(x)
            xi = abs(x[i])
            abs2(xi - center1) < abs2(xi - center2) ? push!(members1, i) : push!(members2, i)
        end

        # update cluster centers
        center1, center2 = zero(T), zero(T)
        for i in members1
            center1 += abs(x[i])
        end
        for i in members2
            center2 += abs(x[i])
        end
        center1 /= length(members1)
        center2 /= length(members2)
    end
    center1 ≥ 0 || error("center 1 is negative! Shouldn't happen!")
    center2 ≥ 0 || error("center 2 is negative! Shouldn't happen!")

    # project cluster with smaller mean
    p = center1 < center2 ? members1 : members2
    for i in p
        x[i] = 0
    end
    
    return center1, center2, members1, members2
end

project_by_clustering!

# Run full IHT with clustering

In [78]:
@time result = fit_iht(Y, xla, z) # doesn't need to specify k!

****                   MendelIHT Version 1.3.3                  ****
****     Benjamin Chu, Kevin Keys, Chris German, Hua Zhou       ****
****   Jin Zhou, Eric Sobel, Janet Sinsheimer, Kenneth Lange    ****
****                                                            ****
****                 Please cite our paper!                     ****
****         https://doi.org/10.1093/gigascience/giaa044        ****

Running sparse linear regression
Link functin = IdentityLink()
Sparsity parameter (k) = 10
Prior weight scaling = off
Doubly sparse projection = off
Debias = off
Max IHT iterations = 100
Converging when tol < 0.0001:

number of nonzero entries in full_grad = 12
Iteration 1: loglikelihood = -1652.4728974141558, backtracks = 0, tol = 0.690994930063011
number of nonzero entries in full_grad = 12
number of nonzero entries in full_grad = 10
Iteration 2: loglikelihood = -1643.3185656114504, backtracks = 1, tol = 0.03897139371315082
number of nonzero entries in full_grad = 12
number of


IHT estimated 10 nonzero SNP predictors and 0 non-genetic predictors.

Compute time (sec):     0.0810849666595459
Final loglikelihood:    -1640.0333203979542
Iterations:             11

Selected genetic predictors:
[1m10×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │      921     0.371189
   2 │     1079    -0.770944
   3 │     1320    -0.748608
   4 │     1733    -0.438681
   5 │     2767    -0.409249
   6 │     2942     0.595825
   7 │     3174    -0.445867
   8 │     6889    -0.611849
   9 │     7036    -0.641182
  10 │     7960     0.620884

Selected nongenetic predictors:
[1m0×2 DataFrame[0m

In [79]:
# compare estimated vs true beta values
[result.beta[correct_position] true_b[correct_position]]

20×2 Array{Float64,2}:
  0.0       -0.0808294
  0.0       -0.201134
  0.371189   0.379378
 -0.770944  -0.769173
  0.0        0.364568
 -0.748608  -0.735814
  0.0       -0.0863341
 -0.438681  -0.48582
 -0.409249  -0.367884
  0.595825   0.608448
 -0.445867  -0.423953
  0.0        0.32746
  0.0        0.148092
  0.0       -0.194033
  0.0       -0.00171695
  0.0        0.0203646
  0.0        0.0629827
 -0.611849  -0.624858
 -0.641182  -0.622936
  0.620884   0.639766

**Conclusion:** Dynamically updating $k$ finds 10/20 predictors.

### Try cross validation

In [50]:
mses = cv_iht(Y, x, z, path=1:30)
argmin(mses)



Crossvalidation Results:
	k	MSE
	1	2922.582428341611
	2	2497.143700280606
	3	2026.205124704225
	4	1685.342493595061
	5	1495.6359176179424
	6	1115.6698005623014
	7	981.9198742270855
	8	797.7348367435474
	9	695.4183093345118
	10	566.2847691448469
	11	427.7883447891499
	12	337.7022232893082
	13	301.3347158784928
	14	277.7846947695629
	15	261.65651932009206
	16	256.83821523426735
	17	255.95744550292497
	18	261.75862693331305
	19	261.99788681127836
	20	269.7223615993907
	21	279.37025534346765
	22	273.81620031246956
	23	282.0862375147658
	24	279.812271764295
	25	289.1082388485988
	26	295.81662961085283
	27	297.94289347572
	28	297.50114325139856
	29	313.0682987518701
	30	311.39374427185305


17

### Run regular IHT with $k = 17$

In [67]:
@time result = fit_iht(Y, xla, z, k=argmin(mses))
[result.beta[correct_position] true_b[correct_position]]

****                   MendelIHT Version 1.3.3                  ****
****     Benjamin Chu, Kevin Keys, Chris German, Hua Zhou       ****
****   Jin Zhou, Eric Sobel, Janet Sinsheimer, Kenneth Lange    ****
****                                                            ****
****                 Please cite our paper!                     ****
****         https://doi.org/10.1093/gigascience/giaa044        ****

Running sparse linear regression
Link functin = IdentityLink()
Sparsity parameter (k) = 17
Prior weight scaling = off
Doubly sparse projection = off
Debias = off
Max IHT iterations = 100
Converging when tol < 0.0001:

Iteration 1: loglikelihood = -1884.7728490421966, backtracks = 0, tol = 1.2248089246902358
Iteration 2: loglikelihood = -1550.1401891555495, backtracks = 0, tol = 0.2181789842425944
Iteration 3: loglikelihood = -1492.2623013446416, backtracks = 0, tol = 0.09161758927788029
Iteration 4: loglikelihood = -1490.6825325158013, backtracks = 0, tol = 0.010866354423797207


20×2 Array{Float64,2}:
 -0.16437   -0.161659
 -0.426789  -0.402269
  0.744609   0.758756
 -1.53655   -1.53835
  0.774703   0.729135
 -1.48857   -1.47163
 -0.201474  -0.172668
 -0.917505  -0.971641
 -0.76298   -0.735768
  1.23738    1.2169
 -0.85381   -0.847906
  0.684522   0.654921
  0.285958   0.296183
 -0.379468  -0.388067
  0.0       -0.0034339
  0.0        0.0407291
  0.0        0.125965
 -1.2369    -1.24972
 -1.2681    -1.24587
  1.2425     1.27953

**Conclusion:** Dynamically updating $k$ finds 10/20 predictors, while cross validation finds 17/20.