In [1]:
using Distributed
# addprocs(4)

@everywhere begin
    using Revise
    using MendelIHT
    using SnpArrays
    using Random
    using GLM
    using DelimitedFiles
    using Test
    using Distributions
    using LinearAlgebra
    using CSV
    using DataFrames
end

┌ Info: Precompiling MendelIHT [921c7187-1484-5754-b919-5d3ed9ac03c4]
└ @ Base loading.jl:1278


# Simulate data

There are 20 true SNPs. 

In [33]:
n = 1000  # number of samples
p = 10000 # number of SNPs
k = 20    # number of causal SNPs per trait
d = Normal
l = canonicallink(d())

# set random seed for reproducibility
Random.seed!(2021)

# simulate `.bed` file with no missing data
x = simulate_random_snparray(undef, n, p)
xla = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true) 

# intercept is the only nongenetic covariate
z = ones(n)
intercept = 0.0

# simulate response y, true model b, and the correct non-0 positions of b
Y, true_b, correct_position = simulate_random_response(xla, k, d, l, Zu=z*intercept);

# If $k$ unknown, try clustering by absolute value

Here is a simple projection based on based on k-means clustering. 

In [34]:
"""
    project_by_clustering!(x::AbstractVector)

Projects `x` to sparsity by clustering. We run k-means with `group` clusters for
10 iterations, and then project the cluster with the smallest mean.
"""
function project_by_clustering!(x::AbstractVector{T}, groups::Int=3) where T <: Real
    centers = collect(T, 0:1/(groups-1):1) # initialize cluster centers
    members = [Int[] for _ in 1:groups]

    # run 10 iterations of k-means
    for iter in 1:10
        # assign xᵢ to the nearest cluster
        empty!.(members) # refresh cluster members
        for i in eachindex(x)
            xi, best_dist, best_group = abs(x[i]), typemax(T), 0
            for j in 1:groups
                d = abs2(xi - centers[j])
                if d < best_dist
                    best_dist = d
                    best_group = j
                end
            end
            push!(members[best_group], i)
        end

        # update cluster centers
        centers .= zero(T)
        for j in 1:groups
            for i in members[j]
                centers[j] += abs(x[i])
            end
            length(members[j]) != 0 && (centers[j] /= length(members[j]))
            centers[j] ≥ 0 || error("center $j = $(centers[j]) is negative! Shouldn't happen!")
        end
    end

    # project cluster with smallest mean
    mincenter, mincluster = findmin(centers)
    for i in members[mincluster]
        x[i] = 0
    end

    return centers, members
end

project_by_clustering!

# Run full IHT with clustering

Here $k$ represents the number of clusters

In [37]:
@time result = fit_iht(Y, xla, z)

****                   MendelIHT Version 1.3.3                  ****
****     Benjamin Chu, Kevin Keys, Chris German, Hua Zhou       ****
****   Jin Zhou, Eric Sobel, Janet Sinsheimer, Kenneth Lange    ****
****                                                            ****
****                 Please cite our paper!                     ****
****         https://doi.org/10.1093/gigascience/giaa044        ****

Running sparse linear regression
Link functin = IdentityLink()
Sparsity parameter (k) = 10
Prior weight scaling = off
Doubly sparse projection = off
Debias = off
Max IHT iterations = 100
Converging when tol < 0.0001:

number of nonzero entries kept = 7346
cluster 1 has mean 0.06610620795135885 with 2560 members
cluster 2 has mean 0.11445487120929407 with 2188 members
cluster 3 has mean 0.17133336683747977 with 1542 members
cluster 4 has mean 0.23524812117658506 with 762 members
cluster 5 has mean 0.3089214956644556 with 242 members
cluster 6 has mean 0.4056247851157432 with 40 m


IHT estimated 10000 nonzero SNP predictors and 1 non-genetic predictors.

Compute time (sec):     3.1001930236816406
Final loglikelihood:    5537.880760119945
Iterations:             31

Selected genetic predictors:
[1m10000×2 DataFrame[0m
[1m   Row [0m│[1m Position [0m[1m Estimated_β  [0m
[1m       [0m│[90m Int64    [0m[90m Float64      [0m
───────┼────────────────────────
     1 │        1  -0.000321246
     2 │        2  -0.00877949
     3 │        3   0.0211276
     4 │        4   0.00522768
     5 │        5   0.0134751
     6 │        6  -0.0174565
     7 │        7   0.0140031
     8 │        8   0.00726093
     9 │        9   0.0173385
    10 │       10  -0.00427415
    11 │       11  -0.00786451
   ⋮   │    ⋮           ⋮
  9991 │     9991  -0.00661741
  9992 │     9992  -0.00596259
  9993 │     9993   0.00671866
  9994 │     9994  -0.0226632
  9995 │     9995  -0.00982646
  9996 │     9996  -0.0160549
  9997 │     9997   0.0133445
  9998 │     9998  -0.00301655


In [31]:
# compare estimated vs true beta values
[result.beta[correct_position] true_b[correct_position]]

10×2 Array{Float64,2}:
  0.0       -0.402269
  0.733425   0.758756
  0.670327   0.729135
 -1.42341   -1.47163
  0.0       -0.172668
 -0.856152  -0.847906
  0.0        0.296183
  0.0       -0.0034339
  0.0        0.125965
 -1.22742   -1.24972

**Conclusion:** Dynamically updating $k$ finds 12/20 predictors.

### Try cross validation

In [19]:
mses = cv_iht(Y, x, z, path=1:30)
argmin(mses)



Crossvalidation Results:
	k	MSE
	1	340.3993737718125
	2	340.3995331984139
	3	340.39937856046635
	4	340.3991680355956
	5	340.39926036316706
	6	340.3999458878631
	7	340.40009806128955
	8	340.3993093482102
	9	340.39929641773284
	10	340.39926481141316
	11	340.399352373388
	12	340.39943174709276
	13	340.3992900106386
	14	340.3993026117015
	15	340.3993971804856
	16	340.39940488080003
	17	340.39953623177587
	18	340.3997596183156
	19	340.3999596299992
	20	340.39983515333694
	21	340.4001399544
	22	340.399291637834
	23	340.39883966540816
	24	340.3988293009556
	25	340.3988182744421
	26	340.3988070687493
	27	340.39880633108544
	28	340.3989986231407
	29	340.39901293296293
	30	340.39902169578255


27

### Run regular IHT with $k = 17$

In [20]:
@time result = fit_iht(Y, xla, z, k=argmin(mses))
[result.beta[correct_position] true_b[correct_position]]

****                   MendelIHT Version 1.3.3                  ****
****     Benjamin Chu, Kevin Keys, Chris German, Hua Zhou       ****
****   Jin Zhou, Eric Sobel, Janet Sinsheimer, Kenneth Lange    ****
****                                                            ****
****                 Please cite our paper!                     ****
****         https://doi.org/10.1093/gigascience/giaa044        ****

Running sparse linear regression
Link functin = IdentityLink()
Sparsity parameter (k) = 27
Prior weight scaling = off
Doubly sparse projection = off
Debias = off
Max IHT iterations = 100
Converging when tol < 0.0001:

Iteration 1: loglikelihood = -1920.3446678703415, backtracks = 0, tol = 1.0382143858583603
Iteration 2: loglikelihood = -1661.3157264955626, backtracks = 0, tol = 0.22015859364984378
Iteration 3: loglikelihood = -1656.7577011746137, backtracks = 0, tol = 0.028960415766784334
Iteration 4: loglikelihood = -1656.668060843034, backtracks = 0, tol = 0.004586605809942993

20×2 Array{Float64,2}:
  0.0       -0.161659
  0.0       -0.402269
  0.75665    0.758756
 -1.53159   -1.53835
  0.776944   0.729135
 -1.48858   -1.47163
  0.0       -0.172668
 -0.95268   -0.971641
 -0.782955  -0.735768
  1.24029    1.2169
 -0.856995  -0.847906
  0.677345   0.654921
  0.0        0.296183
  0.0       -0.388067
  0.0       -0.0034339
  0.0        0.0407291
  0.0        0.125965
 -1.23779   -1.24972
 -1.21665   -1.24587
  1.25041    1.27953

**Conclusion:** Dynamically updating $k$ finds 12/20 predictors, while cross validation finds 17/20.