In [1]:
using Distributed
# addprocs(4)

@everywhere begin
    using Revise
    using MendelIHT
    using SnpArrays
    using Random
    using GLM
    using DelimitedFiles
    using Test
    using Distributions
    using LinearAlgebra
    using CSV
    using DataFrames
end

┌ Info: Precompiling MendelIHT [921c7187-1484-5754-b919-5d3ed9ac03c4]
└ @ Base loading.jl:1278


# Simulate data

There are 20 true SNPs. 

In [82]:
n = 1000  # number of samples
p = 10000 # number of SNPs
k = 20    # number of causal SNPs per trait
d = Normal
l = canonicallink(d())

# set random seed for reproducibility
Random.seed!(2021)

# simulate `.bed` file with no missing data
x = simulate_random_snparray(undef, n, p)
xla = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true) 

# intercept is the only nongenetic covariate
z = ones(n)
intercept = 0.0

# simulate response y, true model b, and the correct non-0 positions of b
Y, true_b, correct_position = simulate_random_response(xla, k, d, l, Zu=z*intercept);

# If $k$ unknown, try clustering by absolute value

Here is a simple projection based on based on k-means clustering. 

In [58]:
"""
    project_by_clustering!(x::AbstractVector)

Projects `x` to sparsity by clustering. We run k-means with `group` clusters for
10 iterations, and then project the cluster with the smallest mean.
"""
function project_by_clustering!(x::AbstractVector{T}, groups::Int=3) where T <: Real
    centers = collect(T, 0:1/(groups-1):1) # initialize cluster centers
    members = [Int[] for _ in 1:groups]

    # run 10 iterations of k-means
    for iter in 1:10
        # assign xᵢ to the nearest cluster
        empty!.(members) # refresh cluster members
        for i in eachindex(x)
            xi, best_dist, best_group = abs(x[i]), typemax(T), 0
            for j in 1:groups
                d = abs2(xi - centers[j])
                if d < best_dist
                    best_dist = d
                    best_group = j
                end
            end
            push!(members[best_group], i)
        end

        # update cluster centers
        centers .= zero(T)
        for j in 1:groups
            for i in members[j]
                centers[j] += abs(x[i])
            end
            length(members[j]) != 0 && (centers[j] /= length(members[j]))
            centers[j] ≥ 0 || error("center $j = $(centers[j]) is negative! Shouldn't happen!")
        end
    end

    # project cluster with smallest mean
    mincenter, mincluster = findmin(centers)
    for i in members[mincluster]
        x[i] = 0
    end

    return centers, members
end

project_by_clustering!

# Run full IHT with clustering

Here $k$ represents the number of clusters

In [83]:
@time result = fit_iht(Y, xla, z, k=2)

****                   MendelIHT Version 1.3.3                  ****
****     Benjamin Chu, Kevin Keys, Chris German, Hua Zhou       ****
****   Jin Zhou, Eric Sobel, Janet Sinsheimer, Kenneth Lange    ****
****                                                            ****
****                 Please cite our paper!                     ****
****         https://doi.org/10.1093/gigascience/giaa044        ****

Running sparse linear regression
Link functin = IdentityLink()
Sparsity parameter (k) = 2
Prior weight scaling = off
Doubly sparse projection = off
Debias = off
Max IHT iterations = 100
Converging when tol < 0.0001:

cluster 1 has mean 0.09897039900604343 with 9989 members
cluster 2 has mean 1.0296528284922977 with 12 members
Iteration 1: loglikelihood = -1706.7140423805665, backtracks = 0, tol = 1.3804772210599874
cluster 1 has mean 0.03785743987018223 with 9989 members
cluster 2 has mean 1.0544671540047375 with 12 members
Iteration 2: loglikelihood = -1657.4869549672399, backt


IHT estimated 12 nonzero SNP predictors and 0 non-genetic predictors.

Compute time (sec):     0.03905916213989258
Final loglikelihood:    -1656.666069606865
Iterations:             6

Selected genetic predictors:
[1m12×2 DataFrame[0m
[1m Row [0m│[1m Position [0m[1m Estimated_β [0m
[1m     [0m│[90m Int64    [0m[90m Float64     [0m
─────┼───────────────────────
   1 │      921     0.756641
   2 │     1079    -1.53159
   3 │     1216     0.776927
   4 │     1320    -1.48857
   5 │     1733    -0.952702
   6 │     2767    -0.782935
   7 │     2942     1.24028
   8 │     3174    -0.856962
   9 │     4189     0.677355
  10 │     6889    -1.23779
  11 │     7036    -1.21663
  12 │     7960     1.25045

Selected nongenetic predictors:
[1m0×2 DataFrame[0m

In [84]:
# compare estimated vs true beta values
[result.beta[correct_position] true_b[correct_position]]

20×2 Array{Float64,2}:
  0.0       -0.161659
  0.0       -0.402269
  0.756641   0.758756
 -1.53159   -1.53835
  0.776927   0.729135
 -1.48857   -1.47163
  0.0       -0.172668
 -0.952702  -0.971641
 -0.782935  -0.735768
  1.24028    1.2169
 -0.856962  -0.847906
  0.677355   0.654921
  0.0        0.296183
  0.0       -0.388067
  0.0       -0.0034339
  0.0        0.0407291
  0.0        0.125965
 -1.23779   -1.24972
 -1.21663   -1.24587
  1.25045    1.27953

**Conclusion:** Dynamically updating $k$ finds 12/20 predictors.

### Try cross validation

In [68]:
mses = cv_iht(Y, x, z, path=1:30)
argmin(mses)



Crossvalidation Results:
	k	MSE
	1	923.6053388280155
	2	804.8252069069749
	3	687.5250032949984
	4	620.3530486649759
	5	542.5895524210691
	6	461.471154918797
	7	428.616158410335
	8	406.9939489195609
	9	358.77150329876383
	10	330.3450937480381
	11	290.8726741109584
	12	266.40215317589787
	13	259.2410565620497
	14	255.37915945348996
	15	258.5614813137921
	16	260.02234179360454
	17	265.7868181363281
	18	275.5541989543569
	19	270.23654365359585
	20	276.87296250332327
	21	282.4759933773313
	22	283.83153151906345
	23	287.782773630965
	24	292.13477841288005
	25	301.14212108013083
	26	310.86180286316676
	27	314.30599770082796
	28	312.5334824080938
	29	312.63561339052256
	30	327.04909178245344


14

### Run regular IHT with $k = 17$

In [20]:
@time result = fit_iht(Y, xla, z, k=argmin(mses))
[result.beta[correct_position] true_b[correct_position]]

****                   MendelIHT Version 1.3.3                  ****
****     Benjamin Chu, Kevin Keys, Chris German, Hua Zhou       ****
****   Jin Zhou, Eric Sobel, Janet Sinsheimer, Kenneth Lange    ****
****                                                            ****
****                 Please cite our paper!                     ****
****         https://doi.org/10.1093/gigascience/giaa044        ****

Running sparse linear regression
Link functin = IdentityLink()
Sparsity parameter (k) = 27
Prior weight scaling = off
Doubly sparse projection = off
Debias = off
Max IHT iterations = 100
Converging when tol < 0.0001:

Iteration 1: loglikelihood = -1920.3446678703415, backtracks = 0, tol = 1.0382143858583603
Iteration 2: loglikelihood = -1661.3157264955626, backtracks = 0, tol = 0.22015859364984378
Iteration 3: loglikelihood = -1656.7577011746137, backtracks = 0, tol = 0.028960415766784334
Iteration 4: loglikelihood = -1656.668060843034, backtracks = 0, tol = 0.004586605809942993

20×2 Array{Float64,2}:
  0.0       -0.161659
  0.0       -0.402269
  0.75665    0.758756
 -1.53159   -1.53835
  0.776944   0.729135
 -1.48858   -1.47163
  0.0       -0.172668
 -0.95268   -0.971641
 -0.782955  -0.735768
  1.24029    1.2169
 -0.856995  -0.847906
  0.677345   0.654921
  0.0        0.296183
  0.0       -0.388067
  0.0       -0.0034339
  0.0        0.0407291
  0.0        0.125965
 -1.23779   -1.24972
 -1.21665   -1.24587
  1.25041    1.27953

**Conclusion:** Dynamically updating $k$ finds 12/20 predictors, while cross validation finds 17/20.