# Cross-model selection and averaging

In [1]:
using Revise
using MendelIHT
using GLM
using Random

┌ Info: Precompiling MendelIHT [921c7187-1484-5754-b919-5d3ed9ac03c4]
└ @ Base loading.jl:1423


Simulate some data

In [2]:
n = 1000            # number of samples
p = 10000           # number of SNPs
k = 300              # 10 causal SNPs
d = Normal
l = IdentityLink()

# set random seed
Random.seed!(0)

# simulate `sim.bed` file with no missing data
x = simulate_random_snparray("sim.bed", n, p)
xla = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true, impute=true) 

# 2 nongenetic covariate: first column is the intercept, second column is sex: 0 = male 1 = female
z = ones(n, 2) 
z[:, 2] .= rand(0:1, n)
standardize!(@view(z[:, 2:end])) 

# randomly set genetic predictors where causal βᵢ ~ N(0, 1)
true_b = zeros(p) 
true_b[1:k] = randn(k)
shuffle!(true_b)

# find correct position of genetic predictors
correct_position = findall(!iszero, true_b)

# define effect size of non-genetic predictors: intercept & sex
true_c = [1.0; 1.5] 

# simulate phenotype using genetic and nongenetic predictors
prob = GLM.linkinv.(l, xla * true_b .+ z * true_c) # note genotype-vector multiplication is done with `xla`
y = [rand(d(i)) for i in prob]
y = Float64.(y)

1000-element Vector{Float64}:
  15.546878145794178
  13.430059613061733
  13.549838322889817
   2.2333363971040927
   8.495235552896775
  21.316935424043617
  -1.8997550146997513
 -10.412386353451382
   8.28555104993998
  -7.026292086447713
 -22.733087527450937
  35.17377137504211
  -1.8785769515507358
   ⋮
  -6.340996992434085
   4.168478362302482
  -5.568149556009154
 -12.725883878906997
  10.603329737660747
 -19.20751188053586
  -3.160014307253282
  29.612180059347605
 -18.193357604991654
  -0.05076501222061902
  23.454274255593486
  15.475226721877949

## Test CMSA procedure

In [15]:
Random.seed!(11)
@time result = cmsa_iht(y, xla, z, kmin=1, kmax=1000, warmstart=false, nabort=100)
@show count(!iszero, result.betas[argmin(result.loss)])
[result.k result.loss]

[32mCross validating... 100%|████████████████████████████████| Time: 0:01:34[39m


****                   MendelIHT Version 1.4.8                  ****
****     Benjamin Chu, Kevin Keys, Chris German, Hua Zhou       ****
****   Jin Zhou, Eric Sobel, Janet Sinsheimer, Kenneth Lange    ****
****                                                            ****
****                 Please cite our paper!                     ****
****         https://doi.org/10.1093/gigascience/giaa044        ****

 94.542848 seconds (499.05 M allocations: 9.609 GiB, 2.05% gc time)
count(!iszero, result.betas[argmin(result.loss)]) = 292


75×2 Matrix{Float64}:
    1.0  57453.9
    2.0  56440.0
    3.0  56603.0
    4.0  54662.5
    5.0  55075.3
    6.0  54788.9
    7.0  53240.9
    8.0  52739.6
    9.0  50645.3
   10.0  50171.3
   11.0  49684.3
   12.0  49897.5
   13.0  48138.2
    ⋮    
  464.0  50207.2
  498.0  47905.0
  534.0  51045.5
  572.0  48990.5
  614.0  49408.0
  658.0  51072.8
  705.0  49482.8
  756.0  50644.3
  811.0  50523.5
  870.0  50071.5
  933.0  51102.7
 1000.0  49855.5

In [16]:
findall(!iszero, result.betas[argmin(result.loss)]) ∩ 
    findall(!iszero, result.betas[argmin(result.loss) - 1])

192-element Vector{Int64}:
   27
   33
   98
  149
  179
  209
  310
  317
  350
  381
  397
  508
  521
    ⋮
 9307
 9429
 9649
 9714
 9724
 9729
 9847
 9927
 9928
 9948
 9971
 9995

In [17]:
# number of causal SNPs found
findall(!iszero, result.betas[argmin(result.loss)]) ∩ findall(!iszero, true_b)

98-element Vector{Int64}:
   27
  317
  350
  523
  591
  681
  684
  807
  863
  985
 1024
 1084
 1216
    ⋮
 8737
 8791
 8835
 8892
 9143
 9220
 9714
 9724
 9729
 9927
 9928
 9971

## Test warmstart

In [18]:
Random.seed!(11)
@time result = cmsa_iht(y, xla, z, kmin=1, kmax=1000, warmstart=true, nabort=100)
@show count(!iszero, result.betas[argmin(result.loss)])
[result.k result.loss]

[32mCross validating... 100%|████████████████████████████████| Time: 0:01:41[39m                                      


****                   MendelIHT Version 1.4.8                  ****
****     Benjamin Chu, Kevin Keys, Chris German, Hua Zhou       ****
****   Jin Zhou, Eric Sobel, Janet Sinsheimer, Kenneth Lange    ****
****                                                            ****
****                 Please cite our paper!                     ****
****         https://doi.org/10.1093/gigascience/giaa044        ****

101.232452 seconds (500.87 M allocations: 9.636 GiB, 2.25% gc time)
count(!iszero, result.betas[argmin(result.loss)]) = 351


75×2 Matrix{Float64}:
    1.0  57453.9
    2.0  56439.9
    3.0  56602.9
    4.0  54662.6
    5.0  55175.9
    6.0  54465.6
    7.0  53240.9
    8.0  51806.2
    9.0  50645.4
   10.0  50171.3
   11.0  49494.3
   12.0  48916.7
   13.0  48147.4
    ⋮    
  464.0  51254.7
  498.0  51691.3
  534.0  50627.6
  572.0  50931.0
  614.0  50041.8
  658.0  49723.9
  705.0  50859.2
  756.0  50149.6
  811.0  52037.3
  870.0  51626.8
  933.0  50418.9
 1000.0  51181.1

In [19]:
findall(!iszero, result.betas[argmin(result.loss)]) ∩ 
    findall(!iszero, result.betas[argmin(result.loss) - 1])

218-element Vector{Int64}:
   27
   33
   94
   98
  149
  179
  202
  221
  310
  317
  350
  508
  521
    ⋮
 9500
 9611
 9649
 9714
 9724
 9729
 9847
 9927
 9928
 9971
 9992
 9995

In [20]:
# number of causal SNPs found
findall(!iszero, result.betas[argmin(result.loss)]) ∩ findall(!iszero, true_b)

106-element Vector{Int64}:
   27
  317
  350
  472
  523
  591
  681
  684
  807
  863
  985
 1024
 1084
    ⋮
 8791
 8892
 9143
 9220
 9447
 9714
 9724
 9729
 9852
 9927
 9928
 9971