# Normal - Compare coefficient estimates for IHT and lasso

In [1]:
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using DelimitedFiles
using GLM
using RCall
R"library(glmnet)"

│ Loading required package: foreach
│ Loaded glmnet 2.0-16
│ 
└ @ RCall /u/home/b/biona001/.julia/packages/RCall/ffM0W/src/io.jl:113


RObject{StrSxp}
 [1] "glmnet"    "foreach"   "Matrix"    "stats"     "graphics"  "grDevices"
 [7] "utils"     "datasets"  "methods"   "base"     


In [2]:
#run n repeats of L0_reg using the same X and b, but different y
function run_repeat(
    n        :: Int,
    p        :: Int,
    repeats  :: Int, 
    z        :: AbstractMatrix,
    true_b   :: Vector,
    cor_pos  :: Vector, #correct position of the true model
    d        :: UnionAll,
    l        :: GLM.Link
    )
    
    k = size(cor_pos, 1)
    IHT_estimated_β = zeros(k, repeats)
    lasso_estimated_β = zeros(k, repeats)
    
    for i in 1:repeats
        # simulat SNP data
        mafs = rand(Uniform(0.05, 0.5), p)
        x = zeros(Float32, n, p)
        for j in 1:p
            dist = Binomial(2, mafs[j])
            for ii in 1:n
                x[ii, j] = Float32(rand(dist))
            end
        end
        standardize!(x)

        #simulate phenotypes (e.g. vector y)
        if d == Normal || d == Poisson || d == Bernoulli
            prob = GLM.linkinv.(l, x * true_b)
            clamp!(prob, -20, 20)
            y = [rand(d(i)) for i in prob]
        elseif d == NegativeBinomial
            r = 10
            μ = GLM.linkinv.(l, x * true_b)
            prob = 1 ./ (1 .+ μ ./ r)
            y = [rand(d(r, i)) for i in prob] #number of failtures before r success occurs
        elseif d == Gamma
            μ = GLM.linkinv.(l, x * true_b)
            β = 1 ./ μ # here β is the rate parameter for gamma distribution
            y = [rand(d(α, i)) for i in β] # α is the shape parameter for gamma
        end
        y = Float32.(y)

        #compute results for data
        IHT_result = L0_reg(x, z, y, 1, k, d(), l)

        #compute lasso results. choose λ via cross validation
        num_folds = 3
        folds = rand(1:num_folds, size(x, 1))
        @rput x y folds num_folds #make variables visible to R
        R"lasso_cv_result = cv.glmnet(x, y, nfolds = num_folds, foldid = folds, family='gaussian')"
        R"lasso_beta_tmp = glmnet(x, y, lambda=lasso_cv_result$lambda.min, family='gaussian')$beta"
        R"lasso_beta = as.vector(lasso_beta_tmp)"
        @rget lasso_cv_result lasso_beta #pull result from R to Julia

        #store the correct position in estimated model
        IHT_estimated_β[:, i] .= IHT_result.beta[cor_pos]
        lasso_estimated_β[:, i] .= lasso_beta[cor_pos]
    end
    
    return IHT_estimated_β, lasso_estimated_β
end

run_repeat (generic function with 1 method)

## Run 100 models

Here $\beta_{true}$ is the same in all models, but the design matrix and phenotype will be different. To run lasso, we first cross validate for the tuning parameter $\lambda$ before running the standard lasso. We compute the average and standard error for correctly identified predictors (i.e. estimated $\beta_i > 0$). Note that here we are not comparing false positives or false negatives. 

In [3]:
repeats = 100
n = 5000
p = 10000
d = Normal
l = canonicallink(d())

# set random seed for reproducibility
Random.seed!(2019)

# intercept
z = ones(Float32, n, 1)

#construct true model b
true_b = zeros(p)
true_b[1:6] = [0.01; 0.03; 0.05; 0.1; 0.25; 0.5]
shuffle!(true_b)
correct_position = findall(x -> x != 0, true_b)

#run repeats
@time iht_result, lasso_result = run_repeat(n, p, repeats, z, true_b, correct_position, d, l)

4947.355525 seconds (40.81 M allocations: 20.803 GiB, 0.05% gc time)


([0.0618633 0.0503844 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.516486 0.488924 … 0.517375 0.483652], [0.0103852 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.468399 0.439719 … 0.46912 0.432281])

In [4]:
iht_result

6×100 Array{Float64,2}:
 0.0618633  0.0503844  0.0        …  0.0       0.0        0.0      
 0.0        0.0        0.0629937     0.0       0.0        0.0      
 0.23741    0.257444   0.244949      0.251235  0.246026   0.255436 
 0.120814   0.100397   0.103847      0.113403  0.0998012  0.0903106
 0.0        0.0        0.0           0.0       0.0        0.0      
 0.516486   0.488924   0.478518   …  0.4914    0.517375   0.483652 

In [5]:
lasso_result

6×100 Array{Float64,2}:
 0.0103852  0.0        0.00865305  …  0.0        0.0        0.0      
 0.0        0.0        0.0136544      0.0        0.0        0.0      
 0.188443   0.207823   0.195532       0.200539   0.197764   0.204044 
 0.0688395  0.0523231  0.0589289      0.0625958  0.0491319  0.0403328
 0.0        0.0        0.0            0.0        0.0        0.0      
 0.468399   0.439719   0.43065     …  0.439339   0.46912    0.432281 

In [6]:
iht_non_zero = [iht_result[i, findall(!iszero, iht_result[i, :])] for i in 1:6]
lasso_non_zero = [lasso_result[i, findall(!iszero, lasso_result[i, :])] for i in 1:6];

In [7]:
iht_non_zero_mean = mean.(iht_non_zero)

6-element Array{Float64,1}:
   0.05727936699986458 
   0.061893903227014976
   0.25041748002171516 
   0.09566341534256935 
 NaN                   
   0.49891154795885084 

In [8]:
lasso_non_zero_mean = mean.(lasso_non_zero)

6-element Array{Float64,1}:
   0.008019509256396114
   0.010934411580758346
   0.1994199443091757  
   0.04480031486780809 
 NaN                   
   0.44805405740613075 

In [9]:
iht_non_zero_std = std.(iht_non_zero)

6-element Array{Float64,1}:
   0.005809003889608191 
   0.0072014211418802215
   0.011929937836438325 
   0.013838700709846294 
 NaN                    
   0.015007375146615595 

In [10]:
lasso_non_zero_std = std.(lasso_non_zero)

6-element Array{Float64,1}:
   0.004689104307433112
   0.008169470313789758
   0.012020559335369201
   0.014348575994290534
 NaN                   
   0.015624323548375101

In [11]:
final = DataFrame(
    true_β = true_b[correct_position],
    β_IHT_mean = iht_non_zero_mean, 
    β_IHT_std = iht_non_zero_std, 
    β_lasso_mean = lasso_non_zero_mean,
    β_lasso_std = lasso_non_zero_std)

Unnamed: 0_level_0,true_β,β_IHT_mean,β_IHT_std,β_lasso_mean,β_lasso_std
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.03,0.0572794,0.005809,0.00801951,0.0046891
2,0.05,0.0618939,0.00720142,0.0109344,0.00816947
3,0.25,0.250417,0.0119299,0.19942,0.0120206
4,0.1,0.0956634,0.0138387,0.0448003,0.0143486
5,0.01,,,,
6,0.5,0.498912,0.0150074,0.448054,0.0156243
