# Normal - Compare coefficient estimates for IHT and lasso

In [1]:
using MendelIHT
using MendelGWAS
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using CSV
using GLM
using RCall
R"library(glmnet)"

│ Loading required package: foreach
│ Loaded glmnet 2.0-16
│ 
└ @ RCall /Users/biona001/.julia/packages/RCall/g7dhB/src/io.jl:113


RObject{StrSxp}
 [1] "glmnet"    "foreach"   "Matrix"    "stats"     "graphics"  "grDevices"
 [7] "utils"     "datasets"  "methods"   "base"     


In [2]:
#run n repeats of L0_reg using the same X and b, but different y
function run_repeat(
    n        :: Int,
    p        :: Int,
    repeats  :: Int, 
    z        :: AbstractMatrix,
    true_b   :: Vector,
    cor_pos  :: Vector, #correct position of the true model
    d        :: UnionAll,
    l        :: GLM.Link
    )
    
    k = size(cor_pos, 1)
    IHT_estimated_β = zeros(k, repeats)
    lasso_estimated_β = zeros(k, repeats)
    marginal_estimated_β = zeros(k, repeats)
    
    for i in 1:repeats
        # simulat SNP data
        x = simulate_random_snparray(n, p, "tmp.bed")
        xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
        x_float = [convert(Matrix{Float64}, x, center=true, scale=true) z] #Float64 version of x

        #simulate phenotypes (e.g. vector y)
        if d == Normal || d == Poisson || d == Bernoulli
            prob = GLM.linkinv.(l, xbm * true_b)
            clamp!(prob, -20, 20)
            y = [rand(d(i)) for i in prob]
        elseif d == NegativeBinomial
            r = 10
            μ = GLM.linkinv.(l, xbm * true_b)
            prob = 1 ./ (1 .+ μ ./ r)
            y = [rand(d(r, i)) for i in prob] #number of failtures before r success occurs
        elseif d == Gamma
            μ = GLM.linkinv.(l, xbm * true_b)
            β = 1 ./ μ # here β is the rate parameter for gamma distribution
            y = [rand(d(α, i)) for i in β] # α is the shape parameter for gamma
        end
        y = Float64.(y)

        #compute results for data
        IHT_result = L0_reg(x, xbm, z, y, 1, k, d(), l)

        #compute lasso results. choose λ via cross validation
        num_folds = 3
        folds = rand(1:num_folds, size(x, 1))
        @rput x_float y folds num_folds #make variables visible to R
        R"lasso_cv_result = cv.glmnet(x_float, y, nfolds = num_folds, foldid = folds, family='gaussian')"
        R"lasso_beta_tmp = glmnet(x_float, y, lambda=lasso_cv_result$lambda.min, family='gaussian')$beta"
        R"lasso_beta = as.vector(lasso_beta_tmp)"
        @rget lasso_cv_result lasso_beta #pull result from R to Julia

        # Now run MendelGWAS (marginal analysis)
        make_bim_fam_files(x, y, "tmp") #create .bim and .bed files for MendelGWAS
        open("tmp_control.txt", "w") do f
            write(f, "plink_input_basename = tmp \n")
            write(f, "plink_field_separator = '\t' \n")
            write(f, "output_table = tmp_table.txt \n")
            write(f, "regression = linear \n")
            write(f, "regression_formula = Trait ~ 1\n")
        end
        GWAS("tmp_control.txt")
        result = CSV.read("tmp_table.txt")
        
        #store the correct position in estimated model
        IHT_estimated_β[:, i] .= IHT_result.beta[cor_pos]
        lasso_estimated_β[:, i] .= lasso_beta[cor_pos]
        marginal_estimated_β[:, i] .= result[cor_pos, 6]
    end
    
    return IHT_estimated_β, lasso_estimated_β, marginal_estimated_β
end

run_repeat (generic function with 1 method)

## Run 100 models

Here $\beta_{true}$ is the same in all models, but the design matrix and phenotype will be different. To run lasso, we first cross validate for the tuning parameter $\lambda$ before running the standard lasso. We compute the average and standard error for correctly identified predictors (i.e. estimated $\beta_i > 0$). Note that here we are not comparing false positives or false negatives. 

In [3]:
repeats = 100
n = 5000
p = 10000
d = Normal
l = canonicallink(d())

# set random seed for reproducibility
Random.seed!(2020)

# intercept
z = ones(Float64, n, 1)

#construct true model b
true_b = zeros(p)
true_b[1:6] = [0.01; 0.03; 0.05; 0.1; 0.25; 0.5]
shuffle!(true_b)
correct_position = findall(x -> x != 0, true_b)

#run repeats
@time iht_result, lasso_result, marginal_result = run_repeat(n, p, repeats, z, true_b, correct_position, d, l)

 
 
     Welcome to OpenMendel's
      GWAS analysis option
 
 
Reading the data.

The current working directory is "/Users/biona001/.julia/dev/MendelIHT/figures/repeats".

Keywords modified by the user:

  affected_designator = 2
  control_file = tmp_control.txt
  output_table = tmp_table.txt
  pedigree_file = tmp.fam
  plink_field_separator = 	
  plink_input_basename = tmp
  regression = linear
  regression_formula = Trait ~ 1
  snpdata_file = tmp.bed
  snpdefinition_file = tmp.bim
 
 
Analyzing the data.

 
Table of p-values for all SNPs is in file: tmp_table.txt
 
 
Mendel's analysis is finished.

 
 
     Welcome to OpenMendel's
      GWAS analysis option
 
 
Reading the data.

The current working directory is "/Users/biona001/.julia/dev/MendelIHT/figures/repeats".

Keywords modified by the user:

  affected_designator = 2
  control_file = tmp_control.txt
  output_table = tmp_table.txt
  pedigree_file = tmp.fam
  plink_field_separator = 	
  plink_input_basename = tmp
  regression 

([0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.062683925577611; … ; 0.5099030272153777 0.5075610984870925 … 0.5232093416681559 0.4822863507643918; 0.0 0.0 … 0.0 0.0], [0.0 0.0 … 0.0 0.0; 0.0 0.0016181587579655768 … 0.0 0.006810572880516208; … ; 0.46131477226439205 0.46027584480068084 … 0.4714901377722354 0.4279425266210463; 0.0 0.0 … 0.0 0.0], [NaN NaN … NaN NaN; NaN NaN … NaN NaN; … ; NaN 0.7277129141990171 … 1.1720346671646016 0.686060221994585; NaN NaN … NaN NaN])

In [4]:
iht_result

6×100 Array{Float64,2}:
 0.0        0.0       0.0        0.0       …  0.0        0.0        0.0      
 0.0        0.0       0.0608723  0.0          0.0761425  0.0        0.0626839
 0.231465   0.242373  0.238931   0.248917     0.231719   0.220016   0.256395 
 0.0633284  0.110042  0.109548   0.106149     0.112734   0.0987938  0.118163 
 0.509903   0.507561  0.492106   0.511625     0.481867   0.523209   0.482286 
 0.0        0.0       0.054331   0.0       …  0.0        0.0        0.0      

In [5]:
lasso_result

6×100 Array{Float64,2}:
 0.0       0.0         0.0         …  0.0        0.0        0.0       
 0.0       0.00161816  0.0134162      0.0256119  0.0        0.00681057
 0.180581  0.193908    0.191388       0.183754   0.171414   0.201177  
 0.011222  0.0624045   0.0623008      0.0637263  0.0464837  0.0653551 
 0.461315  0.460276    0.445935       0.434694   0.47149    0.427943  
 0.0       0.0         0.00638247  …  0.0        0.0        0.0       

In [17]:
marginal_result[findall(isnan, marginal_result)] .= 0.0 # set NaN to 0
marginal_result

6×100 Array{Float64,2}:
 0.0       0.0       0.0       0.0       …  0.0       0.0       0.0     
 0.0       0.0       0.0       0.0          0.0       0.0       0.0     
 0.332426  0.653539  0.346435  0.364699     0.772158  0.607633  0.69596 
 0.0       0.152662  0.449026  0.161742     0.223137  0.171571  0.237281
 0.0       0.727713  0.708197  1.02837      0.684022  1.17203   0.68606 
 0.0       0.0       0.0       0.0       …  0.0       0.0       0.0     

In [18]:
iht_non_zero = [iht_result[i, findall(!iszero, iht_result[i, :])] for i in 1:6]
lasso_non_zero = [lasso_result[i, findall(!iszero, lasso_result[i, :])] for i in 1:6]
marginal_non_zero = [marginal_result[i, findall(!iszero, marginal_result[i, :])] for i in 1:6];

In [19]:
iht_non_zero_mean = mean.(iht_non_zero)

6-element Array{Float64,1}:
 NaN                   
   0.06266402442562875 
   0.24925197937784543 
   0.0967742590114098  
   0.501563859958689   
   0.058203598175762004

In [20]:
lasso_non_zero_mean = mean.(lasso_non_zero)

6-element Array{Float64,1}:
 NaN                   
   0.011622150842919572
   0.198845161375783   
   0.046366143828693855
   0.4508608357267142  
   0.009827471806669909

In [21]:
marginal_non_zero_mean = mean.(marginal_non_zero)

6-element Array{Float64,1}:
 NaN                  
 NaN                  
   0.4925166545935277 
   0.20364635462727537
   0.9900134028303681 
 NaN                  

In [22]:
iht_non_zero_std = std.(iht_non_zero)

6-element Array{Float64,1}:
 NaN                   
   0.007191156231314294
   0.012879223614127399
   0.013695232022409215
   0.015063215959963008
   0.007085069747381762

In [23]:
lasso_non_zero_std = std.(lasso_non_zero)

6-element Array{Float64,1}:
 NaN                   
   0.007739014986190035
   0.013194099577398113
   0.014413271472265855
   0.015467719028289929
   0.00706745267477641 

In [24]:
marginal_non_zero_std = std.(marginal_non_zero)

6-element Array{Float64,1}:
 NaN                  
 NaN                  
   0.1891043595224298 
   0.07761390391810027
   0.4970233514415856 
 NaN                  

In [25]:
final = DataFrame(
    true_β = true_b[correct_position],
    β_IHT_mean = iht_non_zero_mean, 
    β_IHT_std = iht_non_zero_std, 
    β_lasso_mean = lasso_non_zero_mean,
    β_lasso_std = lasso_non_zero_std,
    β_marginal_mean = marginal_non_zero_mean,
    β_marginal_std = marginal_non_zero_std)

Unnamed: 0_level_0,true_β,β_IHT_mean,β_IHT_std,β_lasso_mean,β_lasso_std,β_marginal_mean,β_marginal_std
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.01,,,,,,
2,0.05,0.062664,0.00719116,0.0116222,0.00773901,,
3,0.25,0.249252,0.0128792,0.198845,0.0131941,0.492517,0.189104
4,0.1,0.0967743,0.0136952,0.0463661,0.0144133,0.203646,0.0776139
5,0.5,0.501564,0.0150632,0.450861,0.0154677,0.990013,0.497023
6,0.03,0.0582036,0.00708507,0.00982747,0.00706745,,
