# Bernoulli: Compare false positive/negatives with LASSO

LASSO is currently the de facto penalized least squares method for [feature selection](https://en.wikipedia.org/wiki/Feature_selection). Here we compare the performance (in terms of the number of false positives/negative) of LASSO with IHT for GWAS data, using the `glmnet` implementation of cyclic coordinate descent for LASSO. Since the focus here is not scalability, we test our sample problems on moderately sized genotype matrces of 1000 samples with 10,000 SNPs.

In [1]:
using Distributed
addprocs(4)
nprocs()

5

In [2]:
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using Random
using LinearAlgebra
using DelimitedFiles
using GLM
using RCall
R"library(glmnet)"

│ Setting LC_CTYPE failed, using "C" 
└ @ RCall /u/home/b/biona001/.julia/packages/RCall/ffM0W/src/io.jl:113
│ Setting LC_CTYPE failed, using "C" 
└ @ RCall ~/.julia/packages/RCall/ffM0W/src/io.jl:113
│ Setting LC_CTYPE failed, using "C" 
└ @ RCall ~/.julia/packages/RCall/ffM0W/src/io.jl:113
│ Setting LC_CTYPE failed, using "C" 
└ @ RCall ~/.julia/packages/RCall/ffM0W/src/io.jl:113
│ Setting LC_CTYPE failed, using "C" 
└ @ RCall ~/.julia/packages/RCall/ffM0W/src/io.jl:113
│ Loading required package: foreach
│ Loaded glmnet 2.0-16
│ 
└ @ RCall /u/home/b/biona001/.julia/packages/RCall/ffM0W/src/io.jl:113


RObject{StrSxp}
 [1] "glmnet"    "foreach"   "Matrix"    "stats"     "graphics"  "grDevices"
 [7] "utils"     "datasets"  "methods"   "base"     


# Bernoulli response (i.e. logistic regression)

In [3]:
function iht_lasso_bernoulli(n::Int64, p::Int64, k::Int64, d::UnionAll, l::Link)
    #construct snpmatrix, covariate files, and true model b
    x, = simulate_random_snparray(n, p, undef)
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
    z = ones(n, 1) # the intercept
    x_float = [convert(Matrix{Float64}, x, center=true, scale=true) z] #Float64 version of x

    # simulate response, true model b, and the correct non-0 positions of b
    y, true_b, correct_position = simulate_random_response(x, xbm, k, d, l)
 
    #specify path and folds
    num_folds = 3
    folds = rand(1:num_folds, size(x, 1));

    #run glmnet via Rcall
    @rput x_float y folds num_folds #make variables visible to R
    R"lasso_cv_result = cv.glmnet(x_float, y, nfolds = num_folds, foldid = folds, family='binomial')"
    R"lasso_beta_tmp = glmnet(x_float, y, lambda=lasso_cv_result$lambda.min, family='binomial')$beta"
    R"lasso_beta = as.vector(lasso_beta_tmp)"
    @rget lasso_cv_result lasso_beta #pull result from R to Julia
    lasso_k_est = count(!iszero, lasso_beta)
    
    #find non-zero entries returned by best lasso model as largest k estimate
    path = collect(1:50);
    
    #run IHT's cross validation routine 
    mses = cv_iht_distributed(d(), l, x, z, y, 1, path, folds, num_folds, use_maf=false, debias=false, showinfo=false, parallel=true);
    iht_k_est = argmin(mses)
    iht_result = L0_reg(x, xbm, z, y, 1, iht_k_est, d(), l, debias=false, init=false, use_maf=false)
    iht_beta = iht_result.beta
        
    #show lasso and IHT's reconstruction result
    compare_model = DataFrame(
        true_β  = true_b[correct_position], 
        IHT_β   = iht_beta[correct_position],
        lasso_β = lasso_beta[correct_position])
    @show compare_model
    
    #compute true/false positives/negatives for IHT and lasso
    iht_tp = count(!iszero, iht_beta[correct_position])
    iht_fp = iht_k_est - iht_tp
    iht_fn = k - iht_tp
    lasso_tp = count(!iszero, lasso_beta[correct_position])
    lasso_fp = lasso_k_est - lasso_tp
    lasso_fn = k - lasso_tp
    
    println("IHT false positives = $iht_fp")
    println("IHT false negatives = $iht_fn")
    println("LASSO false positives = $lasso_fp")
    println("LASSO false positives = $lasso_fn" * "\n")
    
    return iht_fp, iht_fn, lasso_fp, lasso_fn
end

iht_lasso_bernoulli (generic function with 1 method)

In [4]:
#simulat data with k true predictors, from distribution d and with link l.
n = 1000
p = 10000
k = 10
d = Bernoulli
l = canonicallink(d())

#set random seed
Random.seed!(2019)

#run function above, saving results in 4 vectors
total_runs = 50
iht_false_positives = zeros(total_runs)
iht_false_negatives = zeros(total_runs)
lasso_false_positives = zeros(total_runs)
lasso_false_negatives = zeros(total_runs)
for i in 1:total_runs
    println("current run = $i")
    iht_fp, iht_fn, lasso_fp, lasso_fn = iht_lasso_bernoulli(n, p, k, d, l)
    iht_false_positives[i] = iht_fp
    iht_false_negatives[i] = iht_fn
    lasso_false_positives[i] = lasso_fp
    lasso_false_negatives[i] = lasso_fn
end

current run = 1
compare_model = 10×3 DataFrame
│ Row │ true_β   │ IHT_β     │ lasso_β   │
│     │ Float64  │ Float64   │ Float64   │
├─────┼──────────┼───────────┼───────────┤
│ 1   │ -1.29964 │ -1.20594  │ -0.696055 │
│ 2   │ -0.2177  │ 0.0       │ -0.032045 │
│ 3   │ 0.786217 │ 0.74778   │ 0.35188   │
│ 4   │ 0.599233 │ 0.500377  │ 0.144367  │
│ 5   │ 0.283711 │ 0.0       │ 0.101466  │
│ 6   │ -1.12537 │ -1.09734  │ -0.63629  │
│ 7   │ 0.693374 │ 0.58001   │ 0.261347  │
│ 8   │ -0.67709 │ -0.718464 │ -0.337016 │
│ 9   │ 0.14727  │ 0.0       │ 0.0       │
│ 10  │ 1.03477  │ 1.02787   │ 0.591364  │
IHT false positives = 0
IHT false negatives = 3
LASSO false positives = 61
LASSO false positives = 1

current run = 2
compare_model = 10×3 DataFrame
│ Row │ true_β    │ IHT_β     │ lasso_β   │
│     │ Float64   │ Float64   │ Float64   │
├─────┼───────────┼───────────┼───────────┤
│ 1   │ -0.872089 │ -0.771944 │ -0.30663  │
│ 2   │ -0.511788 │ -0.586261 │ -0.239912 │
│ 3   │ 0.639028  │ 0.668

In [5]:
bernoulli_iht_false_positives = sum(iht_false_positives) / 50
bernoulli_iht_false_negatives = sum(iht_false_negatives) / 50
bernoulli_lasso_false_positives = sum(lasso_false_positives) / 50
bernoulli_lasso_false_negatives = sum(lasso_false_negatives) / 50
result = [bernoulli_iht_false_positives; bernoulli_iht_false_negatives; 
         bernoulli_lasso_false_positives; bernoulli_lasso_false_negatives]

4-element Array{Float64,1}:
  0.06
  3.68
 93.66
  2.0 

In [3]:
IHT_did_not_converge = 1
LASSO_did_not_converge = 0

0