# Compare false positive/negatives with LASSO

LASSO is currently the de facto penalized least squares method for [feature selection](https://en.wikipedia.org/wiki/Feature_selection). Here we compare the performance (in terms of the number of false positives/negative) of LASSO with IHT for GWAS data, using the `glmnet` implementation of cyclic coordinate descent for LASSO. Since the focus here is not scalability, we test our sample problems on moderately sized genotype matrces of 1000 samples with 10,000 SNPs.

In [1]:
using Distributed
addprocs(4)
nprocs()

5

In [2]:
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using Random
using LinearAlgebra
using DelimitedFiles
using GLM
using RCall
R"library(glmnet)"

│  要求されたパッケージ foreach をロード中です 
│ Loaded glmnet 2.0-16
│ 
└ @ RCall /Users/biona001/.julia/packages/RCall/ffM0W/src/io.jl:113


RObject{StrSxp}
 [1] "glmnet"    "foreach"   "Matrix"    "stats"     "graphics"  "grDevices"
 [7] "utils"     "datasets"  "methods"   "base"     


# Normal responses

We use the R library `glmnet` (implemented in Fortran) to run LASSO. [Documentation of Rcall.jl](http://juliainterop.github.io/RCall.jl/stable/gettingstarted.html) teaches us to to transfer variables between R and Julia. Since glmnet does not operate on genotype file, we need to convert `x` to Float64, and then run glmnet. 

In [3]:
function iht_lasso(n::Int64, p::Int64, k::Int64, d::UnionAll, l::Link)
    #construct snpmatrix, covariate files, and true model b
    x, = simulate_random_snparray(n, p, undef)
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
    z = ones(n, 1) # the intercept
    x_float = [convert(Matrix{Float64}, x, center=true, scale=true) z] #Float64 version of x

    # simulate response, true model b, and the correct non-0 positions of b
    y, true_b, correct_position = simulate_random_response(x, xbm, k, d, l)
 
    #specify path and folds
    num_folds = 3
    folds = rand(1:num_folds, size(x, 1));

    #run glmnet via Rcall
    @rput x_float y folds num_folds #make variables visible to R
    R"lasso_cv_result = cv.glmnet(x_float, y, nfolds = num_folds, foldid = folds)"
    R"lasso_beta_tmp = glmnet(x_float, y, lambda=lasso_cv_result$lambda.min)$beta"
    R"lasso_beta = as.vector(lasso_beta_tmp)"
    @rget lasso_cv_result lasso_beta #pull result from R to Julia
    lasso_k_est = length(findall(!iszero, lasso_beta))
    
    #find non-zero entries returned by best lasso model as largest k estimate
    path = collect(1:50);
    
    #run IHT's cross validation routine 
    mses = cv_iht_distributed(d(), l, x, z, y, 1, path, folds, num_folds, use_maf=false, debias=false, parallel=true);
    iht_k_est = argmin(mses)
    iht_result = L0_reg(x, xbm, z, y, 1, iht_k_est, d(), l, debias=false, init=false, use_maf=false)
    iht_beta = iht_result.beta
        
    #show lasso and IHT's reconstruction result
    compare_model = DataFrame(
        true_β  = true_b[correct_position], 
        IHT_β   = iht_beta[correct_position],
        lasso_β = lasso_beta[correct_position])
    @show compare_model
    
    #compute true/false positives/negatives for IHT and lasso
    iht_tp = count(!iszero, iht_beta[correct_position])
    iht_fp = iht_k_est - iht_tp
    iht_fn = k - iht_tp
    lasso_tp = count(!iszero, lasso_beta[correct_position])
    lasso_fp = lasso_k_est - lasso_tp
    lasso_fn = k - lasso_tp
    
    return iht_fp, iht_fn, lasso_fp, lasso_fn
end

iht_lasso (generic function with 1 method)

In [4]:
#simulat data with k true predictors, from distribution d and with link l.
n = 1000
p = 10000
k = 10
d = Normal
l = canonicallink(d())

#set random seed
Random.seed!(2019)

#run function above, saving results in 4 vectors
total_runs = 1
iht_false_positives = zeros(total_runs)
iht_false_negatives = zeros(total_runs)
lasso_false_positives = zeros(total_runs)
lasso_false_negatives = zeros(total_runs)
for i in 1:total_runs
    println("current run = $i")
    iht_fp, iht_fn, lasso_fp, lasso_fn = iht_lasso(n, p, k, d, l)
    iht_false_positives[i] = iht_fp
    iht_false_negatives[i] = iht_fn
    lasso_false_positives[i] = lasso_fp
    lasso_false_negatives[i] = lasso_fn
end

current run = 1


Crossvalidation Results:
	k	MSE
	1	1927.0765190526674
	2	1443.8787480187539
	3	1080.0409263841343
	4	862.2411688907335
	5	705.1020809314726
	6	507.3949184828667
	7	391.96845175311284
	8	368.4542118816205
	9	350.6425973055326
	10	345.8446309064211
	11	348.2788551316564
	12	359.4105897752303
	13	364.17867254237615
	14	377.24829700896595
	15	381.3368331696641
	16	386.202101887892
	17	392.2275869571673
	18	404.25529490899805
	19	399.9942176486576
	20	411.42781811189747
	21	401.31373054942696
	22	423.91470453781744
	23	435.12255208049766
	24	440.6975707755194
	25	441.3406002521729
	26	453.38035540718
	27	444.533023686396
	28	453.04214083470407
	29	458.0104051341299
	30	461.26479637719643
	31	465.3330448088224
	32	482.1175924752104
	33	493.80952198457334
	34	489.667276502509
	35	494.7318551588827
	36	496.1268166725306
	37	499.88322706191354
	38	502.7061704504717
	39	543.5283786436145
	40	511.0882072725324
	41	503.9142937469825
	42	548.3613173011001
	43	508.7260449653686
	44

In [31]:
function iht_lasso_normal(n :: Int64, p :: Int64, k :: Int64)
    #construct snpmatrix, covariate files, and true model b
    x, maf = simulate_random_snparray(n, p, "./data/normal.bed")
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
    z = ones(n, 1) # non-genetic covariates, just the intercept
    true_b = zeros(p)
    true_b[1:k] = randn(k)
    shuffle!(true_b)
    correct_position = findall(x -> x != 0, true_b)
    noise = rand(Normal(0, 1), n) # noise vectors from N(0, s) 
    x_float = [convert(Matrix{Float64}, x, center=true, scale=true) z] #Float64 version of x

    #simulate phenotypes (e.g. vector y) via: y = Xb + noise
    y = xbm * true_b + noise

    #specify path and folds
    num_folds = 3
    folds = rand(1:num_folds, size(x, 1));

    #run glmnet via Rcall
    @rput x_float y folds num_folds #make variables visible to R
    R"lasso_cv_result = cv.glmnet(x_float, y, nfolds = num_folds, foldid = folds)"
    R"lasso_beta_tmp = glmnet(x_float, y, lambda=lasso_cv_result$lambda.min)$beta"
    R"lasso_beta = as.vector(lasso_beta_tmp)"
    @rget lasso_cv_result lasso_beta #pull result from R to Julia
    lasso_k_est = length(findall(!iszero, lasso_beta))
    
    #find non-zero entries returned by best lasso model as largest k estimate
    path = collect(1:50);
    
    #run IHT's cross validation routine 
    mses = cv_iht_distributed(x, z, y, 1, path, folds, num_folds, "normal", use_maf = false, debias=false, showinfo=false, parallel=true)
    iht_k_est = argmin(mses)
    iht_result = L0_normal_reg(x, xbm, z, y, 1, iht_k_est, debias=false)
    iht_beta = iht_result.beta
        
    #show lasso and IHT's reconstruction result
    compare_model = DataFrame(
        true_β  = true_b[correct_position], 
        IHT_β   = iht_beta[correct_position],
        lasso_β = lasso_beta[correct_position])
    @show compare_model
    
    #compute precision/recall for IHT and lasso
    iht_tp = length(findall(!iszero, iht_beta[correct_position]))
    iht_fp = iht_k_est - iht_tp
    iht_fn = k - iht_tp
    iht_precision = iht_tp / (iht_tp + iht_fp)
    iht_recall = iht_tp / (iht_tp + iht_fn)

    lasso_tp = length(findall(!iszero, lasso_beta[correct_position]))
    lasso_fp = lasso_k_est - lasso_tp
    lasso_fn = k - lasso_tp
    lasso_precision = lasso_tp / (lasso_tp + lasso_fp)
    lasso_recall = lasso_tp / (lasso_tp + lasso_fn)
    
#     println("lasso estimated $lasso_k_est")
#     println("IHT estimated $iht_k_est")
#     println(iht_tp)
#     println(iht_fp)
#     println(iht_fn)
#     println(lasso_tp)
#     println(lasso_fp)
#     println(lasso_fn)
    
    return iht_precision, iht_recall, lasso_precision, lasso_recall
end

iht_lasso_normal (generic function with 2 methods)

In [32]:
Random.seed!(2019)

total_runs = 30
iht_precision = zeros(total_runs)
iht_recall = zeros(total_runs)
lasso_precision = zeros(total_runs)
lasso_recall = zeros(total_runs)
for i in 1:total_runs
    println("current run = $i")
    n = 1000
    p = 10000
    k = 10
    ihtp, ihtr, lassop, lassor = iht_lasso_normal(n, p, k)
    iht_precision[i] = ihtp
    iht_recall[i] = ihtr
    lasso_precision[i] = lassop
    lasso_recall[i] = lassor
end

[2.45987, 1.97982, 1.59474, 1.25621, 0.860642, 0.679042, 0.590805, 0.55617, 0.532006, 0.546, 0.551486, 0.563301, 0.571866, 0.571855, 0.576869, 0.609254, 0.614412, 0.623018, 0.621612, 0.624477, 0.644398, 0.656862, 0.671392, 0.66338, 0.658227, 0.696714, 0.706316, 0.710069, 0.694266, 0.709308, 0.724558, 0.731155, 0.744513, 0.753307, 0.733599, 0.752503, 0.754134, 0.799798, 0.800942, 0.809273, 0.795998, 0.797934, 0.797377, 0.808903, 0.868147, 0.851316, 0.825684, 0.847262, 0.880105, 0.886974]
compare_model = 10×3 DataFrame
│ Row │ true_β    │ IHT_β     │ lasso_β   │
│     │ Float64   │ Float64   │ Float64   │
├─────┼───────────┼───────────┼───────────┤
│ 1   │ 0.0366837 │ 0.0       │ 0.0       │
│ 2   │ 1.00474   │ 1.01942   │ 0.917629  │
│ 3   │ 0.9989    │ 0.930238  │ 0.814742  │
│ 4   │ 0.778596  │ 0.818491  │ 0.723922  │
│ 5   │ 0.787098  │ 0.854464  │ 0.74252   │
│ 6   │ -1.07958  │ -1.13027  │ -1.02692  │
│ 7   │ -0.279566 │ -0.274712 │ -0.178466 │
│ 8   │ -0.404658 │ -0.428998 │ -0.31

In [39]:
[iht_precision iht_recall lasso_precision lasso_recall]

1×4 Array{Float64,2}:
 1.0  0.9  0.310345  0.9

# Logistic response

In [3]:
function iht_lasso_logistic(n :: Int64, p :: Int64, k :: Int64)
    glm = "logistic"

    #construct snpmatrix, covariate files, and true model b
    x, maf = simulate_random_snparray(n, p, "logistic.bed")
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
    z = ones(n, 1) # non-genetic covariates, just the intercept
    true_b = zeros(p)
    true_b[1:k] = randn(k)
    shuffle!(true_b)
    correct_position = findall(x -> x != 0, true_b)
    x_float = [convert(Matrix{Float64}, x, center=true, scale=true) z] #Float64 version of x

    #simulate bernoulli data
    y_temp = xbm * true_b
    prob = logistic.(y_temp) #inverse logit link
    y = [rand(Bernoulli(x)) for x in prob]
    y = Float64.(y)

    #specify path and folds
    num_folds = 3
    folds = rand(1:num_folds, size(x, 1));

    #run glmnet via Rcall
    @rput x_float y folds num_folds #make variables visible to R
    R"lasso_cv_result = cv.glmnet(x_float, y, nfolds = num_folds, foldid = folds, family='binomial')"
    R"lasso_beta_tmp = glmnet(x_float, y, lambda=lasso_cv_result$lambda.min, family='binomial')$beta"
    R"lasso_beta = as.vector(lasso_beta_tmp)"
    @rget lasso_cv_result lasso_beta #pull result from R to Julia
    lasso_k_est = length(findall(!iszero, lasso_beta))
    
    #find non-zero entries returned by best lasso model as largest k estimate
    path = collect(1:50);
    
    #run IHT's cross validation routine 
    mses = cv_iht_distributed(x, z, y, 1, path, folds, num_folds, glm, use_maf = false, debias=true)
    iht_k_est = argmin(mses)
    iht_result = L0_logistic_reg(x, xbm, z, y, 1, iht_k_est, glm = "logistic", debias=true, show_info=false, convg=true, init=false)
    iht_beta = iht_result.beta
        
    #show lasso and IHT's reconstruction result
    compare_model = DataFrame(
        true_β  = true_b[correct_position],
        IHT_β   = iht_beta[correct_position],
        lasso_β = lasso_beta[correct_position])
    @show compare_model

    #compute precision/recall for IHT and lasso
    iht_tp = length(findall(!iszero, iht_beta[correct_position]))
    iht_fp = iht_k_est - iht_tp
    iht_fn = k - iht_tp
    iht_precision = iht_tp / (iht_tp + iht_fp)
    iht_recall = iht_tp / (iht_tp + iht_fn)

    lasso_tp = length(findall(!iszero, lasso_beta[correct_position]))
    lasso_fp = lasso_k_est - lasso_tp
    lasso_fn = k - lasso_tp
    lasso_precision = lasso_tp / (lasso_tp + lasso_fp)
    lasso_recall = lasso_tp / (lasso_tp + lasso_fn) 
    
#     println("lasso estimated $lasso_k_est")
#     println("IHT estimated $iht_k_est")
#     println(iht_tp)
#     println(iht_fp)
#     println(iht_fn)
#     println(lasso_tp)
#     println(lasso_fp)
#     println(lasso_fn)
    
    return iht_precision, iht_recall, lasso_precision, lasso_recall
end

iht_lasso_logistic (generic function with 1 method)

In [4]:
function run()
    Random.seed!(2019)

    total_runs = 30
    iht_precision = zeros(total_runs)
    iht_recall = zeros(total_runs)
    lasso_precision = zeros(total_runs)
    lasso_recall = zeros(total_runs)
    for i in 1:total_runs
        println("current run = $i")
        n = 1000
        p = 10000
        k = 10
        ihtp, ihtr, lassop, lassor = iht_lasso_logistic(n, p, k)
        iht_precision[i] = ihtp
        iht_recall[i] = ihtr
        lasso_precision[i] = lassop
        lasso_recall[i] = lassor
    end
    writedlm("./logistic_results/iht_precision", iht_precision)
    writedlm("./logistic_results/iht_recall", iht_recall)
    writedlm("./logistic_results/lasso_precision", lasso_precision)
    writedlm("./logistic_results/lasso_precision", lasso_precision)
end

current run = 1
compare_model = 10×3 DataFrame
│ Row │ true_β   │ IHT_β     │ lasso_β   │
│     │ Float64  │ Float64   │ Float64   │
├─────┼──────────┼───────────┼───────────┤
│ 1   │ -1.29964 │ -1.20647  │ -0.696055 │
│ 2   │ -0.2177  │ 0.0       │ -0.032045 │
│ 3   │ 0.786217 │ 0.748135  │ 0.35188   │
│ 4   │ 0.599233 │ 0.500641  │ 0.144367  │
│ 5   │ 0.283711 │ 0.0       │ 0.101466  │
│ 6   │ -1.12537 │ -1.09784  │ -0.63629  │
│ 7   │ 0.693374 │ 0.580258  │ 0.261347  │
│ 8   │ -0.67709 │ -0.718785 │ -0.337016 │
│ 9   │ 0.14727  │ 0.0       │ 0.0       │
│ 10  │ 1.03477  │ 1.02831   │ 0.591364  │


In [5]:
[iht_precision iht_recall lasso_precision lasso_recall]

1×4 Array{Float64,2}:
 1.0  0.7  0.128571  0.9

# Poisson responses

In [11]:
function iht_lasso_poisson(n :: Int64, p :: Int64, k :: Int64)
    glm = "poisson"

    #construct snpmatrix, covariate files, and true model b
    x, maf = simulate_random_snparray(n, p, "poisson.bed")
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
    z = ones(n, 1) # non-genetic covariates, just the intercept
    true_b = zeros(p)
    true_b[1:k] = rand(Normal(0, 0.3), k)
    shuffle!(true_b)
    correct_position = findall(x -> x != 0, true_b)
    x_float = [convert(Matrix{Float64}, x, center=true, scale=true) z] #Float64 version of x

    # Simulate poisson data
    y_temp = xbm * true_b
    λ = exp.(y_temp) #inverse log link
    y = [rand(Poisson(x)) for x in λ]
    y = Float64.(y)

    #specify path and folds
    num_folds = 3
    folds = rand(1:num_folds, size(x, 1));

    #run glmnet via Rcall
    @rput x_float y folds num_folds #make variables visible to R
    R"lasso_cv_result = cv.glmnet(x_float, y, nfolds = num_folds, foldid = folds, family='poisson')"
    R"lasso_beta_tmp = glmnet(x_float, y, lambda=lasso_cv_result$lambda.min, family='poisson')$beta"
    R"lasso_beta = as.vector(lasso_beta_tmp)"
    @rget lasso_cv_result lasso_beta #pull result from R to Julia
    lasso_k_est = length(findall(!iszero, lasso_beta))
    
    #find non-zero entries returned by best lasso model as largest k estimate
    path = collect(1:50);
    
    #run IHT's cross validation routine 
    mses = cv_iht_distributed(x, z, y, 1, path, folds, num_folds, glm, use_maf=false, debias=true)
    iht_k_est = argmin(mses)
    iht_result = L0_poisson_reg(x, xbm, z, y, 1, iht_k_est, glm = "poisson", debias=true, convg=false, show_info=false, true_beta=true_b, scale=false, init=false)
    iht_beta = iht_result.beta
        
    #show lasso and IHT's reconstruction result
    compare_model = DataFrame(
        true_β  = true_b[correct_position],
        IHT_β   = iht_beta[correct_position],
        lasso_β = lasso_beta[correct_position])
    @show compare_model

    #compute precision/recall for IHT and lasso
    iht_tp = length(findall(!iszero, iht_beta[correct_position]))
    iht_fp = iht_k_est - iht_tp
    iht_fn = k - iht_tp
    iht_precision = iht_tp / (iht_tp + iht_fp)
    iht_recall = iht_tp / (iht_tp + iht_fn)

    lasso_tp = length(findall(!iszero, lasso_beta[correct_position]))
    lasso_fp = lasso_k_est - lasso_tp
    lasso_fn = k - lasso_tp
    lasso_precision = lasso_tp / (lasso_tp + lasso_fp)
    lasso_recall = lasso_tp / (lasso_tp + lasso_fn) 
    
#     println("lasso estimated $lasso_k_est")
#     println("IHT estimated $iht_k_est")
#     println(iht_tp)
#     println(iht_fp)
#     println(iht_fn)
#     println(lasso_tp)
#     println(lasso_fp)
#     println(lasso_fn)
    
    return iht_precision, iht_recall, lasso_precision, lasso_recall
end

iht_lasso_poisson (generic function with 1 method)

In [12]:
function run()
    Random.seed!(2019)

    total_runs = 30
    iht_precision = zeros(total_runs)
    iht_recall = zeros(total_runs)
    lasso_precision = zeros(total_runs)
    lasso_recall = zeros(total_runs)
    for i in 1:total_runs
        println("current run = $i")
        n = 1000
        p = 10000
        k = 10
        ihtp, ihtr, lassop, lassor = iht_lasso_poisson(n, p, k)
        iht_precision[i] = ihtp
        iht_recall[i] = ihtr
        lasso_precision[i] = lassop
        lasso_recall[i] = lassor
    end
    
    writedlm("./poisson_results/iht_precision", iht_precision)
    writedlm("./poisson_results/iht_recall", iht_recall)
    writedlm("./poisson_results/lasso_precision", lasso_precision)
    writedlm("./poisson_results/lasso_precision", lasso_precision)
end

current run = 1
compare_model = 10×3 DataFrame
│ Row │ true_β     │ IHT_β     │ lasso_β   │
│     │ Float64    │ Float64   │ Float64   │
├─────┼────────────┼───────────┼───────────┤
│ 1   │ -0.389892  │ -0.395049 │ -0.273347 │
│ 2   │ -0.0653099 │ 0.0       │ 0.0       │
│ 3   │ 0.235865   │ 0.265943  │ 0.177224  │
│ 4   │ 0.17977    │ 0.235471  │ 0.156896  │
│ 5   │ 0.0851134  │ 0.0       │ 0.0187873 │
│ 6   │ -0.33761   │ -0.306962 │ -0.18046  │
│ 7   │ 0.208012   │ 0.2307    │ 0.177488  │
│ 8   │ -0.203127  │ -0.216157 │ -0.139893 │
│ 9   │ 0.0441809  │ 0.0       │ 0.0       │
│ 10  │ 0.310431   │ 0.294472  │ 0.219388  │
lasso estimated 50
IHT estimated 7
7
0
3
8
42
2


In [13]:
[iht_precision iht_recall lasso_precision lasso_recall]

1×4 Array{Float64,2}:
 1.0  0.7  0.16  0.8