# Poisson: compare false positive/negatives with LASSO

LASSO is currently the de facto penalized least squares method for [feature selection](https://en.wikipedia.org/wiki/Feature_selection). Here we compare the performance (in terms of the number of false positives/negative) of LASSO with IHT for GWAS data, using the `glmnet` implementation of cyclic coordinate descent for LASSO. Since the focus here is not scalability, we test our sample problems on moderately sized genotype matrces of 1000 samples with 10,000 SNPs.

In [1]:
using Distributed
addprocs(4)
nprocs()

5

In [35]:
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using Random
using LinearAlgebra
using DelimitedFiles
using GLM
using MendelGWAS
using CSV
using RCall
R"library(glmnet)"

┌ Info: Recompiling stale cache file /Users/biona001/.julia/compiled/v1.0/MendelGWAS/y6a7g.ji for MendelGWAS [c1bad3ca-268b-5bf6-b82e-de8a361f94db]
└ @ Base loading.jl:1190
┌ Info: Loading DataFrames support into Gadfly.jl
└ @ Gadfly /Users/biona001/.julia/packages/Gadfly/09PWZ/src/mapping.jl:228
┌ Info: Recompiling stale cache file /Users/biona001/.julia/compiled/v1.0/RCall/8GFyb.ji for RCall [6f49c342-dc21-5d91-9882-a32aef131414]
└ @ Base loading.jl:1190
│  要求されたパッケージ foreach をロード中です 
│ Loaded glmnet 2.0-16
│ 
└ @ RCall /Users/biona001/.julia/packages/RCall/ffM0W/src/io.jl:113


RObject{StrSxp}
 [1] "glmnet"    "foreach"   "Matrix"    "stats"     "graphics"  "grDevices"
 [7] "utils"     "datasets"  "methods"   "base"     


In [3]:
function iht_lasso_marginal(n::Int64, p::Int64, k::Int64, d::UnionAll, l::Link)
    #construct snpmatrix, covariate files, and true model b
    x, = simulate_random_snparray(n, p, "poisson.bed")
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
    z = ones(n, 1) # the intercept
    x_float = [convert(Matrix{Float64}, x, center=true, scale=true) z] #Float64 version of x

    # simulate response, true model b, and the correct non-0 positions of b
    y, true_b, correct_position = simulate_random_response(x, xbm, k, d, l)
 
    #specify path and folds
    num_folds = 3
    folds = rand(1:num_folds, size(x, 1));

    #run glmnet via Rcall
    @rput x_float y folds num_folds #make variables visible to R
    R"lasso_cv_result = cv.glmnet(x_float, y, nfolds = num_folds, foldid = folds, family='poisson')"
    R"lasso_beta_tmp = glmnet(x_float, y, lambda=lasso_cv_result$lambda.min, family='poisson')$beta"
    R"lasso_beta = as.vector(lasso_beta_tmp)"
    @rget lasso_cv_result lasso_beta #pull result from R to Julia
    lasso_k_est = count(!iszero, lasso_beta)
    
    #find non-zero entries returned by best lasso model as largest k estimate
    path = collect(1:50);
    
    #run IHT's cross validation routine 
    mses = cv_iht(d(), l, x, z, y, 1, path, folds, num_folds, use_maf=false, debias=false, showinfo=false, parallel=true);
    iht_k_est = argmin(mses)
    iht_result = L0_reg(x, xbm, z, y, 1, iht_k_est, d(), l, debias=false, init=false, use_maf=false)
    iht_beta = iht_result.beta
    
    #Now run MendelGWAS
    make_bim_fam_files(x, y, "poisson") #create .bim and .bed files for MendelGWAS

    #create a control file for MendelGWAS
    open("poisson_control.txt", "w") do f
        write(f, "plink_input_basename = poisson \n")
        write(f, "plink_field_separator = '\t' \n")
        write(f, "output_table = poisson_table.txt \n")
        write(f, "regression = Poisson \n")
        write(f, "regression_formula = Trait ~ \n")
    end

    #run marginal analysis
    GWAS("poisson_control.txt");

    # calculate false positive/negatives based on bonferroni correction
    println("reached here")
    p_values = CSV.read("poisson_table.txt", delim=',', header=true)[:Pvalue]
    println("reached here too")
    significance = 0.05 / Float64(p)
    println("reached here 3")
    passing_snps_position = findall(p_values .<= significance)
    println("reached here 4")
    marginal_found = [correct_position[snp] in passing_snps_position for snp in 1:k]
    println("reached here 5")

    #clean up
    rm("poisson.bed", force=true)
    rm("poisson.bim", force=true)
    rm("poisson.fam", force=true)
    rm("poisson_table.txt", force=true)
    rm("poisson_control.txt", force=true)
    rm("Mendel_Output.txt", force=true)
    println("reached here 6")

    #show lasso and IHT's reconstruction result
    compare_model = DataFrame(
        true_β  = true_b[correct_position], 
        IHT_β   = iht_beta[correct_position],
        lasso_β = lasso_beta[correct_position],
        marginal_found = marginal_found)
    @show compare_model
    println("reached here 7")

    
    #compute true/false positives/negatives for IHT and lasso
    iht_tp = count(!iszero, iht_beta[correct_position])
    iht_fp = iht_k_est - iht_tp
    iht_fn = k - iht_tp
    lasso_tp = count(!iszero, lasso_beta[correct_position])
    lasso_fp = lasso_k_est - lasso_tp
    lasso_fn = k - lasso_tp
    marginal_tp = count(!iszero, true_b[passing_snps_position])
    marginal_fp = length(passing_snps_position) - marginal_tp
    marginal_fn = k - marginal_tp
    
    println("IHT false positives = $iht_fp")
    println("IHT false negatives = $iht_fn")
    println("LASSO false positives = $lasso_fp")
    println("LASSO false negatives = $lasso_fn")
    println("marginal false positives = $marginal_fp")
    println("marginal false negatives = $marginal_fn" * "\n")
    
    return iht_fp, iht_fn, lasso_fp, lasso_fn, marginal_fp, marginal_fn
end

iht_lasso_marginal (generic function with 1 method)

In [7]:
function run()
    #simulat data with k true predictors, from distribution d and with link l.
    n = 1000
    p = 10000
    k = 10
    d = Poisson
    l = canonicallink(d())

    #run function above, saving results in 4 vectors
    total_runs = 50
    iht_false_positives = zeros(total_runs)
    iht_false_negatives = zeros(total_runs)
    lasso_false_positives = zeros(total_runs)
    lasso_false_negatives = zeros(total_runs)
    marginal_false_positives = zeros(total_runs)
    marginal_false_negatives = zeros(total_runs)
    for i in 1:total_runs
        println("current run = $i")
        
        #set random seed
        Random.seed!(i)
        
        iht_fp, iht_fn, lasso_fp, lasso_fn, m_fp, m_fn = iht_lasso_marginal(n, p, k, d, l)
        iht_false_positives[i] = iht_fp
        iht_false_negatives[i] = iht_fn
        lasso_false_positives[i] = lasso_fp
        lasso_false_negatives[i] = lasso_fn
        marginal_false_positives[i] = m_fp
        marginal_false_negatives[i] = m_fn
    end
    
    return iht_false_positives, iht_false_negatives, lasso_false_positives, 
            lasso_false_negatives, marginal_false_positives, marginal_false_negatives
end

run (generic function with 1 method)

# Note: this code was ran on cluster

The code above was run via the command below on cluster machine. The result was saved, and will be imported later.

In [None]:
iht_false_positives, iht_false_negatives, lasso_false_positives, 
    lasso_false_negatives, marginal_false_positives, marginal_false_negatives = run()

current run = 1
 
 
     Welcome to OpenMendel's
      GWAS analysis option
        version 0.5.0
 
 
Reading the data.

The current working directory is "/Users/biona001/.julia/dev/MendelIHT/figures/precision_recall".

Keywords modified by the user:

  affected_designator = 2
  control_file = poisson_control.txt
  output_table = poisson_table.txt
  pedigree_file = poisson.fam
  plink_field_separator = 	
  plink_input_basename = poisson
  regression = Poisson
  regression_formula = Trait ~
  snpdata_file = poisson.bed
  snpdefinition_file = poisson.bim
 
 
Analyzing the data.

 
 
Mendel's analysis is finished.

reached here
reached here too
reached here 3
reached here 4
reached here 5
reached here 6
compare_model = 10×4 DataFrame
│ Row │ true_β     │ IHT_β     │ lasso_β   │ marginal_found │
│     │ Float64    │ Float64   │ Float64   │ Bool           │
├─────┼────────────┼───────────┼───────────┼────────────────┤
│ 1   │ 0.182262   │ 0.152774  │ 0.0966563 │ true           │
│ 2   │ -0.

│  getcoef(fit, nvars, nx, vnames) で警告がありました: 
│   an empty model has been returned; probably a convergence issue
└ @ RCall /Users/biona001/.julia/packages/RCall/ffM0W/src/io.jl:113


[31mDid not converge!!!!! The run time for IHT was 10.322705030441284 seconds and model size was13[39m
 
 
     Welcome to OpenMendel's
      GWAS analysis option
        version 0.5.0
 
 
Reading the data.

The current working directory is "/Users/biona001/.julia/dev/MendelIHT/figures/precision_recall".

Keywords modified by the user:

  affected_designator = 2
  control_file = poisson_control.txt
  output_table = poisson_table.txt
  pedigree_file = poisson.fam
  plink_field_separator = 	
  plink_input_basename = poisson
  regression = Poisson
  regression_formula = Trait ~
  snpdata_file = poisson.bed
  snpdefinition_file = poisson.bim
 
 
Analyzing the data.

 
 
Mendel's analysis is finished.

reached here
reached here too
reached here 3
reached here 4
reached here 5
reached here 6
compare_model = 10×4 DataFrame
│ Row │ true_β    │ IHT_β   │ lasso_β │ marginal_found │
│     │ Float64   │ Float64 │ Float64 │ Bool           │
├─────┼───────────┼─────────┼─────────┼────────────────┤

│  getcoef(fit, nvars, nx, vnames) で警告がありました: 
│   an empty model has been returned; probably a convergence issue
└ @ RCall /Users/biona001/.julia/packages/RCall/ffM0W/src/io.jl:113


 
 
     Welcome to OpenMendel's
      GWAS analysis option
        version 0.5.0
 
 
Reading the data.

The current working directory is "/Users/biona001/.julia/dev/MendelIHT/figures/precision_recall".

Keywords modified by the user:

  affected_designator = 2
  control_file = poisson_control.txt
  output_table = poisson_table.txt
  pedigree_file = poisson.fam
  plink_field_separator = 	
  plink_input_basename = poisson
  regression = Poisson
  regression_formula = Trait ~
  snpdata_file = poisson.bed
  snpdefinition_file = poisson.bim
 
 
Analyzing the data.

 
 
Mendel's analysis is finished.

reached here
reached here too
reached here 3
reached here 4
reached here 5
reached here 6
compare_model = 10×4 DataFrame
│ Row │ true_β    │ IHT_β     │ lasso_β │ marginal_found │
│     │ Float64   │ Float64   │ Float64 │ Bool           │
├─────┼───────────┼───────────┼─────────┼────────────────┤
│ 1   │ 0.353633  │ 0.339164  │ 0.0     │ true           │
│ 2   │ -0.133638 │ 0.0       │ 0.0    

## Importing results.... 

In [8]:
#this code was ran on cluster
using DelimitedFiles
poisson_iht_false_positives = readdlm("poisson_results/iht_false_positives")
poisson_iht_false_negatives = readdlm("poisson_results/lasso_false_negatives")
poisson_lasso_false_positives = readdlm("poisson_results/lasso_false_positives")
poisson_lasso_false_negatives = readdlm("poisson_results/lasso_false_negatives")
poisson_marginal_false_positives = readdlm("poisson_results/marginal_false_positives")
poisson_marginal_false_negatives = readdlm("poisson_results/marginal_false_negatives")

50×1 Array{Float64,2}:
 2.0
 1.0
 2.0
 4.0
 5.0
 6.0
 3.0
 2.0
 3.0
 2.0
 3.0
 2.0
 5.0
 ⋮  
 3.0
 5.0
 3.0
 3.0
 3.0
 2.0
 2.0
 6.0
 3.0
 3.0
 2.0
 2.0

In [50]:
IHT_dnc = 1
LASSO_dnc = 8
marginal_dnc = 0
lasso_dnc_index = [2; 10; 17; 24; 27; 37; 38; 40; ]

result = [sum(poisson_iht_false_positives)/50; sum(poisson_iht_false_negatives)/50; 
        sum(poisson_lasso_false_positives)/50; sum(poisson_lasso_false_negatives)/50;
        sum(poisson_marginal_false_positives)/50; sum(poisson_marginal_false_negatives)/50]

6-element Array{Float64,1}:
   0.56
   3.64
  27.74
   3.64
 866.68
   3.14

In [14]:
poisson_iht_false_positives = readdlm("poisson_results/iht_false_positives")
poisson_iht_false_negatives = readdlm("poisson_results/lasso_false_negatives")
poisson_lasso_false_positives = readdlm("poisson_results/lasso_false_positives")
poisson_lasso_false_negatives = readdlm("poisson_results/lasso_false_negatives")
poisson_marginal_false_positives = readdlm("poisson_results/marginal_false_positives")
poisson_marginal_false_negatives = readdlm("poisson_results/marginal_false_negatives")

50×1 Array{Float64,2}:
 2.0
 1.0
 2.0
 4.0
 5.0
 6.0
 3.0
 2.0
 3.0
 2.0
 3.0
 2.0
 5.0
 ⋮  
 3.0
 5.0
 3.0
 3.0
 3.0
 2.0
 2.0
 6.0
 3.0
 3.0
 2.0
 2.0

In [53]:
iht_false_positive_exclude_nonconvergent = [getindex(poisson_iht_false_positives, i) for i = 1:50 if !(i in lasso_dnc_index)]
iht_false_negative_exclude_nonconvergent = [getindex(poisson_iht_false_negatives, i) for i = 1:50 if !(i in lasso_dnc_index)]
lasso_false_positive_exclude_nonconvergent = [getindex(poisson_lasso_false_positives, i) for i = 1:50 if !(i in lasso_dnc_index)]
lasso_false_negative_exclude_nonconvergent = [getindex(poisson_lasso_false_negatives, i) for i = 1:50 if !(i in lasso_dnc_index)]
marginal_false_positive_exclude_nonconvergent = [getindex(poisson_marginal_false_positives, i) for i = 1:50 if !(i in lasso_dnc_index)]
marginal_false_negative_exclude_nonconvergent = [getindex(poisson_marginal_false_negatives, i) for i = 1:50 if !(i in lasso_dnc_index)]

42-element Array{Float64,1}:
 2.0
 2.0
 4.0
 5.0
 6.0
 3.0
 2.0
 3.0
 3.0
 2.0
 5.0
 1.0
 5.0
 ⋮  
 3.0
 3.0
 3.0
 3.0
 3.0
 2.0
 2.0
 6.0
 3.0
 3.0
 2.0
 2.0

In [54]:
@show sum(iht_false_positive_exclude_nonconvergent) / 42
@show sum(iht_false_negative_exclude_nonconvergent) / 42
@show sum(lasso_false_positive_exclude_nonconvergent) / 42
@show sum(lasso_false_negative_exclude_nonconvergent) / 42
@show sum(marginal_false_positive_exclude_nonconvergent) / 42 #still pretty bad
@show sum(marginal_false_negative_exclude_nonconvergent) / 42 

sum(iht_false_positive_exclude_nonconvergent) / 42 = 0.38095238095238093
sum(iht_false_negative_exclude_nonconvergent) / 42 = 2.4285714285714284
sum(lasso_false_positive_exclude_nonconvergent) / 42 = 33.023809523809526
sum(lasso_false_negative_exclude_nonconvergent) / 42 = 2.4285714285714284
sum(marginal_false_positive_exclude_nonconvergent) / 42 = 142.23809523809524
sum(marginal_false_negative_exclude_nonconvergent) / 42 = 3.3095238095238093


3.3095238095238093

In [33]:
lasso_dnc_index

8-element Array{Int64,1}:
  2
 10
 17
 24
 27
 37
 38
 40

# 9000+ hits are a consequence of extreme y values

In [38]:
n = 1000
p = 10000
k = 10
d = Poisson
l = canonicallink(d())

Random.seed!(37)

x, = simulate_random_snparray(n, p, "poisson.bed")
xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
z = ones(n, 1) # the intercept
x_float = [convert(Matrix{Float64}, x, center=true, scale=true) z] #Float64 version of x
y, true_b, correct_position = simulate_random_response(x, xbm, k, d, l)

make_bim_fam_files(x, y, "poisson")

#create a control file for MendelGWAS
open("poisson_control.txt", "w") do f
    write(f, "plink_input_basename = poisson \n")
    write(f, "plink_field_separator = '\t' \n")
    write(f, "output_table = poisson_table.txt \n")
    write(f, "regression = Poisson \n")
    write(f, "regression_formula = Trait ~ \n")
end

GWAS("poisson_control.txt");

# calculate false positive/negatives based on bonferroni correction
p_values = CSV.read("poisson_table.txt", delim=',', header=true)[:Pvalue]
significance = 0.05 / Float64(p)
passing_snps_position = findall(p_values .<= significance)
marginal_found = [correct_position[snp] in passing_snps_position for snp in 1:k]


 
 
     Welcome to OpenMendel's
      GWAS analysis option
        version 0.5.0
 
 
Reading the data.

The current working directory is "/Users/biona001/.julia/dev/MendelIHT/figures/precision_recall".

Keywords modified by the user:

  affected_designator = 2
  control_file = poisson_control.txt
  output_table = poisson_table.txt
  pedigree_file = poisson.fam
  plink_field_separator = 	
  plink_input_basename = poisson
  regression = Poisson
  regression_formula = Trait ~
  snpdata_file = poisson.bed
  snpdefinition_file = poisson.bim
 
 
Analyzing the data.

 
 
Mendel's analysis is finished.



10-element Array{Bool,1}:
  true
 false
  true
  true
  true
  true
  true
  true
  true
  true

In [39]:
p_values

10000-element Array{Union{Missing, Float64},1}:
 1.3451240680413934e-75 
 0.0                    
 0.0                    
 0.0                    
 0.0                    
 0.0                    
 0.0                    
 0.0                    
 0.0                    
 0.0                    
 0.0                    
 0.0                    
 0.0                    
 ⋮                      
 0.0                    
 0.0                    
 1.6597422626641304e-238
 0.0                    
 0.0008455539546774351  
 2.0e-323               
 0.0                    
 0.0                    
 0.0                    
 0.0                    
 0.0                    
 0.0                    

In [45]:
# result = CSV.read("poisson_table.txt", delim=',', header=true)
maximum(y), var(y)

(24217.0, 980310.9098058061)

In [42]:
rm("poisson.bed", force=true)
rm("poisson.bim", force=true)
rm("poisson.fam", force=true)
rm("poisson_table.txt", force=true)
rm("poisson_control.txt", force=true)
rm("Mendel_Output.txt", force=true)