# Bernoulli: IHT vs LASSO vs Marginal regression: comparing false positive/negatives

Marginal regression is the most prevalent method used to analyze GWAS data. LASSO is currently the de facto penalized least squares method for [feature selection](https://en.wikipedia.org/wiki/Feature_selection). Here we compare the performance (in terms of the number of false positives/negative) of LASSO and marginal regression with IHT for GWAS data. We use the `glmnet` implementation of cyclic coordinate descent for LASSO, and `MendelGWAS.jl` module of [OpenMendel](https://openmendel.github.io/MendelGWAS.jl/) for marginal analysis. Since the focus here is not scalability, we test our sample problems on moderately sized genotype matrces of 1000 samples with 10,000 SNPs.

In [1]:
using Distributed
addprocs(25)
nprocs()

26

In [2]:
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using Random
using LinearAlgebra
using DelimitedFiles
using GLM
using MendelGWAS
using CSV
using RCall
R"library(glmnet)"

┌ Info: Loading DataFrames support into Gadfly.jl
└ @ Gadfly /u/home/b/biona001/.julia/packages/Gadfly/09PWZ/src/mapping.jl:228
[ Info: Loading DataFrames support into Gadfly.jl
[ Info: Loading DataFrames support into Gadfly.jl
[ Info: Loading DataFrames support into Gadfly.jl
[ Info: Loading DataFrames support into Gadfly.jl
[ Info: Loading DataFrames support into Gadfly.jl
[ Info: Loading DataFrames support into Gadfly.jl
[ Info: Loading DataFrames support into Gadfly.jl
[ Info: Loading DataFrames support into Gadfly.jl
[ Info: Loading DataFrames support into Gadfly.jl
[ Info: Loading DataFrames support into Gadfly.jl
[ Info: Loading DataFrames support into Gadfly.jl
[ Info: Loading DataFrames support into Gadfly.jl
[ Info: Loading DataFrames support into Gadfly.jl
[ Info: Loading DataFrames support into Gadfly.jl
[ Info: Loading DataFrames support into Gadfly.jl
[ Info: Loading DataFrames support into Gadfly.jl
[ Info: Loading DataFrames support into Gadfly.jl
[ Info: Loading DataFr

RObject{StrSxp}
 [1] "glmnet"    "foreach"   "Matrix"    "stats"     "graphics"  "grDevices"
 [7] "utils"     "datasets"  "methods"   "base"     


# Bernoulli response (i.e. logistic regression)

In [3]:
function iht_lasso_marginal(n::Int64, p::Int64, k::Int64, d::UnionAll, l::Link)
    #construct snpmatrix, covariate files, and true model b
    x, = simulate_random_snparray(n, p, "tmp.bed")
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
    z = ones(n, 1) # the intercept
    x_float = [convert(Matrix{Float64}, x, center=true, scale=true) z] #Float64 version of x

    # simulate response, true model b, and the correct non-0 positions of b
    y, true_b, correct_position = simulate_random_response(x, xbm, k, d, l)
 
    #specify path and folds
    num_folds = 3
    folds = rand(1:num_folds, size(x, 1));

    #run glmnet via Rcall
    @rput x_float y folds num_folds #make variables visible to R
    R"lasso_cv_result = cv.glmnet(x_float, y, nfolds = num_folds, foldid = folds, family='binomial')"
    R"lasso_beta_tmp = glmnet(x_float, y, lambda=lasso_cv_result$lambda.min, family='binomial')$beta"
    R"lasso_beta = as.vector(lasso_beta_tmp)"
    @rget lasso_cv_result lasso_beta #pull result from R to Julia
    lasso_k_est = count(!iszero, lasso_beta)
    
    #find non-zero entries returned by best lasso model as largest k estimate
    path = collect(1:50);
    
    #run IHT's cross validation routine 
    mses = cv_iht(d(), l, x, z, y, 1, path, folds, num_folds, use_maf=false, debias=false, showinfo=false, parallel=true);
    iht_k_est = argmin(mses)
    iht_result = L0_reg(x, xbm, z, y, 1, iht_k_est, d(), l, debias=false, init=false, use_maf=false)
    iht_beta = iht_result.beta
    
    #Now run MendelGWAS
    make_bim_fam_files(x, y, "tmp") #create .bim and .bed files for MendelGWAS

    #create a control file for MendelGWAS
    open("tmp_control.txt", "w") do f
        write(f, "plink_input_basename = tmp \n")
        write(f, "plink_field_separator = '\t' \n")
        write(f, "output_table = tmp_table.txt \n")
        write(f, "regression = logistic \n")
        write(f, "affected_designator = 1.0 \n")
        write(f, "regression_formula = Trait ~ \n")
    end

    #run marginal analysis
    GWAS("tmp_control.txt");

    # calculate false positive/negatives based on bonferroni correction
    p_values = CSV.read("tmp_table.txt", delim=',', header=true)[:Pvalue]
    significance = 0.05 / Float64(p)
    passing_snps_position = findall(p_values .<= significance)
    marginal_found = [correct_position[snp] in passing_snps_position for snp in 1:k]

    #clean up
    rm("tmp.bed", force=true)
    rm("tmp.bim", force=true)
    rm("tmp.fam", force=true)
    rm("tmp_table.txt", force=true)
    rm("tmp_control.txt", force=true)
    rm("Mendel_Output.txt", force=true)
        
    #show lasso and IHT's reconstruction result
    compare_model = DataFrame(
        true_β  = true_b[correct_position], 
        IHT_β   = iht_beta[correct_position],
        lasso_β = lasso_beta[correct_position],
        marginal_found = marginal_found)
    @show compare_model
    
    #compute true/false positives/negatives for IHT and lasso
    iht_tp = count(!iszero, iht_beta[correct_position])
    iht_fp = iht_k_est - iht_tp
    iht_fn = k - iht_tp
    lasso_tp = count(!iszero, lasso_beta[correct_position])
    lasso_fp = lasso_k_est - lasso_tp
    lasso_fn = k - lasso_tp
    marginal_tp = count(!iszero, true_b[passing_snps_position])
    marginal_fp = length(passing_snps_position) - marginal_tp
    marginal_fn = k - marginal_tp
    
    println("IHT false positives = $iht_fp")
    println("IHT false negatives = $iht_fn")
    println("LASSO false positives = $lasso_fp")
    println("LASSO false negatives = $lasso_fn")
    println("marginal false positives = $marginal_fp")
    println("marginal false negatives = $marginal_fn" * "\n")
    
    return iht_fp, iht_fn, lasso_fp, lasso_fn, marginal_fp, marginal_fn
end

iht_lasso_marginal (generic function with 1 method)

In [4]:
function run()
    #simulat data with k true predictors, from distribution d and with link l.
    n = 1000
    p = 10000
    k = 10
    d = Bernoulli
    l = canonicallink(d())

    #run function above, saving results in 4 vectors
    total_runs = 50
    iht_false_positives = zeros(total_runs)
    iht_false_negatives = zeros(total_runs)
    lasso_false_positives = zeros(total_runs)
    lasso_false_negatives = zeros(total_runs)
    marginal_false_positives = zeros(total_runs)
    marginal_false_negatives = zeros(total_runs)
    for i in 1:total_runs
        println("current run = $i")
        
        #set random seed
        Random.seed!(i)
        
        iht_fp, iht_fn, lasso_fp, lasso_fn, m_fp, m_fn = iht_lasso_marginal(n, p, k, d, l)
        iht_false_positives[i] = iht_fp
        iht_false_negatives[i] = iht_fn
        lasso_false_positives[i] = lasso_fp
        lasso_false_negatives[i] = lasso_fn
        marginal_false_positives[i] = m_fp
        marginal_false_negatives[i] = m_fn
    end
    
    return iht_false_positives, iht_false_negatives, lasso_false_positives, 
            lasso_false_negatives, marginal_false_positives, marginal_false_negatives
end

run (generic function with 1 method)

In [5]:
iht_false_positives, iht_false_negatives, lasso_false_positives, 
    lasso_false_negatives, marginal_false_positives, marginal_false_negatives = run()

current run = 1
 
 
     Welcome to OpenMendel's
      GWAS analysis option
        version 0.5.0
 
 
Reading the data.

The current working directory is "/u/home/b/biona001/precision_recall".

Keywords modified by the user:

  affected_designator = 1.0
  control_file = tmp_control.txt
  output_table = tmp_table.txt
  pedigree_file = tmp.fam
  plink_field_separator = 	
  plink_input_basename = tmp
  regression = logistic
  regression_formula = Trait ~
  snpdata_file = tmp.bed
  snpdefinition_file = tmp.bim
 
 
Analyzing the data.

 
 
Mendel's analysis is finished.

compare_model = 10×4 DataFrame
│ Row │ true_β    │ IHT_β    │ lasso_β   │ marginal_found │
│     │ Float64   │ Float64  │ Float64   │ Bool           │
├─────┼───────────┼──────────┼───────────┼────────────────┤
│ 1   │ 0.607538  │ 0.0      │ 0.226727  │ false          │
│ 2   │ -1.53004  │ -1.68693 │ -0.951184 │ true           │
│ 3   │ 1.44416   │ 1.77934  │ 1.04999   │ true           │
│ 4   │ 1.75303   │ 1.90754  │ 1.109

([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0], [3.0, 2.0, 3.0, 6.0, 5.0, 5.0, 2.0, 2.0, 3.0, 2.0  …  3.0, 5.0, 5.0, 4.0, 1.0, 7.0, 3.0, 2.0, 5.0, 3.0], [167.0, 85.0, 221.0, 19.0, 26.0, 44.0, 15.0, 57.0, 79.0, 276.0  …  57.0, 106.0, 22.0, 62.0, 85.0, 31.0, 143.0, 184.0, 132.0, 67.0], [2.0, 1.0, 2.0, 4.0, 5.0, 4.0, 1.0, 2.0, 0.0, 1.0  …  2.0, 2.0, 4.0, 1.0, 1.0, 3.0, 1.0, 1.0, 2.0, 3.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0], [5.0, 5.0, 3.0, 5.0, 5.0, 6.0, 4.0, 3.0, 4.0, 4.0  …  4.0, 5.0, 6.0, 4.0, 3.0, 7.0, 4.0, 4.0, 5.0, 6.0])

In [6]:
bernoulli_iht_false_positives = sum(iht_false_positives) / 50
bernoulli_iht_false_negatives = sum(iht_false_negatives) / 50
bernoulli_lasso_false_positives = sum(lasso_false_positives) / 50
bernoulli_lasso_false_negatives = sum(lasso_false_negatives) / 50
bernoulli_marginal_false_positives = sum(marginal_false_positives) / 50
bernoulli_marginal_false_negatives = sum(marginal_false_negatives) / 50

result = [bernoulli_iht_false_positives; bernoulli_iht_false_negatives; 
        bernoulli_lasso_false_positives; bernoulli_lasso_false_negatives;
        bernoulli_marginal_false_positives; bernoulli_marginal_false_negatives]

6-element Array{Float64,1}:
  0.1 
  3.84
 88.54
  2.38
  0.06
  4.78