# Precision Recall curves

LASSO is currently the de facto penalized least squares method for [feature selection](https://en.wikipedia.org/wiki/Feature_selection). Here we compare precision/recall curves of LASSO with IHT for GWAS data. Since the focus here is not scalability, we test our sample problems on moderately sized genotype matrces of 5000 samples with 100,000 SNPs (covariates). 

In [2]:
using GLMNet
using GLM
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using StatsFuns: logistic
using Random
using LinearAlgebra
using DelimitedFiles
using Distributed

In [4]:
# add 4 workers in addition to the master worker
addprocs(4)
nprocs()

5

# Poisson IHT vs Poisson LASSO

In [5]:
function iht_lasso_poisson(n :: Int64, p :: Int64, sim :: Int64)
    #define maf and true model size
    k = 10
    
    #set random seed
    Random.seed!(1111)

    #construct snpmatrix, covariate files, and true model b
    x, maf = simulate_random_snparray(n, p)
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
    z           = ones(n, 1)                   # non-genetic covariates, just the intercept
    true_b      = zeros(p)                     # model vector
    true_b[1:k] = rand(Normal(0, 0.4), k)      # k true response
    shuffle!(true_b)                           # Shuffle the entries
    correct_position = findall(x -> x != 0, true_b) # keep track of what the true entries are

    # Simulate poisson data
    y_temp = xbm * true_b
    λ = exp.(y_temp) #inverse log link
    y = [rand(Poisson(x)) for x in λ]
    y = Float64.(y)

    #compute poisson IHT result
    cur_time = time()
    path = collect(1:20)
    num_folds = 3
    folds = rand(1:num_folds, size(x, 1))
    k_est_iht = cv_iht(x, z, y, 1, path, folds, num_folds, use_maf=false, glm="poisson", debias=true)
    iht_result = L0_poisson_reg(x, z, y, 1, k_est_iht, glm = "poisson", debias=true, convg=false, show_info=false, true_beta=true_b, init=false)
    iht_time = time() - cur_time
    
    #compute poisson lasso result
    x_float = [convert(Matrix{Float64}, x, center=true, scale=true) z]
    cur_time = time()
    cv = glmnetcv(x_float, y, Poisson(), pmax=20, nfolds=3, folds=folds)
    best = argmin(cv.meanloss)
    lasso_result = cv.path.betas[:, best]
    k_est_lasso = length(findall(!iszero, lasso_result))
    lasso_time = time() - cur_time
    
    #compute regular poisson regression using only true predictors
    x_true = [x_float[:, correct_position] z]
    regular_result = glm(x_true, y, Poisson(), LogLink())
    regular_result = regular_result.pp.beta0
    
    #check result
    IHT_model = iht_result.beta[correct_position]
    lasso_model = lasso_result[correct_position]
    true_model = true_b[correct_position]
    compare_model = DataFrame(
        true_β  = true_model, 
        iht_β   = IHT_model,
        lasso_β = lasso_model,
        regular_β = regular_result[1:10])
    @show compare_model

    #compute summary statistics
    lasso_num_correct_predictors = length(findall(!iszero, lasso_model))
    lasso_false_positives = k_est_lasso - lasso_num_correct_predictors
    lasso_false_negatives = k - lasso_num_correct_predictors
    iht_num_correct_predictors = length(findall(!iszero, IHT_model))
    iht_false_positives = k_est_iht - iht_num_correct_predictors
    iht_false_negatives = k - iht_num_correct_predictors
    println("IHT cv found $iht_false_positives" * " false positives and $iht_false_negatives" * " false negatives, and used $iht_time" * " seconds")
    println("lasso cv found $lasso_false_positives" * " false positives and $lasso_false_negatives" * " false negatives, and used $lasso_time" * " seconds \n\n")

    #write y to file to view distribution later
    #writedlm("./IHT_poisson_simulations_mean/data_simulation_$sim.txt", y)

    return lasso_num_correct_predictors, lasso_false_positives, lasso_false_negatives, lasso_time, iht_num_correct_predictors, iht_false_positives, iht_false_negatives, iht_time
end

iht_lasso_poisson (generic function with 1 method)

In [6]:
function run_iht_lasso_poisson()
    lasso_total_found = 0
    lasso_false_positives = 0
    lasso_false_negatives = 0
    lasso_total_time = 0
    iht_total_found = 0
    iht_false_positives = 0
    iht_false_negatives = 0
    iht_total_time = 0
    
    iter = 10
    for i = 1:iter
        n = rand(1000:3000) 
        p = rand(1:10)n
        println("Running the $i th model where " * "n, p = " * string(n) * ", " * string(p))
        ltf, lfp, lfn, ltt, itf, ifp, ifn, itt = iht_lasso_poisson(n, p, i)

        lasso_total_found += ltf
        lasso_false_positives += lfp
        lasso_false_negatives += lfn
        lasso_total_time += ltt
        iht_total_found += itf
        iht_false_positives += ifp
        iht_false_negatives += ifn
        iht_total_time += itt
    end
    println("IHT  : Found $iht_total_found " * "correct predictors, out of " * string(10iter))
    println("IHT  : False positives = $iht_false_positives")
    println("IHT  : False negatives = $iht_false_negatives")   
    println("IHT  : Average time = " * string(iht_total_time / iter))   
    println("Lasso: Found $lasso_total_found " * "correct predictors, out of " * string(10iter))
    println("Lasso: False positives = $lasso_false_positives")
    println("Lasso: False negatives = $lasso_false_negatives")
    println("Lasso: Average time = " * string(lasso_total_time / iter))   
end

run_iht_lasso_poisson (generic function with 1 method)

In [None]:
Random.seed!(2019)
run_iht_lasso_poisson()