# Logistic - Compare coefficient estimates for IHT and lasso

In [1]:
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using DelimitedFiles
using GLM
using RCall
R"library(glmnet)"

│ Loading required package: foreach
│ Loaded glmnet 2.0-16
│ 
└ @ RCall /u/home/b/biona001/.julia/packages/RCall/ffM0W/src/io.jl:113


RObject{StrSxp}
 [1] "glmnet"    "foreach"   "Matrix"    "stats"     "graphics"  "grDevices"
 [7] "utils"     "datasets"  "methods"   "base"     


In [2]:
#simulate 5 genes, each with 5 rare variants and 1 non-rare variant
function compare(d::UnionAll, l::Link)
    
    # define problem size
    n = 1000
    p = 10000
    block_size = 20
    num_blocks = Int(p / block_size)

    # assign group membership
    membership = collect(1:num_blocks)
    g = zeros(Int64, p + 1)
    for i in 1:length(membership)
        for j in 1:block_size
            cur_row = block_size * (i - 1) + j
            g[block_size*(i - 1) + j] = membership[i]
        end
    end
    g[end] = membership[end]
    
    #simulate correlated snparray
    x = simulate_correlated_snparray(n, p, "tmp.bed")
    z = ones(n, 1) # the intercept
    x_float = convert(Matrix{Float64}, x, model=ADDITIVE_MODEL, center=true, scale=true)
    
    #simulate true model, where 5 groups each with 3 snps contribute
    true_b = zeros(p)
    true_groups = randperm(num_blocks)[1:5]
    within_group = [randperm(block_size)[1:3] randperm(block_size)[1:3] randperm(block_size)[1:3] randperm(block_size)[1:3] randperm(block_size)[1:3]]
    correct_position = zeros(Int64, 15)
    for i in 1:5
        cur_group = block_size * (true_groups[i] - 1)
        cur_group_snps = cur_group .+ within_group[:, i]
        correct_position[3*(i-1)+1:3i] .= cur_group_snps
    end
    for i in 1:15
        true_b[correct_position[i]] = rand(-1:2:1) * 0.2
    end
    sort!(correct_position)
        
    # simulate phenotype
    if d == Normal || d == Bernoulli || d == Poisson
        prob = linkinv.(l, x_float * true_b)
        clamp!(prob, -20, 20)
        y = [rand(d(i)) for i in prob]
    elseif d == NegativeBinomial
        r = 10
        μ = linkinv.(l, x_float * true_b)
        clamp!(μ, -20, 20)
        prob = 1 ./ (1 .+ μ ./ r)
        y = [rand(d(r, i)) for i in prob] #number of failtures before nn success occurs
    end
    y = Float64.(y)
    
    #run IHT without groups
    k = 15
    ungrouped = L0_reg(x_float, z, y, 1, k, d(), l, debias=false)
        
    #run IHT with groups
    J = 5
    k = 3
    grouped = L0_reg(x_float, z, y, J, k, d(), l, debias=false, group=g)

    #check result
    compare_model = DataFrame(
        position   = correct_position,
        correct    = true_b[correct_position],
        ungrouped  = ungrouped.beta[correct_position], 
        grouped    = grouped.beta[correct_position])
    @show compare_model
    println("Grouped ran: ", grouped.iter, " iters and Ungrouped ran: ", ungrouped.iter, " iters")
    println("\n")

    #clean up
    rm("tmp.bed", force=true)
    
    group = count(!iszero, grouped.beta[correct_position])
    ungroup = count(!iszero, ungrouped.beta[correct_position])
    return group, ungroup
end

run_repeat (generic function with 1 method)

## Run 100 models

Here $\beta_{true}$ is the same in all models, but the design matrix and phenotype will be different. To run lasso, we first cross validate for the tuning parameter $\lambda$ before running the standard lasso. We compute the average and standard error for correctly identified predictors (i.e. estimated $\beta_i > 0$). Note that here we are not comparing false positives or false negatives. 

In [3]:
repeats = 100
n = 5000
p = 10000
d = Bernoulli
l = canonicallink(d())

# set random seed for reproducibility
Random.seed!(2019)

# intercept
z = ones(Float32, n, 1)

#construct true model b
true_b = zeros(p)
true_b[1:6] = [0.01; 0.03; 0.05; 0.1; 0.25; 0.5]
shuffle!(true_b)
correct_position = findall(x -> x != 0, true_b)

#run repeats
@time iht_result, lasso_result = run_repeat(n, p, repeats, z, true_b, correct_position, d, l)

7522.213603 seconds (40.62 M allocations: 20.844 GiB, 0.03% gc time)


([0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.474713 0.559795 … 0.518652 0.533072], [0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.356865 0.425148 … 0.381897 0.414738])

In [4]:
iht_result

6×100 Array{Float64,2}:
 0.0       0.0       0.0       0.0       …  0.0       0.0       0.0     
 0.0       0.0       0.0       0.0          0.0       0.0       0.0     
 0.299919  0.239312  0.232653  0.222768     0.280829  0.242682  0.279235
 0.0       0.13667   0.0       0.14567      0.0       0.0       0.0     
 0.0       0.0       0.0       0.0          0.0       0.0       0.0     
 0.474713  0.559795  0.474136  0.487957  …  0.510568  0.518652  0.533072

In [5]:
lasso_result

6×100 Array{Float64,2}:
 0.0          0.0        0.0       0.0        …  0.0       0.0       0.0     
 0.0          0.0        0.0       0.0           0.0       0.0       0.0     
 0.188276     0.117103   0.119253  0.110791      0.157104  0.120708  0.168432
 0.000621037  0.0158941  0.0       0.0327681     0.0       0.0       0.0     
 0.0          0.0        0.0       0.0           0.0       0.0       0.0     
 0.356865     0.425148   0.350042  0.366872   …  0.386655  0.381897  0.414738

In [6]:
iht_non_zero = [iht_result[i, findall(!iszero, iht_result[i, :])] for i in 1:6]
lasso_non_zero = [lasso_result[i, findall(!iszero, lasso_result[i, :])] for i in 1:6];

In [7]:
iht_non_zero_mean = mean.(iht_non_zero)

6-element Array{Float64,1}:
   0.1335183084011078 
   0.11446361243724823
   0.25645575910806656
   0.12844786211484815
 NaN                  
   0.5040046945214272 

In [8]:
lasso_non_zero_mean = mean.(lasso_non_zero)

6-element Array{Float64,1}:
   0.026024788555032277
   0.010405771468260074
   0.14167074811788927 
   0.020725702478585614
 NaN                   
   0.37921347950538864 

In [9]:
iht_non_zero_std = std.(iht_non_zero)

6-element Array{Float64,1}:
 NaN                   
   0.01092739233999524 
   0.028641150297164263
   0.0162601857072553  
 NaN                   
   0.028991719145716056

In [10]:
lasso_non_zero_std = std.(lasso_non_zero)

6-element Array{Float64,1}:
 NaN                   
   0.011147662835270798
   0.027798884257837206
   0.015303219549302327
 NaN                   
   0.029624209719069428

In [11]:
final = DataFrame(
    true_β = true_b[correct_position],
    β_IHT_mean = iht_non_zero_mean, 
    β_IHT_std = iht_non_zero_std, 
    β_lasso_mean = lasso_non_zero_mean,
    β_lasso_std = lasso_non_zero_std)

Unnamed: 0_level_0,true_β,β_IHT_mean,β_IHT_std,β_lasso_mean,β_lasso_std
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.03,0.133518,,0.0260248,
2,0.05,0.114464,0.0109274,0.0104058,0.0111477
3,0.25,0.256456,0.0286412,0.141671,0.0277989
4,0.1,0.128448,0.0162602,0.0207257,0.0153032
5,0.01,,,,
6,0.5,0.504005,0.0289917,0.379213,0.0296242


# 5000 by 10000 for 100 runs 
7522.213603 seconds

In [12]:
final = DataFrame(
    true_β = true_b[correct_position],
    β_IHT_mean = iht_non_zero_mean, 
    β_IHT_std = iht_non_zero_std, 
    β_lasso_mean = lasso_non_zero_mean,
    β_lasso_std = lasso_non_zero_std)

Unnamed: 0_level_0,true_β,β_IHT_mean,β_IHT_std,β_lasso_mean,β_lasso_std
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.03,0.133518,,0.0260248,
2,0.05,0.114464,0.0109274,0.0104058,0.0111477
3,0.25,0.256456,0.0286412,0.141671,0.0277989
4,0.1,0.128448,0.0162602,0.0207257,0.0153032
5,0.01,,,,
6,0.5,0.504005,0.0289917,0.379213,0.0296242


# 10000 by 20000 for 3 runs

In [29]:
final = DataFrame(
    true_β = true_b[correct_position],
    β_IHT_mean = iht_non_zero_mean, 
    β_IHT_std = iht_non_zero_std, 
    β_lasso_mean = lasso_non_zero_mean,
    β_lasso_std = lasso_non_zero_std)

Unnamed: 0_level_0,true_β,β_IHT_mean,β_IHT_std,β_lasso_mean,β_lasso_std
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.25,0.253357,0.0123672,0.171736,0.0132097
2,0.05,0.107912,,0.0152292,0.0109794
3,0.03,,,,
4,0.5,0.508404,0.0154591,0.420122,0.00909588
5,0.1,0.124637,0.044301,0.0482534,0.0372184
6,0.01,,,,
