# Poisson - Compare coefficient estimates for IHT and lasso

In [1]:
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using DelimitedFiles
using GLM
using RCall
R"library(glmnet)"

│ Loading required package: foreach
│ Loaded glmnet 2.0-16
│ 
└ @ RCall /u/home/b/biona001/.julia/packages/RCall/ffM0W/src/io.jl:113


RObject{StrSxp}
 [1] "glmnet"    "foreach"   "Matrix"    "stats"     "graphics"  "grDevices"
 [7] "utils"     "datasets"  "methods"   "base"     


In [2]:
#run n repeats of L0_reg using the same X and b, but different y
function run_repeat(
    n        :: Int,
    p        :: Int,
    repeats  :: Int, 
    z        :: AbstractMatrix,
    true_b   :: Vector,
    cor_pos  :: Vector, #correct position of the true model
    d        :: UnionAll,
    l        :: GLM.Link
    )
    
    k = size(cor_pos, 1)
    IHT_estimated_β = zeros(k, repeats)
    lasso_estimated_β = zeros(k, repeats)
    
    for i in 1:repeats
        # simulat SNP data
        mafs = rand(Uniform(0.05, 0.5), p)
        x = zeros(Float32, n, p)
        for j in 1:p
            dist = Binomial(2, mafs[j])
            for ii in 1:n
                x[ii, j] = Float32(rand(dist))
            end
        end
        standardize!(x)

        #simulate phenotypes (e.g. vector y)
        if d == Normal || d == Poisson || d == Bernoulli
            prob = GLM.linkinv.(l, x * true_b)
            clamp!(prob, -20, 20)
            y = [rand(d(i)) for i in prob]
        elseif d == NegativeBinomial
            r = 10
            μ = GLM.linkinv.(l, x * true_b)
            prob = 1 ./ (1 .+ μ ./ r)
            y = [rand(d(r, i)) for i in prob] #number of failtures before r success occurs
        elseif d == Gamma
            μ = GLM.linkinv.(l, x * true_b)
            β = 1 ./ μ # here β is the rate parameter for gamma distribution
            y = [rand(d(α, i)) for i in β] # α is the shape parameter for gamma
        end
        y = Float32.(y)

        #compute results for data
        IHT_result = L0_reg(x, z, y, 1, k, d(), l)

        #compute lasso results. choose λ via cross validation
        num_folds = 3
        folds = rand(1:num_folds, size(x, 1))
        @rput x y folds num_folds #make variables visible to R
        R"lasso_cv_result = cv.glmnet(x, y, nfolds = num_folds, foldid = folds, family='poisson')"
        R"lasso_beta_tmp = glmnet(x, y, lambda=lasso_cv_result$lambda.min, family='poisson')$beta"
        R"lasso_beta = as.vector(lasso_beta_tmp)"
        @rget lasso_cv_result lasso_beta #pull result from R to Julia

        #store the correct position in estimated model
        IHT_estimated_β[:, i] .= IHT_result.beta[cor_pos]
        lasso_estimated_β[:, i] .= lasso_beta[cor_pos]
    end
    
    return IHT_estimated_β, lasso_estimated_β
end

run_repeat (generic function with 1 method)

## Run 100 models

Here $\beta_{true}$ is the same in all models, but the design matrix and phenotype will be different. To run lasso, we first cross validate for the tuning parameter $\lambda$ before running the standard lasso. We compute the average and standard error for correctly identified predictors (i.e. estimated $\beta_i > 0$). Note that here we are not comparing false positives or false negatives. 

In [3]:
repeats = 100
n = 5000
p = 10000
d = Poisson
l = canonicallink(d())

# set random seed for reproducibility
Random.seed!(2019)

# intercept
z = ones(Float32, n, 1)

#construct true model b
true_b = zeros(p)
true_b[1:6] = [0.01; 0.03; 0.05; 0.1; 0.25; 0.5]
shuffle!(true_b)
correct_position = findall(x -> x != 0, true_b)

#run repeats
@time iht_result, lasso_result = run_repeat(n, p, repeats, z, true_b, correct_position, d, l)

│   caller = poislogpdf at pois.jl:18 [inlined]
└ @ Core /u/home/b/biona001/.julia/packages/StatsFuns/hg4pG/src/distrs/pois.jl:18


4874.291185 seconds (553.63 M allocations: 75.746 GiB, 0.56% gc time)


([0.0 0.0 … 0.0528748 0.0; 0.0497643 0.0 … 0.0574094 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.509794 0.477069 … 0.512654 0.494566], [0.0 0.0 … 0.00224377 0.0; 0.00662803 0.0 … 0.0112765 0.003919; … ; 0.0 0.0 … 0.0 0.0; 0.470077 0.429881 … 0.487958 0.450758])

In [4]:
iht_result

6×100 Array{Float64,2}:
 0.0        0.0        0.0       0.0        …  0.0        0.0528748  0.0     
 0.0497643  0.0        0.047106  0.0628911     0.0570181  0.0574094  0.0     
 0.228839   0.262226   0.262653  0.250997      0.245492   0.250475   0.246462
 0.0965199  0.0919223  0.103844  0.0846764     0.0895321  0.0964624  0.130771
 0.0        0.0        0.0       0.0           0.0        0.0        0.0     
 0.509794   0.477069   0.489537  0.507455   …  0.513434   0.512654   0.494566

In [5]:
lasso_result

6×100 Array{Float64,2}:
 0.0         0.0        0.0        …  0.0        0.00224377  0.0      
 0.00662803  0.0        0.0           0.011202   0.0112765   0.003919 
 0.186557    0.23129    0.212933      0.199296   0.220281    0.213068 
 0.0520362   0.0485685  0.0605761     0.0413027  0.0576767   0.0850391
 0.0         0.0        0.0           0.0        0.0         0.0      
 0.470077    0.429881   0.450512   …  0.470328   0.487958    0.450758 

In [6]:
iht_non_zero = [iht_result[i, findall(!iszero, iht_result[i, :])] for i in 1:6]
lasso_non_zero = [lasso_result[i, findall(!iszero, lasso_result[i, :])] for i in 1:6];

In [7]:
iht_non_zero_mean = mean.(iht_non_zero)

6-element Array{Float64,1}:
   0.04967247943083445
   0.05674616127244888
   0.24937559753656388
   0.0993067517131567 
 NaN                  
   0.4980309695005417 

In [8]:
lasso_non_zero_mean = mean.(lasso_non_zero)

6-element Array{Float64,1}:
 0.004334018501960412 
 0.01301941904766544  
 0.2114382319224645   
 0.05523909447775663  
 0.0003998595945450759
 0.4629356518901696   

In [9]:
iht_non_zero_std = std.(iht_non_zero)

6-element Array{Float64,1}:
   0.004515144008110586
   0.008123108521738082
   0.011414869590965474
   0.012735053383877036
 NaN                   
   0.01123355719152502 

In [10]:
lasso_non_zero_std = std.(lasso_non_zero)

6-element Array{Float64,1}:
   0.0035055165569766503
   0.008947507944566835 
   0.012848143900630762 
   0.013938785318554999 
 NaN                    
   0.016330620387666514 

In [11]:
final = DataFrame(
    true_β = true_b[correct_position],
    β_IHT_mean = iht_non_zero_mean, 
    β_IHT_std = iht_non_zero_std, 
    β_lasso_mean = lasso_non_zero_mean,
    β_lasso_std = lasso_non_zero_std)

Unnamed: 0_level_0,true_β,β_IHT_mean,β_IHT_std,β_lasso_mean,β_lasso_std
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.03,0.0496725,0.00451514,0.00433402,0.00350552
2,0.05,0.0567462,0.00812311,0.0130194,0.00894751
3,0.25,0.249376,0.0114149,0.211438,0.0128481
4,0.1,0.0993068,0.0127351,0.0552391,0.0139388
5,0.01,,,0.00039986,
6,0.5,0.498031,0.0112336,0.462936,0.0163306
