# Poisson - Compare coefficient estimates for IHT and lasso

In [1]:
using Revise
using MendelIHT
using MendelGWAS
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using CSV
using GLM
using RCall
R"library(glmnet)"

│ Loading required package: foreach
│ Loaded glmnet 2.0-16
│ 
└ @ RCall /Users/biona001/.julia/packages/RCall/g7dhB/src/io.jl:113


RObject{StrSxp}
 [1] "glmnet"    "foreach"   "Matrix"    "stats"     "graphics"  "grDevices"
 [7] "utils"     "datasets"  "methods"   "base"     


In [2]:
#run n repeats of L0_reg using the same X and b, but different y
function run_repeat(
    n        :: Int,
    p        :: Int,
    repeats  :: Int, 
    z        :: AbstractMatrix,
    true_b   :: Vector,
    cor_pos  :: Vector, #correct position of the true model
    d        :: UnionAll,
    l        :: GLM.Link
    )
    
    k = size(cor_pos, 1)
    IHT_estimated_β = zeros(k, repeats)
    lasso_estimated_β = zeros(k, repeats)
    marginal_estimated_β = zeros(k, repeats)

    for i in 1:repeats
        # simulat SNP data
        x = simulate_random_snparray(n, p, "tmp.bed")
        xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
        x_float = [convert(Matrix{Float64}, x, center=true, scale=true) z] #Float64 version of x

        #simulate phenotypes (e.g. vector y)
        if d == Normal || d == Poisson || d == Bernoulli
            prob = GLM.linkinv.(l, xbm * true_b)
            clamp!(prob, -20, 20)
            y = [rand(d(i)) for i in prob]
        elseif d == NegativeBinomial
            r = 10
            μ = GLM.linkinv.(l, xbm * true_b)
            prob = 1 ./ (1 .+ μ ./ r)
            y = [rand(d(r, i)) for i in prob] #number of failtures before r success occurs
        elseif d == Gamma
            μ = GLM.linkinv.(l, xbm * true_b)
            β = 1 ./ μ # here β is the rate parameter for gamma distribution
            y = [rand(d(α, i)) for i in β] # α is the shape parameter for gamma
        end
        y = Float64.(y)

        #compute results for data
        IHT_result = L0_reg(x, xbm, z, y, 1, k, d(), l)

        #compute lasso results. choose λ via cross validation
        num_folds = 3
        folds = rand(1:num_folds, size(x, 1))
        @rput x_float y folds num_folds #make variables visible to R
        R"lasso_cv_result = cv.glmnet(x_float, y, nfolds = num_folds, foldid = folds, family='poisson')"
        R"lasso_beta_tmp = glmnet(x_float, y, lambda=lasso_cv_result$lambda.min, family='poisson')$beta"
        R"lasso_beta = as.vector(lasso_beta_tmp)"
        @rget lasso_cv_result lasso_beta #pull result from R to Julia

        # Now run MendelGWAS (marginal analysis)
        make_bim_fam_files(x, y, "tmp") #create .bim and .bed files for MendelGWAS
        open("tmp_control.txt", "w") do f
            write(f, "plink_input_basename = tmp \n")
            write(f, "plink_field_separator = '\t' \n")
            write(f, "output_table = tmp_table.txt \n")
            write(f, "regression = Poisson \n")
            write(f, "regression_formula = Trait ~ 1\n")
        end
        GWAS("tmp_control.txt")
        result = CSV.read("tmp_table.txt")
        
        #store the correct position in estimated model
        IHT_estimated_β[:, i] .= IHT_result.beta[cor_pos]
        lasso_estimated_β[:, i] .= lasso_beta[cor_pos]
        marginal_estimated_β[:, i] .= result[cor_pos, 6]
    end
    
    return IHT_estimated_β, lasso_estimated_β, marginal_estimated_β
end

run_repeat (generic function with 1 method)

## Run 100 models

Here $\beta_{true}$ is the same in all models, but the design matrix and phenotype will be different. To run lasso, we first cross validate for the tuning parameter $\lambda$ before running the standard lasso. We compute the average and standard error for correctly identified predictors (i.e. estimated $\beta_i > 0$). Note that here we are not comparing false positives or false negatives. 

In [3]:
repeats = 100
n = 5000
p = 10000
d = Poisson
l = canonicallink(d())

# set random seed for reproducibility
Random.seed!(2020)

# intercept
z = ones(Float64, n, 1)

#construct true model b
true_b = zeros(p)
true_b[1:6] = [0.01; 0.03; 0.05; 0.1; 0.25; 0.5]
shuffle!(true_b)
correct_position = findall(x -> x != 0, true_b)

#run repeats
@time iht_result, lasso_result, marginal_result = run_repeat(n, p, repeats, z, true_b, correct_position, d, l)

 
 
     Welcome to OpenMendel's
      GWAS analysis option
 
 
Reading the data.

The current working directory is "/Users/biona001/.julia/dev/MendelIHT/figures/repeats".

Keywords modified by the user:

  affected_designator = 2
  control_file = tmp_control.txt
  output_table = tmp_table.txt
  pedigree_file = tmp.fam
  plink_field_separator = 	
  plink_input_basename = tmp
  regression = Poisson
  regression_formula = Trait ~ 1
  snpdata_file = tmp.bed
  snpdefinition_file = tmp.bim
 
 
Analyzing the data.

 
Table of p-values for all SNPs is in file: tmp_table.txt
 
 
Mendel's analysis is finished.

 
 
     Welcome to OpenMendel's
      GWAS analysis option
 
 
Reading the data.

The current working directory is "/Users/biona001/.julia/dev/MendelIHT/figures/repeats".

Keywords modified by the user:

  affected_designator = 2
  control_file = tmp_control.txt
  output_table = tmp_table.txt
  pedigree_file = tmp.fam
  plink_field_separator = 	
  plink_input_basename = tmp
  regression

([0.0 0.0 … 0.0 0.0; 0.0 0.054221624807373324 … 0.0 0.0; … ; 0.35906241459028904 0.5003901925850526 … 0.5050030530118692 0.49831617620734603; 0.0 0.0 … 0.0 0.0], [0.0 0.0 … 0.0 0.0; 0.0 0.010298963351037567 … 0.0 0.0; … ; 0.35322360342242265 0.4714628721429879 … 0.4621034530335884 0.450285118394372; 0.0 0.0 … 0.0 0.0], [NaN NaN … NaN NaN; NaN NaN … NaN NaN; … ; NaN 1.0016300557057696 … 0.75919500038641 0.7150168851197319; NaN NaN … NaN NaN])

In [4]:
iht_result

6×100 Array{Float64,2}:
 0.0        0.0        0.0        0.0       …  0.0       0.0       0.0     
 0.0        0.0542216  0.0        0.0          0.0       0.0       0.0     
 0.195348   0.233275   0.239347   0.256797     0.238692  0.253888  0.240465
 0.0587599  0.106387   0.0859585  0.124953     0.100811  0.108789  0.103818
 0.359062   0.50039    0.505846   0.506734     0.492391  0.505003  0.498316
 0.0        0.0        0.0        0.0       …  0.0       0.0       0.0     

In [5]:
lasso_result

6×100 Array{Float64,2}:
 0.0         0.0        0.0        …  0.0        0.0        0.0      
 0.0         0.010299   0.0           0.0        0.0        0.0      
 0.142006    0.193656   0.2036        0.195637   0.22351    0.198884 
 0.00510234  0.0613356  0.0441151     0.0728919  0.0653809  0.0519284
 0.353224    0.471463   0.46983       0.451719   0.462103   0.450285 
 0.0         0.0        0.0        …  0.0        0.0        0.0      

In [16]:
marginal_result[findall(isnan, marginal_result)] .= 0.0 # set NaN to 0
marginal_result

6×100 Array{Float64,2}:
 0.0       0.0       0.0       0.0       …  0.0       0.0       0.0     
 0.0       0.0       0.0       0.0          0.0       0.0       0.0     
 0.282548  0.421735  0.400131  0.379903     0.371328  0.564182  0.412516
 0.0       0.214653  0.146086  0.188062     0.609007  0.203008  0.133465
 0.0       1.00163   0.796422  1.09415      0.744162  0.759195  0.715017
 0.0       0.0       0.0       0.0       …  0.0       0.0       0.0     

In [17]:
iht_non_zero = [iht_result[i, findall(!iszero, iht_result[i, :])] for i in 1:6]
lasso_non_zero = [lasso_result[i, findall(!iszero, lasso_result[i, :])] for i in 1:6]
marginal_non_zero = [marginal_result[i, findall(!iszero, marginal_result[i, :])] for i in 1:6];

In [18]:
iht_non_zero_mean = mean.(iht_non_zero)

6-element Array{Float64,1}:
 NaN                   
   0.056552321864968794
   0.24700950049912454 
   0.10020388358125838 
   0.49213070652681823 
   0.05030758450523561 

In [19]:
lasso_non_zero_mean = mean.(lasso_non_zero)

6-element Array{Float64,1}:
 NaN                   
   0.012269619984479669
   0.20847464182049402 
   0.05815246909981811 
   0.4584211737788706  
   0.007665359512413977

In [20]:
marginal_non_zero_mean = mean.(marginal_non_zero)

6-element Array{Float64,1}:
 NaN                  
   0.16467929420122857
   0.4524797152742854 
   0.19761177931200846
   0.9426858915061739 
 NaN                  

In [21]:
iht_non_zero_std = std.(iht_non_zero)

6-element Array{Float64,1}:
 NaN                   
   0.007828852485881867
   0.011892103250521185
   0.01354509718041563 
   0.03928006737640751 
   0.004915768216286467

In [22]:
lasso_non_zero_std = std.(lasso_non_zero)

6-element Array{Float64,1}:
 NaN                   
   0.00916152964832935 
   0.015340437147771748
   0.016053259821774177
   0.03709926770629462 
   0.005472543451330883

In [23]:
marginal_non_zero_std = std.(marginal_non_zero)

6-element Array{Float64,1}:
 NaN                   
   0.049018549122905465
   0.18415951295324715 
   0.09744887837477016 
   0.33117053961134363 
 NaN                   

In [24]:
final = DataFrame(
    true_β = true_b[correct_position],
    β_IHT_mean = iht_non_zero_mean, 
    β_IHT_std = iht_non_zero_std, 
    β_lasso_mean = lasso_non_zero_mean,
    β_lasso_std = lasso_non_zero_std,
    β_marginal_mean = marginal_non_zero_mean,
    β_marginal_std = marginal_non_zero_std)

Unnamed: 0_level_0,true_β,β_IHT_mean,β_IHT_std,β_lasso_mean,β_lasso_std,β_marginal_mean,β_marginal_std
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.01,,,,,,
2,0.05,0.0565523,0.00782885,0.0122696,0.00916153,0.164679,0.0490185
3,0.25,0.24701,0.0118921,0.208475,0.0153404,0.45248,0.18416
4,0.1,0.100204,0.0135451,0.0581525,0.0160533,0.197612,0.0974489
5,0.5,0.492131,0.0392801,0.458421,0.0370993,0.942686,0.331171
6,0.03,0.0503076,0.00491577,0.00766536,0.00547254,,
