# Poisson - Compare coefficient estimates for IHT and lasso

In [1]:
using Revise
using MendelIHT
using MendelGWAS
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using CSV
using GLM
using RCall
R"library(glmnet)"

┌ Info: Recompiling stale cache file /Users/biona001/.julia/compiled/v1.2/MendelGWAS/y6a7g.ji for MendelGWAS [c1bad3ca-268b-5bf6-b82e-de8a361f94db]
└ @ Base loading.jl:1240
  ** incremental compilation may be fatally broken for this module **

│ Loading required package: foreach
│ Loaded glmnet 2.0-16
│ 
└ @ RCall /Users/biona001/.julia/packages/RCall/g7dhB/src/io.jl:113


RObject{StrSxp}
 [1] "glmnet"    "foreach"   "Matrix"    "stats"     "graphics"  "grDevices"
 [7] "utils"     "datasets"  "methods"   "base"     


In [2]:
#run n repeats of L0_reg using the same X and b, but different y
function run_repeat(
    n        :: Int,
    p        :: Int,
    repeats  :: Int, 
    z        :: AbstractMatrix,
    true_b   :: Vector,
    cor_pos  :: Vector, #correct position of the true model
    d        :: UnionAll,
    l        :: GLM.Link
    )
    
    k = size(cor_pos, 1)
    IHT_estimated_β = zeros(k, repeats)
    lasso_estimated_β = zeros(k, repeats)
    marginal_estimated_β = zeros(k, repeats)

    for i in 1:repeats
        # simulat SNP data
        x = simulate_random_snparray(n, p, "tmp.bed")
        xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
        x_float = [convert(Matrix{Float64}, x, center=true, scale=true) z] #Float64 version of x

        #simulate phenotypes (e.g. vector y)
        if d == Normal || d == Poisson || d == Bernoulli
            prob = GLM.linkinv.(l, xbm * true_b)
            clamp!(prob, -20, 20)
            y = [rand(d(i)) for i in prob]
        elseif d == NegativeBinomial
            r = 10
            μ = GLM.linkinv.(l, xbm * true_b)
            prob = 1 ./ (1 .+ μ ./ r)
            y = [rand(d(r, i)) for i in prob] #number of failtures before r success occurs
        elseif d == Gamma
            μ = GLM.linkinv.(l, xbm * true_b)
            β = 1 ./ μ # here β is the rate parameter for gamma distribution
            y = [rand(d(α, i)) for i in β] # α is the shape parameter for gamma
        end
        y = Float64.(y)

        #compute results for data
        IHT_result = L0_reg(x, xbm, z, y, 1, k, d(), l)

        #compute lasso results. choose λ via cross validation
        num_folds = 3
        folds = rand(1:num_folds, size(x, 1))
        @rput x y folds num_folds #make variables visible to R
        R"lasso_cv_result = cv.glmnet(x, y, nfolds = num_folds, foldid = folds, family='poisson')"
        R"lasso_beta_tmp = glmnet(x, y, lambda=lasso_cv_result$lambda.min, family='poisson')$beta"
        R"lasso_beta = as.vector(lasso_beta_tmp)"
        @rget lasso_cv_result lasso_beta #pull result from R to Julia

        # Now run MendelGWAS (marginal analysis)
        make_bim_fam_files(x, y, "tmp") #create .bim and .bed files for MendelGWAS
        open("tmp_control.txt", "w") do f
            write(f, "plink_input_basename = tmp \n")
            write(f, "plink_field_separator = '\t' \n")
            write(f, "output_table = tmp_table.txt \n")
            write(f, "regression = Poisson \n")
            write(f, "regression_formula = Trait ~ 1\n")
        end
        GWAS("tmp_control.txt")
        result = CSV.read("tmp_table.txt")
        
        #store the correct position in estimated model
        IHT_estimated_β[:, i] .= IHT_result.beta[cor_pos]
        lasso_estimated_β[:, i] .= lasso_beta[cor_pos]
        marginal_estimated_β[:, i] .= result[cor_pos, 6]
    end
    
    return IHT_estimated_β, lasso_estimated_β, marginal_estimated_β
end

run_repeat (generic function with 1 method)

## Run 100 models

Here $\beta_{true}$ is the same in all models, but the design matrix and phenotype will be different. To run lasso, we first cross validate for the tuning parameter $\lambda$ before running the standard lasso. We compute the average and standard error for correctly identified predictors (i.e. estimated $\beta_i > 0$). Note that here we are not comparing false positives or false negatives. 

In [3]:
repeats = 100
n = 5000
p = 10000
d = Poisson
l = canonicallink(d())

# set random seed for reproducibility
Random.seed!(2020)

# intercept
z = ones(Float64, n, 1)

#construct true model b
true_b = zeros(p)
true_b[1:6] = [0.01; 0.03; 0.05; 0.1; 0.25; 0.5]
shuffle!(true_b)
correct_position = findall(x -> x != 0, true_b)

#run repeats
@time iht_result, lasso_result, marginal_result = run_repeat(n, p, repeats, z, true_b, correct_position, d, l)

 
 
     Welcome to OpenMendel's
      GWAS analysis option
 
 
Reading the data.

The current working directory is "/Users/biona001/.julia/dev/MendelIHT/figures/repeats".

Keywords modified by the user:

  affected_designator = 2
  control_file = tmp_control.txt
  output_table = tmp_table.txt
  pedigree_file = tmp.fam
  plink_field_separator = 	
  plink_input_basename = tmp
  regression = Poisson
  regression_formula = Trait ~ 1
  snpdata_file = tmp.bed
  snpdefinition_file = tmp.bim
 
 
Analyzing the data.

 
Table of p-values for all SNPs is in file: tmp_table.txt
 
 
Mendel's analysis is finished.

 
 
     Welcome to OpenMendel's
      GWAS analysis option
 
 
Reading the data.

The current working directory is "/Users/biona001/.julia/dev/MendelIHT/figures/repeats".

Keywords modified by the user:

  affected_designator = 2
  control_file = tmp_control.txt
  output_table = tmp_table.txt
  pedigree_file = tmp.fam
  plink_field_separator = 	
  plink_input_basename = tmp
  regression

([0.0 0.0 … 0.0 0.0; 0.0 0.05832343868602918 … 0.05431371535700101 0.055213596848378656; … ; 0.35906241459028904 0.4894019227592323 … 0.48596633572089004 0.5052469779198554; 0.0 0.0 … 0.0 0.0], [0.0 0.0 … 0.0 0.0; 0.0 0.006794247470393869 … 0.03029190532367507 0.07830409150890993; … ; 1.489423895194209 0.6826379466776341 … 0.4086783952768785 0.48952506092892906; 0.0 0.0 … 0.0 0.0], [0.0 0.0 … 0.0 0.0; 0.0 0.12943931789894425 … 0.24585923044514038 0.0; … ; 0.0 1.3062636962499337 … 0.6881276197807473 0.9201330793185503; 0.0 0.0 … 0.0 0.0])

In [4]:
iht_result

6×100 Array{Float64,2}:
 0.0        0.0        0.0        …  0.0        0.0        0.0      
 0.0        0.0583234  0.0           0.0        0.0543137  0.0552136
 0.195348   0.246422   0.250602      0.245703   0.252682   0.248557 
 0.0587599  0.11175    0.0909621     0.0859438  0.0873269  0.0772075
 0.359062   0.489402   0.495944      0.49478    0.485966   0.505247 
 0.0        0.0        0.0        …  0.0        0.0        0.0      

In [5]:
lasso_result

6×100 Array{Float64,2}:
 0.0         0.0         0.0       …  0.0        0.0        0.0      
 0.0         0.00679425  0.0          0.0        0.0302919  0.0783041
 0.127801    0.335882    0.521553     0.174189   0.253626   0.179629 
 0.00447473  0.14719     0.106143     0.0800553  0.0353601  0.0231276
 1.48942     0.682638    0.654212     0.418547   0.408678   0.489525 
 0.0         0.0         0.0       …  0.0        0.0        0.0      

In [6]:
marginal_result

6×100 Array{Float64,2}:
 0.0       0.0       0.0       0.0       …  0.0       0.0       0.0     
 0.0       0.129439  0.0       0.0          0.0       0.245859  0.0     
 0.282548  0.692139  1.02022   0.441847     0.350734  0.542527  0.394751
 0.0       0.474781  0.311654  0.172583     0.301113  0.141198  0.0     
 0.0       1.30626   1.24169   1.31633      0.698953  0.688128  0.920133
 0.0       0.0       0.0       0.0       …  0.0       0.0       0.0     

In [7]:
iht_non_zero = [iht_result[i, findall(!iszero, iht_result[i, :])] for i in 1:6]
lasso_non_zero = [lasso_result[i, findall(!iszero, lasso_result[i, :])] for i in 1:6]
marginal_non_zero = [marginal_result[i, findall(!iszero, marginal_result[i, :])] for i in 1:6];

In [8]:
iht_non_zero_mean = mean.(iht_non_zero)

6-element Array{Float64,1}:
 NaN                   
   0.05858668683293008 
   0.24555736105153778 
   0.09827424326530226 
   0.4933780679962815  
   0.050587183951516485

In [9]:
lasso_non_zero_mean = mean.(lasso_non_zero)

6-element Array{Float64,1}:
 NaN                    
   0.016918479913159683 
   0.2319068622689472   
   0.06732022490749663  
   0.5565802894007392   
   0.0034734154260624346

In [10]:
marginal_non_zero_mean = mean.(marginal_non_zero)

6-element Array{Float64,1}:
 NaN                  
   0.1559938565515263 
   0.4725155796962604 
   0.20095538669142823
   0.9892909642829543 
 NaN                  

In [11]:
iht_non_zero_std = std.(iht_non_zero)

6-element Array{Float64,1}:
 NaN                   
   0.00784598147020808 
   0.014126238556842793
   0.014132379063096669
   0.02780853565768039 
   0.004841897434712639

In [12]:
lasso_non_zero_std = std.(lasso_non_zero)

6-element Array{Float64,1}:
 NaN                    
   0.016000835734748604 
   0.12447576093152118  
   0.0561727424552273   
   0.23598985219623977  
   0.0030485918616114764

In [13]:
marginal_non_zero_std = std.(marginal_non_zero)

6-element Array{Float64,1}:
 NaN                  
   0.06850217631592975
   0.20157064520457768
   0.10067925912184575
   0.4104671630625075 
 NaN                  

In [14]:
final = DataFrame(
    true_β = true_b[correct_position],
    β_IHT_mean = iht_non_zero_mean, 
    β_IHT_std = iht_non_zero_std, 
    β_lasso_mean = lasso_non_zero_mean,
    β_lasso_std = lasso_non_zero_std,
    β_marginal_mean = marginal_non_zero_mean,
    β_marginal_std = marginal_non_zero_std)

Unnamed: 0_level_0,true_β,β_IHT_mean,β_IHT_std,β_lasso_mean,β_lasso_std,β_marginal_mean,β_marginal_std
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.01,,,,,,
2,0.05,0.0585867,0.00784598,0.0169185,0.0160008,0.155994,0.0685022
3,0.25,0.245557,0.0141262,0.231907,0.124476,0.472516,0.201571
4,0.1,0.0982742,0.0141324,0.0673202,0.0561727,0.200955,0.100679
5,0.5,0.493378,0.0278085,0.55658,0.23599,0.989291,0.410467
6,0.03,0.0505872,0.0048419,0.00347342,0.00304859,,
