# Logistic - Compare coefficient estimates for IHT and lasso

In [1]:
using MendelIHT
using MendelGWAS
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using CSV
using GLM
using RCall
R"library(glmnet)"

│ Loading required package: foreach
│ Loaded glmnet 2.0-16
│ 
└ @ RCall /Users/biona001/.julia/packages/RCall/g7dhB/src/io.jl:113


RObject{StrSxp}
 [1] "glmnet"    "foreach"   "Matrix"    "stats"     "graphics"  "grDevices"
 [7] "utils"     "datasets"  "methods"   "base"     


In [2]:
function run_repeat(
    n        :: Int,
    p        :: Int,
    repeats  :: Int, 
    z        :: AbstractMatrix,
    true_b   :: Vector,
    cor_pos  :: Vector, #correct position of the true model
    d        :: UnionAll,
    l        :: GLM.Link
    )

    k = size(cor_pos, 1)
    IHT_estimated_β = zeros(k, repeats)
    lasso_estimated_β = zeros(k, repeats)
    marginal_estimated_β = zeros(k, repeats)
    
    for i in 1:repeats
        # simulat SNP data
        x = simulate_random_snparray(n, p, "tmp.bed")
        xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
        x_float = [convert(Matrix{Float64}, x, center=true, scale=true) z] #Float64 version of x

        #simulate phenotypes (e.g. vector y)
        if d == Normal || d == Poisson || d == Bernoulli
            prob = GLM.linkinv.(l, xbm * true_b)
            clamp!(prob, -20, 20)
            y = [rand(d(i)) for i in prob]
        elseif d == NegativeBinomial
            r = 10
            μ = GLM.linkinv.(l, xbm * true_b)
            prob = 1 ./ (1 .+ μ ./ r)
            y = [rand(d(r, i)) for i in prob] #number of failtures before r success occurs
        elseif d == Gamma
            μ = GLM.linkinv.(l, xbm * true_b)
            β = 1 ./ μ # here β is the rate parameter for gamma distribution
            y = [rand(d(α, i)) for i in β] # α is the shape parameter for gamma
        end
        y = Float64.(y)

        #compute results for data
        IHT_result = L0_reg(x, xbm, z, y, 1, k, d(), l)

        #compute lasso results. choose λ via cross validation
        num_folds = 3
        folds = rand(1:num_folds, size(x, 1))
        @rput x_float y folds num_folds #make variables visible to R
        R"lasso_cv_result = cv.glmnet(x_float, y, nfolds = num_folds, foldid = folds, family='binomial')"
        R"lasso_beta_tmp = glmnet(x_float, y, lambda=lasso_cv_result$lambda.min, family='binomial')$beta"
        R"lasso_beta = as.vector(lasso_beta_tmp)"
        @rget lasso_cv_result lasso_beta #pull result from R to Julia

        # Now run MendelGWAS (marginal analysis)
        make_bim_fam_files(x, y, "tmp") #create .bim and .bed files for MendelGWAS
        open("tmp_control.txt", "w") do f
            write(f, "plink_input_basename = tmp \n")
            write(f, "plink_field_separator = '\t' \n")
            write(f, "output_table = tmp_table.txt \n")
            write(f, "regression = logistic \n")
            write(f, "affected_designator = 1.0 \n")
            write(f, "regression_formula = Trait ~ 1\n")
        end
        GWAS("tmp_control.txt")
        result = CSV.read("tmp_table.txt")
        
        #store the correct position in estimated model
        IHT_estimated_β[:, i] .= IHT_result.beta[cor_pos]
        lasso_estimated_β[:, i] .= lasso_beta[cor_pos]
        marginal_estimated_β[:, i] .= result[cor_pos, 6]
    end
    
    return IHT_estimated_β, lasso_estimated_β, marginal_estimated_β
end

run_repeat (generic function with 1 method)

## Run 100 models

Here $\beta_{true}$ is the same in all models, but the design matrix and phenotype will be different. To run lasso, we first cross validate for the tuning parameter $\lambda$ before running the standard lasso. We compute the average and standard error for correctly identified predictors (i.e. estimated $\beta_i > 0$). Note that here we are not comparing false positives or false negatives. 

In [3]:
repeats = 100
n = 5000
p = 10000
d = Bernoulli
l = canonicallink(d())

# set random seed for reproducibility
Random.seed!(2020)

# intercept
z = ones(Float64, n, 1)

#construct true model b
true_b = zeros(p)
true_b[1:6] = [0.01; 0.03; 0.05; 0.1; 0.25; 0.5]
shuffle!(true_b)
correct_position = findall(x -> x != 0, true_b)

#run repeats
@time iht_result, lasso_result, marginal_result = run_repeat(n, p, repeats, z, true_b, correct_position, d, l)

[31mDid not converge after 100 iterations! The run time for IHT was 22.381141901016235 seconds[39m
 
 
     Welcome to OpenMendel's
      GWAS analysis option
 
 
Reading the data.

The current working directory is "/Users/biona001/.julia/dev/MendelIHT/figures/repeats".

Keywords modified by the user:

  affected_designator = 1.0
  control_file = tmp_control.txt
  output_table = tmp_table.txt
  pedigree_file = tmp.fam
  plink_field_separator = 	
  plink_input_basename = tmp
  regression = logistic
  regression_formula = Trait ~ 1
  snpdata_file = tmp.bed
  snpdefinition_file = tmp.bim
 
 
Analyzing the data.

 
Table of p-values for all SNPs is in file: tmp_table.txt
 
 
Mendel's analysis is finished.

 
 
     Welcome to OpenMendel's
      GWAS analysis option
 
 
Reading the data.

The current working directory is "/Users/biona001/.julia/dev/MendelIHT/figures/repeats".

Keywords modified by the user:

  affected_designator = 1.0
  control_file = tmp_control.txt
  output_table = tmp

([0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.5751576419229085 0.5216087921144836 … 0.4888138331943573 0.509844242622916; 0.0 0.0 … 0.0 0.0], [0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.14439694749327234 0.3357317306237613 … 0.36488718786806595 0.36578165453208805; 0.0 0.0 … 0.0 0.0], [0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 2.0166273288181955 … 1.1069112514153625 0.8667065331716317; 0.0 0.0 … 0.0 0.0])

In [4]:
iht_result

6×100 Array{Float64,2}:
 0.0       0.0       0.0       0.0       …  0.0       0.0       0.0     
 0.0       0.0       0.0       0.0          0.0       0.0       0.0     
 0.252708  0.274895  0.277593  0.252662     0.295073  0.25336   0.276485
 0.0       0.137119  0.105452  0.0          0.149888  0.121019  0.107666
 0.575158  0.521609  0.542869  0.509489     0.42843   0.488814  0.509844
 0.0       0.0       0.0       0.0       …  0.0       0.0       0.0     

In [5]:
lasso_result

6×100 Array{Float64,2}:
 0.0       0.0        0.0       0.0       …  0.0        0.0        0.0     
 0.0       0.0        0.0       0.0          0.0        0.0        0.0     
 0.134702  0.157558   0.14814   0.144922     0.178309   0.131738   0.147157
 0.0       0.0220735  0.0       0.0          0.0447092  0.0219265  0.0     
 0.144397  0.335732   0.405305  0.380671     0.296971   0.364887   0.365782
 0.0       0.0        0.0       0.0       …  0.0        0.0        0.0     

In [6]:
marginal_result

6×100 Array{Float64,2}:
 0.0       0.0       0.0       0.0       …  0.0       0.0       0.0     
 0.0       0.0       0.0       0.0          0.0       0.0       0.0     
 0.356763  0.443228  0.406038  0.353152     0.573303  0.907875  0.360882
 0.0       0.0       0.0       0.0          0.0       0.0       0.0     
 0.0       2.01663   0.742139  1.21516      1.00569   1.10691   0.866707
 0.0       0.0       0.0       0.0       …  0.0       0.0       0.0     

In [7]:
iht_non_zero = [iht_result[i, findall(!iszero, iht_result[i, :])] for i in 1:6]
lasso_non_zero = [lasso_result[i, findall(!iszero, lasso_result[i, :])] for i in 1:6]
marginal_non_zero = [marginal_result[i, findall(!iszero, marginal_result[i, :])] for i in 1:6];

In [8]:
iht_non_zero_mean = mean.(iht_non_zero)

6-element Array{Float64,1}:
 NaN                  
   0.10801399841755437
   0.25596668187280786
   0.12542033546352405
   0.5077260696365136 
   0.10568050030965656

In [9]:
lasso_non_zero_mean = mean.(lasso_non_zero)

6-element Array{Float64,1}:
 NaN                    
   0.0077862127022293864
   0.13746785079123336  
   0.02202542528239795  
   0.3657946811409184   
 NaN                    

In [10]:
marginal_non_zero_mean = mean.(marginal_non_zero)

6-element Array{Float64,1}:
 NaN                  
 NaN                  
   0.47982753149121943
 NaN                  
   0.9836942749912515 
 NaN                  

In [11]:
iht_non_zero_std = std.(iht_non_zero)

6-element Array{Float64,1}:
 NaN                   
   0.005695526527512667
   0.03826153382696935 
   0.016393578922280962
   0.03887425672919938 
 NaN                   

In [12]:
lasso_non_zero_std = std.(lasso_non_zero)

6-element Array{Float64,1}:
 NaN                   
   0.003342709394959299
   0.032180917849519854
   0.01600352762533589 
   0.05794907765655277 
 NaN                   

In [13]:
marginal_non_zero_std = std.(marginal_non_zero)

6-element Array{Float64,1}:
 NaN                  
 NaN                  
   0.2185114058827111 
 NaN                  
   0.47482025705724096
 NaN                  

In [14]:
final = DataFrame(
    true_β = true_b[correct_position],
    β_IHT_mean = iht_non_zero_mean, 
    β_IHT_std = iht_non_zero_std, 
    β_lasso_mean = lasso_non_zero_mean,
    β_lasso_std = lasso_non_zero_std,
    β_marginal_mean = marginal_non_zero_mean,
    β_marginal_std = marginal_non_zero_std)

Unnamed: 0_level_0,true_β,β_IHT_mean,β_IHT_std,β_lasso_mean,β_lasso_std,β_marginal_mean,β_marginal_std
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.01,,,,,,
2,0.05,0.108014,0.00569553,0.00778621,0.00334271,,
3,0.25,0.255967,0.0382615,0.137468,0.0321809,0.479828,0.218511
4,0.1,0.12542,0.0163936,0.0220254,0.0160035,,
5,0.5,0.507726,0.0388743,0.365795,0.0579491,0.983694,0.47482
6,0.03,0.105681,,,,,
