# Negative Binomial - Coefficient estimates for IHT

No lasso for comparison because `glmnet` does not implement negative binomial regression.

In [1]:
using MendelIHT
using MendelGWAS
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using CSV
using GLM
using RCall
R"library(glmnet)"

│ Loading required package: foreach
│ Loaded glmnet 2.0-16
│ 
└ @ RCall /Users/biona001/.julia/packages/RCall/g7dhB/src/io.jl:113


RObject{StrSxp}
 [1] "glmnet"    "foreach"   "Matrix"    "stats"     "graphics"  "grDevices"
 [7] "utils"     "datasets"  "methods"   "base"     


In [2]:
#run n repeats of L0_reg using the same X and b, but different y
function run_repeat(
    n        :: Int,
    p        :: Int,
    repeats  :: Int, 
    z        :: AbstractMatrix,
    true_b   :: Vector,
    cor_pos  :: Vector, #correct position of the true model
    d        :: UnionAll,
    l        :: GLM.Link
    )
    
    k = size(cor_pos, 1)
    IHT_estimated_β = zeros(k, repeats)
    marginal_estimated_β = zeros(k, repeats)

    for i in 1:repeats
        # simulat SNP data
        x = simulate_random_snparray(n, p, "tmp.bed")
        xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 

        #simulate phenotypes (e.g. vector y)
        if d == Normal || d == Poisson || d == Bernoulli
            prob = GLM.linkinv.(l, xbm * true_b)
            clamp!(prob, -20, 20)
            y = [rand(d(i)) for i in prob]
        elseif d == NegativeBinomial
            r = 10
            μ = GLM.linkinv.(l, xbm * true_b)
            prob = 1 ./ (1 .+ μ ./ r)
            y = [rand(d(r, i)) for i in prob] #number of failtures before r success occurs
        elseif d == Gamma
            μ = GLM.linkinv.(l, xbm * true_b)
            β = 1 ./ μ # here β is the rate parameter for gamma distribution
            y = [rand(d(α, i)) for i in β] # α is the shape parameter for gamma
        end
        y = Float64.(y)

        #compute results for data
        IHT_result = L0_reg(x, xbm, z, y, 1, k, d(), l)
        # Now run MendelGWAS (marginal analysis)
        make_bim_fam_files(x, y, "tmp") #create .bim and .bed files for MendelGWAS
        open("tmp_control.txt", "w") do f
            write(f, "plink_input_basename = tmp \n")
            write(f, "plink_field_separator = '\t' \n")
            write(f, "output_table = tmp_table.txt \n")
            write(f, "distribution = NegativeBinomial() \n")
            write(f, "link = LogLink() \n")
            write(f, "regression_formula = Trait ~ 1\n")
        end
        GWAS("tmp_control.txt")
        result = CSV.read("tmp_table.txt")
        
        #store the correct position in estimated model
        IHT_estimated_β[:, i] .= IHT_result.beta[cor_pos]
        marginal_estimated_β[:, i] .= result[cor_pos, 6]
    end
    
    return IHT_estimated_β, marginal_estimated_β
end

run_repeat (generic function with 1 method)

## Run 100 models

Here $\beta_{true}$ is the same in all models, but the design matrix and phenotype will be different. To run lasso, we first cross validate for the tuning parameter $\lambda$ before running the standard lasso. We compute the average and standard error for correctly identified predictors (i.e. estimated $\beta_i > 0$). Note that here we are not comparing false positives or false negatives. 

In [3]:
repeats = 100
n = 5000
p = 10000
d = NegativeBinomial
l = LogLink()

# set random seed for reproducibility
Random.seed!(2020)

# intercept
z = ones(Float64, n, 1)

#construct true model b
true_b = zeros(p)
true_b[1:6] = [0.01; 0.03; 0.05; 0.1; 0.25; 0.5]
shuffle!(true_b)
correct_position = findall(x -> x != 0, true_b)

#run repeats
@time iht_result, marginal_result = run_repeat(n, p, repeats, z, true_b, correct_position, d, l)

 
 
     Welcome to OpenMendel's
      GWAS analysis option
 
 
Reading the data.

The current working directory is "/Users/biona001/.julia/dev/MendelIHT/figures/repeats".

Keywords modified by the user:

  affected_designator = 2
  control_file = tmp_control.txt
  distribution = NegativeBinomial{Float64}(r=1.0, p=0.5)
  link = LogLink()
  output_table = tmp_table.txt
  pedigree_file = tmp.fam
  plink_field_separator = 	
  plink_input_basename = tmp
  regression_formula = Trait ~ 1
  snpdata_file = tmp.bed
  snpdefinition_file = tmp.bim
 
 
Analyzing the data.

 
Table of p-values for all SNPs is in file: tmp_table.txt
 
 
Mendel's analysis is finished.

 
 
     Welcome to OpenMendel's
      GWAS analysis option
 
 
Reading the data.

The current working directory is "/Users/biona001/.julia/dev/MendelIHT/figures/repeats".

Keywords modified by the user:

  affected_designator = 2
  control_file = tmp_control.txt
  distribution = NegativeBinomial{Float64}(r=1.0, p=0.5)
  link = LogLink

([0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.054208906048724956 0.053542780032999136; … ; 0.501560381623395 0.4943985147322125 … 0.5033216076362838 0.4949244653862425; 0.0 0.0 … 0.0 0.0], [0.08072566083817725 0.017314085522983726 … 0.034580905085558465 0.06611536327191805; -0.10544437079767921 0.10772485328192222 … 0.1153626694580581 -0.0028595410258840377; … ; 0.0 0.8350451702291622 … 0.7835441053895238 1.7980250978708567; 0.13399682904550747 0.08341654748812607 … 0.08196554960843991 0.09743104271044191])

In [4]:
iht_result

6×100 Array{Float64,2}:
 0.0        0.0       0.0        …  0.0        0.0        0.0      
 0.0        0.0       0.0           0.0        0.0542089  0.0535428
 0.264137   0.256071  0.233858      0.255637   0.266222   0.256354 
 0.0873791  0.104394  0.0976695     0.0788668  0.0989841  0.110348 
 0.50156    0.494399  0.506778      0.51698    0.503322   0.494924 
 0.0        0.0       0.0        …  0.0        0.0        0.0      

In [5]:
marginal_result

6×100 Array{Float64,2}:
  0.0807257  0.0173141  0.0166791  …  0.0231365  0.0345809   0.0661154 
 -0.105444   0.107725   0.0625898     0.0446973  0.115363   -0.00285954
  0.405028   0.381679   0.356194      0.393092   0.430991    0.457189  
  0.0618348  0.161726   0.289639      0.151644   0.134282    0.223707  
  0.0        0.835045   0.874946      0.819226   0.783544    1.79803   
  0.133997   0.0834165  0.0850283  …  0.0620596  0.0819655   0.097431  

In [6]:
iht_non_zero = [iht_result[i, findall(!iszero, iht_result[i, :])] for i in 1:6]
marginal_non_zero = [marginal_result[i, findall(!iszero, marginal_result[i, :])] for i in 1:6];

In [7]:
iht_non_zero_mean = mean.(iht_non_zero)

6-element Array{Float64,1}:
 NaN                   
   0.05983187691244681 
   0.24929115618169934 
   0.09852235467941588 
   0.5672552137783342  
   0.056034642832413294

In [8]:
marginal_non_zero_mean = mean.(marginal_non_zero)

6-element Array{Float64,1}:
 0.014344869539801876
 0.09725081747825516 
 0.4861505939215967  
 0.1895157682647415  
 0.9303986553614522  
 0.04836393794673187 

In [9]:
iht_non_zero_std = std.(iht_non_zero)

6-element Array{Float64,1}:
 NaN                   
   0.007852035645802809
   0.01172805511473952 
   0.012296725415838912
   0.6699456791413129  
   0.005596461286501224

In [10]:
marginal_non_zero_std = std.(marginal_non_zero)

6-element Array{Float64,1}:
 0.047723351743795495
 0.06054788292790077 
 0.1782153099389692  
 0.08980522056488444 
 0.3150844676182244  
 0.12265231567413745 

In [11]:
final = DataFrame(
    true_β = true_b[cordrect_position],
    β_IHT_mean = iht_non_zero_mean, 
    β_IHT_std = iht_non_zero_std,
    β_marginal_mean = marginal_non_zero_mean, 
    β_marginal_std = marginal_non_zero_std)

Unnamed: 0_level_0,true_β,β_IHT_mean,β_IHT_std,β_marginal_mean,β_marginal_std
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64
1,0.01,,,0.0143449,0.0477234
2,0.05,0.0598319,0.00785204,0.0972508,0.0605479
3,0.25,0.249291,0.0117281,0.486151,0.178215
4,0.1,0.0985224,0.0122967,0.189516,0.0898052
5,0.5,0.567255,0.669946,0.930399,0.315084
6,0.03,0.0560346,0.00559646,0.0483639,0.122652
