# Negative Binomial - Coefficient estimates for IHT

No lasso for comparison because `glmnet` does not implement negative binomial regression.

In [1]:
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using DelimitedFiles
using GLM
using RCall
R"library(glmnet)"

│ Loading required package: foreach
│ Loaded glmnet 2.0-16
│ 
└ @ RCall /u/home/b/biona001/.julia/packages/RCall/ffM0W/src/io.jl:113


RObject{StrSxp}
 [1] "glmnet"    "foreach"   "Matrix"    "stats"     "graphics"  "grDevices"
 [7] "utils"     "datasets"  "methods"   "base"     


In [2]:
#run n repeats of L0_reg using the same X and b, but different y
function run_repeat(
    n        :: Int,
    p        :: Int,
    repeats  :: Int, 
    z        :: AbstractMatrix,
    true_b   :: Vector,
    cor_pos  :: Vector, #correct position of the true model
    d        :: UnionAll,
    l        :: GLM.Link
    )
    
    k = size(cor_pos, 1)
    IHT_estimated_β = zeros(k, repeats)
    
    for i in 1:repeats
        # simulat SNP data
        mafs = rand(Uniform(0.05, 0.5), p)
        x = zeros(Float32, n, p)
        for j in 1:p
            dist = Binomial(2, mafs[j])
            for ii in 1:n
                x[ii, j] = Float32(rand(dist))
            end
        end
        standardize!(x)

        #simulate phenotypes (e.g. vector y)
        if d == Normal || d == Poisson || d == Bernoulli
            prob = GLM.linkinv.(l, x * true_b)
            clamp!(prob, -20, 20)
            y = [rand(d(i)) for i in prob]
        elseif d == NegativeBinomial
            r = 10
            μ = GLM.linkinv.(l, x * true_b)
            prob = 1 ./ (1 .+ μ ./ r)
            y = [rand(d(r, i)) for i in prob] #number of failtures before r success occurs
        elseif d == Gamma
            μ = GLM.linkinv.(l, x * true_b)
            β = 1 ./ μ # here β is the rate parameter for gamma distribution
            y = [rand(d(α, i)) for i in β] # α is the shape parameter for gamma
        end
        y = Float32.(y)

        #compute results for data
        IHT_result = L0_reg(x, z, y, 1, k, d(), l)

        #store the correct position in estimated model
        IHT_estimated_β[:, i] .= IHT_result.beta[cor_pos]
    end
    
    return IHT_estimated_β
end

run_repeat (generic function with 1 method)

## Run 100 models

Here $\beta_{true}$ is the same in all models, but the design matrix and phenotype will be different. To run lasso, we first cross validate for the tuning parameter $\lambda$ before running the standard lasso. We compute the average and standard error for correctly identified predictors (i.e. estimated $\beta_i > 0$). Note that here we are not comparing false positives or false negatives. 

In [3]:
repeats = 100
n = 5000
p = 10000
d = NegativeBinomial
l = LogLink()

# set random seed for reproducibility
Random.seed!(2019)

# intercept
z = ones(Float32, n, 1)

#construct true model b
true_b = zeros(p)
true_b[1:6] = [0.01; 0.03; 0.05; 0.1; 0.25; 0.5]
shuffle!(true_b)
correct_position = findall(x -> x != 0, true_b)

#run repeats
@time iht_result = run_repeat(n, p, repeats, z, true_b, correct_position, d, l)

262.146976 seconds (36.96 M allocations: 20.626 GiB, 0.73% gc time)


6×100 Array{Float64,2}:
 0.0        0.0        0.0        …  0.0        0.0        0.0     
 0.0        0.0        0.0801787     0.0590452  0.0561173  0.0     
 0.252424   0.263886   0.26252       0.246974   0.26274    0.261156
 0.0932858  0.0938247  0.092982      0.115657   0.10887    0.105399
 0.0        0.0        0.0           0.0        0.0        0.0     
 0.502401   0.51502    0.517696   …  0.505556   0.491024   0.488696

In [4]:
iht_result

6×100 Array{Float64,2}:
 0.0        0.0        0.0        …  0.0        0.0        0.0     
 0.0        0.0        0.0801787     0.0590452  0.0561173  0.0     
 0.252424   0.263886   0.26252       0.246974   0.26274    0.261156
 0.0932858  0.0938247  0.092982      0.115657   0.10887    0.105399
 0.0        0.0        0.0           0.0        0.0        0.0     
 0.502401   0.51502    0.517696   …  0.505556   0.491024   0.488696

In [5]:
iht_non_zero = [iht_result[i, findall(!iszero, iht_result[i, :])] for i in 1:6]

6-element Array{Array{Float64,1},1}:
 [0.0561579, 0.0561131, 0.0603184, 0.0616651, 0.0744082, 0.055671, 0.0558102, 0.0606211, 0.0557753]                                                                                                                          
 [0.0801787, 0.0579106, 0.0810283, 0.0648133, 0.0806855, 0.0578876, 0.0700107, 0.0648967, 0.0549635, 0.0801175  …  0.0602607, 0.057151, 0.057118, 0.053157, 0.0603933, 0.0722108, 0.0552866, 0.0630291, 0.0590452, 0.0561173]
 [0.252424, 0.263886, 0.26252, 0.243379, 0.264464, 0.25534, 0.232409, 0.265255, 0.277067, 0.240343  …  0.251533, 0.258344, 0.252354, 0.210799, 0.226521, 0.265951, 0.256038, 0.246974, 0.26274, 0.261156]                    
 [0.0932858, 0.0938247, 0.092982, 0.0917915, 0.10039, 0.0952722, 0.0718392, 0.0969534, 0.0842929, 0.111284  …  0.10679, 0.0922044, 0.102391, 0.121979, 0.10211, 0.092383, 0.0937393, 0.115657, 0.10887, 0.105399]            
 []                                                                        

In [6]:
iht_non_zero_mean = mean.(iht_non_zero)

6-element Array{Float64,1}:
   0.05961559505926238
   0.0635398430306287 
   0.24841989755630492
   0.10099200211465359
 NaN                  
   0.5021571403741837 

In [7]:
iht_non_zero_std = std.(iht_non_zero)

6-element Array{Float64,1}:
   0.0060574330143880585
   0.007947521576018116 
   0.013819632207345202 
   0.014142077852623813 
 NaN                    
   0.013166054672465843 

In [8]:
final = DataFrame(
    true_β = true_b[correct_position],
    β_IHT_mean = iht_non_zero_mean, 
    β_IHT_std = iht_non_zero_std)

Unnamed: 0_level_0,true_β,β_IHT_mean,β_IHT_std
Unnamed: 0_level_1,Float64,Float64,Float64
1,0.03,0.0596156,0.00605743
2,0.05,0.0635398,0.00794752
3,0.25,0.24842,0.0138196
4,0.1,0.100992,0.0141421
5,0.01,,
6,0.5,0.502157,0.0131661
