# Negative Binomial: Compute false positive/negatives

Currently `glmnet` does not support negative binomila regression. Therefore, we simply compute the number of false positive/negatives of negative binomial in the same cross validation setting as Normal/Bernoulli/Poisson.

In [1]:
using Distributed
addprocs(30)
nprocs()

31

In [2]:
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using Random
using LinearAlgebra
using DelimitedFiles
using GLM

In [25]:
function iht_negativebinomial(n::Int64, p::Int64, d::UnionAll, l::Link)
    #construct snpmatrix, covariate files, and true model b
    x = simulate_random_snparray(n, p, undef)
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
    z = ones(n, 1) # the intercept

    # simulate response, true model b, and the correct non-0 positions of b
    true_b = zeros(p)
    true_b[1:10] .= collect(0.1:0.1:1.0)
    k = 10
    shuffle!(true_b)
    correct_position = findall(x -> x != 0, true_b)

    #simulate phenotypes (e.g. vector y)
    if d == Normal || d == Poisson || d == Bernoulli
        prob = linkinv.(l, xbm * true_b)
        clamp!(prob, -20, 20)
        y = [rand(d(i)) for i in prob]
    elseif d == NegativeBinomial
        nn = 10
        μ = linkinv.(l, xbm * true_b)
        clamp!(μ, -20, 20)
        prob = 1 ./ (1 .+ μ ./ nn)
        y = [rand(d(nn, i)) for i in prob] #number of failtures before nn success occurs
    elseif d == Gamma
        μ = linkinv.(l, xbm * true_b)
        β = 1 ./ μ # here β is the rate parameter for gamma distribution
        y = [rand(d(α, i)) for i in β] # α is the shape parameter for gamma
    end
    y = Float64.(y)
    
    #specify path and folds
    num_folds = 3
    folds = rand(1:num_folds, size(x, 1));

    #find non-zero entries returned by best lasso model as largest k estimate
    path = collect(1:50);
    
    #run IHT's cross validation routine 
    mses = cv_iht(d(), l, x, z, y, 1, path, num_folds, folds=folds, use_maf=false, debias=false, showinfo=false, parallel=true);
    iht_k_est = argmin(mses)
    iht_result = L0_reg(x, xbm, z, y, 1, iht_k_est, d(), l, debias=false, init=false, use_maf=false)
    iht_beta = iht_result.beta
        
    #show lasso and IHT's reconstruction result
    compare_model = DataFrame(
        true_β  = true_b[correct_position], 
        IHT_β   = iht_beta[correct_position])
    @show compare_model
    
    #compute true/false positives/negatives for IHT and lasso
    iht_tp = count(!iszero, iht_beta[correct_position])
    iht_fp = iht_k_est - iht_tp
    iht_fn = k - iht_tp
    
    println("IHT true positives = $iht_tp")
    println("IHT false positives = $iht_fp")
    println("IHT false negatives = $iht_fn" * "\n")
    
    return iht_tp, iht_fp, iht_fn
end

iht_negativebinomial (generic function with 2 methods)

In [26]:
function run()
    #simulat data with k true predictors, from distribution d and with link l.
    n = 1000
    p = 10000
    d = NegativeBinomial
    l = LogLink()

    #set random seed
    Random.seed!(2019)

    #run function above, saving results in 4 vectors
    total_runs = 50
    iht_true_positives = zeros(total_runs)
    iht_false_positives = zeros(total_runs)
    iht_false_negatives = zeros(total_runs)
    for i in 1:total_runs
        println("current run = $i")
        iht_tp, iht_fp, iht_fn = iht_negativebinomial(n, p, d, l)
        iht_true_positives[i] = iht_tp
        iht_false_positives[i] = iht_fp
        iht_false_negatives[i] = iht_fn
    end
    
    return iht_true_positives, iht_false_positives, iht_false_negatives
end

run (generic function with 1 method)

In [27]:
iht_true_positives, iht_false_positives, iht_false_negatives = run()

current run = 1
compare_model = 10×2 DataFrame
│ Row │ true_β  │ IHT_β    │
│     │ Float64 │ Float64  │
├─────┼─────────┼──────────┤
│ 1   │ 0.3     │ 0.292211 │
│ 2   │ 0.1     │ 0.0      │
│ 3   │ 0.5     │ 0.423205 │
│ 4   │ 0.7     │ 0.628858 │
│ 5   │ 0.4     │ 0.370041 │
│ 6   │ 1.0     │ 0.922779 │
│ 7   │ 0.6     │ 0.510471 │
│ 8   │ 0.8     │ 0.682268 │
│ 9   │ 0.9     │ 0.854823 │
│ 10  │ 0.2     │ 0.160754 │
IHT true positives = 9
IHT false positives = 0
IHT false negatives = 1

current run = 2
compare_model = 10×2 DataFrame
│ Row │ true_β  │ IHT_β    │
│     │ Float64 │ Float64  │
├─────┼─────────┼──────────┤
│ 1   │ 0.6     │ 0.517774 │
│ 2   │ 0.8     │ 0.686017 │
│ 3   │ 0.3     │ 0.267898 │
│ 4   │ 0.1     │ 0.0      │
│ 5   │ 1.0     │ 0.884469 │
│ 6   │ 0.5     │ 0.444435 │
│ 7   │ 0.4     │ 0.344769 │
│ 8   │ 0.7     │ 0.60465  │
│ 9   │ 0.9     │ 0.793706 │
│ 10  │ 0.2     │ 0.178217 │
IHT true positives = 9
IHT false positives = 1
IHT false negatives = 1

current 

([9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0  …  9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 8.0, 8.0], [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0  …  1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0])

In [28]:
nb_iht_true_positives = sum(iht_true_positives) / 50
nb_iht_false_positives = sum(iht_false_positives) / 50
nb_iht_false_negatives = sum(iht_false_negatives) / 50
result = [nb_iht_true_positives; nb_iht_false_positives; nb_iht_false_negatives]

3-element Array{Float64,1}:
 8.88
 0.22
 1.12