# There are many parameters to tune in Poisson
+ Should one perform debiasin?
+ Ken or Kevin's convergence criteria?
+ Should one scale the SnpBitMatrix?
+ How many backtracking by default?

To answer these questions, we run simulation experiments with a lot of samples $n \in \{500, 1000, ..., 5000\}$ with $p = 10000$ SNPs.

In [1]:
using IHT
using SnpArrays
using DataFrames
using Distributions
using StatsFuns: logistic
using Random
using LinearAlgebra
using Distributions

┌ Info: Recompiling stale cache file /Users/biona001/.julia/compiled/v0.7/IHT/eaqWB.ji for IHT [921c7187-1484-5754-b919-5d3ed9ac03c4]
└ @ Base loading.jl:1185
│ - If you have IHT checked out for development and have
│   added SnpArrays as a dependency but haven't updated your primary
│   environment's manifest file, try `Pkg.resolve()`.
│ - Otherwise you may need to report an issue with IHT
┌ Info: Loading DataFrames support into Gadfly.jl
└ @ Gadfly /Users/biona001/.julia/packages/Gadfly/09PWZ/src/mapping.jl:228


In [2]:
function run_test(
    n::Int64, 
    p::Int64;
    debias::Bool = false,
    convergence::Bool = false, #true = kevin's method, false = ken's method
    scale::Bool = false,
    max_step::Int = 3
)
    #set random seed
    Random.seed!(1111)

    #specify model and simuation parameters
    k = 10
    bernoulli_rates = 0.5rand(p)
    x = simulate_random_snparray(n, p, bernoulli_rates)
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 

    #construct snpmatrix, covariate files, and true model b
    z           = ones(n, 1)                   # non-genetic covariates, just the intercept
    true_b      = zeros(p)                     # model vector
    true_b[1:k] = randn(k)                     # Initialize k non-zero entries in the true model
    shuffle!(true_b)                           # Shuffle the entries
    correct_position = findall(x -> x != 0, true_b) # keep track of what the true entries are

    #simulate phenotypes under different noises by: y = Xb + noise
    y_temp = xbm * true_b

    # Simulate poisson data
    λ = exp.(y_temp) #inverse log link
    y = [rand(Poisson(x)) for x in λ]
    y = Float64.(y)
    
    #construct result
    result = L0_poisson_reg(x, z, y, 1, k, glm="poisson", debias=debias, scale=scale, convergence=convergence,max_step=max_step, show_info=false)

    #calculate number of real signals found
    supp_recovered = count(!iszero, result.beta[correct_position])
    
    return (result.time, result.iter, supp_recovered)
end

run_test (generic function with 1 method)

## Should one debias?

In [3]:
function test_poisson_debias()
    sample_sizes = collect(500:500:5000)
    nn = length(sample_sizes)
    p = 10000
    
    debias_df = DataFrame(time_debias = zeros(Float64, nn), time_no_debias = zeros(Float64, nn), iter_debias = zeros(Int, nn), iter_no_debias = zeros(Int, nn), support_recovered_debias = zeros(Int, nn), support_recovered_no_debias = zeros(Int, nn))
    for i in 1:length(sample_sizes)
        n = sample_sizes[i]
        debias    = run_test(n, p, debias=true,convergence=false,scale=false,max_step=3)
        no_debias = run_test(n, p, debias=false,convergence=false,scale=false,max_step=3)
        debias_df[i, 1] = debias[1]
        debias_df[i, 2] = no_debias[1]
        debias_df[i, 3] = debias[2]
        debias_df[i, 4] = no_debias[2]
        debias_df[i, 5] = debias[3]
        debias_df[i, 6] = no_debias[3]
    end
    return debias_df
end

test_poisson_debias (generic function with 1 method)

In [4]:
should_I_debias = test_poisson_debias()

Unnamed: 0_level_0,time_debias,time_no_debias,iter_debias,iter_no_debias,support_recovered_debias,support_recovered_no_debias
Unnamed: 0_level_1,Float64,Float64,Int64,Int64,Int64,Int64
1,24.1622,23.5209,861,861,4,4
2,19.0641,18.6802,351,351,2,2
3,24.0687,23.0432,269,269,8,8
4,24.8863,24.4421,221,221,10,10
5,6.97411,8.96395,51,67,8,8
6,107.087,106.209,685,685,3,3
7,22.1921,21.9222,120,120,10,10
8,33.8737,33.1712,162,162,9,9
9,25.1572,24.0459,107,104,9,9
10,58.527,57.6429,227,227,10,10


In [None]:
    #keep track of: time, iteration, support recovery, 

    #kevin's vs ken's convergence
    #scale or not
    #how many backtracking

## Test which convergence criteria I should use

In [6]:
#true = kevin's method, false = ken's method
function test_poisson_convg()
    sample_sizes = collect(500:500:5000)
    nn = length(sample_sizes)
    p = 10000
    
    convg_df = DataFrame(time_kevin = zeros(nn), time_ken = zeros(nn), iter_kevin = zeros(Int, nn), iter_ken = zeros(Int, nn), support_recovered_kevin = zeros(Int, nn), support_recovered_no_ken = zeros(Int, nn))
    for i in 1:length(sample_sizes)
        n = sample_sizes[i]
        kevin = run_test(n, p, debias=false,convergence=true,scale=false,max_step=3)
        ken   = run_test(n, p, debias=false,convergence=false,scale=false,max_step=3)
        convg_df[i, 1] = kevin[1]
        convg_df[i, 2] = ken[1]
        convg_df[i, 3] = kevin[2]
        convg_df[i, 4] = ken[2]
        convg_df[i, 5] = kevin[3]
        convg_df[i, 6] = ken[3]
    end
    return convg_df
end

test_poisson_convg (generic function with 1 method)

In [7]:
which_convergence_criteria = test_poisson_convg()

Unnamed: 0_level_0,time_kevin,time_ken,iter_kevin,iter_ken,support_recovered_kevin,support_recovered_no_ken
Unnamed: 0_level_1,Float64,Float64,Int64,Int64,Int64,Int64
1,10.3769,23.5718,377,861,4,4
2,11.3464,18.3091,212,351,2,2
3,21.6453,23.4103,247,269,8,8
4,19.8966,25.4802,178,221,10,10
5,9.22129,9.22906,67,67,8,8
6,56.6324,108.642,360,685,2,3
7,17.7581,21.8035,99,120,10,10
8,25.2194,33.433,122,162,9,9
9,23.1952,24.1049,100,104,9,9
10,51.2782,57.7842,201,227,10,10


## Should I scale SnpBitMatrix?

In [3]:
function test_poisson_scale()
    sample_sizes = collect(500:500:5000)
    nn = length(sample_sizes)
    p = 10000
    
    scale_df = DataFrame(time_yes_scale = zeros(nn), time_no_scale = zeros(nn), iter_yes_scale = zeros(Int, nn), iter_no_scale = zeros(Int, nn), supp_recov_yes_scale = zeros(Int, nn), supp_recov_no_scale = zeros(Int, nn))
    for i in 1:length(sample_sizes)
        n = sample_sizes[i]
        yes_scale = run_test(n, p, debias=false,convergence=true,scale=true,max_step=3)
        no_scale  = run_test(n, p, debias=false,convergence=true,scale=false,max_step=3)
        scale_df[i, 1] = yes_scale[1]
        scale_df[i, 2] = no_scale[1]
        scale_df[i, 3] = yes_scale[2]
        scale_df[i, 4] = no_scale[2]
        scale_df[i, 5] = yes_scale[3]
        scale_df[i, 6] = no_scale[3]
    end
    return scale_df
end

test_poisson_scale (generic function with 1 method)

In [6]:
should_I_scale = test_poisson_scale()

[31mDid not converge!!!!! The run time for IHT was 90.84418201446533seconds and model size was10[39m

Unnamed: 0_level_0,time_yes_scale,time_no_scale,iter_yes_scale,iter_no_scale,supp_recov_yes_scale,supp_recov_no_scale
Unnamed: 0_level_1,Float64,Float64,Int64,Int64,Int64,Int64
1,11.7174,10.1912,424,377,1,4
2,23.1371,10.9557,399,212,2,2
3,90.8442,20.6237,1000,247,8,8
4,37.579,21.5381,347,178,6,10
5,11.6094,8.99532,86,67,9,8
6,89.7073,58.6053,558,360,2,2
7,38.8959,20.9325,210,99,9,10
8,114.839,25.0874,517,122,9,9
9,47.2276,22.8083,209,100,9,9
10,83.6891,50.3003,333,201,10,10


## Test how how many backtracking

In [7]:
function test_poisson_backtrack()
    sample_sizes = collect(500:500:5000)
    nn = length(sample_sizes)
    p = 10000
    
    backtrack_df = DataFrame(time_3 = zeros(nn), time_4 = zeros(nn), time_5 = zeros(nn), iter_3 = zeros(Int, nn), iter_4 = zeros(Int, nn), iter_5 = zeros(Int, nn), supp_recov_3 = zeros(Int, nn), supp_recov_4 = zeros(Int, nn), supp_recov_5 = zeros(Int, nn))
    for i in 1:length(sample_sizes)
        n = sample_sizes[i]
        
        back_3 = run_test(n, p, debias=false,convergence=true,scale=true,max_step=3)
        back_4 = run_test(n, p, debias=false,convergence=true,scale=true,max_step=4)
        back_5 = run_test(n, p, debias=false,convergence=true,scale=true,max_step=5)
        backtrack_df[i, 1] = back_3[1]
        backtrack_df[i, 2] = back_4[1]
        backtrack_df[i, 3] = back_5[1]
        backtrack_df[i, 4] = back_3[2]
        backtrack_df[i, 5] = back_4[2]
        backtrack_df[i, 6] = back_5[2]
        backtrack_df[i, 7] = back_3[3]
        backtrack_df[i, 8] = back_4[3]
        backtrack_df[i, 9] = back_5[3]
    end
    return backtrack_df
end

test_poisson_backtrack (generic function with 1 method)

In [8]:
how_many_times_should_I_backtrack = test_poisson_backtrack()

[31mDid not converge!!!!! The run time for IHT was 80.88692903518677seconds and model size was10[39m

Unnamed: 0_level_0,time_3,time_4,time_5,iter_3,iter_4,iter_5,supp_recov_3,supp_recov_4,supp_recov_5
Unnamed: 0_level_1,Float64,Float64,Float64,Int64,Int64,Int64,Int64,Int64,Int64
1,11.5746,10.3514,14.6715,424,386,554,1,1,1
2,22.505,23.1776,20.4193,399,412,371,2,2,2
3,80.8869,47.051,60.0428,1000,581,732,8,7,8
4,37.2058,40.2342,40.0411,347,347,347,6,6,6
5,11.8397,12.4022,12.0313,86,86,86,9,9,9
6,93.7168,100.461,96.9217,558,610,592,2,2,2
7,39.2462,41.4451,41.7887,210,210,210,9,9,9
8,119.161,125.392,122.959,517,552,551,9,9,9
9,51.7593,51.79,52.6283,209,209,209,9,9,9
10,87.4533,82.191,82.9427,333,333,333,10,10,10
