In [1]:
using Revise
using IHT
using SnpArrays
using DataFrames
using Distributions
using StatsFuns: logistic
using Random
using LinearAlgebra

┌ Info: Recompiling stale cache file /Users/biona001/.julia/compiled/v1.0/IHT/eaqWB.ji for IHT [921c7187-1484-5754-b919-5d3ed9ac03c4]
└ @ Base loading.jl:1190
┌ Info: Loading DataFrames support into Gadfly.jl
└ @ Gadfly /Users/biona001/.julia/packages/Gadfly/09PWZ/src/mapping.jl:228


In [2]:
function run_poisson(n :: Int64, p :: Int64)
    #set random seed
    Random.seed!(1111)

    #simulate data
    k = 10 # number of true predictors
    bernoulli_rates = 0.5rand(p) #minor allele frequencies are drawn from uniform (0, 0.5)
    x = simulate_random_snparray(n, p, bernoulli_rates)
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 

    #construct snpmatrix, covariate files, and true model b
    z           = ones(n, 1)                   # non-genetic covariates, just the intercept
    true_b      = zeros(p)                     # model vector
    true_b[1:k] = randn(k)                     # Initialize k non-zero entries in the true model
    shuffle!(true_b)                           # Shuffle the entries
    correct_position = findall(x -> x != 0, true_b) # keep track of what the true entries are

    #check maf
    bernoulli_rates[correct_position]

    #simulate phenotypes under different noises by: y = Xb + noise
    y_temp = xbm * true_b

    # Simulate poisson data
    λ = exp.(y_temp) #inverse log link
    y = [rand(Poisson(x)) for x in λ]
    y = Float64.(y)

    #compute poisson IHT result
    result = L0_poisson_reg(x, z, y, 1, k, glm = "poisson", debias=false, convg=false, show_info=false)

    #check result
    estimated_models = zeros(k)
    estimated_models .= result.beta[correct_position]
    true_model = true_b[correct_position]
    compare_model = DataFrame(
        correct_position = correct_position, 
        true_β           = true_model, 
        estimated_β      = estimated_models)
    
    #display results
    @show compare_model
    println("n = " * string(n) * ", and p = " * string(p))
    println("Total iteration number was " * string(result.iter))
    println("Total time was " * string(result.time))
end

function run_logistic(n :: Int64, p :: Int64, debias :: Bool)
    #set random seed
    Random.seed!(1111)

    #simulate data
    k = 10 # number of true predictors
    bernoulli_rates = 0.5rand(p) #minor allele frequencies are drawn from uniform (0, 0.5)
    x = simulate_random_snparray(n, p, bernoulli_rates)
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 

    #construct snpmatrix, covariate files, and true model b
    z           = ones(n, 1)                   # non-genetic covariates, just the intercept
    true_b      = zeros(p)                     # model vector
    true_b[1:k] = randn(k)                     # Initialize k non-zero entries in the true model
    shuffle!(true_b)                           # Shuffle the entries
    correct_position = findall(x -> x != 0, true_b) # keep track of what the true entries are

    #simulate phenotypes under different noises by: y = Xb + noise
    y_temp = xbm * true_b

    # Apply inverse logit link and sample from the vector of distributions
    prob = logistic.(y_temp) #inverse logit link
    y = [rand(Bernoulli(x)) for x in prob]
    y = Float64.(y)

    #compute logistic IHT result
    result = L0_logistic_reg(x, z, y, 1, k, glm = "logistic", debias=debias, show_info=false)

    #check result
    estimated_models = result.beta[correct_position]
    true_model = true_b[correct_position]
    compare_model = DataFrame(
        correct_position = correct_position, 
        true_β           = true_model, 
        estimated_β      = estimated_models)
    println("Total iteration number was " * string(result.iter))
    println("Total time was " * string(result.time))

    #display results
    @show compare_model
    println("n = " * string(n) * ", and p = " * string(p))
    println("Total iteration number was " * string(result.iter))
    println("Total time was " * string(result.time))
end

run_logistic (generic function with 1 method)

# Poisson sample size = 100~1000 (no debias)

In [3]:
for i = 1:100
    println("running the $i th model")
    n = rand(100:1000) 
    p = rand(1:10)n
    run_poisson(n, p)
end

running the 1 th model
compare_model = 10×3 DataFrame
│ Row │ correct_position │ true_β     │ estimated_β │
│     │ Int64            │ Float64    │ Float64     │
├─────┼──────────────────┼────────────┼─────────────┤
│ 1   │ 2                │ 0.00897563 │ 0.0         │
│ 2   │ 9                │ 0.0701975  │ 0.0         │
│ 3   │ 44               │ -0.777736  │ -0.317116   │
│ 4   │ 130              │ 0.210051   │ 0.0         │
│ 5   │ 145              │ -0.185546  │ 0.0         │
│ 6   │ 148              │ 1.21199    │ 1.03397     │
│ 7   │ 167              │ -1.98988   │ 0.0         │
│ 8   │ 248              │ 0.284483   │ 0.0         │
│ 9   │ 258              │ -0.123728  │ 0.0         │
│ 10  │ 273              │ -0.473523  │ 0.0         │
n = 278, and p = 278
Total iteration number was 252
Total time was 0.5504870414733887
running the 2 th model
compare_model = 10×3 DataFrame
│ Row │ correct_position │ true_β     │ estimated_β │
│     │ Int64            │ Float64    │ Float64   

# Poisson sample size = 100~1000 (with debias)


In [5]:
for i = 1:100
    println("running the $i th model")
    n = rand(100:1000) 
    p = rand(1:10)n
    run_poisson(n, p)
end

running the 1 th model
compare_model = 10×3 DataFrame
│ Row │ correct_position │ true_β     │ estimated_β │
│     │ Int64            │ Float64    │ Float64     │
├─────┼──────────────────┼────────────┼─────────────┤
│ 1   │ 5                │ 0.963311   │ 1.07236     │
│ 2   │ 25               │ -0.0162929 │ 0.0         │
│ 3   │ 332              │ -0.833918  │ 0.0         │
│ 4   │ 661              │ -1.11357   │ -1.32764    │
│ 5   │ 1279             │ -0.0145944 │ 0.0         │
│ 6   │ 1751             │ -0.706056  │ 0.0         │
│ 7   │ 2184             │ 0.958318   │ 1.11266     │
│ 8   │ 2195             │ 1.07606    │ 0.975752    │
│ 9   │ 2258             │ -0.607827  │ 0.0         │
│ 10  │ 2517             │ 0.075377   │ 0.0         │
n = 381, and p = 2667
Total iteration number was 152
Total time was 0.9217848777770996
running the 2 th model
compare_model = 10×3 DataFrame
│ Row │ correct_position │ true_β    │ estimated_β │
│     │ Int64            │ Float64   │ Float64    

# Poisson testing ad hoc scaling v.df (divide v.b by 10)

In [5]:
#scaling v.b down by 2, scaling v.df down by exp(1/2)
for i = 1:100
    println("running the $i th model")
    n = rand(100:1000) 
    p = rand(1:10)n
    run_poisson(n, p)
end

running the 1 th model
compare_model = 10×3 DataFrame
│ Row │ correct_position │ true_β    │ estimated_β │
│     │ Int64            │ Float64   │ Float64     │
├─────┼──────────────────┼───────────┼─────────────┤
│ 1   │ 61               │ -0.985009 │ -0.999038   │
│ 2   │ 195              │ -0.134038 │ 0.0         │
│ 3   │ 339              │ -0.541968 │ -0.547913   │
│ 4   │ 353              │ -0.24548  │ -0.192981   │
│ 5   │ 363              │ -1.75137  │ -1.7294     │
│ 6   │ 403              │ -0.232117 │ -0.205514   │
│ 7   │ 446              │ -1.22987  │ -1.20587    │
│ 8   │ 463              │ -0.184661 │ -0.188596   │
│ 9   │ 578              │ 0.752658  │ 0.760266    │
│ 10  │ 587              │ -0.933025 │ -0.899582   │
n = 288, and p = 864
Total iteration number was 127
Total time was 0.19269108772277832
running the 2 th model
compare_model = 10×3 DataFrame
│ Row │ correct_position │ true_β     │ estimated_β │
│     │ Int64            │ Float64    │ Float64     │
├─────┼─

# Logistic sample size = 100~1000 (with debias)

In [3]:
for i = 1:100
    println("running the $i th model")
    n = rand(100:1000) 
    p = rand(1:10)n
    debias = true
    run_logistic(n, p, debias)
end

running the 1 th model
Total iteration number was 10
Total time was 0.8166790008544922
compare_model = 10×3 DataFrame
│ Row │ correct_position │ true_β    │ estimated_β │
│     │ Int64            │ Float64   │ Float64     │
├─────┼──────────────────┼───────────┼─────────────┤
│ 1   │ 142              │ -1.29408  │ -1.329      │
│ 2   │ 1361             │ 1.3569    │ 1.51201     │
│ 3   │ 2545             │ 1.59891   │ 1.70748     │
│ 4   │ 3195             │ -0.639507 │ -0.776259   │
│ 5   │ 3960             │ 0.569607  │ 0.721466    │
│ 6   │ 4701             │ 1.12605   │ 1.15373     │
│ 7   │ 4920             │ -0.67882  │ -0.628831   │
│ 8   │ 5429             │ 0.0773713 │ 0.0         │
│ 9   │ 6640             │ 1.08847   │ 1.02796     │
│ 10  │ 6729             │ -0.383984 │ -0.573983   │
n = 774, and p = 7740
Total iteration number was 10
Total time was 0.8166790008544922
running the 2 th model
Total iteration number was 11
Total time was 0.07755780220031738
compare_model = 10×