# IHT code on fitting generalized linear models

In [1]:
using Revise
using IHT
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using StatsFuns: logistic

┌ Info: Recompiling stale cache file /Users/biona001/.julia/compiled/v1.0/IHT/eaqWB.ji for IHT [921c7187-1484-5754-b919-5d3ed9ac03c4]
└ @ Base loading.jl:1190
┌ Info: Loading DataFrames support into Gadfly.jl
└ @ Gadfly /Users/biona001/.julia/packages/Gadfly/09PWZ/src/mapping.jl:228


## Logistic Regression: 

In [2]:
#problem size
n = 2000
p = 25000

#set random seed
Random.seed!(1111)

#simulate data using k true predictors
k = 10 
bernoulli_rates = 0.5rand(p) #minor allele frequencies are drawn from uniform (0, 0.5)
x = simulate_random_snparray(n, p, bernoulli_rates)
xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 

#construct covariates (intercept) and true model b ~ N(0, 1)
z = ones(n, 1)          # non-genetic covariates, just the intercept
true_b = zeros(p)       # model vector
true_b[1:k] = randn(k)  # Initialize k non-zero entries in the true model
shuffle!(true_b)        # Shuffle the entries
correct_position = findall(x -> x != 0, true_b) # keep track of what the true entries are

#simulate phenotypes (e.g. vector y) via: y = Xb
y_temp = xbm * true_b

# Apply inverse logit link and sample from the vector of distributions
prob = logistic.(y_temp) #inverse logit link
y = [rand(Bernoulli(x)) for x in prob]
y = Float64.(y)

#compute logistic IHT result. Debias accelerates convergence
result = L0_logistic_reg(x, z, y, 1, k, glm = "logistic", debias=true, show_info=false)
@show result

#check result
estimated_models = result.beta[correct_position]
true_model = true_b[correct_position]
compare_model = DataFrame(
    correct_position = correct_position, 
    true_β           = true_model, 
    estimated_β      = estimated_models)

result = IHT results:

Compute time (sec):     4.952028036117554
Final loglikelihood:    -670.8572856298258
Iterations:             14
Max number of groups:   1
Max predictors/group:   10
IHT estimated 10 nonzero coefficients.
10×3 DataFrame
│ Row │ Group │ Predictor │ Estimated_β │
│     │ Int64 │ Int64     │ Float64     │
├─────┼───────┼───────────┼─────────────┤
│ 1   │ 1     │ 3281      │ 0.254733    │
│ 2   │ 1     │ 5036      │ -0.336928   │
│ 3   │ 1     │ 7955      │ 0.705645    │
│ 4   │ 1     │ 11125     │ 1.3001      │
│ 5   │ 1     │ 14778     │ 0.265708    │
│ 6   │ 1     │ 15259     │ 0.440616    │
│ 7   │ 1     │ 15611     │ -0.825903   │
│ 8   │ 1     │ 16824     │ -1.21126    │
│ 9   │ 1     │ 18529     │ 2.19817     │
│ 10  │ 1     │ 22323     │ 1.06874     │

Intercept of model = 0.0



Unnamed: 0_level_0,correct_position,true_β,estimated_β
Unnamed: 0_level_1,Int64,Float64,Float64
1,2671,0.14408,0.0
2,5036,-0.499722,-0.336928
3,7955,0.899909,0.705645
4,11125,1.45065,1.3001
5,15259,0.44709,0.440616
6,15611,-0.875885,-0.825903
7,16627,0.243581,0.0
8,16824,-1.16319,-1.21126
9,18529,2.29413,2.19817
10,22323,1.04997,1.06874


## Poisson regression:

In [3]:
#problem size
n = 2000
p = 20001

#set random seed
Random.seed!(1111)

#simulate data using k true predictors
k = 10 # number of true predictors
bernoulli_rates = 0.5rand(p) #minor allele frequencies are drawn from uniform (0, 0.5)
x = simulate_random_snparray(n, p, bernoulli_rates)
xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 

#construct snpmatrix, covariate files, and true model b
z           = ones(n, 1)                   # non-genetic covariates, just the intercept
true_b      = zeros(p)                     # model vector
true_b[1:k] = randn(k)                     # Initialize k non-zero entries in the true model
shuffle!(true_b)                           # Shuffle the entries
correct_position = findall(x -> x != 0, true_b) # keep track of what the true entries are

#check maf
bernoulli_rates[correct_position]

#simulate phenotypes by: y = Xb + noise
y_temp = xbm * true_b

# Simulate poisson data
λ = exp.(y_temp) #inverse log link
y = [rand(Poisson(x)) for x in λ]
y = Float64.(y)

#compute poisson IHT result
result = L0_poisson_reg(x, z, y, 1, k, glm = "poisson", debias=false, convg=false, show_info=false, true_beta=true_b)
@show result

#check result
estimated_models = result.beta[correct_position]
true_model = true_b[correct_position]
compare_model = DataFrame(
    correct_position = correct_position, 
    true_β           = true_model, 
    estimated_β      = estimated_models)

result = IHT results:

Compute time (sec):     35.481719970703125
Final loglikelihood:    653148.285035393
Iterations:             127
Max number of groups:   1
Max predictors/group:   10
IHT estimated 10 nonzero coefficients.
10×3 DataFrame
│ Row │ Group │ Predictor │ Estimated_β │
│     │ Int64 │ Int64     │ Float64     │
├─────┼───────┼───────────┼─────────────┤
│ 1   │ 1     │ 3435      │ -1.65424    │
│ 2   │ 1     │ 5454      │ -0.583612   │
│ 3   │ 1     │ 8250      │ 1.20102     │
│ 4   │ 1     │ 9447      │ -0.270882   │
│ 5   │ 1     │ 10707     │ 0.155146    │
│ 6   │ 1     │ 10822     │ -1.38683    │
│ 7   │ 1     │ 11077     │ 0.674554    │
│ 8   │ 1     │ 12764     │ 0.361803    │
│ 9   │ 1     │ 13340     │ 1.65805     │
│ 10  │ 1     │ 17035     │ 0.911785    │

Intercept of model = 0.0



Unnamed: 0_level_0,correct_position,true_β,estimated_β
Unnamed: 0_level_1,Int64,Float64,Float64
1,3435,-1.6435,-1.65424
2,5454,-0.574795,-0.583612
3,8250,1.20082,1.20102
4,9447,-0.27556,-0.270882
5,10707,0.15172,0.155146
6,10822,-1.47364,-1.38683
7,11077,0.668192,0.674554
8,12764,0.362815,0.361803
9,13340,1.64449,1.65805
10,17035,0.909084,0.911785


# Often times poisson regression does not work. 

The example above is hand-picked. In about 40% of the time, poisson performs poorly. 

In [4]:
#some function that runs poisson regression on different SNP matrices
function run_poisson(n :: Int64, p :: Int64)
    #set random seed
    Random.seed!(1111)

    #simulate data
    k = 10 # number of true predictors
    bernoulli_rates = 0.5rand(p) #minor allele frequencies are drawn from uniform (0, 0.5)
    x = simulate_random_snparray(n, p, bernoulli_rates)
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 

    #construct snpmatrix, covariate files, and true model b
    z           = ones(n, 1)                   # non-genetic covariates, just the intercept
    true_b      = zeros(p)                     # model vector
    true_b[1:k] = randn(k)                     # Initialize k non-zero entries in the true model
    shuffle!(true_b)                           # Shuffle the entries
    correct_position = findall(x -> x != 0, true_b) # keep track of what the true entries are

    #check maf
    bernoulli_rates[correct_position]

    #simulate phenotypes under different noises by: y = Xb + noise
    y_temp = xbm * true_b

    # Simulate poisson data
    λ = exp.(y_temp) #inverse log link
    y = [rand(Poisson(x)) for x in λ]
    y = Float64.(y)

    #compute poisson IHT result
    result = L0_poisson_reg(x, z, y, 1, k, glm = "poisson", debias=false, convg=false, show_info=false)

    #check result
    estimated_models = result.beta[correct_position]
    true_model = true_b[correct_position]
    compare_model = DataFrame(
        correct_position = correct_position, 
        true_β           = true_model, 
        estimated_β      = estimated_models)
    
    #display results
    @show compare_model
    println("Total iteration number was " * string(result.iter))
    println("Total time was " * string(result.time))
end


run_poisson (generic function with 1 method)

In [5]:
for i = 1:25
    println("running the $i th model")
    n = rand(500:2000) 
    p = rand(1:10)n
    println("n, p = " * string(n) * ", " * string(p))
    run_poisson(n, p)
end

running the 1 th model
n, p = 1682, 6728
compare_model = 10×3 DataFrame
│ Row │ correct_position │ true_β    │ estimated_β │
│     │ Int64            │ Float64   │ Float64     │
├─────┼──────────────────┼───────────┼─────────────┤
│ 1   │ 192              │ -1.27736  │ 0.0         │
│ 2   │ 1342             │ 1.33404   │ 0.0         │
│ 3   │ 1373             │ -2.3186   │ -1.82063    │
│ 4   │ 1382             │ 0.128027  │ 0.0         │
│ 5   │ 2345             │ 1.09643   │ 1.02478     │
│ 6   │ 3293             │ 0.907412  │ 0.0         │
│ 7   │ 3525             │ -1.78121  │ -1.76787    │
│ 8   │ 4387             │ -0.920503 │ 0.0         │
│ 9   │ 4559             │ -1.91075  │ -1.14125    │
│ 10  │ 6097             │ 0.120748  │ 0.0         │
Total iteration number was 292
Total time was 21.481292009353638
running the 2 th model
n, p = 1175, 2350
compare_model = 10×3 DataFrame
│ Row │ correct_position │ true_β    │ estimated_β │
│     │ Int64            │ Float64   │ Float64   

## Reconstruction becomes harder as k increase

In [6]:
function run_poisson()
    n, p = 2000, 20000

    #set random seed
    Random.seed!(1111)

    #simulate data
    bernoulli_rates = 0.5rand(p) #minor allele frequencies are drawn from uniform (0, 0.5)
    x = simulate_random_snparray(n, p, bernoulli_rates)
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
    z           = ones(n, 1)                   # non-genetic covariates, just the intercept

    for k in 1:30
        true_b      = zeros(p)                     # model vector
        true_b[1:k] = randn(k)                     # Initialize k non-zero entries in the true model
        shuffle!(true_b)                           # Shuffle the entries
        correct_position = findall(x -> x != 0, true_b) # keep track of what the true entries are

        #simulate phenotypes under different noises by: y = Xb + noise
        y_temp = xbm * true_b

        # Simulate poisson data
        λ = exp.(y_temp) #inverse log link
        y = [rand(Poisson(x)) for x in λ]
        y = Float64.(y)

        #compute poisson IHT result
        result = L0_poisson_reg(x, z, y, 1, k, glm = "poisson", debias=false, convg=false, show_info=false)

        #check result
        estimated_models = result.beta[correct_position]
        true_model = true_b[correct_position]
        compare_model = DataFrame(
            correct_position = correct_position, 
            true_β           = true_model, 
            estimated_β      = estimated_models)

        #display results
        @show compare_model
        println("Total iteration number was " * string(result.iter))
        println("Total time was " * string(result.time))
    end
end


run_poisson (generic function with 2 methods)

In [7]:
run_poisson()

compare_model = 1×3 DataFrame
│ Row │ correct_position │ true_β   │ estimated_β │
│     │ Int64            │ Float64  │ Float64     │
├─────┼──────────────────┼──────────┼─────────────┤
│ 1   │ 9568             │ -0.18273 │ 0.0         │
Total iteration number was 11
Total time was 9.347173929214478
compare_model = 2×3 DataFrame
│ Row │ correct_position │ true_β   │ estimated_β │
│     │ Int64            │ Float64  │ Float64     │
├─────┼──────────────────┼──────────┼─────────────┤
│ 1   │ 10299            │ 0.873072 │ 0.8714      │
│ 2   │ 12735            │ 0.187676 │ 0.179331    │
Total iteration number was 18
Total time was 14.012959003448486
compare_model = 3×3 DataFrame
│ Row │ correct_position │ true_β    │ estimated_β │
│     │ Int64            │ Float64   │ Float64     │
├─────┼──────────────────┼───────────┼─────────────┤
│ 1   │ 1010             │ -0.138243 │ -0.140331   │
│ 2   │ 14539            │ -0.118317 │ -0.123353   │
│ 3   │ 18887            │ -0.432062 │ -0.385693  