# Various illustrations on IHT-poisson fitting

In [1]:
using Revise
using IHT
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using StatsFuns: logistic

┌ Info: Recompiling stale cache file /Users/biona001/.julia/compiled/v1.0/IHT/eaqWB.ji for IHT [921c7187-1484-5754-b919-5d3ed9ac03c4]
└ @ Base loading.jl:1190
┌ Info: Loading DataFrames support into Gadfly.jl
└ @ Gadfly /Users/biona001/.julia/packages/Gadfly/09PWZ/src/mapping.jl:228


# Poisson regression

First we simulate 50 different SNP matrices, with 50 different $\beta \sim N(0, 0.25)$, to examine reconstruction behavior. Overall, IHT struggles to find effect sizes $< 0.1$ but performs well for larger values.

In [21]:
#some function that runs poisson regression on different SNP matrices
function run_poisson(n :: Int64, p :: Int64, k :: Int64)    
    #set random seed
    Random.seed!(1111)

    #construct snpmatrix, covariate files, and true model b
    x, maf = simulate_random_snparray(n, p)
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
    z = ones(n, 1) # non-genetic covariates, just the intercept
    true_b = zeros(p)
    true_b[1:k] = rand(Normal(0, 0.25), k)
    shuffle!(true_b)
    correct_position = findall(x -> x != 0, true_b)

    # Simulate poisson data
    y_temp = xbm * true_b
    λ = exp.(y_temp) #inverse log link
    y = [rand(Poisson(x)) for x in λ]
    y = Float64.(y)

    #compute poisson IHT result
    result = L0_poisson_reg(x, z, y, 1, k, glm = "poisson", debias=false, convg=false, show_info=false)

    #check result
    estimated_models = result.beta[correct_position]
    true_model = true_b[correct_position]
    compare_model = DataFrame(
        correct_position = correct_position, 
        true_β           = true_model, 
        estimated_β      = estimated_models)
    
    #display results
    @show compare_model
    println("Total iteration number was " * string(result.iter))
    println("Total time was " * string(result.time))
    println("median of y = " * string(median(y)))
    println("maximum of y = " * string(maximum(y)))
    println("minimum of y = " * string(minimum(y)))
    println("skewness of y = " * string(skewness(y)))
    println("kurtosis of y = " * string(kurtosis(y)))
    println("mean of y = " * string(mean(y)))
    println("variance of y = " * string(var(y)))
    println("var / mean = " * string(var(y) / mean(y)) * "\n\n")
end

run_poisson (generic function with 1 method)

In [23]:
k = 10
for i = 1:50
    println("running the $i th model")
    n = rand(500:2000) 
    p = rand(1:10)n
    println("n, p = " * string(n) * ", " * string(p))
    run_poisson(n, p, k)
end

running the 1 th model
n, p = 1828, 7312
compare_model = 10×3 DataFrame
│ Row │ correct_position │ true_β     │ estimated_β │
│     │ Int64            │ Float64    │ Float64     │
├─────┼──────────────────┼────────────┼─────────────┤
│ 1   │ 363              │ 0.238954   │ 0.228253    │
│ 2   │ 437              │ -0.0727749 │ -0.10205    │
│ 3   │ 516              │ -0.0730439 │ 0.0         │
│ 4   │ 1832             │ -0.183891  │ -0.141271   │
│ 5   │ 2004             │ -0.137655  │ -0.106951   │
│ 6   │ 3746             │ 0.206968   │ 0.194926    │
│ 7   │ 4181             │ 0.355171   │ 0.354624    │
│ 8   │ 5979             │ 0.138518   │ 0.147283    │
│ 9   │ 6338             │ 0.0290228  │ 0.0         │
│ 10  │ 6599             │ -0.449113  │ -0.485518   │
Total iteration number was 11
Total time was 0.9138290882110596
median of y = 1.0
maximum of y = 14.0
minimum of y = 0.0
skewness of y = 1.9510798615215827
kurtosis of y = 6.479998361865022
mean of y = 1.298140043763676
varian

# Should I initialize beta?

## Examine reconstruction result as k increase

Here we use the same SNP matrix but different sparsity parameter to examine reconstruction results, where $k = \{1,2,...,30\}$. 

In [23]:
function run_poisson_increasing_k()
    n, p = 2000, 20000

    #set random seed
    Random.seed!(1111)

    #simulate data
    bernoulli_rates = 0.5rand(p) #minor allele frequencies are drawn from uniform (0, 0.5)
    x = simulate_random_snparray(n, p, bernoulli_rates)
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
    z = ones(n, 1)                   # non-genetic covariates, just the intercept

    for k in 1:30
        true_b      = zeros(p)                     # model vector
        true_b[1:k] = rand(Normal(0, 0.25), k)     # Initialize k non-zero entries in the true model
        shuffle!(true_b)                           # Shuffle the entries
        correct_position = findall(x -> x != 0, true_b) # keep track of what the true entries are

        #simulate phenotypes under different noises by: y = Xb + noise
        y_temp = xbm * true_b

        # Simulate poisson data
        λ = exp.(y_temp) #inverse log link
        y = [rand(Poisson(x)) for x in λ]
        y = Float64.(y)

        #compute poisson IHT result
        result = L0_poisson_reg(x, z, y, 1, k, glm = "poisson", debias=false, convg=false, show_info=false)

        #check result
        estimated_models = result.beta[correct_position]
        true_model = true_b[correct_position]
        compare_model = DataFrame(
            correct_position = correct_position, 
            true_β           = true_model, 
            estimated_β      = estimated_models)

        #display results
        @show compare_model
        println("Total iteration number was " * string(result.iter))
        println("Total time was " * string(result.time))
    end
end


run_poisson_increasing_k (generic function with 1 method)

In [24]:
run_poisson_increasing_k()

compare_model = 1×3 DataFrame
│ Row │ correct_position │ true_β     │ estimated_β │
│     │ Int64            │ Float64    │ Float64     │
├─────┼──────────────────┼────────────┼─────────────┤
│ 1   │ 9568             │ -0.0456826 │ 0.0         │
Total iteration number was 8
Total time was 13.029578924179077
compare_model = 2×3 DataFrame
│ Row │ correct_position │ true_β    │ estimated_β │
│     │ Int64            │ Float64   │ Float64     │
├─────┼──────────────────┼───────────┼─────────────┤
│ 1   │ 10299            │ 0.218268  │ 0.2111      │
│ 2   │ 12735            │ 0.0469189 │ 0.0         │
Total iteration number was 13
Total time was 11.204272985458374
compare_model = 3×3 DataFrame
│ Row │ correct_position │ true_β     │ estimated_β │
│     │ Int64            │ Float64    │ Float64     │
├─────┼──────────────────┼────────────┼─────────────┤
│ 1   │ 962              │ -0.0913304 │ 0.0         │
│ 2   │ 14502            │ 0.0597354  │ 0.0         │
│ 3   │ 18863            │ 0.421

# Comparison with LASSO: cross validation

In [1]:
using Revise
using GLMNet #julia wrapper for GLMNet package in R, which calls fortran
using GLM
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using StatsFuns: logistic
using Random
using LinearAlgebra
using DelimitedFiles

┌ Info: Loading DataFrames support into Gadfly.jl
└ @ Gadfly /Users/biona001/.julia/packages/Gadfly/09PWZ/src/mapping.jl:228


In [2]:
Threads.nthreads() #verify multiple threads are enabled

8

In [19]:
function iht_lasso_poisson(n :: Int64, p :: Int64, sim :: Int64)
    #define maf and true model size
    k = 10
    
    #set random seed
    Random.seed!(1111)

    #construct snpmatrix, covariate files, and true model b
    x, maf = simulate_random_snparray(n, p)
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
    z           = ones(n, 1)                   # non-genetic covariates, just the intercept
    true_b      = zeros(p)                     # model vector
    true_b[1:k] = rand(Normal(0, 0.25), k)     # k true response
    shuffle!(true_b)                           # Shuffle the entries
    correct_position = findall(x -> x != 0, true_b) # keep track of what the true entries are

    # Simulate poisson data
    y_temp = xbm * true_b
    λ = exp.(y_temp) #inverse log link
    y = [rand(Poisson(x)) for x in λ]
    y = Float64.(y)

    #compute poisson IHT result
    path = collect(1:15)
    num_folds = 5
    folds = rand(1:num_folds, size(x, 1))
    k_est_iht = cv_iht(x, z, y, 1, path, folds, num_folds, use_maf=false, glm="poisson", debias=false)
    iht_result = L0_poisson_reg(x, z, y, 1, k_est_iht, glm = "poisson", debias=false, convg=false, show_info=false, true_beta=true_b, init=false)

    #compute poisson lasso result
    x_float = [convert(Matrix{Float64}, x, center=true, scale=true) z]
    cv = glmnetcv(x_float, y, Poisson(), dfmax=15, nfolds=5, folds=folds)
    best = argmin(cv.meanloss)
    lasso_result = cv.path.betas[:, best]
    k_est_lasso = length(findall(!iszero, lasso_result))
    
    #compute regular poisson regression using only true predictors
    x_true = [x_float[:, correct_position] z]
    regular_result = glm(x_true, y, Poisson(), LogLink())
    regular_result = regular_result.pp.beta0
    
    #check result
    IHT_model = iht_result.beta[correct_position]
    lasso_model = lasso_result[correct_position]
    true_model = true_b[correct_position]
    compare_model = DataFrame(
        true_β  = true_model, 
        iht_β   = IHT_model,
        lasso_β = lasso_model,
        regular_β = regular_result[1:10])
    @show compare_model

    #compute summary statistics
    lasso_num_correct_predictors = length(findall(!iszero, lasso_model))
    lasso_false_positives = k_est_lasso - lasso_num_correct_predictors
    lasso_false_negatives = k - lasso_num_correct_predictors
    iht_num_correct_predictors = length(findall(!iszero, IHT_model))
    iht_false_positives = k_est_iht - iht_num_correct_predictors
    iht_false_negatives = k - iht_num_correct_predictors
    println("IHT cv found $iht_false_positives" * " false positives and $iht_false_negatives" * " false negatives")
    println("lasso cv found $lasso_false_positives" * " false positives and $lasso_false_negatives" * " false negatives\n\n")

    #write y to file to view distribution later
    writedlm("./IHT_poisson_simulations_mean/data_simulation_$sim.txt", y)

    return lasso_num_correct_predictors, lasso_false_positives, lasso_false_negatives, iht_num_correct_predictors, iht_false_positives, iht_false_negatives
end

iht_lasso_poisson (generic function with 1 method)

In [20]:
function run_iht_lasso_poisson()
    lasso_total_found = 0
    lasso_false_positives = 0
    lasso_false_negatives = 0
    iht_total_found = 0
    iht_false_positives = 0
    iht_false_negatives = 0
    
    iter = 10
    for i = 1:iter
        n = rand(500:1000) 
        p = rand(1:10)n
        println("Running the $i th model where " * "n, p = " * string(n) * ", " * string(p))
        ltf, lfp, lfn, itf, ifp, ifn = iht_lasso_poisson(n, p, i)

        lasso_total_found += ltf
        lasso_false_positives += lfp
        lasso_false_negatives += lfn
        iht_total_found += itf
        iht_false_positives += ifp
        iht_false_negatives += ifn
    end
    println("IHT  : Found $iht_total_found " * "correct predictors, out of " * string(10iter))
    println("IHT  : False positives = $iht_false_positives")
    println("IHT  : False negatives = $iht_false_negatives")   
    println("Lasso: Found $lasso_total_found " * "correct predictors, out of " * string(10iter))
    println("Lasso: False positives = $lasso_false_positives")
    println("Lasso: False negatives = $lasso_false_negatives")
end

run_iht_lasso_poisson (generic function with 1 method)

In [18]:
Random.seed!(2019)
run_iht_lasso_poisson()

Running the 1 th model where n, p = 601, 3005
compare_model = 10×4 DataFrame
│ Row │ true_β     │ iht_β     │ lasso_β    │ regular_β │
│     │ Float64    │ Float64   │ Float64    │ Float64   │
├─────┼────────────┼───────────┼────────────┼───────────┤
│ 1   │ -0.19935   │ 0.0       │ -0.0529655 │ -0.171662 │
│ 2   │ 0.18049    │ 0.0       │ 0.105772   │ 0.167837  │
│ 3   │ 0.524747   │ 0.591968  │ 0.512252   │ 0.582238  │
│ 4   │ 0.292468   │ 0.340894  │ 0.252993   │ 0.312079  │
│ 5   │ 0.429149   │ 0.460376  │ 0.359645   │ 0.451225  │
│ 6   │ -0.22333   │ 0.0       │ -0.0916251 │ -0.240026 │
│ 7   │ -0.367907  │ -0.360886 │ -0.240433  │ -0.374834 │
│ 8   │ -0.0342718 │ 0.0       │ 0.0        │ 0.0374873 │
│ 9   │ 0.0696975  │ 0.0       │ 0.0        │ 0.076362  │
│ 10  │ -0.142232  │ 0.0       │ -0.0378819 │ -0.145426 │
IHT cv   found 4predictors, with 0 false positives and 6 false negatives
lasso cv found 16predictors, with 8 false positives and 2 false negatives
Running the 2 th model

In [15]:
#some function that runs poisson regression on different SNP matrices
function run_poisson(n :: Int64, p :: Int64)
    #set random seed
    Random.seed!(1111)

    #simulate data
    k = 10 # number of true predictors
    bernoulli_rates = 0.5rand(p) #minor allele frequencies are drawn from uniform (0, 0.5)
    x = simulate_random_snparray(n, p, bernoulli_rates)
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 

    #construct snpmatrix, covariate files, and true model b
    z           = ones(n, 1)                   # non-genetic covariates, just the intercept
    true_b      = zeros(p)                     # model vector
    true_b[1:k] = 0.2randn(k)                     # Initialize k non-zero entries in the true model
    shuffle!(true_b)                           # Shuffle the entries
    correct_position = findall(x -> x != 0, true_b) # keep track of what the true entries are

    #check maf
    bernoulli_rates[correct_position]

    #simulate phenotypes under different noises by: y = Xb + noise
    y_temp = xbm * true_b

    # Simulate poisson data
    λ = exp.(y_temp) #inverse log link
    y = [rand(Poisson(x)) for x in λ]
    y = Float64.(y)

    #compute poisson IHT result
    result = L0_poisson_reg(x, z, y, 1, k, glm = "poisson", debias=false, convg=false, show_info=false)

    #check result
    estimated_models = result.beta[correct_position]
    true_model = true_b[correct_position]
    compare_model = DataFrame(
        correct_position = correct_position, 
        true_β           = true_model, 
        estimated_β      = estimated_models)
    
    #display results
    @show compare_model
    println("Total iteration number was " * string(result.iter))
    println("Total time was " * string(result.time))
    println("median of y = " * string(median(y)))
    println("maximum of y = " * string(maximum(y)))
    println("minimum of y = " * string(minimum(y)))
    println("skewness of y = " * string(skewness(y)))
    println("kurtosis of y = " * string(kurtosis(y)))
    println("mean of y = " * string(mean(y)))
    println("variance of y = " * string(var(y)))
    println("var / mean = " * string(var(y) / mean(y)) * "\n\n")
end

run_poisson (generic function with 1 method)

In [17]:
for i = 1:25
    n = rand(2000:5000) 
    p = rand(1:10)n
    println("Running the $i th model where " * "n, p = " * string(n) * ", " * string(p))
    run_poisson(n, p)
end

Running the 1 th model where n, p = 3047, 9141
compare_model = 10×3 DataFrame
│ Row │ correct_position │ true_β    │ estimated_β │
│     │ Int64            │ Float64   │ Float64     │
├─────┼──────────────────┼───────────┼─────────────┤
│ 1   │ 996              │ -0.441003 │ -0.43984    │
│ 2   │ 1847             │ -0.502956 │ -0.46739    │
│ 3   │ 2935             │ 0.351857  │ 0.354962    │
│ 4   │ 3016             │ -0.414902 │ -0.419244   │
│ 5   │ 3379             │ 0.207074  │ 0.188294    │
│ 6   │ 3577             │ 0.0125818 │ 0.0         │
│ 7   │ 3984             │ 0.0480669 │ 0.0         │
│ 8   │ 8060             │ 0.147444  │ 0.146366    │
│ 9   │ 8211             │ 0.215924  │ 0.204016    │
│ 10  │ 8999             │ -0.26669  │ -0.277494   │
Total iteration number was 84
Total time was 19.836469888687134
median of y = 1.0
maximum of y = 20.0
minimum of y = 0.0
skewness of y = 2.9606381620155013
kurtosis of y = 15.91986963097509
mean of y = 1.4972103708565803
variance of 

InterruptException: InterruptException: