# Run normal, logistic, poisson code on the same model but different data many times 

In [18]:
using IHT
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using StatsFuns: logistic
using DelimitedFiles

# Wrapper functions for running code a bunch of times 

In [2]:
#run n repeats of L0_reg using the same X and b, but different y
function normal_repeat(
    repeats  :: Int, 
    x        :: SnpArray,
    xbm      :: SnpBitMatrix,
    true_b   :: Vector{Float64},
    noise    :: Float64,
    position :: Vector{Int} #correct position of the true model
)
    n = size(xbm, 1)
    k = size(position, 1)
    estimated_β = zeros(k, repeats)
    
    for i in 1:repeats
        # simulate random noise
        ϵ = rand(Normal(0, noise), n)

        #simulate phenotypes (e.g. vector y) via: y = Xb + noise
        y = xbm * true_b + ϵ

        #compute IHT result for less noisy data
        z = ones(n, 1) # intercept
        result = L0_reg(x, z, y, 1, k, debias=false)
        
        #store the correct position in estimated model
        estimated_β[:, i] .= result.beta[position]
    end
    
    return estimated_β
end

normal_repeat (generic function with 1 method)

In [3]:
#run n repeats of L0_reg using the same X and b, but different y
function logistic_repeat(
    repeats  :: Int, 
    x        :: SnpArray,
    xbm      :: SnpBitMatrix,
    true_b   :: Vector{Float64},
    noise    :: Float64,
    position :: Vector{Int} #correct position of the true model
)
    n = size(xbm, 1)
    k = size(position, 1)
    estimated_β = zeros(k, repeats)
    
    for i in 1:repeats
        #simulate phenotypes 
        y_temp = xbm * true_b
        
        # Apply inverse logit link and sample from the vector of distributions
        prob = logistic.(y_temp) #inverse logit link
        y = [rand(Bernoulli(x)) for x in prob]
        y = Float64.(y)

        #compute IHT result for less noisy data
        z = ones(n, 1) # intercept
        result = L0_logistic_reg(x, z, y, 1, k, glm = "logistic", debias=true, show_info=false)

        #store the correct position in estimated model
        estimated_β[:, i] .= result.beta[position]
    end
    
    return estimated_β
end

logistic_repeat (generic function with 1 method)

In [14]:
#run n repeats of L0_reg using the same X and b, but different y
function poisson_repeat(
    repeats  :: Int, 
    x        :: SnpArray,
    xbm      :: SnpBitMatrix,
    true_b   :: Vector{Float64},
    noise    :: Float64,
    position :: Vector{Int} #correct position of the true model
)
    n = size(xbm, 1)
    k = size(position, 1)
    estimated_β = zeros(k, repeats)
    
    for i in 1:repeats
        #simulate phenotypes 
        y_temp = xbm * true_b
        
        # Simulate poisson data
        λ = exp.(y_temp) #inverse log link
        y = [rand(Poisson(x)) for x in λ]
        y = Float64.(y)

        #compute IHT result for less noisy data
        z = ones(n, 1) # intercept
        result = L0_poisson_reg(x, z, y, 1, k, glm = "poisson", debias=false, convg=false, show_info=false)

        #store the correct position in estimated model
        estimated_β[:, i] .= result.beta[position]
    end
    
    return estimated_β
end

poisson_repeat (generic function with 1 method)

# Simulate data

In [8]:
#set random seed
Random.seed!(123)

#simulat data
n = 2000
p = 10000
bernoulli_rates = 0.5rand(p) #minor allele frequencies are drawn from uniform (0, 0.5)
x = simulate_random_snparray(n, p, bernoulli_rates)
xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 

#specify true model size and noise of data
k = 10   # number of true predictors
s = 0.1  # noise 

#construct true model b
true_b = zeros(p)       # model vector
true_b[1:k] = randn(k)  # Initialize k non-zero entries in the true model
shuffle!(true_b)        # Shuffle the entries
correct_position = findall(x -> x != 0, true_b) # keep track of what the true entries are
true_b[correct_position]

10-element Array{Float64,1}:
 -1.032632104380422  
 -0.2239554516566982 
  0.07273374165749658
 -0.9191484037333554 
  0.45122947328314994
  0.7111366326563213 
  0.2641836722931301 
 -0.0699173021489764 
 -0.4464860534793655 
 -0.6418130951634765 

# Perform simulation a bunch of times

In [5]:
repeats = 10
normal_result = normal_repeat(repeats, xbm, true_b, s, correct_position)

10×10 Array{Float64,2}:
 -1.03097    -1.03614    -1.02993    …  -1.0359     -1.03192    -1.03389  
 -0.224721   -0.223331   -0.22911       -0.223697   -0.22428    -0.221284 
  0.0710394   0.0750143   0.0722183      0.0750609   0.0715057   0.0719568
 -0.918373   -0.920954   -0.918275      -0.921586   -0.919323   -0.92075  
  0.452428    0.451672    0.448629       0.455558    0.446528    0.453115 
  0.713601    0.710315    0.708198   …   0.710077    0.711536    0.714367 
  0.262455    0.264357    0.268168       0.26665     0.261241    0.263857 
 -0.0659158  -0.0706653  -0.0705811     -0.0713897  -0.0692596  -0.0712174
 -0.449508   -0.448497   -0.446629      -0.443119   -0.448306   -0.444949 
 -0.642566   -0.641768   -0.644232      -0.641812   -0.643826   -0.643875 

In [16]:
repeats = 10
logistic_result = logistic_repeat(repeats, xbm, true_b, s, correct_position)

10×10 Array{Float64,2}:
 -1.05325   -0.946425  -1.042     …  -0.951251  -1.09671   -1.05655 
  0.0       -0.369396   0.0           0.0       -0.205502   0.0     
  0.2336     0.0        0.0           0.0        0.0        0.0     
 -0.993427  -0.937221  -0.880625     -0.987428  -0.946767  -0.895962
  0.540484   0.52665    0.464331      0.463708   0.382106   0.538334
  0.731927   0.685709   0.619302  …   0.796292   0.840397   0.69273 
  0.272807   0.306217   0.336967      0.300038   0.356199   0.304941
  0.0        0.0        0.0           0.0        0.0        0.0     
 -0.407213  -0.437987  -0.468822     -0.488452  -0.486859  -0.465732
 -0.560326  -0.581322  -0.681331     -0.728032  -0.713123  -0.631274

In [15]:
repeats = 10
poisson_result = poisson_repeat(repeats, xbm, true_b, s, correct_position)

10×10 Array{Float64,2}:
 -1.01311    -1.03886    -1.0194     …  -1.04351    -1.02035    -1.025    
  0.0         0.0         0.0            0.0         0.0         0.0      
  0.0752493   0.0613097   0.076923       0.0684839   0.0683997   0.0774494
 -0.921249   -0.928067   -0.941155      -0.927725   -0.90217    -0.916345 
  0.43986     0.455245    0.449927       0.447075    0.449652    0.450708 
  0.708496    0.715644    0.704064   …   0.725813    0.70221     0.722058 
  0.266938    0.269339    0.295152       0.25666     0.266401    0.250103 
 -0.0557379  -0.0665173  -0.0715434     -0.0786794  -0.081211   -0.0802913
 -0.433837   -0.443607   -0.460737      -0.441779   -0.463928   -0.464449 
 -0.617545   -0.642085   -0.635184      -0.648386   -0.650688   -0.640051 

In [20]:
#writedlm("normal_result", normal_result)
#writedlm("logistic_result", logistic_result)
#writedlm("poisson_result", poisson_result)