# Prior Weight IHT

In [2]:
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using GLM
using Statistics

In [4]:
function compare(n, p, k, d, l)
    # construct snpmatrix, covariate files, and true model b
    x = simulate_random_snparray(n, p, "tmp.bed")
    X = convert(Matrix{Float64}, x, center=true, scale=true)
    z = ones(n, 1) # the intercept
    
    #define true_b 
    true_b = zeros(p)
    true_b[1:10] .= collect(0.1:0.1:1.0)
    shuffle!(true_b)
    correct_position = findall(!iszero, true_b)

    #simulate phenotypes (e.g. vector y)
    if d == Normal || d == Poisson || d == Bernoulli
        prob = linkinv.(l, X * true_b)
        clamp!(prob, -20, 20)
        y = [rand(d(i)) for i in prob]
    elseif d == NegativeBinomial
        r = 10
        μ = GLM.linkinv.(l, X * true_b)
        clamp!(μ, -20, 20)
        prob = 1 ./ (1 .+ μ ./ r)
        y = [rand(d(r, i)) for i in prob] #number of failtures before r success occurs
    end
    y = Float64.(y)
    
    # construct weight vector
    w = ones(p + 1)
    w[correct_position] .= 2.0
    one_tenth = round(Int, p/10)
    idx = rand(1:p, one_tenth)
    w[idx] .= 2.0 #randomly set ~1/10 of all predictors to 2

    #run IHT
    unweighted = L0_reg(X, z, y, 1, k, d(), l, verbose=false)
    weighted   = L0_reg(X, z, y, 1, k, d(), l, verbose=false, weight=w)

    #check result
    compare_model = DataFrame(
        position    = correct_position,
        correct     = true_b[correct_position],
        unweighted  = unweighted.beta[correct_position], 
        weighted    = weighted.beta[correct_position])
    @show compare_model
    println("\n")

    #clean up
    rm("tmp.bed", force=true)
    
    unweighted_tp = count(!iszero, unweighted.beta[correct_position])
    unweighted_fp = 10 - unweighted_tp
    weightedp_tp = count(!iszero, weighted.beta[correct_position])
    weighted_fp = 10 - weightedp_tp
    
    return unweighted_tp, unweighted_fp, weightedp_tp, weighted_fp
end

compare (generic function with 1 method)

## Normal

In [7]:
Random.seed!(2019)

d = Normal
l = canonicallink(d())
n = 1000
p = 10000
k = 10

num_runs = 100
ungroup_true_positives = zeros(num_runs)
ungroup_false_positives = zeros(num_runs)
variable_true_positives = zeros(num_runs)
variable_false_positives = zeros(num_runs)

@time for i in 1:num_runs
    utp, ufp, wtp, wfp = compare(n, p, k, d, l)
    ungroup_true_positives[i] = utp
    ungroup_false_positives[i] = ufp
    variable_true_positives[i] = wtp
    variable_false_positives[i] = wfp
end

compare_model = 10×4 DataFrame
│ Row │ position │ correct │ unweighted │ weighted │
│     │ Int64    │ Float64 │ Float64    │ Float64  │
├─────┼──────────┼─────────┼────────────┼──────────┤
│ 1   │ 844      │ 0.3     │ 0.275855   │ 0.275863 │
│ 2   │ 868      │ 0.1     │ 0.15148    │ 0.151472 │
│ 3   │ 915      │ 0.5     │ 0.50666    │ 0.506665 │
│ 4   │ 2692     │ 0.7     │ 0.680781   │ 0.680771 │
│ 5   │ 4234     │ 0.4     │ 0.384727   │ 0.38472  │
│ 6   │ 4775     │ 1.0     │ 1.05168    │ 1.05168  │
│ 7   │ 5086     │ 0.6     │ 0.608062   │ 0.608054 │
│ 8   │ 5276     │ 0.8     │ 0.801632   │ 0.801627 │
│ 9   │ 7747     │ 0.9     │ 0.899538   │ 0.899544 │
│ 10  │ 8248     │ 0.2     │ 0.192054   │ 0.192059 │


compare_model = 10×4 DataFrame
│ Row │ position │ correct │ unweighted │ weighted │
│     │ Int64    │ Float64 │ Float64    │ Float64  │
├─────┼──────────┼─────────┼────────────┼──────────┤
│ 1   │ 78       │ 0.6     │ 0.599909   │ 0.606404 │
│ 2   │ 551      │ 0.9     │ 0.9488

In [8]:
@show mean(ungroup_true_positives)
@show mean(ungroup_false_positives)
@show mean(variable_true_positives)
@show mean(variable_false_positives)

@show std(ungroup_true_positives)
@show std(ungroup_false_positives)
@show std(variable_true_positives)
@show std(variable_false_positives)

mean(ungroup_true_positives) = 9.21
mean(ungroup_false_positives) = 0.79
mean(variable_true_positives) = 9.43
mean(variable_false_positives) = 0.57
std(ungroup_true_positives) = 0.4333333333333333
std(ungroup_false_positives) = 0.4333333333333334
std(variable_true_positives) = 0.5174724898753342
std(variable_false_positives) = 0.5174724898753341


0.5174724898753341

## Logistic

In [9]:
Random.seed!(2019)

d = Bernoulli
l = canonicallink(d())
n = 1000
p = 10000
k = 10

num_runs = 100
ungroup_true_positives = zeros(num_runs)
ungroup_false_positives = zeros(num_runs)
variable_true_positives = zeros(num_runs)
variable_false_positives = zeros(num_runs)

@time for i in 1:num_runs
    utp, ufp, wtp, wfp = compare(n, p, k, d, l)
    ungroup_true_positives[i] = utp
    ungroup_false_positives[i] = ufp
    variable_true_positives[i] = wtp
    variable_false_positives[i] = wfp
end

compare_model = 10×4 DataFrame
│ Row │ position │ correct │ unweighted │ weighted │
│     │ Int64    │ Float64 │ Float64    │ Float64  │
├─────┼──────────┼─────────┼────────────┼──────────┤
│ 1   │ 844      │ 0.3     │ 0.0        │ 0.238193 │
│ 2   │ 868      │ 0.1     │ 0.0        │ 0.0      │
│ 3   │ 915      │ 0.5     │ 0.477457   │ 0.478861 │
│ 4   │ 2692     │ 0.7     │ 0.693755   │ 0.705615 │
│ 5   │ 4234     │ 0.4     │ 0.494545   │ 0.520017 │
│ 6   │ 4775     │ 1.0     │ 1.06651    │ 1.05308  │
│ 7   │ 5086     │ 0.6     │ 0.545771   │ 0.55663  │
│ 8   │ 5276     │ 0.8     │ 0.811478   │ 0.779552 │
│ 9   │ 7747     │ 0.9     │ 0.938243   │ 0.926799 │
│ 10  │ 8248     │ 0.2     │ 0.0        │ 0.0      │


compare_model = 10×4 DataFrame
│ Row │ position │ correct │ unweighted │ weighted │
│     │ Int64    │ Float64 │ Float64    │ Float64  │
├─────┼──────────┼─────────┼────────────┼──────────┤
│ 1   │ 455      │ 0.6     │ 0.637819   │ 0.617106 │
│ 2   │ 1260     │ 0.8     │ 0.8239

In [10]:
@show mean(ungroup_true_positives)
@show mean(ungroup_false_positives)
@show mean(variable_true_positives)
@show mean(variable_false_positives)

@show std(ungroup_true_positives)
@show std(ungroup_false_positives)
@show std(variable_true_positives)
@show std(variable_false_positives)

mean(ungroup_true_positives) = 7.31
mean(ungroup_false_positives) = 2.69
mean(variable_true_positives) = 7.96
mean(variable_false_positives) = 2.04
std(ungroup_true_positives) = 0.646591685749831
std(ungroup_false_positives) = 0.6465916857498308
std(variable_true_positives) = 0.6343691688790072
std(variable_false_positives) = 0.6343691688790072


0.6343691688790072

## Poisson

In [11]:
Random.seed!(2019)

d = Poisson
l = canonicallink(d())
n = 1000
p = 10000
k = 10

num_runs = 100
ungroup_true_positives = zeros(num_runs)
ungroup_false_positives = zeros(num_runs)
variable_true_positives = zeros(num_runs)
variable_false_positives = zeros(num_runs)

@time for i in 1:num_runs
    utp, ufp, wtp, wfp = compare(n, p, k, d, l)
    ungroup_true_positives[i] = utp
    ungroup_false_positives[i] = ufp
    variable_true_positives[i] = wtp
    variable_false_positives[i] = wfp
end

compare_model = 10×4 DataFrame
│ Row │ position │ correct │ unweighted │ weighted │
│     │ Int64    │ Float64 │ Float64    │ Float64  │
├─────┼──────────┼─────────┼────────────┼──────────┤
│ 1   │ 844      │ 0.3     │ 0.227018   │ 0.226985 │
│ 2   │ 868      │ 0.1     │ 0.0        │ 0.0      │
│ 3   │ 915      │ 0.5     │ 0.334441   │ 0.334396 │
│ 4   │ 2692     │ 0.7     │ 0.506295   │ 0.506207 │
│ 5   │ 4234     │ 0.4     │ 0.299686   │ 0.299648 │
│ 6   │ 4775     │ 1.0     │ 0.67595    │ 0.675845 │
│ 7   │ 5086     │ 0.6     │ 0.387774   │ 0.387709 │
│ 8   │ 5276     │ 0.8     │ 0.554518   │ 0.55445  │
│ 9   │ 7747     │ 0.9     │ 0.598139   │ 0.598063 │
│ 10  │ 8248     │ 0.2     │ 0.156778   │ 0.156755 │


compare_model = 10×4 DataFrame
│ Row │ position │ correct │ unweighted │ weighted │
│     │ Int64    │ Float64 │ Float64    │ Float64  │
├─────┼──────────┼─────────┼────────────┼──────────┤
│ 1   │ 378      │ 0.3     │ 0.190027   │ 0.190051 │
│ 2   │ 467      │ 0.9     │ 0.4338

In [12]:
@show mean(ungroup_true_positives)
@show mean(ungroup_false_positives)
@show mean(variable_true_positives)
@show mean(variable_false_positives)

@show std(ungroup_true_positives)
@show std(ungroup_false_positives)
@show std(variable_true_positives)
@show std(variable_false_positives)

mean(ungroup_true_positives) = 8.0
mean(ungroup_false_positives) = 2.0
mean(variable_true_positives) = 8.29
mean(variable_false_positives) = 1.71
std(ungroup_true_positives) = 0.5860327153276884
std(ungroup_false_positives) = 0.5860327153276884
std(variable_true_positives) = 0.6243380332784314
std(variable_false_positives) = 0.6243380332784314


0.6243380332784314

## Negative Binomial

In [13]:
Random.seed!(2019)

d = NegativeBinomial
l = LogLink()
n = 1000
p = 10000
k = 10

num_runs = 100
ungroup_true_positives = zeros(num_runs)
ungroup_false_positives = zeros(num_runs)
variable_true_positives = zeros(num_runs)
variable_false_positives = zeros(num_runs)

@time for i in 1:num_runs
    utp, ufp, wtp, wfp = compare(n, p, k, d, l)
    ungroup_true_positives[i] = utp
    ungroup_false_positives[i] = ufp
    variable_true_positives[i] = wtp
    variable_false_positives[i] = wfp
end

compare_model = 10×4 DataFrame
│ Row │ position │ correct │ unweighted │ weighted │
│     │ Int64    │ Float64 │ Float64    │ Float64  │
├─────┼──────────┼─────────┼────────────┼──────────┤
│ 1   │ 844      │ 0.3     │ 0.293356   │ 0.295085 │
│ 2   │ 868      │ 0.1     │ 0.0        │ 0.109169 │
│ 3   │ 915      │ 0.5     │ 0.421309   │ 0.42044  │
│ 4   │ 2692     │ 0.7     │ 0.626021   │ 0.629411 │
│ 5   │ 4234     │ 0.4     │ 0.368917   │ 0.365827 │
│ 6   │ 4775     │ 1.0     │ 0.931231   │ 0.927338 │
│ 7   │ 5086     │ 0.6     │ 0.504519   │ 0.514288 │
│ 8   │ 5276     │ 0.8     │ 0.682545   │ 0.68146  │
│ 9   │ 7747     │ 0.9     │ 0.854359   │ 0.862193 │
│ 10  │ 8248     │ 0.2     │ 0.159843   │ 0.162776 │


compare_model = 10×4 DataFrame
│ Row │ position │ correct │ unweighted │ weighted │
│     │ Int64    │ Float64 │ Float64    │ Float64  │
├─────┼──────────┼─────────┼────────────┼──────────┤
│ 1   │ 192      │ 0.9     │ 0.792279   │ 0.789752 │
│ 2   │ 2245     │ 0.7     │ 0.6602

In [14]:
@show mean(ungroup_true_positives)
@show mean(ungroup_false_positives)
@show mean(variable_true_positives)
@show mean(variable_false_positives)

@show std(ungroup_true_positives)
@show std(ungroup_false_positives)
@show std(variable_true_positives)
@show std(variable_false_positives)

mean(ungroup_true_positives) = 9.18
mean(ungroup_false_positives) = 0.82
mean(variable_true_positives) = 9.39
mean(variable_false_positives) = 0.61
std(ungroup_true_positives) = 0.4579268169663901
std(ungroup_false_positives) = 0.45792681696639
std(variable_true_positives) = 0.5103969538555756
std(variable_false_positives) = 0.5103969538555755


0.5103969538555755