# Doubly sparse projection for rare variant discovery

In [1]:
using Revise
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using GLM

In [63]:
#simulate 5 genes, each with 5 rare variants and 1 non-rare variant
function compare(d::UnionAll, l::Link)
    
    # define problem size
    n = 1000
    p = 10000
    block_size = 20
    num_blocks = Int(p / block_size)

    # assign group membership
    membership = collect(1:num_blocks)
    g = zeros(Int64, p + 1)
    for i in 1:length(membership)
        for j in 1:block_size
            cur_row = block_size * (i - 1) + j
            g[block_size*(i - 1) + j] = membership[i]
        end
    end
    g[end] = membership[end]
    
    #simulate correlated snparray
    x = simulate_correlated_snparray(n, p, "tmp.bed")
    z = ones(n, 1) # the intercept
    x_float = convert(Matrix{Float64}, x, model=ADDITIVE_MODEL, center=true, scale=true)
    
    #simulate true model, where 5 groups each with 3 snps contribute
    true_b = zeros(p)
    true_groups = randperm(num_blocks)[1:5]
    within_group = [randperm(block_size)[1:3] randperm(block_size)[1:3] randperm(block_size)[1:3] randperm(block_size)[1:3] randperm(block_size)[1:3]]
    correct_position = zeros(Int64, 15)
    for i in 1:5
        cur_group = block_size * (true_groups[i] - 1)
        cur_group_snps = cur_group .+ within_group[:, i]
        correct_position[3*(i-1)+1:3i] .= cur_group_snps
    end
    for i in 1:15
        true_b[correct_position[i]] = rand(-1:2:1) * 0.2
    end
    sort!(correct_position)
        
    # simulate phenotype
    if d == Normal || d == Bernoulli || d == Poisson
        prob = linkinv.(l, x_float * true_b)
        clamp!(prob, -20, 20)
        y = [rand(d(i)) for i in prob]
    elseif d == NegativeBinomial
        nn = 10
        μ = linkinv.(l, x_float * true_b)
        clamp!(μ, -20, 20)
        prob = 1 ./ (1 .+ μ ./ nn)
        y = [rand(d(nn, i)) for i in prob] #number of failtures before nn success occurs
    end
    y = Float64.(y)
    
    #run IHT without groups
    k = 15
    ungrouped = L0_reg(x_float, z, y, 1, k, d(), l, debias=false)
        
    #run IHT with groups
    J = 5
    k = 3
    grouped = L0_reg(x_float, z, y, J, k, d(), l, debias=false, group=g)

    #check result
    compare_model = DataFrame(
        position   = correct_position,
        correct    = true_b[correct_position],
        ungrouped  = ungrouped.beta[correct_position], 
        grouped    = grouped.beta[correct_position])
    @show compare_model
    println("Grouped ran: ", grouped.iter, " iters and Ungrouped ran: ", ungrouped.iter, " iters")
    println("\n")

    #clean up
    rm("tmp.bed", force=true)
    
    group = count(!iszero, grouped.beta[correct_position])
    ungroup = count(!iszero, ungrouped.beta[correct_position])
    return group, ungroup
end

compare (generic function with 1 method)

# Normal

In [64]:
Random.seed!(2019)
d = Normal
l = canonicallink(d())

num_runs = 100
group_found = zeros(num_runs)
ungroup_found = zeros(num_runs)

for i in 1:num_runs
    println("running $i th model")
    gf, uf = compare(d, l)
    group_found[i] = gf
    ungroup_found[i] = uf
end

println("Group IHT found: ", sum(group_found), "/", 15num_runs, " variants", ", percentage = ", sum(group_found)/15num_runs)
println("Ungrouped IHT found: ", sum(ungroup_found), "/", 15num_runs, " variants" , ", percentage = ", sum(ungroup_found)/15num_runs)
println("Grouped common variant standard errors = ", std(group_found), ", percentage = ", std(group_found)/15)
println("Ungrouped common variant standard errors = ", std(ungroup_found), ", percentage = ", std(ungroup_found)/15)

running 1 th model
compare_model = 15×4 DataFrame
│ Row │ position │ correct │ ungrouped │ grouped   │
│     │ Int64    │ Float64 │ Float64   │ Float64   │
├─────┼──────────┼─────────┼───────────┼───────────┤
│ 1   │ 3464     │ -0.2    │ -0.148166 │ -0.159439 │
│ 2   │ 3466     │ -0.2    │ -0.183184 │ -0.173504 │
│ 3   │ 3468     │ 0.2     │ 0.196883  │ 0.189621  │
│ 4   │ 4924     │ -0.2    │ -0.221287 │ -0.26612  │
│ 5   │ 4930     │ 0.2     │ 0.210063  │ 0.211576  │
│ 6   │ 4938     │ 0.2     │ 0.0       │ 0.168352  │
│ 7   │ 5004     │ 0.2     │ 0.21265   │ 0.21483   │
│ 8   │ 5016     │ -0.2    │ -0.177054 │ -0.179432 │
│ 9   │ 5020     │ 0.2     │ 0.198026  │ 0.196929  │
│ 10  │ 5081     │ -0.2    │ -0.177789 │ -0.177659 │
│ 11  │ 5091     │ -0.2    │ 0.0       │ -0.102868 │
│ 12  │ 5098     │ 0.2     │ 0.0       │ 0.0       │
│ 13  │ 7003     │ -0.2    │ 0.0       │ -0.202866 │
│ 14  │ 7007     │ -0.2    │ 0.0       │ -0.188353 │
│ 15  │ 7018     │ -0.2    │ -0.228586 │ -0.23565

# Logistic

In [65]:
Random.seed!(2019)
d = Bernoulli
l = canonicallink(d())

num_runs = 100
group_found = zeros(num_runs)
ungroup_found = zeros(num_runs)

for i in 1:num_runs
    println("running $i th model")
    gf, uf = compare(d, l)
    group_found[i] = gf
    ungroup_found[i] = uf
end

println("Group IHT found: ", sum(group_found), "/", 15num_runs, " variants", "percentage = ", sum(group_found)/15num_runs)
println("Ungrouped IHT found: ", sum(ungroup_found), "/", 15num_runs, " variants" , "percentage = ", sum(ungroup_found)/15num_runs)
println("Grouped common variant standard errors = ", std(group_found), "percentage = ", std(group_found)/15)
println("Ungrouped common variant standard errors = ", std(ungroup_found), "percentage = ", std(ungroup_found)/15)

running 1 th model
compare_model = 15×4 DataFrame
│ Row │ position │ correct │ ungrouped │ grouped    │
│     │ Int64    │ Float64 │ Float64   │ Float64    │
├─────┼──────────┼─────────┼───────────┼────────────┤
│ 1   │ 3464     │ -0.2    │ 0.0       │ 0.0        │
│ 2   │ 3466     │ -0.2    │ -0.269934 │ 0.0        │
│ 3   │ 3468     │ 0.2     │ 0.0       │ 0.0        │
│ 4   │ 4924     │ -0.2    │ 0.0       │ -0.0855497 │
│ 5   │ 4930     │ 0.2     │ 0.286553  │ 0.354747   │
│ 6   │ 4938     │ 0.2     │ 0.405353  │ 0.402313   │
│ 7   │ 5004     │ 0.2     │ 0.28868   │ 0.266985   │
│ 8   │ 5016     │ -0.2    │ 0.0       │ 0.0        │
│ 9   │ 5020     │ 0.2     │ 0.237608  │ 0.155275   │
│ 10  │ 5081     │ -0.2    │ 0.0       │ 0.0        │
│ 11  │ 5091     │ -0.2    │ 0.0       │ 0.0        │
│ 12  │ 5098     │ 0.2     │ 0.0       │ 0.0        │
│ 13  │ 7003     │ -0.2    │ 0.0       │ 0.0        │
│ 14  │ 7007     │ -0.2    │ 0.0       │ 0.0        │
│ 15  │ 7018     │ -0.2    │ -0.

# Poisson

In [66]:
Random.seed!(2019)
d = Poisson
l = canonicallink(d())

num_runs = 100
group_found = zeros(num_runs)
ungroup_found = zeros(num_runs)

for i in 1:num_runs
    println("running $i th model")
    gf, uf = compare(d, l)
    group_found[i] = gf
    ungroup_found[i] = uf
end

println("Group IHT found: ", sum(group_found), "/", 15num_runs, " variants", "percentage = ", sum(group_found)/15num_runs)
println("Ungrouped IHT found: ", sum(ungroup_found), "/", 15num_runs, " variants" , "percentage = ", sum(ungroup_found)/15num_runs)
println("Grouped common variant standard errors = ", std(group_found), "percentage = ", std(group_found)/15)
println("Ungrouped common variant standard errors = ", std(ungroup_found), "percentage = ", std(ungroup_found)/15)

running 1 th model
compare_model = 15×4 DataFrame
│ Row │ position │ correct │ ungrouped │ grouped   │
│     │ Int64    │ Float64 │ Float64   │ Float64   │
├─────┼──────────┼─────────┼───────────┼───────────┤
│ 1   │ 3464     │ -0.2    │ -0.180238 │ -0.196032 │
│ 2   │ 3466     │ -0.2    │ -0.211899 │ -0.203317 │
│ 3   │ 3468     │ 0.2     │ 0.196402  │ 0.187232  │
│ 4   │ 4924     │ -0.2    │ -0.235714 │ -0.24824  │
│ 5   │ 4930     │ 0.2     │ 0.0       │ 0.143477  │
│ 6   │ 4938     │ 0.2     │ 0.0       │ 0.0       │
│ 7   │ 5004     │ 0.2     │ 0.200452  │ 0.197379  │
│ 8   │ 5016     │ -0.2    │ -0.186234 │ -0.195465 │
│ 9   │ 5020     │ 0.2     │ 0.188199  │ 0.185596  │
│ 10  │ 5081     │ -0.2    │ -0.234073 │ -0.149247 │
│ 11  │ 5091     │ -0.2    │ -0.230353 │ -0.156976 │
│ 12  │ 5098     │ 0.2     │ 0.221201  │ 0.0       │
│ 13  │ 7003     │ -0.2    │ -0.161933 │ -0.167107 │
│ 14  │ 7007     │ -0.2    │ -0.193825 │ -0.196326 │
│ 15  │ 7018     │ -0.2    │ -0.121225 │ -0.12747

# Negative Binomial

In [67]:
Random.seed!(2019)
d = NegativeBinomial
l = LogLink()

num_runs = 100
group_found = zeros(num_runs)
ungroup_found = zeros(num_runs)

for i in 1:num_runs
    println("running $i th model")
    gf, uf = compare(d, l)
    group_found[i] = gf
    ungroup_found[i] = uf
end

println("Group IHT found: ", sum(group_found), "/", 15num_runs, " variants", "percentage = ", sum(group_found)/15num_runs)
println("Ungrouped IHT found: ", sum(ungroup_found), "/", 15num_runs, " variants" , "percentage = ", sum(ungroup_found)/15num_runs)
println("Grouped common variant standard errors = ", std(group_found), "percentage = ", std(group_found)/15)
println("Ungrouped common variant standard errors = ", std(ungroup_found), "percentage = ", std(ungroup_found)/15)

running 1 th model
compare_model = 15×4 DataFrame
│ Row │ position │ correct │ ungrouped │ grouped   │
│     │ Int64    │ Float64 │ Float64   │ Float64   │
├─────┼──────────┼─────────┼───────────┼───────────┤
│ 1   │ 3464     │ -0.2    │ -0.214793 │ -0.229371 │
│ 2   │ 3466     │ -0.2    │ -0.20044  │ -0.192839 │
│ 3   │ 3468     │ 0.2     │ 0.205078  │ 0.1956    │
│ 4   │ 4924     │ -0.2    │ -0.147362 │ -0.161833 │
│ 5   │ 4930     │ 0.2     │ 0.231746  │ 0.229945  │
│ 6   │ 4938     │ 0.2     │ 0.21064   │ 0.199346  │
│ 7   │ 5004     │ 0.2     │ 0.19831   │ 0.201262  │
│ 8   │ 5016     │ -0.2    │ -0.189168 │ -0.198957 │
│ 9   │ 5020     │ 0.2     │ 0.192298  │ 0.202311  │
│ 10  │ 5081     │ -0.2    │ -0.123427 │ -0.122367 │
│ 11  │ 5091     │ -0.2    │ 0.0       │ -0.109817 │
│ 12  │ 5098     │ 0.2     │ 0.0       │ 0.0       │
│ 13  │ 7003     │ -0.2    │ -0.240588 │ -0.23592  │
│ 14  │ 7007     │ -0.2    │ -0.219081 │ -0.219578 │
│ 15  │ 7018     │ -0.2    │ -0.239348 │ -0.24982