# Doubly sparse projection for rare variant discovery

In [1]:
using Revise
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using GLM

In [2]:
#simulate 5 genes, each with 5 rare variants and 1 non-rare variant
function compare(d::UnionAll, l::Link)
    
    # define problem size
    n = 1000
    p = 100000

    # assign group membership
    g = ones(Int64, p + 1)
    g[20001:40000] .= 2
    g[40001:60000] .= 3
    g[60001:80000] .= 4
    g[80001:end] .= 5
    
    # construct 2 rare variants and 1 non rare variant in each group (gene)
    true_b = zeros(p)
    rare = collect(4000:4000:100000)
    common = collect(10000:20000:100000)
    [true_b[i] = 0.5 for i in rare]
    [true_b[i] = 0.5 for i in common]
    
    # construct snpmatrix, covariate files, and true model b using specific minor allele freq
    mafs = 0.5rand(p)
    clamp!(mafs, 0.05, 0.5)
    mafs[rare] .= 0.005
    mafs[common] .= 0.4
    x = simulate_random_snparray(n, p, "tmp.bed", mafs)
    z = ones(n, 1) # the intercept
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 

    # use the SnpBitMatrix to simulate data
    prob = linkinv.(l, xbm * true_b)
    y = [rand(d(i)) for i in prob]
    y = Float64.(y)
    println("number of cases = " * string(sum(y)))
        
    #run IHT without groups
    k = 30
    ungrouped = L0_reg(x, xbm, z, y, 1, k, d(), l, debias=true, init=false, use_maf=false)
        
    #run IHT with groups
    J = 5
    k = 6
    grouped = L0_reg(x, xbm, z, y, J, k, d(), l, debias=true, init=false, use_maf=false, group=g)

    #check result
    correct_position = [common; rare] # 10000, 30000, 50000, 70000, 90000 are common ones, other are rares
    compare_model = DataFrame(
        position   = correct_position,
        correct    = true_b[correct_position],
        ungrouped  = ungrouped.beta[correct_position], 
        grouped    = grouped.beta[correct_position])
    @show compare_model
    println("\n")

    #clean up
    rm("tmp.bed", force=true)
    
    group_rare = count(!iszero, grouped.beta[rare])
    group_common = count(!iszero, grouped.beta[common])
    ungroup_rare = count(!iszero, ungrouped.beta[rare])
    ungroup_common = count(!iszero, ungrouped.beta[common])
    return group_rare, group_common, ungroup_rare, ungroup_common
end

compare (generic function with 1 method)

In [5]:
Random.seed!(2019)
d = Bernoulli
l = canonicallink(d())

group_rare_found = 0
group_common_found = 0
ungroup_rare_found = 0
ungroup_common_found = 0

num_runs = 10
for i in 1:num_runs
    println("running $i th model")
    gr, gc, ur, uc = compare(d, l)
    group_rare_found += gr
    group_common_found += gc
    ungroup_rare_found += ur
    ungroup_common_found += uc
end

println("Group IHT found: $group_common_found / " * string(5*num_runs) * " common variants and $group_rare_found / " * string(25*num_runs) * "rare variants")
println("Ungrouped IHT found: $ungroup_common_found / " * string(5*num_runs) * " common variants and $ungroup_rare_found / " * string(25*num_runs) * "rare variants")


running 1 th model
number of cases = 426.0
compare_model = 30×4 DataFrame
│ Row │ position │ correct │ ungrouped │ grouped  │
│     │ Int64    │ Float64 │ Float64   │ Float64  │
├─────┼──────────┼─────────┼───────────┼──────────┤
│ 1   │ 10000    │ 0.5     │ 0.583125  │ 0.624073 │
│ 2   │ 30000    │ 0.5     │ 0.51649   │ 0.50558  │
│ 3   │ 50000    │ 0.5     │ 0.0       │ 0.0      │
│ 4   │ 70000    │ 0.5     │ 0.524118  │ 0.556743 │
│ 5   │ 90000    │ 0.5     │ 0.0       │ 0.442866 │
│ 6   │ 4000     │ 0.5     │ 0.0       │ 0.417332 │
│ 7   │ 8000     │ 0.5     │ 0.840596  │ 1.02274  │
│ 8   │ 12000    │ 0.5     │ 0.0       │ 0.0      │
│ 9   │ 16000    │ 0.5     │ 0.956049  │ 0.883687 │
│ 10  │ 20000    │ 0.5     │ 0.0       │ 0.0      │
│ 11  │ 24000    │ 0.5     │ 0.697486  │ 0.702569 │
│ 12  │ 28000    │ 0.5     │ 0.487665  │ 0.550583 │
│ 13  │ 32000    │ 0.5     │ 0.0       │ 0.0      │
│ 14  │ 36000    │ 0.5     │ 0.0       │ 0.0      │
│ 15  │ 40000    │ 0.5     │ 0.0       │ 0

# Note: The final table in the paper is ran in terminal and not reproduced here, although the code is the same. 