# Doubly sparse projection: when it performs better

In [2]:
using Revise
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using GLM

┌ Info: Recompiling stale cache file /Users/biona001/.julia/compiled/v1.0/MendelIHT/eaqWB.ji for MendelIHT [921c7187-1484-5754-b919-5d3ed9ac03c4]
└ @ Base loading.jl:1190


In [5]:
function compare(d::UnionAll, l::Link)
    # define problem size and number of true predictors k
    n = 1000
    p = 10000

    # construct snpmatrix, covariate files, and true model b
    x, = simulate_random_snparray(n, p, "tmp.bed")
    z = ones(n, 1) # the intercept
    
    #construct 3 true response and make 2 fake SNPs highly (~0.9) correlated with one of them
    ρ = 0.98
    true_b = zeros(p)
    correct_position = [1000, 5000, 9000]
    [true_b[i] = 1.0 for i in correct_position]
    correlated_snps = [4900, 5100]      #SNP 4900 and 5100 shall be made correlated with 5000
    adhoc_add_correlation(x, ρ, 5000, correlated_snps) #SNP 5000 now has 2 SNPs with ~0.9 correlation
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 

    # use the SnpBitMatrix to simulate data
    prob = linkinv.(l, xbm * true_b)
    y = [rand(d(i)) for i in prob]
    
    #create group membership
    g = ones(Int64, p + 1)
    g[3334:6666] .= 2
    g[6667:end] .= 3
    
    #run IHT without groups
    J = 1
    k = 5
    ungrouped = L0_reg(x, xbm, z, y, J, k, d(), l, debias=false, init=false, use_maf=false)
    
    #run IHT with groups
    J = 3
    k = 1
    grouped = L0_reg(x, xbm, z, y, J, k, d(), l, debias=false, init=false, use_maf=false, group=g)

    #check result
    compare_model = DataFrame(
        position   = correct_position,
        correct    = true_b[correct_position],
        ungrouped  = ungrouped.beta[correct_position], 
        grouped    = grouped.beta[correct_position])
    @show compare_model
    println("\n")

    #clean up
    rm("tmp.bed", force=true)
    
#     unweighted_found = count(!iszero, unweighted.beta[correct_position])
#     weighted_found = count(!iszero, weighted.beta[correct_position])
#     return unweighted_found, weighted_found
end

compare (generic function with 1 method)

In [6]:
Random.seed!(2019)
d = Normal
l = canonicallink(d())
for i in 1:10
    compare(d, l)
end

for SNP 4900 the simulated correlation is 0.9852305660907742
for SNP 5100 the simulated correlation is 0.9837305195571029
[31mDid not converge!!!!! The run time for IHT was 10.149780988693237 seconds and model size was4[39m
compare_model = 3×4 DataFrame
│ Row │ position │ correct │ ungrouped │ grouped  │
│     │ Int64    │ Float64 │ Float64   │ Float64  │
├─────┼──────────┼─────────┼───────────┼──────────┤
│ 1   │ 1000     │ 1.0     │ 0.994382  │ 0.994151 │
│ 2   │ 5000     │ 1.0     │ 0.728327  │ 0.971643 │
│ 3   │ 9000     │ 1.0     │ 1.02342   │ 1.02187  │


for SNP 4900 the simulated correlation is 0.9840690205796696
for SNP 5100 the simulated correlation is 0.982466805657332
compare_model = 3×4 DataFrame
│ Row │ position │ correct │ ungrouped │ grouped  │
│     │ Int64    │ Float64 │ Float64   │ Float64  │
├─────┼──────────┼─────────┼───────────┼──────────┤
│ 1   │ 1000     │ 1.0     │ 1.00828   │ 1.00506  │
│ 2   │ 5000     │ 1.0     │ 0.600483  │ 0.0      │
│ 3   │ 9000     │ 

In [None]:
#simulate 5 genes, each with 5 rare variants and 1 non-rare variant
function compare(d::UnionAll, l::Link)
    # define problem size and number of true predictors k
    n = 1000
    p = 10000

    # construct snpmatrix, covariate files, and true model b
    x, = simulate_random_snparray(n, p, "tmp.bed")
    z = ones(n, 1) # the intercept
    
    #create group membership
    g = ones(Int64, p + 1)
    g[2001:4000] .= 2
    g[4001:6000] .= 3
    g[6001:8000] .= 4
    g[8001:end] .= 5
    
    #construct 5 rare variants and 1 non rare variant in each gene
    true_b = zeros(p)
    rare = collect(400:400:10000)
#     rare_1 = collect(400:400:2000)
#     rare_2 = collect(2400:400:4000)
#     rare_3 = collect(4400:400:6000)
#     rare_4 = collect(6400:400:8000)
#     rare_5 = collect(8400:400:10000)
    common = collect(1000:2000:10000)
    [true_b[i] = 1.0 for i in rare]
    [true_b[i] = 1.0 for i in common]    
    xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 

    # use the SnpBitMatrix to simulate data
    prob = linkinv.(l, xbm * true_b)
    y = [rand(d(i)) for i in prob]
    
    #run IHT without groups
    J = 1
    k = 5
    ungrouped = L0_reg(x, xbm, z, y, J, k, d(), l, debias=false, init=false, use_maf=false)
    
    #run IHT with groups
    J = 3
    k = 1
    grouped = L0_reg(x, xbm, z, y, J, k, d(), l, debias=false, init=false, use_maf=false, group=g)

    #check result
    compare_model = DataFrame(
        position   = correct_position,
        correct    = true_b[correct_position],
        ungrouped  = ungrouped.beta[correct_position], 
        grouped    = grouped.beta[correct_position])
    @show compare_model
    println("\n")

    #clean up
    rm("tmp.bed", force=true)
    
#     unweighted_found = count(!iszero, unweighted.beta[correct_position])
#     weighted_found = count(!iszero, weighted.beta[correct_position])
#     return unweighted_found, weighted_found
end