# Doubly sparse projection on simulated data

Data simulated here have strong within-group correlation but 0 between-group correlation. We simulate $p = 10000$ SNPs all divided into groups of length 20. For how simulation was carried out, see our [paper](https://www.biorxiv.org/content/10.1101/697755v1.article-metrics). 

In [1]:
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using GLM
using Statistics

In [19]:
function compare(d, l)
    
    # define problem size
    n = 1000
    p = 10000
    block_size = 20
    num_blocks = Int(p / block_size)

    # assign group membership
    membership = collect(1:num_blocks)
    g = zeros(Int64, p + 1)
    for i in 1:length(membership)
        for j in 1:block_size
            cur_row = block_size * (i - 1) + j
            g[block_size*(i - 1) + j] = membership[i]
        end
    end
    g[end] = membership[end]
    
    #simulate correlated snparray
    x = simulate_correlated_snparray(n, p, "tmp.bed")
    z = ones(n, 1) # the intercept
    x_float = convert(Matrix{Float64}, x, model=ADDITIVE_MODEL, center=true, scale=true)
    
    #simulate true model, where 5 groups each with 1~5 snps contribute
    true_b = zeros(p)
    true_groups = randperm(num_blocks)[1:5]
    sort!(true_groups)
    within_group = [randperm(block_size)[1:1], randperm(block_size)[1:2], 
                    randperm(block_size)[1:3], randperm(block_size)[1:4], 
                    randperm(block_size)[1:5]]
    correct_position = zeros(Int64, 15)
    for i in 1:5
        cur_group = block_size * (true_groups[i] - 1)
        cur_group_snps = cur_group .+ within_group[i]
        start, last = Int(i*(i-1)/2 + 1), Int(i*(i+1)/2)
        correct_position[start:last] .= cur_group_snps
    end
    for i in 1:15
        true_b[correct_position[i]] = rand(-1:2:1) * 0.2
    end
    sort!(correct_position)
        
    # simulate phenotype
    if d == Normal || d == Bernoulli || d == Poisson
        prob = linkinv.(l, x_float * true_b)
        clamp!(prob, -20, 20)
        y = [rand(d(i)) for i in prob]
    elseif d == NegativeBinomial
        r = 10
        μ = linkinv.(l, x_float * true_b)
        clamp!(μ, -20, 20)
        prob = 1 ./ (1 .+ μ ./ r)
        y = [rand(d(r, i)) for i in prob] #number of failtures before r success occurs
    end
    y = Float64.(y)
    
    #run IHT without groups
    k = 15
    ungrouped = L0_reg(x_float, z, y, 1, k, d(), l, verbose=false)

    # constant group IHT
#     J = 5
#     k = 5
#     constant_group = L0_reg(x_float, z, y, J, k, d(), l, group=g, verbose=false)

    #variable group IHT
    J = 5
    max_group_snps = ones(Int, num_blocks)
    max_group_snps[true_groups] .= collect(1:5)
    variable_group = L0_reg(x_float, z, y, J, max_group_snps, d(), l, verbose=false, group=g)

    #check result
    correct_position = findall(!iszero, true_b)
    compare_model = DataFrame(
        position   = correct_position,
        correct    = true_b[correct_position],
        ungrouped  = ungrouped.beta[correct_position], 
#         constant_group = constant_group.beta[correct_position],
        variable_group = variable_group.beta[correct_position])
    @show compare_model
    println("\n")

    #clean up
    rm("tmp.bed", force=true)

    #compute true positives and false positives
    ungroup_tp = count(!iszero, ungrouped.beta[correct_position])
#     constant_group_tp = count(!iszero, constant_group.beta[correct_position])
    variable_group_tp = count(!iszero, variable_group.beta[correct_position])
    ungroup_fp = 15 - ungroup_tp
#     constant_group_fp = 25 - constant_group_tp
    variable_group_fp = 15 - variable_group_tp

    return ungroup_tp, ungroup_fp, variable_group_tp, variable_group_fp
end

compare (generic function with 1 method)

## Normal

In [3]:
Random.seed!(2019)

d = Normal
l = canonicallink(d())

num_runs = 100
ungroup_true_positives = zeros(num_runs)
ungroup_false_positives = zeros(num_runs)
variable_true_positives = zeros(num_runs)
variable_false_positives = zeros(num_runs)

@time for i in 1:num_runs
    println("running $i th model")
    utp, ufp, vtp, vfp = compare(d, l)
    ungroup_true_positives[i] = utp
    ungroup_false_positives[i] = ufp
    variable_true_positives[i] = vtp
    variable_false_positives[i] = vfp
end

running 1 th model
compare_model = 15×4 DataFrame
│ Row │ position │ correct │ ungrouped │ variable_group │
│     │ Int64    │ Float64 │ Float64   │ Float64        │
├─────┼──────────┼─────────┼───────────┼────────────────┤
│ 1   │ 3464     │ -0.2    │ -0.154814 │ -0.143728      │
│ 2   │ 4927     │ 0.2     │ 0.157603  │ 0.159271       │
│ 3   │ 4938     │ -0.2    │ -0.243461 │ -0.241363      │
│ 4   │ 5001     │ -0.2    │ -0.194176 │ -0.194224      │
│ 5   │ 5011     │ -0.2    │ -0.196923 │ -0.196444      │
│ 6   │ 5018     │ -0.2    │ -0.19521  │ -0.190188      │
│ 7   │ 5084     │ -0.2    │ 0.0       │ -0.158618      │
│ 8   │ 5090     │ 0.2     │ 0.219459  │ 0.214238       │
│ 9   │ 5098     │ -0.2    │ -0.118704 │ -0.236296      │
│ 10  │ 5100     │ -0.2    │ -0.22058  │ -0.197157      │
│ 11  │ 7004     │ 0.2     │ 0.303529  │ 0.28044        │
│ 12  │ 7011     │ 0.2     │ 0.0       │ 0.0            │
│ 13  │ 7015     │ -0.2    │ 0.0       │ -0.102599      │
│ 14  │ 7016     │ 0.2

In [4]:
@show mean(ungroup_true_positives)
@show mean(ungroup_false_positives)
@show mean(variable_true_positives)
@show mean(variable_false_positives)

@show std(ungroup_true_positives)
@show std(ungroup_false_positives)
@show std(variable_true_positives)
@show std(variable_false_positives)

mean(ungroup_true_positives) = 11.12
mean(ungroup_false_positives) = 3.88
mean(variable_true_positives) = 12.21
mean(variable_false_positives) = 2.79
std(ungroup_true_positives) = 1.9346912681433965
std(ungroup_false_positives) = 1.9346912681433965
std(variable_true_positives) = 1.9709762745267823
std(variable_false_positives) = 1.9709762745267825


1.9709762745267825

## Logistic 

In [5]:
Random.seed!(2019)

d = Bernoulli
l = canonicallink(d())

num_runs = 100
ungroup_true_positives = zeros(num_runs)
ungroup_false_positives = zeros(num_runs)
variable_true_positives = zeros(num_runs)
variable_false_positives = zeros(num_runs)

@time for i in 1:num_runs
    println("running $i th model")
    utp, ufp, vtp, vfp = compare(d, l)
    ungroup_true_positives[i] = utp
    ungroup_false_positives[i] = ufp
    variable_true_positives[i] = vtp
    variable_false_positives[i] = vfp
end

running 1 th model
compare_model = 15×4 DataFrame
│ Row │ position │ correct │ ungrouped │ variable_group │
│     │ Int64    │ Float64 │ Float64   │ Float64        │
├─────┼──────────┼─────────┼───────────┼────────────────┤
│ 1   │ 3464     │ -0.2    │ 0.0       │ 0.0            │
│ 2   │ 4927     │ 0.2     │ 0.284375  │ 0.214257       │
│ 3   │ 4938     │ -0.2    │ 0.0       │ 0.0            │
│ 4   │ 5001     │ -0.2    │ 0.0       │ -0.242449      │
│ 5   │ 5011     │ -0.2    │ 0.0       │ -0.221208      │
│ 6   │ 5018     │ -0.2    │ 0.0       │ 0.0            │
│ 7   │ 5084     │ -0.2    │ 0.0       │ 0.0            │
│ 8   │ 5090     │ 0.2     │ 0.249169  │ 0.215036       │
│ 9   │ 5098     │ -0.2    │ 0.0       │ 0.0            │
│ 10  │ 5100     │ -0.2    │ 0.0       │ -0.187706      │
│ 11  │ 7004     │ 0.2     │ 0.0       │ 0.0            │
│ 12  │ 7011     │ 0.2     │ 0.0       │ 0.0            │
│ 13  │ 7015     │ -0.2    │ 0.0       │ 0.0            │
│ 14  │ 7016     │ 0.2

In [6]:
@show mean(ungroup_true_positives)
@show mean(ungroup_false_positives)
@show mean(variable_true_positives)
@show mean(variable_false_positives)

@show std(ungroup_true_positives)
@show std(ungroup_false_positives)
@show std(variable_true_positives)
@show std(variable_false_positives)

mean(ungroup_true_positives) = 3.82
mean(ungroup_false_positives) = 11.18
mean(variable_true_positives) = 7.7
mean(variable_false_positives) = 7.3
std(ungroup_true_positives) = 1.6041612554021287
std(ungroup_false_positives) = 1.604161255402129
std(variable_true_positives) = 2.222474733128352
std(variable_false_positives) = 2.222474733128352


2.222474733128352

## Poisson 

In [20]:
Random.seed!(2019)

d = Poisson
l = canonicallink(d())

num_runs = 100
ungroup_true_positives = zeros(num_runs)
ungroup_false_positives = zeros(num_runs)
variable_true_positives = zeros(num_runs)
variable_false_positives = zeros(num_runs)

@time for i in 1:num_runs
    println("running $i th model")
    utp, ufp, vtp, vfp = compare(d, l)
    ungroup_true_positives[i] = utp
    ungroup_false_positives[i] = ufp
    variable_true_positives[i] = vtp
    variable_false_positives[i] = vfp
end

running 1 th model
compare_model = 15×4 DataFrame
│ Row │ position │ correct │ ungrouped │ variable_group │
│     │ Int64    │ Float64 │ Float64   │ Float64        │
├─────┼──────────┼─────────┼───────────┼────────────────┤
│ 1   │ 3464     │ -0.2    │ -0.173963 │ -0.177852      │
│ 2   │ 4927     │ 0.2     │ 0.0       │ 0.194465       │
│ 3   │ 4938     │ -0.2    │ -0.256207 │ -0.243026      │
│ 4   │ 5001     │ -0.2    │ -0.223954 │ -0.226218      │
│ 5   │ 5011     │ -0.2    │ -0.197047 │ -0.194907      │
│ 6   │ 5018     │ -0.2    │ -0.212477 │ -0.206918      │
│ 7   │ 5084     │ -0.2    │ 0.0       │ -0.117447      │
│ 8   │ 5090     │ 0.2     │ 0.218054  │ 0.215562       │
│ 9   │ 5098     │ -0.2    │ 0.0       │ 0.0            │
│ 10  │ 5100     │ -0.2    │ -0.246471 │ -0.213671      │
│ 11  │ 7004     │ 0.2     │ 0.217706  │ 0.224078       │
│ 12  │ 7011     │ 0.2     │ 0.189572  │ 0.182618       │
│ 13  │ 7015     │ -0.2    │ -0.222002 │ -0.231538      │
│ 14  │ 7016     │ 0.2

In [21]:
@show mean(ungroup_true_positives)
@show mean(ungroup_false_positives)
@show mean(variable_true_positives)
@show mean(variable_false_positives)

@show std(ungroup_true_positives)
@show std(ungroup_false_positives)
@show std(variable_true_positives)
@show std(variable_false_positives)

mean(ungroup_true_positives) = 11.46
mean(ungroup_false_positives) = 3.54
mean(variable_true_positives) = 12.44
mean(variable_false_positives) = 2.56
std(ungroup_true_positives) = 2.2130010703507894
std(ungroup_false_positives) = 2.213001070350789
std(variable_true_positives) = 1.6956271745733242
std(variable_false_positives) = 1.6956271745733242


1.6956271745733242

## Negative Binomial

In [22]:
Random.seed!(2019)

d = NegativeBinomial
l = LogLink()

num_runs = 100
ungroup_true_positives = zeros(num_runs)
ungroup_false_positives = zeros(num_runs)
variable_true_positives = zeros(num_runs)
variable_false_positives = zeros(num_runs)

@time for i in 1:num_runs
    println("running $i th model")
    utp, ufp, vtp, vfp = compare(d, l)
    ungroup_true_positives[i] = utp
    ungroup_false_positives[i] = ufp
    variable_true_positives[i] = vtp
    variable_false_positives[i] = vfp
end

running 1 th model
compare_model = 15×4 DataFrame
│ Row │ position │ correct │ ungrouped │ variable_group │
│     │ Int64    │ Float64 │ Float64   │ Float64        │
├─────┼──────────┼─────────┼───────────┼────────────────┤
│ 1   │ 3464     │ -0.2    │ -0.234958 │ -0.245853      │
│ 2   │ 4927     │ 0.2     │ 0.158171  │ 0.160904       │
│ 3   │ 4938     │ -0.2    │ -0.222613 │ -0.213439      │
│ 4   │ 5001     │ -0.2    │ -0.193739 │ -0.19624       │
│ 5   │ 5011     │ -0.2    │ -0.162718 │ -0.149913      │
│ 6   │ 5018     │ -0.2    │ -0.190532 │ -0.181966      │
│ 7   │ 5084     │ -0.2    │ 0.0       │ 0.0            │
│ 8   │ 5090     │ 0.2     │ 0.226509  │ 0.21164        │
│ 9   │ 5098     │ -0.2    │ 0.0       │ 0.0            │
│ 10  │ 5100     │ -0.2    │ -0.140337 │ -0.157655      │
│ 11  │ 7004     │ 0.2     │ 0.151748  │ 0.190224       │
│ 12  │ 7011     │ 0.2     │ 0.206449  │ 0.21294        │
│ 13  │ 7015     │ -0.2    │ -0.284706 │ -0.256058      │
│ 14  │ 7016     │ 0.2

In [23]:
@show mean(ungroup_true_positives)
@show mean(ungroup_false_positives)
@show mean(variable_true_positives)
@show mean(variable_false_positives)

@show std(ungroup_true_positives)
@show std(ungroup_false_positives)
@show std(variable_true_positives)
@show std(variable_false_positives)

mean(ungroup_true_positives) = 11.0
mean(ungroup_false_positives) = 4.0
mean(variable_true_positives) = 12.35
mean(variable_false_positives) = 2.65
std(ungroup_true_positives) = 2.098580568870398
std(ungroup_false_positives) = 2.098580568870398
std(variable_true_positives) = 1.5851265762160014
std(variable_false_positives) = 1.5851265762160014


1.5851265762160014