# Doubly sparse projection on NFBC data

In [1]:
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using GLM
using Statistics
using StatsBase

## Import NFBC1966 data, subset it, then define causal SNPs

We select the first 30,000 SNPs (~1%) from the NFBC dataset as our data. 

### We designated the following causal/non-causal genes and non-coding regions:

+ Two "large genes" of length 2000, one of which contains 5 causal SNPs.
+ Five "medium genes" of length 500, 2 of which contain 3 causal SNPs. 
+ Ten "small genes" of length 100, 3 of which contain 2 causal SNPs.
+ Non-coding regions, containing 2 causal SNPs. 
+ Total = 19 causal SNPs
+ Total groups = 18
+ Total active groups = 7


In [2]:
#full genotype data after basic quality control
stampeed = SnpArray("../stampeed/HDL/kevin_imputed_filtered_HDL.bed")

# use first 30000 (standardized) SNPs in simulation
X = convert(Matrix{Float32}, @view(stampeed[:, 1:30000]), scale=true, center=true)

4907×30000 Array{Float32,2}:
  0.304199   0.67661   0.665899   0.677065  …  -0.23099   0.754171   1.24577 
  0.304199  -1.13965  -1.16878   -1.13843       1.20196   0.754171   1.24577 
 -3.13522    0.67661   0.665899   0.677065     -0.23099   0.754171  -0.179834
  0.304199   0.67661   0.665899   0.677065     -0.23099  -0.948874  -0.179834
  0.304199   0.67661   0.665899   0.677065     -0.23099   0.754171  -0.179834
  0.304199   0.67661   0.665899   0.677065  …   1.20196   0.754171  -1.60544 
  0.304199   0.67661   0.665899   0.677065     -1.66394   0.754171  -1.60544 
  0.304199   0.67661   0.665899   0.677065     -0.23099  -0.948874  -0.179834
  0.304199   0.67661   0.665899   0.677065      1.20196   0.754171  -1.60544 
 -3.13522   -2.95591  -3.00346   -2.95393       1.20196   0.754171  -0.179834
  0.304199   0.67661   0.665899   0.677065  …   1.20196  -0.948874  -1.60544 
  0.304199   0.67661   0.665899   0.677065     -0.23099   0.754171   1.24577 
  0.304199  -1.13965  -1.16878   -1

### SNPs can be highly correlated due to LD

In [3]:
snp2 = X[:, 2]
snp3 = X[:, 3]
correlation = convert(Float64, cor(snp2, snp3))

0.9772464632987976

### Define group information

In [4]:
#number of snpss
p = 30000

# non coding region belongs in group 1
group = ones(Int64, p + 1) # +1 because of intercept

# 2 large genes
group[1:2000] .= 2
group[10001:12000] .= 3

# 5 medium genes
group[4001:4500] .= 4
group[12501:13000] .= 5
group[18001:18500] .= 6
group[22001:22500] .= 7
group[28501:29000] .= 8

# 10 small genes
group[2001:2100] .= 9
group[2501:2600] .= 10
group[3301:3400] .= 11
group[3501:3600] .= 12
group[17001:17100] .= 13
group[19001:19100] .= 14
group[21001:22000] .= 15
group[23601:23700] .= 16
group[24401:24500] .= 17
group[29301:29400] .= 18;

## Run IHT for ungrouped, constant group, and variable group sizes

In [5]:
function compare(d, l, X, group)
    
    # randomly choose among genes
    selected_large_gene  = rand(2:3)
    selected_medium_gene = sample(4:8, 2, replace=false)
    selected_small_gene  = sample(9:18, 3, replace=false)

    # randomly select snps in genes and non-coding regions to have effect sizes -0.2 or 0.2
    n, p = size(X)
    true_b = zeros(p)
    for i in selected_large_gene
        large_gene_index = findall(x -> x == i, group)
        @view(true_b[large_gene_index])[1:5] .= rand(-1:2:1) * 0.2
        shuffle!(@view(true_b[large_gene_index]))
    end
    for i in selected_medium_gene
        medium_gene_index = findall(x -> x == i, group)
        @view(true_b[medium_gene_index])[1:3] .= rand(-1:2:1) * 0.2
        shuffle!(@view(true_b[medium_gene_index]))
    end
    for i in selected_small_gene
        small_gene_index = findall(x -> x == i, group)
        @view(true_b[small_gene_index])[1:2] .= rand(-1:2:1) * 0.2
        shuffle!(@view(true_b[small_gene_index]))
    end
    true_b[12001] = rand(-1:2:1) * 0.2 
    true_b[24501] = rand(-1:2:1) * 0.2 
    shuffle!(@view(true_b[12000:12500])) # let a SNP in 12000:12500 be causal
    shuffle!(@view(true_b[24501:28500])) # let a SNP in 24501:28500 be causal
    correct_position = findall(!iszero, true_b)
    
    # construct nongenetic covariate (intercept only)
    z = ones(Float32, n, 1) # the intercept

    # simulate phenotype
    if d == Normal || d == Bernoulli || d == Poisson
        prob = GLM.linkinv.(l, X * true_b)
        clamp!(prob, -20, 20)
        y = [rand(d(i)) for i in prob]
    elseif d == NegativeBinomial
        r = 10
        μ = GLM.linkinv.(l, X * true_b)
        clamp!(μ, -20, 20)
        prob = 1 ./ (1 .+ μ ./ r)
        y = [rand(d(r, Float64(i))) for i in prob]
    end
    y = Float32.(y)
        
    #IHT without groups
    k = 19
    ungrouped = L0_reg(X, z, y, 1, k, d(), l, verbose=false)
        
    #IHT with constant groups
    J = 7
    k = 5
    constant_group = L0_reg(X, z, y, J, k, d(), l, group=group, verbose=false)

    #IHT with variable groups. 2 in non-coding, 5 for large gene, 3 for medium gene, 2 for small gene. 
    J = 7
    max_group_snps = zeros(Int, 18)
    max_group_snps[1] = 2
    max_group_snps[selected_large_gene] = 5
    max_group_snps[selected_medium_gene] .= 3
    max_group_snps[selected_small_gene] .= 2 
    variable_group = L0_reg(X, z, y, J, max_group_snps, d(), l, group=group, verbose=false)

    #check result
    correct_position = findall(!iszero, true_b)
    compare_model = DataFrame(
        position   = correct_position,
        correct    = true_b[correct_position],
        ungrouped  = ungrouped.beta[correct_position], 
        constant_group = constant_group.beta[correct_position],
        variable_group = variable_group.beta[correct_position])
    @show compare_model
    println("\n")

    #compute false positives and false negatives
    ungroup_found = count(!iszero, ungrouped.beta[correct_position])
    constant_group_found = count(!iszero, constant_group.beta[correct_position])
    variable_group_found = count(!iszero, variable_group.beta[correct_position])
    
    ungroup_fp = 19 - ungroup_found
    constant_group_fp = 35 - constant_group_found
    variable_group_fp = 19 - variable_group_found
    
    ungroup_fn = count(iszero, ungrouped.beta[correct_position])
    constant_group_fn = count(iszero, constant_group.beta[correct_position])
    variable_group_fn = count(iszero, variable_group.beta[correct_position])
    
    return ungroup_fp, ungroup_fn, constant_group_fp,
            constant_group_fn, variable_group_fp, variable_group_fn
end

compare (generic function with 1 method)

## Normal

In [6]:
Random.seed!(2019)

d = Normal
l = canonicallink(d())

num_runs = 100
ungroup_false_positives = zeros(num_runs)
ungroup_false_negatives = zeros(num_runs)
constant_false_positives = zeros(num_runs)
constant_false_negatives = zeros(num_runs)
variable_false_positives = zeros(num_runs)
variable_false_negatives = zeros(num_runs)

@time for i in 1:num_runs
    println("running $i th model")
    ufp, ufn, cfp, cfn, vfp, vfn = compare(d, l, X, group)
    ungroup_false_positives[i] = ufp
    ungroup_false_negatives[i] = ufn
    constant_false_positives[i] = cfp
    constant_false_negatives[i] = cfn
    variable_false_positives[i] = vfp
    variable_false_negatives[i] = vfn
end

running 1 th model




compare_model = 19×5 DataFrame
│ Row │ position │ correct │ ungrouped │ constant_group │ variable_group │
│     │ Int64    │ Float64 │ Float32   │ Float32        │ Float32        │
├─────┼──────────┼─────────┼───────────┼────────────────┼────────────────┤
│ 1   │ 3337     │ -0.2    │ -0.222992 │ -0.200582      │ -0.213006      │
│ 2   │ 3394     │ -0.2    │ 0.0       │ -0.0845098     │ -0.159637      │
│ 3   │ 3527     │ 0.2     │ 0.213736  │ 0.226742       │ 0.230826       │
│ 4   │ 3557     │ 0.2     │ 0.191726  │ 0.20554        │ 0.190352       │
│ 5   │ 4041     │ -0.2    │ -0.193185 │ -0.197958      │ -0.193631      │
│ 6   │ 4255     │ -0.2    │ -0.201264 │ -0.194245      │ -0.194682      │
│ 7   │ 4418     │ -0.2    │ -0.205089 │ -0.20328       │ -0.204669      │
│ 8   │ 10168    │ 0.2     │ 0.108262  │ 0.212045       │ 0.215621       │
│ 9   │ 10251    │ 0.2     │ 0.220193  │ 0.220925       │ 0.221087       │
│ 10  │ 10515    │ 0.2     │ 0.195701  │ 0.194497       │ 0.195405   

In [7]:
@show mean(ungroup_false_positives)
@show mean(ungroup_false_negatives)
@show mean(constant_false_positives)
@show mean(constant_false_negatives)
@show mean(variable_false_positives)
@show mean(variable_false_negatives)

@show std(ungroup_false_positives)
@show std(ungroup_false_negatives)
@show std(constant_false_positives)
@show std(constant_false_negatives)
@show std(variable_false_positives)
@show std(variable_false_negatives)

mean(ungroup_false_positives) = 1.98
mean(ungroup_false_negatives) = 1.98
mean(constant_false_positives) = 16.99
mean(constant_false_negatives) = 0.99
mean(variable_false_positives) = 2.05
mean(variable_false_negatives) = 2.05
std(ungroup_false_positives) = 1.238930786989126
std(ungroup_false_negatives) = 1.238930786989126
std(constant_false_positives) = 1.0298082590973188
std(constant_false_negatives) = 1.0298082590973188
std(variable_false_positives) = 1.3880805728658825
std(variable_false_negatives) = 1.3880805728658825


1.3880805728658825

## Logistic

In [8]:
Random.seed!(2019)

d = Bernoulli
l = canonicallink(d())

num_runs = 100
ungroup_false_positives = zeros(num_runs)
ungroup_false_negatives = zeros(num_runs)
constant_false_positives = zeros(num_runs)
constant_false_negatives = zeros(num_runs)
variable_false_positives = zeros(num_runs)
variable_false_negatives = zeros(num_runs)

@time for i in 1:num_runs
    println("running $i th model")
    ufp, ufn, cfp, cfn, vfp, vfn = compare(d, l, X, group)
    ungroup_false_positives[i] = ufp
    ungroup_false_negatives[i] = ufn
    constant_false_positives[i] = cfp
    constant_false_negatives[i] = cfn
    variable_false_positives[i] = vfp
    variable_false_negatives[i] = vfn
end

running 1 th model
compare_model = 19×5 DataFrame
│ Row │ position │ correct │ ungrouped │ constant_group │ variable_group │
│     │ Int64    │ Float64 │ Float32   │ Float32        │ Float32        │
├─────┼──────────┼─────────┼───────────┼────────────────┼────────────────┤
│ 1   │ 3337     │ -0.2    │ -0.18343  │ -0.181064      │ -0.183427      │
│ 2   │ 3394     │ -0.2    │ -0.182837 │ -0.1071        │ -0.18284       │
│ 3   │ 3527     │ 0.2     │ 0.165419  │ 0.179865       │ 0.165409       │
│ 4   │ 3557     │ 0.2     │ 0.196164  │ 0.176832       │ 0.196164       │
│ 5   │ 4041     │ -0.2    │ -0.219186 │ -0.228678      │ -0.219182      │
│ 6   │ 4255     │ -0.2    │ -0.237326 │ -0.225719      │ -0.23732       │
│ 7   │ 4418     │ -0.2    │ -0.245374 │ -0.251625      │ -0.245368      │
│ 8   │ 10168    │ 0.2     │ 0.0       │ 0.0            │ 0.0            │
│ 9   │ 10251    │ 0.2     │ 0.18268   │ 0.160913       │ 0.182678       │
│ 10  │ 10515    │ 0.2     │ 0.202338  │ 0.200499 

In [9]:
@show mean(ungroup_false_positives)
@show mean(ungroup_false_negatives)
@show mean(constant_false_positives)
@show mean(constant_false_negatives)
@show mean(variable_false_positives)
@show mean(variable_false_negatives)

@show std(ungroup_false_positives)
@show std(ungroup_false_negatives)
@show std(constant_false_positives)
@show std(constant_false_negatives)
@show std(variable_false_positives)
@show std(variable_false_negatives)

mean(ungroup_false_positives) = 3.33
mean(ungroup_false_negatives) = 3.33
mean(constant_false_positives) = 18.75
mean(constant_false_negatives) = 2.75
mean(variable_false_positives) = 3.2
mean(variable_false_negatives) = 3.2
std(ungroup_false_positives) = 1.504572156374487
std(ungroup_false_negatives) = 1.504572156374487
std(constant_false_positives) = 1.8167292177165673
std(constant_false_negatives) = 1.8167292177165673
std(variable_false_positives) = 1.5954480704349312
std(variable_false_negatives) = 1.5954480704349312


1.5954480704349312

## Poisson

In [10]:
Random.seed!(2019)

d = Poisson
l = canonicallink(d())

num_runs = 100
ungroup_false_positives = zeros(num_runs)
ungroup_false_negatives = zeros(num_runs)
constant_false_positives = zeros(num_runs)
constant_false_negatives = zeros(num_runs)
variable_false_positives = zeros(num_runs)
variable_false_negatives = zeros(num_runs)

@time for i in 1:num_runs
    println("running $i th model")
    ufp, ufn, cfp, cfn, vfp, vfn = compare(d, l, X, group)
    ungroup_false_positives[i] = ufp
    ungroup_false_negatives[i] = ufn
    constant_false_positives[i] = cfp
    constant_false_negatives[i] = cfn
    variable_false_positives[i] = vfp
    variable_false_negatives[i] = vfn
end

running 1 th model


│   caller = poislogpdf at pois.jl:18 [inlined]
└ @ Core /u/home/b/biona001/.julia/packages/StatsFuns/hg4pG/src/distrs/pois.jl:18


compare_model = 19×5 DataFrame
│ Row │ position │ correct │ ungrouped │ constant_group │ variable_group │
│     │ Int64    │ Float64 │ Float32   │ Float32        │ Float32        │
├─────┼──────────┼─────────┼───────────┼────────────────┼────────────────┤
│ 1   │ 3337     │ -0.2    │ -0.202416 │ -0.199745      │ -0.202439      │
│ 2   │ 3394     │ -0.2    │ -0.198976 │ -0.146028      │ -0.199002      │
│ 3   │ 3527     │ 0.2     │ 0.218042  │ 0.211008       │ 0.218083       │
│ 4   │ 3557     │ 0.2     │ 0.192489  │ 0.193658       │ 0.192504       │
│ 5   │ 4041     │ -0.2    │ -0.206992 │ -0.20408       │ -0.206983      │
│ 6   │ 4255     │ -0.2    │ -0.207989 │ -0.200464      │ -0.208044      │
│ 7   │ 4418     │ -0.2    │ -0.21342  │ -0.215571      │ -0.213442      │
│ 8   │ 10168    │ 0.2     │ 0.209567  │ 0.21264        │ 0.209537       │
│ 9   │ 10251    │ 0.2     │ 0.175556  │ 0.172039       │ 0.175505       │
│ 10  │ 10515    │ 0.2     │ 0.197339  │ 0.194341       │ 0.197379   

In [11]:
@show mean(ungroup_false_positives)
@show mean(ungroup_false_negatives)
@show mean(constant_false_positives)
@show mean(constant_false_negatives)
@show mean(variable_false_positives)
@show mean(variable_false_negatives)

@show std(ungroup_false_positives)
@show std(ungroup_false_negatives)
@show std(constant_false_positives)
@show std(constant_false_negatives)
@show std(variable_false_positives)
@show std(variable_false_negatives)

mean(ungroup_false_positives) = 1.91
mean(ungroup_false_negatives) = 1.91
mean(constant_false_positives) = 16.76
mean(constant_false_negatives) = 0.76
mean(variable_false_positives) = 2.01
mean(variable_false_negatives) = 2.01
std(ungroup_false_positives) = 1.3034143197344765
std(ungroup_false_negatives) = 1.3034143197344765
std(constant_false_positives) = 0.8775539926984608
std(constant_false_negatives) = 0.877553992698461
std(variable_false_positives) = 1.4034229439466777
std(variable_false_negatives) = 1.4034229439466777


1.4034229439466777

## Negative Binomial

In [12]:
Random.seed!(2019)

d = NegativeBinomial
l = LogLink()

num_runs = 100
ungroup_false_positives = zeros(num_runs)
ungroup_false_negatives = zeros(num_runs)
constant_false_positives = zeros(num_runs)
constant_false_negatives = zeros(num_runs)
variable_false_positives = zeros(num_runs)
variable_false_negatives = zeros(num_runs)

@time for i in 1:num_runs
    println("running $i th model")
    ufp, ufn, cfp, cfn, vfp, vfn = compare(d, l, X, group)
    ungroup_false_positives[i] = ufp
    ungroup_false_negatives[i] = ufn
    constant_false_positives[i] = cfp
    constant_false_negatives[i] = cfn
    variable_false_positives[i] = vfp
    variable_false_negatives[i] = vfn
end

running 1 th model
compare_model = 19×5 DataFrame
│ Row │ position │ correct │ ungrouped │ constant_group │ variable_group │
│     │ Int64    │ Float64 │ Float32   │ Float32        │ Float32        │
├─────┼──────────┼─────────┼───────────┼────────────────┼────────────────┤
│ 1   │ 3337     │ -0.2    │ -0.192441 │ -0.19778       │ -0.192419      │
│ 2   │ 3394     │ -0.2    │ 0.0       │ -0.0640694     │ 0.0            │
│ 3   │ 3527     │ 0.2     │ 0.193724  │ 0.186301       │ 0.193677       │
│ 4   │ 3557     │ 0.2     │ 0.234895  │ 0.254115       │ 0.234961       │
│ 5   │ 4041     │ -0.2    │ -0.184299 │ -0.182811      │ -0.184234      │
│ 6   │ 4255     │ -0.2    │ -0.196029 │ -0.197029      │ -0.196145      │
│ 7   │ 4418     │ -0.2    │ -0.219734 │ -0.22525       │ -0.219589      │
│ 8   │ 10168    │ 0.2     │ 0.191402  │ 0.0            │ 0.0            │
│ 9   │ 10251    │ 0.2     │ 0.208235  │ 0.211297       │ 0.208164       │
│ 10  │ 10515    │ 0.2     │ 0.185834  │ 0.186996 

In [13]:
@show mean(ungroup_false_positives)
@show mean(ungroup_false_negatives)
@show mean(constant_false_positives)
@show mean(constant_false_negatives)
@show mean(variable_false_positives)
@show mean(variable_false_negatives)

@show std(ungroup_false_positives)
@show std(ungroup_false_negatives)
@show std(constant_false_positives)
@show std(constant_false_negatives)
@show std(variable_false_positives)
@show std(variable_false_negatives)

mean(ungroup_false_positives) = 1.81
mean(ungroup_false_negatives) = 1.81
mean(constant_false_positives) = 16.86
mean(constant_false_negatives) = 0.86
mean(variable_false_positives) = 2.05
mean(variable_false_negatives) = 2.05
std(ungroup_false_positives) = 1.475010700183753
std(ungroup_false_negatives) = 1.475010700183753
std(constant_false_positives) = 0.9213516640723504
std(constant_false_negatives) = 0.9213516640723501
std(variable_false_positives) = 1.520067781582802
std(variable_false_negatives) = 1.520067781582802


1.520067781582802