In [1]:
using Distributed
addprocs(4)

@everywhere begin
    using Revise
    using MendelIHT
    using SnpArrays
    using Random
    using GLM
    using DelimitedFiles
    using Test
    using Distributions
    using LinearAlgebra
    using CSV
    using DataFrames
    using StatsBase
    BLAS.set_num_threads(1) # remember to set BLAS threads to 1 !!!
end

┌ Info: Precompiling MendelIHT [921c7187-1484-5754-b919-5d3ed9ac03c4]
└ @ Base loading.jl:1278
│   exception = ArgumentError: Invalid checksum in cache file /Users/biona001/.julia/compiled/v1.5/MendelIHT/eaqWB_zEoHw.ji.
└ @ Base loading.jl:1042


# Univariate IHT

In [99]:
n = 1000  # number of samples
p = 10000 # number of SNPs
k = 10    # number of causal SNPs per trait
d = Normal
l = canonicallink(d())
# l = LogLink()

# set random seed for reproducibility
Random.seed!(2021)

# simulate `.bed` file with no missing data
x = simulate_random_snparray("univariate.bed", n, p)
xla = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true) 

# intercept is the only nongenetic covariate
z = ones(n)
intercept = 1.0

# simulate response y, true model b, and the correct non-0 positions of b
y, true_b, correct_position = simulate_random_response(xla, k, d, l, Zu=z*intercept);

univariate.bed


In [105]:
# create covariate files
writedlm("covariates.txt", z)

# create `.bim` and `.bam` files using phenotype
make_bim_fam_files(x, y, "univariate")

# save phenotypes in separate file
writedlm("univariate.phen", y, ',')

## Run IHT

In [101]:
result1 = iht("univariate", 11, d = d(), l = l, verbose=false)
result2 = iht("univariate", "covariates.txt", 11, d = d(), l = l, verbose=false)
result3 = iht("univariate.phen", "univariate", "covariates.txt", 11, d = d(), l = l, verbose=false);

In [102]:
[result1.beta[correct_position] result2.beta[correct_position] result3.beta[correct_position]]

10×3 Array{Float64,2}:
 -0.437828  -0.437828  -0.437828
  0.747956   0.747956   0.747956
  0.691327   0.691327   0.691327
 -1.42505   -1.42505   -1.42505
 -0.19456   -0.19456   -0.19456
 -0.861591  -0.861591  -0.861591
  0.338606   0.338606   0.338606
  0.0        0.0        0.0
  0.0        0.0        0.0
 -1.21895   -1.21895   -1.21895

In [103]:
rm("univariate.bim", force=true)
rm("univariate.bed", force=true)
rm("univariate.fam", force=true)
rm("univariate.phen", force=true)
rm("covariates.txt", force=true)

## Check answer

In [31]:
# first beta
[result.beta[correct_position] true_b[correct_position]]

10×2 Array{Float64,2}:
  0.0       -0.402269
  0.94532    0.758756
  0.0        0.729135
 -1.49538   -1.47163
  0.0       -0.172668
 -0.776139  -0.847906
  0.385436   0.296183
  0.0       -0.0034339
  0.0        0.125965
 -1.52028   -1.24972

In [32]:
# non genetic covariates
[result.c intercept]

1×2 Array{Float64,2}:
 1.01066  1.0

## Test Cross validation

In [11]:
Random.seed!(2020)
Yt = Matrix(Y')
Zt = Matrix(z')
@time mses = cv_iht(Yt, Transpose(xla), Zt);

[32mCross validating...100%|████████████████████████████████| Time: 0:00:07[39m




Crossvalidation Results:
	k	MSE
	1	2888.7160633632484
	2	2560.1358620535425
	3	2067.943067029389
	4	1812.0395079444286
	5	1554.312036744936
	6	1277.3237598020085
	7	1154.9320629872832
	8	1098.5910963871872
	9	1019.4597637296985
	10	1030.14124647156
	11	1023.5545874904793
	12	1007.9022110997687
	13	1012.6193656356761
	14	1019.1491606608182
	15	1024.6877890077092
	16	1022.9300595671256
	17	1040.0286509856787
	18	1033.345570850089
	19	1039.8828186471897
	20	1036.274158344765

Best k = 12

  7.249031 seconds (35.04 k allocations: 7.841 MiB)


# Multivariate IHT

In [91]:
"""
k = Number of causal SNPs
p = Total number of SNPs
traits = Number of traits (phenotypes)
overlap = number of causal SNPs shared in each trait
"""
function simulate_random_beta(k::Int, p::Int, traits::Int; overlap::Int=0)
    true_b = zeros(p, traits)
    if overlap == 0
        causal_snps = sample(1:(traits * p), k, replace=false)
        true_b[causal_snps] = randn(k)
    else
        shared_snps = sample(1:p, overlap, replace=false)
        weight_vector = aweights(1 / (traits * (p - overlap)) * ones(traits * p))
        for i in 1:traits
            weight_vector[i*shared_snps] .= 0.0 # avoid sampling from shared snps
        end
        @assert sum(weight_vector) ≈ 1.0
        # simulate β for shared predictors
        for i in 1:traits
            true_b[shared_snps, i] = randn(overlap)
        end
        # simulate β for none shared predictors
        nonshared_snps = sample(1:(traits * p), weight_vector, k - traits * overlap, replace=false)
        true_b[nonshared_snps] = randn(k - traits * overlap)
    end

    return true_b
end

simulate_random_beta

In [24]:
n = 1000  # number of samples
p = 10000 # number of SNPs
k = 10    # number of causal SNPs
r = 2     # number of traits

# set random seed for reproducibility
Random.seed!(111)

# simulate `.bed` file with no missing data
x = simulate_random_snparray("multivariate_$(r)traits.bed", n, p)
xla = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true) 

# intercept is the only nongenetic covariate
z = ones(n, 1)
intercepts = [0.5 1.0] # each trait have different intercept

# simulate β
B = simulate_random_beta(k, p, r, overlap=2)

# between trait covariance matrix
Σ = random_covariance_matrix(r)

# between sample covariance is identity + GRM (2 times because in SnpArrays grm is halved)
Φ = 2grm(x)
σg = 0.6
σe = 0.4
V = σg * Φ + σe * I

# simulate y using TraitSimulations.jl
# VCM_model = VCMTrait(z, intercepts, x, B, [Σ], [V]) #https://github.com/OpenMendel/TraitSimulation.jl/blob/6d1f09c7332471a74b4dd6c8ef2d2b95a96c585c/src/modelframework.jl#L159
# Y = simulate(VCM_model)

# simulate using naive model
μ = z * intercepts + xla * B
Yt = rand(MatrixNormal(μ', Σ, V))

2×1000 Array{Float64,2}:
 6.99773  6.69385  -1.65585  -3.52854   …  -3.11521   -4.13734  -2.32211
 4.49129  5.74267   4.72216   0.575205     -0.585688  -4.44368   4.02092

In [75]:
# create covariate files (each sample occupies a row)
writedlm("covariates.txt", z)

# create `.bim` and `.bam` files using phenotype
make_bim_fam_files(x, Transpose(Yt), "multivariate_$(r)traits")

# save phenotypes in separate file (each sample occupies a row)
writedlm("multivariate_$(r)traits.phen", Yt', ',')

## Run IHT 

In [92]:
result1 = iht("multivariate_$(r)traits", 17, col=[6, 7], verbose=false)
result2 = iht("multivariate_$(r)traits", "covariates.txt", 17, col=[6, 7], verbose=false)
result3 = iht("multivariate_$(r)traits.phen", "multivariate_$(r)traits", "covariates.txt", 17, col=[6, 7], verbose=false);

In [93]:
true_b1_idx = findall(!iszero, B[:, 1])
[result1.beta[1, true_b1_idx] result2.beta[1, true_b1_idx] result3.beta[1, true_b1_idx]]

5×3 Array{Float64,2}:
 -0.254593  -0.254593  -0.254593
  0.820815   0.820815   0.820815
  0.0        0.0        0.0
 -1.40296   -1.40296   -1.40296
  1.24518    1.24518    1.24518

In [94]:
true_b2_idx = findall(!iszero, B[:, 2])
[result1.beta[2, true_b2_idx] result2.beta[2, true_b2_idx] result3.beta[2, true_b2_idx]]

5×3 Array{Float64,2}:
  0.0        0.0        0.0
 -0.11455   -0.11455   -0.11455
 -2.70483   -2.70483   -2.70483
 -0.218901  -0.218901  -0.218901
  0.312761   0.312761   0.312761

In [95]:
# covariance matrix
[vec(result1.Σ) vec(result2.Σ) vec(result3.Σ)]

4×3 Array{Float64,2}:
 5.9949    5.9949    5.9949
 2.0924    2.0924    2.0924
 2.0924    2.0924    2.0924
 0.741367  0.741367  0.741367

In [96]:
rm("multivariate_$(r)traits.bim", force=true)
rm("multivariate_$(r)traits.bed", force=true)
rm("multivariate_$(r)traits.fam", force=true)
rm("multivariate_$(r)traits.phen", force=true)
rm("covariates.txt", force=true)

In [21]:
# 4 cores
Random.seed!(2020)
@time mses = cv_iht(Yt, Transpose(xla), Zt, path=1:20, parallel=true, max_iter=20);

[32mCross validating...100%|████████████████████████████████| Time: 0:00:07[39m




Crossvalidation Results:
	k	MSE
	1	8079.418619716553
	2	3627.3096599448268
	3	1865.7585855184807
	4	1692.7063084360427
	5	1542.7593074646843
	6	1499.8616100579775
	7	1433.6667306296458
	8	1421.9052004472687
	9	1421.2552305804886
	10	1423.4614532506505
	11	1420.2679722482376
	12	1420.0053086973176
	13	1389.0348278471117
	14	1384.3136635494318
	15	1381.4910373174566
	16	1376.2158720886953
	17	1365.8686420435863
	18	1367.0589543865267
	19	1370.884134297744
	20	1370.9739594368748

Best k = 17

  7.276093 seconds (36.59 k allocations: 8.038 MiB)
