In [1]:
using Distributed
addprocs(4)

@everywhere begin
    using Revise
    using MendelIHT
    using SnpArrays
    using Random
    using GLM
    using DelimitedFiles
    using Test
    using Distributions
    using LinearAlgebra
    using CSV
    using DataFrames
    using StatsBase
    BLAS.set_num_threads(1) # remember to set BLAS threads to 1 !!!
end

┌ Info: Precompiling MendelIHT [921c7187-1484-5754-b919-5d3ed9ac03c4]
└ @ Base loading.jl:1278


# Univariate IHT

In [72]:
n = 1000  # number of samples
p = 10000 # number of SNPs
k = 10    # number of causal SNPs per trait
d = Normal
l = canonicallink(d())
# l = LogLink()

# set random seed for reproducibility
Random.seed!(2021)

# simulate `.bed` file with no missing data
x = simulate_random_snparray("univariate.bed", n, p)
xla = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true) 

# intercept is the only nongenetic covariate
z = ones(n)
intercept = 1.0

# simulate response y, true model b, and the correct non-0 positions of b
y, true_b, correct_position = simulate_random_response(xla, k, d, l, Zu=z*intercept);

In [73]:
# create covariate files
writedlm("covariates.txt", z)

# create `.bim` and `.bam` files using phenotype
make_bim_fam_files(x, y, "univariate")

# save phenotypes in separate file
writedlm("univariate.phen", y, ',')

## Run IHT

In [74]:
result1 = iht("univariate", 11, d, verbose=false)
result2 = iht("univariate", 11, d, covariates="covariates.txt", verbose=false)
result3 = iht("univariate", 11, d, covariates="covariates.txt", phenotypes="univariate.phen", verbose=false);

result = 
IHT estimated 10 nonzero SNP predictors and 1 non-genetic predictors.

Compute time (sec):     0.4673960208892822
Final loglikelihood:    -1472.3905989669602
SNP PVE:                0.8426823000266495
Iterations:             10

Selected genetic predictors:
10×2 DataFrame
 Row │ Position  Estimated_β
     │ Int64     Float64
─────┼───────────────────────
   1 │      782    -0.437828
   2 │      901     0.747956
   3 │     1204     0.691327
   4 │     1306    -1.42505
   5 │     1655    -0.19456
   6 │     3160    -0.861591
   7 │     3936    -0.147235
   8 │     4201     0.338606
   9 │     4402    -0.126472
  10 │     6879    -1.21895

Selected nongenetic predictors:
1×2 DataFrame
 Row │ Position  Estimated_β
     │ Int64     Float64
─────┼───────────────────────
   1 │        1      1.02016
result = 
IHT estimated 10 nonzero SNP predictors and 1 non-genetic predictors.

Compute time (sec):     0.4794940948486328
Final loglikelihood:    -1472.3905989669602
SNP PVE:          

In [39]:
[result1.beta[correct_position] result2.beta[correct_position] result3.beta[correct_position]]

10×3 Array{Float64,2}:
 -0.437828  -0.437828  -0.437828
  0.747956   0.747956   0.747956
  0.691327   0.691327   0.691327
 -1.42505   -1.42505   -1.42505
 -0.19456   -0.19456   -0.19456
 -0.861591  -0.861591  -0.861591
  0.338606   0.338606   0.338606
  0.0        0.0        0.0
  0.0        0.0        0.0
 -1.21895   -1.21895   -1.21895

## Check answer

In [40]:
# first beta
[result1.beta[correct_position] true_b[correct_position]]

10×2 Array{Float64,2}:
 -0.437828  -0.402269
  0.747956   0.758756
  0.691327   0.729135
 -1.42505   -1.47163
 -0.19456   -0.172668
 -0.861591  -0.847906
  0.338606   0.296183
  0.0       -0.0034339
  0.0        0.125965
 -1.21895   -1.24972

In [41]:
# non genetic covariates
[result1.c intercept]

1×2 Array{Float64,2}:
 1.02016  1.0

## Test Cross validation

In [80]:
Random.seed!(2020)
result1 = cross_validate("univariate", d, verbose=false)
Random.seed!(2020)
result2 = cross_validate("univariate", d, covariates="covariates.txt", verbose=false)
Random.seed!(2020)
result3 = cross_validate("univariate", d, covariates="covariates.txt", phenotypes="univariate.phen", verbose=false);

└ @ Base.Docs docs/Docs.jl:227
└ @ Base.Docs docs/Docs.jl:227
└ @ Base.Docs docs/Docs.jl:227
└ @ Base.Docs docs/Docs.jl:227
[32mCross validating...100%|████████████████████████████████| Time: 0:00:01[39m
[32mCross validating...100%|████████████████████████████████| Time: 0:00:01[39m
[32mCross validating...100%|████████████████████████████████| Time: 0:00:01[39m


In [81]:
[result1 result2 result3]

20×3 Array{Float64,2}:
 1218.5    1218.5    1218.5
  842.556   842.556   842.556
  634.041   634.041   634.041
  487.584   487.584   487.584
  391.393   391.393   391.393
  305.317   305.317   305.317
  267.913   267.913   267.913
  243.057   243.057   243.057
  243.476   243.476   243.476
  245.647   245.647   245.647
  250.631   250.631   250.631
  253.988   253.988   253.988
  254.795   254.795   254.795
  255.896   255.896   255.896
  263.607   263.607   263.607
  269.066   269.066   269.066
  271.058   271.058   271.058
  274.433   274.433   274.433
  279.33    279.33    279.33
  284.583   284.583   284.583

In [82]:
rm("univariate.bim", force=true)
rm("univariate.bed", force=true)
rm("univariate.fam", force=true)
rm("univariate.phen", force=true)
rm("covariates.txt", force=true)
rm("cviht.summary.txt", force=true)
rm("iht.summary.txt", force=true)
rm("iht.beta.txt", force=true)

# Multivariate IHT

In [83]:
"""
k = Number of causal SNPs
p = Total number of SNPs
traits = Number of traits (phenotypes)
overlap = number of causal SNPs shared in each trait
"""
function simulate_random_beta(k::Int, p::Int, traits::Int; overlap::Int=0)
    true_b = zeros(p, traits)
    if overlap == 0
        causal_snps = sample(1:(traits * p), k, replace=false)
        true_b[causal_snps] = randn(k)
    else
        shared_snps = sample(1:p, overlap, replace=false)
        weight_vector = aweights(1 / (traits * (p - overlap)) * ones(traits * p))
        for i in 1:traits
            weight_vector[i*shared_snps] .= 0.0 # avoid sampling from shared snps
        end
        @assert sum(weight_vector) ≈ 1.0
        # simulate β for shared predictors
        for i in 1:traits
            true_b[shared_snps, i] = randn(overlap)
        end
        # simulate β for none shared predictors
        nonshared_snps = sample(1:(traits * p), weight_vector, k - traits * overlap, replace=false)
        true_b[nonshared_snps] = randn(k - traits * overlap)
    end

    return true_b
end

simulate_random_beta

In [84]:
n = 1000  # number of samples
p = 10000 # number of SNPs
k = 10    # number of causal SNPs
r = 2     # number of traits
d = MvNormal

# set random seed for reproducibility
Random.seed!(111)

# simulate `.bed` file with no missing data
x = simulate_random_snparray("multivariate_$(r)traits.bed", n, p)
xla = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true) 

# intercept is the only nongenetic covariate
z = ones(n, 1)
intercepts = [0.5 1.0] # each trait have different intercept

# simulate β
B = simulate_random_beta(k, p, r, overlap=2)

# between trait covariance matrix
Σ = random_covariance_matrix(r)

# between sample covariance is identity + GRM (2 times because in SnpArrays grm is halved)
Φ = 2grm(x)
σg = 0.6
σe = 0.4
V = σg * Φ + σe * I

# simulate y using TraitSimulations.jl
# VCM_model = VCMTrait(z, intercepts, x, B, [Σ], [V]) #https://github.com/OpenMendel/TraitSimulation.jl/blob/6d1f09c7332471a74b4dd6c8ef2d2b95a96c585c/src/modelframework.jl#L159
# Y = simulate(VCM_model)

# simulate using naive model
μ = z * intercepts + xla * B
Yt = rand(MatrixNormal(μ', Σ, V))

2×1000 Array{Float64,2}:
 6.99773  6.69385  -1.65585  -3.52854   …  -3.11521   -4.13734  -2.32211
 4.49129  5.74267   4.72216   0.575205     -0.585688  -4.44368   4.02092

In [85]:
# create covariate files (each sample occupies a row)
writedlm("covariates.txt", z)

# create `.bim` and `.bam` files using phenotype
make_bim_fam_files(x, Transpose(Yt), "multivariate_$(r)traits")

# save phenotypes in separate file (each sample occupies a row)
writedlm("multivariate_$(r)traits.phen", Yt', ',')

## Run IHT 

In [94]:
result1 = iht("multivariate_$(r)traits", 17, d, phenotypes=[6, 7], verbose=false)
result2 = iht("multivariate_$(r)traits", 17, d, phenotypes=[6, 7], covariates="covariates.txt", verbose=false)
result3 = iht("multivariate_$(r)traits", 17, d, phenotypes="multivariate_$(r)traits.phen",
    covariates="covariates.txt", verbose=false);



result = 
Compute time (sec):     3.1750080585479736
Final loglikelihood:    3357.84017429941
Iterations:             33
Trait 1's SNP PVE:      0.40782236985473796
Trait 2's SNP PVE:      0.8925672476623521

Trait 1: IHT estimated 7 nonzero SNP predictors
7×2 DataFrame
 Row │ Position  Estimated_β
     │ Int64     Float64
─────┼───────────────────────
   1 │     2351   -0.0173905
   2 │     2529   -0.254593
   3 │     2850   -0.0151754
   4 │     2872    0.820815
   5 │     2986   -0.0125935
   6 │     4248   -1.40296
   7 │     8921    1.24518

Trait 1: IHT estimated 1 non-genetic predictors
1×2 DataFrame
 Row │ Position  Estimated_β
     │ Int64     Float64
─────┼───────────────────────
   1 │        1     0.506032

Trait 2: IHT estimated 8 nonzero SNP predictors
8×2 DataFrame
 Row │ Position  Estimated_β
     │ Int64     Float64
─────┼───────────────────────
   1 │      359   -0.0123443
   2 │     4710   -0.11455
   3 │     6991   -2.70483
   4 │     8738    0.0127784
   5 │     

└ @ Base.Docs docs/Docs.jl:227
└ @ Base.Docs docs/Docs.jl:227
└ @ Base.Docs docs/Docs.jl:227
└ @ Base.Docs docs/Docs.jl:227


In [87]:
true_b1_idx = findall(!iszero, B[:, 1])
[result1.beta[1, true_b1_idx] result2.beta[1, true_b1_idx] result3.beta[1, true_b1_idx]]

5×3 Array{Float64,2}:
 -0.254593  -0.254593  -0.254593
  0.820815   0.820815   0.820815
  0.0        0.0        0.0
 -1.40296   -1.40296   -1.40296
  1.24518    1.24518    1.24518

In [88]:
true_b2_idx = findall(!iszero, B[:, 2])
[result1.beta[2, true_b2_idx] result2.beta[2, true_b2_idx] result3.beta[2, true_b2_idx]]

5×3 Array{Float64,2}:
  0.0        0.0        0.0
 -0.11455   -0.11455   -0.11455
 -2.70483   -2.70483   -2.70483
 -0.218901  -0.218901  -0.218901
  0.312761   0.312761   0.312761

In [89]:
# covariance matrix
[vec(result1.Σ) vec(result2.Σ) vec(result3.Σ)]

4×3 Array{Float64,2}:
 5.9949    5.9949    5.9949
 2.0924    2.0924    2.0924
 2.0924    2.0924    2.0924
 0.741367  0.741367  0.741367

## Multivariate Cross validation

In [90]:
Random.seed!(2020)
result1 = cross_validate("multivariate_$(r)traits", d, phenotypes=[6, 7], verbose=false)
Random.seed!(2020)
result2 = cross_validate("multivariate_$(r)traits", d, phenotypes=[6, 7], covariates="covariates.txt", verbose=false)
Random.seed!(2020)
result3 = cross_validate("multivariate_$(r)traits", d, phenotypes="multivariate_$(r)traits.phen", 
    covariates="covariates.txt", verbose=false);

[32mCross validating...100%|████████████████████████████████| Time: 0:00:14[39m
[32mCross validating...100%|████████████████████████████████| Time: 0:00:15[39m
[32mCross validating...100%|████████████████████████████████| Time: 0:00:15[39m


In [91]:
[result1 result2 result3]

20×3 Array{Float64,2}:
 8079.42  8079.42  8079.42
 2601.87  2601.87  2601.87
 1865.9   1865.9   1865.9
 1704.34  1704.34  1704.34
 1552.04  1552.04  1552.04
 1542.36  1542.36  1542.36
 1480.45  1480.45  1480.45
 1443.06  1443.06  1443.06
 1447.93  1447.93  1447.93
 1443.11  1443.11  1443.11
 1443.45  1443.45  1443.45
 1441.69  1441.69  1441.69
 1396.29  1396.29  1396.29
 1387.62  1387.62  1387.62
 1376.78  1376.78  1376.78
 1375.95  1375.95  1375.95
 1364.46  1364.46  1364.46
 1364.62  1364.62  1364.62
 1365.16  1365.16  1365.16
 1365.5   1365.5   1365.5

In [70]:
rm("multivariate_$(r)traits.bim", force=true)
rm("multivariate_$(r)traits.bed", force=true)
rm("multivariate_$(r)traits.fam", force=true)
rm("multivariate_$(r)traits.phen", force=true)
rm("covariates.txt", force=true)
rm("cviht.summary.txt", force=true)
rm("iht.summary.txt", force=true)
rm("iht.beta.txt", force=true)