# Analyze LDL by thresholding at 145mg/dL

In [1]:
using Distributed
addprocs(30)
nprocs()

31

In [2]:
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using DelimitedFiles
using BenchmarkTools
using Random
using LinearAlgebra
using GLM
using CSV
using Dates

# Import data

+ ** For description of what each phenotype column means**, see here:
https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/dataset.cgi?study_id=phs000276.v2.p1&phv=129612&phd=&pha=2896&pht=2005&phvf=&phdf=&phaf=&phtf=&dssp=1&consent=&temp=1

+ **We have already matched up phenotype with genotypes** in the files below

In [3]:
# import full genotype data
kevin_stampeed = SnpArray("../kevin_imputed.bed")

# import full phenotype data
phenotype_data = CSV.read("full_phenotype_sorted", delim=',', header=true)

Unnamed: 0_level_0,dbGaP_Subject_ID,SUBJID,SEX,a10atc,crp3dec,FASTING_STATUS,FB_GLUK,FS_INS,FS_KOL,FS_KOL_H,FS_KOL_L,FS_TRIGL,HOMA_IR,PAILAHDE,Pills31,PITLAHDE,ZP4202U,ZT20,ZT27,ZT28,ZT29,ZT30,CASE,RACE,BMI,Systolic Blood Pressure MEAN,Diastolic Blood Pressure MEAN
Unnamed: 0_level_1,Int64⍰,Int64⍰,Int64⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,Int64⍰,String⍰,String⍰,String⍰,String⍰
1,356529,2864,2,0,0.373,0,4.8,8.8,6.73,2.26,4.2,0.64,1.13,1,1,1,1,2,166.8,55.7,92.4,71.8,2,Caucasian,20,110,79
2,358247,2374,1,0,0.38,0,5.6,7,3.95,1.55,2.1,0.67,0.94,1,X,1,X,X,179.6,67.2,92.5,79,2,Caucasian,20.8,117,72
3,356035,2803,1,0,9.214,0,5.7,12.9,7.44,1.19,5.6,1.52,1.72,1,X,1,X,X,177,101.7,112,117,2,Caucasian,32.5,147,97
4,353849,4709,1,0,0.61,0,4.9,8.7,3.92,1.44,2.3,0.35,1.12,1,X,1,X,X,173.4,89.9,103,102,2,Caucasian,29.9,125,60
5,354502,4350,1,0,0.593,0,4.9,3.6,3.53,1.53,1.8,0.39,0.47,1,X,1,1,2,170.8,66.5,90,75,2,Caucasian,22.8,X,X
6,356015,3480,2,0,1.774,0,4.7,8.7,3.59,1.48,1.8,0.69,1.11,1,0,1,0,2,170.5,71.6,99,78,2,Caucasian,24.6,138,84
7,356268,3729,1,0,2.564,0,4.6,5.7,5.71,2.01,3.4,0.6,0.73,1,X,1,X,X,182,73.5,91,84,2,Caucasian,22.2,132,76
8,358341,4031,1,0,3.998,0,5.9,7.6,4.11,1.31,2.3,1.03,1.03,1,X,1,X,X,182.3,85.4,102,92,2,Caucasian,25.7,132,100
9,357054,1104,1,0,3.238,0,5.4,12.2,5.93,0.92,3.5,3.24,1.6,1,X,1,X,X,178.4,91.1,102,99,2,Caucasian,28.6,140,110
10,354105,395,1,0,0.83,0,4.1,3.4,3.97,1.58,2.1,0.54,0.42,1,X,1,1,X,172.5,68.3,89.5,78.5,2,Caucasian,23,118,87


# Filtered phenotype and genotype data for LDL (FS_KOL_L)

In [4]:
# exclude samples without HDL measurements
LDL = phenotype_data[:FS_KOL_L]
missing_LDL_data = LDL .== "X"

# exclude people that are fasting
fasting_data = phenotype_data[:FASTING_STATUS]
contains_nonfasting_blood = (fasting_data .== "1") .+ (fasting_data .== "X")

# exlucde people on diabetes medication
diabetes_med = phenotype_data[:a10atc]
contains_diabetes_medication = (diabetes_med .== "1") .+ (diabetes_med .== "X")

# exclude SNPs with maf < 0.01 and SNPs with HWE p-value < 0.00001
rowmask, snps_to_keep = SnpArrays.filter(kevin_stampeed, min_success_rate_per_row=1.0, 
    min_success_rate_per_col=1.0, min_maf=0.01, min_hwe_pval=1e-5)

# combine
samples_to_exclude = missing_LDL_data .+ contains_nonfasting_blood .+ contains_diabetes_medication .+ rowmask
samples_to_keep = samples_to_exclude .== 0

@show count(snps_to_keep)
@show count(samples_to_keep)

count(snps_to_keep) = 324789
count(samples_to_keep) = 4893


4893

In [5]:
SnpArrays.filter("../kevin_imputed", samples_to_keep, snps_to_keep, des="kevin_imputed_filtered_LDL")

4893×324789 SnpArray:
 0x03  0x03  0x03  0x03  0x03  0x03  …  0x03  0x02  0x03  0x03  0x03  0x03
 0x03  0x02  0x02  0x02  0x02  0x03     0x03  0x03  0x02  0x03  0x03  0x03
 0x02  0x03  0x03  0x03  0x02  0x03     0x02  0x02  0x03  0x03  0x03  0x03
 0x03  0x03  0x03  0x03  0x02  0x03     0x03  0x03  0x03  0x03  0x03  0x03
 0x03  0x03  0x03  0x03  0x03  0x03     0x03  0x02  0x03  0x03  0x03  0x03
 0x03  0x03  0x03  0x03  0x02  0x03  …  0x03  0x02  0x03  0x03  0x03  0x03
 0x03  0x03  0x03  0x03  0x02  0x03     0x02  0x02  0x02  0x02  0x02  0x03
 0x03  0x03  0x03  0x03  0x03  0x03     0x02  0x02  0x03  0x03  0x03  0x03
 0x03  0x03  0x03  0x03  0x03  0x03     0x03  0x00  0x00  0x00  0x00  0x03
 0x02  0x00  0x00  0x00  0x03  0x03     0x03  0x02  0x03  0x03  0x03  0x03
 0x03  0x03  0x03  0x03  0x02  0x03  …  0x03  0x00  0x03  0x03  0x03  0x02
 0x03  0x03  0x03  0x03  0x02  0x03     0x03  0x00  0x00  0x03  0x03  0x03
 0x03  0x02  0x02  0x02  0x03  0x03     0x03  0x03  0x00  0x02  0x02  0x03
   

## Compute top 2 principal components on resulting file using plink2

The following command was executed:
```
./plink2 --bfile kevin_imputed_filtered_LDL --pca 2
```

# Begin analysis

Here we truncate the LDL level at 160 mg/dL, which is the range for [high LDL cholesterol category](https://medlineplus.gov/ldlthebadcholesterol.html). 

In [6]:
# first check genotype and phenotype files actually match 
genotype_order = CSV.read("kevin_imputed_filtered_LDL.fam", delim=' ', header=false)[:, 1]
phenotype_order = phenotype_data[samples_to_keep, 2]
all(phenotype_order .== genotype_order)

true

In [7]:
cutoff = 145
molecular_weight = 386.654 #g/mol 

# convert LDL from mmol/L to mg/dL
LDL_converted = parse.(Float64, LDL[samples_to_keep]) .* molecular_weight ./ 10

# truncate 
LDL_truncated = LDL_converted .>= cutoff
y = convert(Vector{Float64}, LDL_truncated)
@show count(!iszero, y)

# check truncation
[y LDL_converted]

count(!iszero, y) = 932


4893×2 Array{Float64,2}:
 1.0  162.395 
 0.0   81.1973
 1.0  216.526 
 0.0   88.9304
 0.0   69.5977
 0.0   69.5977
 0.0  131.462 
 0.0   88.9304
 0.0  135.329 
 0.0   81.1973
 0.0  104.397 
 1.0  170.128 
 0.0  119.863 
 ⋮            
 0.0  139.195 
 0.0   88.9304
 0.0  131.462 
 0.0  123.729 
 0.0  123.729 
 1.0  173.994 
 1.0  158.528 
 0.0  123.729 
 1.0  158.528 
 0.0   85.0639
 0.0  143.062 
 0.0   61.8646

In [8]:
x = SnpArray("kevin_imputed_filtered_LDL.bed")

4893×324789 SnpArray:
 0x03  0x03  0x03  0x03  0x03  0x03  …  0x03  0x02  0x03  0x03  0x03  0x03
 0x03  0x02  0x02  0x02  0x02  0x03     0x03  0x03  0x02  0x03  0x03  0x03
 0x02  0x03  0x03  0x03  0x02  0x03     0x02  0x02  0x03  0x03  0x03  0x03
 0x03  0x03  0x03  0x03  0x02  0x03     0x03  0x03  0x03  0x03  0x03  0x03
 0x03  0x03  0x03  0x03  0x03  0x03     0x03  0x02  0x03  0x03  0x03  0x03
 0x03  0x03  0x03  0x03  0x02  0x03  …  0x03  0x02  0x03  0x03  0x03  0x03
 0x03  0x03  0x03  0x03  0x02  0x03     0x02  0x02  0x02  0x02  0x02  0x03
 0x03  0x03  0x03  0x03  0x03  0x03     0x02  0x02  0x03  0x03  0x03  0x03
 0x03  0x03  0x03  0x03  0x03  0x03     0x03  0x00  0x00  0x00  0x00  0x03
 0x02  0x00  0x00  0x00  0x03  0x03     0x03  0x02  0x03  0x03  0x03  0x03
 0x03  0x03  0x03  0x03  0x02  0x03  …  0x03  0x00  0x03  0x03  0x03  0x02
 0x03  0x03  0x03  0x03  0x02  0x03     0x03  0x00  0x00  0x03  0x03  0x03
 0x03  0x02  0x02  0x02  0x03  0x03     0x03  0x03  0x00  0x02  0x02  0x03
   

# add non-genetic covariates

In [9]:
n, p = size(x)
z = zeros(n, 4) 

# add intercept
z[:, 1] .= ones(n) 

# add sexOCPG 
for i in 1:n
    if phenotype_data[:SEX][i] == 1 #males
        z[i, 2] = 0.0
    else
        # oral contraceptive: 0 = no, 1 = yes, X = unknown
        # pregnancy: 1 = yes, 2 = no, 3 and X = unknown
        my_covariates = (phenotype_data[:ZP4202U][i], phenotype_data[:ZT20][i])
        
        my_covariates == ("0", "1") && (z[i, 2] = 1)
        my_covariates == ("0", "2") && (z[i, 2] = 2)
        my_covariates == ("0", "3") && (z[i, 2] = 3)
        my_covariates == ("0", "X") && (z[i, 2] = 3)
        
        my_covariates == ("1", "1") && (z[i, 2] = 4)
        my_covariates == ("1", "2") && (z[i, 2] = 5)
        my_covariates == ("1", "3") && (z[i, 2] = 6)
        my_covariates == ("1", "X") && (z[i, 2] = 6)
        
        my_covariates == ("X", "1") && (z[i, 2] = 7)
        my_covariates == ("X", "2") && (z[i, 2] = 8)
        my_covariates == ("X", "3") && (z[i, 2] = 9)
        my_covariates == ("X", "X") && (z[i, 2] = 9)
    end 
end

# add first 2 principal components
pc = CSV.read("kevin_imputed_filtered_LDL.eigenvec", delim="\t", header=true)
pc1 = pc[:, 3]
pc2 = pc[:, 4]

# standardize all covariates
mean_sexOCPG = mean(z[:, 2])
mean_pc1 = mean(pc1)
mean_pc2 = mean(pc2)
std_pc1 = std(pc1)
std_pc2 = std(pc2)
std_sexOCPG = std(z[:, 2])
z[:, 2] .= (z[:, 2] .- mean_sexOCPG) ./ std_sexOCPG
z[:, 3] .= (pc1 .- mean_pc1) ./ std_pc1
z[:, 4] .= (pc2 .- mean_pc2) ./ std_pc2

4893-element view(::Array{Float64,2}, :, 4) with eltype Float64:
  0.10918913881138093 
 -0.7345256818351892  
 -0.40101026842192733 
  0.22617552428135446 
 -0.4699760018100405  
 -1.1970016940344952  
  0.38950673151161685 
 -0.17837942757154795 
 -0.5241565193518678  
 -0.7190543267796122  
 -0.7309446086939345  
 -0.752549950360606   
 -0.04493052713418643 
  ⋮                   
 -0.10517306011010917 
 -0.966162362093146   
 -0.6935398800765202  
 -0.33943679324278325 
 -0.23620676451933265 
 -0.16548196883624772 
  0.257249726921914   
 -0.4297553746639755  
  0.023923787318106026
  0.07236913229281212 
 -0.4437726181842657  
  0.07376309299017944 

# Run Cross validation

In [10]:
#set seed
Random.seed!(2019)

d = Bernoulli
l = canonicallink(d())
path = collect(1:20)
num_folds = 5
folds = rand(1:num_folds, size(x, 1))

4893-element Array{Int64,1}:
 5
 5
 4
 3
 4
 4
 4
 1
 4
 5
 4
 4
 4
 ⋮
 1
 5
 1
 2
 4
 4
 3
 5
 2
 2
 3
 4

In [11]:
println("start time = " * string(Dates.format(now(), "HH:MM")))
mses = cv_iht(d(), l, x, z, y, 1, path, folds, num_folds, debias=false, parallel=true)
println("end time = " * string(Dates.format(now(), "HH:MM")))

start time = 11:42


Crossvalidation Results:
	k	MSE
	1	954.0325463026837
	2	941.357036600655
	3	938.7044350380628
	4	939.9824474819263
	5	942.9919415257091
	6	946.6448432498211
	7	946.0341458088332
	8	954.109030428348
	9	960.6091985739399
	10	959.3305785180676
	11	968.1859464169147
	12	961.7063457539242
	13	970.6841587103604
	14	970.0187888789875
	15	977.2596277686467
	16	977.231016599025
	17	982.1684065356247
	18	993.8036026006517
	19	999.4501797610317
	20	993.1084554757742

The lowest MSE is achieved at k = 3 

end time = 12:49


# Run with full dataset on best $k$

In [12]:
xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
@show k = argmin(mses)
d = Normal
l = canonicallink(d())
result = L0_reg(x, xbm, z, y, 1, k, d(), l, debias=true)

k = argmin(mses) = 3


IHT results:

Compute time (sec):     45.00387215614319
Final loglikelihood:    -2318.800016368827
Iterations:             3
Max number of groups:   1
Max predictors/group:   3
IHT estimated 2 nonzero SNP predictors and 1 non-genetic predictors.

Genetic predictors:
2×2 DataFrame
│ Row │ Position │ Estimated_β │
│     │ [90mInt64[39m    │ [90mFloat64[39m     │
├─────┼──────────┼─────────────┤
│ 1   │ 11780    │ 0.0322938   │
│ 2   │ 119293   │ -0.0467026  │

Nongenetic predictors:
1×2 DataFrame
│ Row │ Position │ Estimated_β │
│     │ [90mInt64[39m    │ [90mFloat64[39m     │
├─────┼──────────┼─────────────┤
│ 1   │ 1        │ 0.19047     │

In [13]:
estimated_b = result.beta
position = findall(!iszero, estimated_b)
found_snps = CSV.read("kevin_imputed_filtered_LDL.bim", delim='\t', header=false)[position, :]

Unnamed: 0_level_0,Column1,Column2,Column3,Column4,Column5,Column6
Unnamed: 0_level_1,Int64⍰,String⍰,Int64⍰,Int64⍰,String⍰,String⍰
1,1,rs646776,0,109620053,G,A
2,6,rs6917603,0,30125050,G,A


In [14]:
xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
@show k = argmin(mses)
d = Normal
l = canonicallink(d())
result = L0_reg(x, xbm, z, y, 1, k, d(), l, debias=false)

k = argmin(mses) = 3


IHT results:

Compute time (sec):     43.26154685020447
Final loglikelihood:    -2318.789024629622
Iterations:             3
Max number of groups:   1
Max predictors/group:   3
IHT estimated 2 nonzero SNP predictors and 1 non-genetic predictors.

Genetic predictors:
2×2 DataFrame
│ Row │ Position │ Estimated_β │
│     │ [90mInt64[39m    │ [90mFloat64[39m     │
├─────┼──────────┼─────────────┤
│ 1   │ 11780    │ 0.0322938   │
│ 2   │ 119293   │ -0.0467025  │

Nongenetic predictors:
1×2 DataFrame
│ Row │ Position │ Estimated_β │
│     │ [90mInt64[39m    │ [90mFloat64[39m     │
├─────┼──────────┼─────────────┤
│ 1   │ 1        │ 0.190476    │

In [15]:
estimated_b = result.beta
position = findall(!iszero, estimated_b)
found_snps = CSV.read("kevin_imputed_filtered_LDL.bim", delim='\t', header=false)[position, :]

Unnamed: 0_level_0,Column1,Column2,Column3,Column4,Column5,Column6
Unnamed: 0_level_1,Int64⍰,String⍰,Int64⍰,Int64⍰,String⍰,String⍰
1,1,rs646776,0,109620053,G,A
2,6,rs6917603,0,30125050,G,A
