# Analyze HDL using imputed stampeed data

Things to check:
+ Why did I not get the same number of samples after exclusion criterion for HDL?

In [1]:
using Distributed
addprocs(30)
nprocs()

31

In [2]:
using MendelIHT
using SnpArrays
using DataFrames
using Distributions
using BenchmarkTools
using Random
using LinearAlgebra
using GLM
using CSV
using Dates

# Import data

+ ** For description of what each phenotype column means**, see here:
https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/dataset.cgi?study_id=phs000276.v2.p1&phv=129612&phd=&pha=2896&pht=2005&phvf=&phdf=&phaf=&phtf=&dssp=1&consent=&temp=1

+ **To match up phenotype with genotype**, see `/Volumes/ExternalDrive/stampeed_data/original_dbgap/StudyFiles_decrypted/Release_Notes.phs000276.NFBC66.v2.p1.MULTI.pdf` on page 2. Basically, SUBJID in `phenotype_data` below corresponds to the first column of the `.fam` file (same as `kevin_imputed_filtered_HDL.fam`) at  `/Volumes/ExternalDrive/stampeed_data/original_dbgap/genotypes_decrypted/Mat/NFBC_dbGaP_20091127.fam`.

In [3]:
# import full genotype data
kevin_stampeed = SnpArray("kevin_imputed.bed")

# import full phenotype data
phenotype_data = CSV.read("phs000276.v2.pht002005.v1.p1.c1.NFBC66_Subject_Phenotypes.GRU.txt", delim='\t', header=true)

Unnamed: 0_level_0,dbGaP_Subject_ID,SUBJID,SEX,a10atc,crp3dec,FASTING_STATUS,FB_GLUK,FS_INS,FS_KOL,FS_KOL_H,FS_KOL_L,FS_TRIGL,HOMA_IR,PAILAHDE,Pills31,PITLAHDE,ZP4202U,ZT20,ZT27,ZT28,ZT29,ZT30,CASE,RACE,BMI,Systolic Blood Pressure MEAN,Diastolic Blood Pressure MEAN
Unnamed: 0_level_1,Int64⍰,Int64⍰,Int64⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,Int64⍰,String⍰,String⍰,String⍰,String⍰
1,353881,1,1,0,0.308,0,4.8,6.6,5.79,1.22,4,1.18,0.85,1,X,1,X,X,178.5,70.9,90,78.5,2,Caucasian,22.3,121,70
2,354298,2,1,0,0.472,0,4.8,14.4,5.26,1.21,2.7,3.06,1.83,1,X,1,X,X,182,86.9,98,94,2,Caucasian,26.2,126,77
3,355106,3,2,X,0.073,0,5,5.7,4.44,1.95,2.2,0.65,0.74,1,0,1,X,2,176.9,66.5,95.4,73.9,2,Caucasian,21.3,102,52
4,356463,4,2,0,0.355,0,4.4,5.8,5.29,1.6,3.4,0.57,0.73,1,0,1,X,2,170.7,59.1,92.5,71,2,Caucasian,20.3,100,66
5,356890,5,2,0,4.335,0,4.8,5.3,4.16,1.89,2,0.69,0.68,1,0,1,X,2,168.4,58.8,93.5,71.5,2,Caucasian,20.7,108,61
6,357608,6,2,0,0.539,0,4.8,14.4,5.27,2.24,2.6,0.94,1.83,1,0,1,X,2,166.6,64.3,96,70.5,2,Caucasian,23.2,111,66
7,358051,7,1,0,0.689,0,5.8,12.2,4.2,1.32,2.7,0.47,1.63,1,X,1,X,X,179.4,77.4,99,85,2,Caucasian,24,116,61
8,353837,8,1,0,2.042,0,5.3,5,5.36,1.46,3.5,0.84,0.66,1,X,1,1,X,175,71.4,100.5,80.5,2,Caucasian,23.3,130,82
9,354785,9,2,0,0.907,0,5.2,7.2,4.77,1.68,2.3,1.73,0.95,1,0,1,X,2,168.1,71.1,100.5,81.5,2,Caucasian,25.2,131,70
10,358398,10,2,0,0.599,0,4.6,6.2,4.27,1.74,2.2,0.77,0.79,1,0,1,0,2,162.6,64.1,98,77.5,2,Caucasian,24.2,120,76


# First sort phenotype data so it matches genotype

In [4]:
genotype_order = CSV.read("kevin_imputed.fam", delim=' ', header=false)[:, 1]
phenotype_order = phenotype_data[:, 2]

phenotype_data_sorted = similar(phenotype_data, 0) 
for i in 1:size(phenotype_data, 1)
    row_in_phenotype = findall(x -> x == genotype_order[i], phenotype_data[:, 2])
    length(row_in_phenotype) == 1 || error("should have only 1 element")
    push!(phenotype_data_sorted, phenotype_data[row_in_phenotype[1], :])
end
phenotype_data_sorted

Unnamed: 0_level_0,dbGaP_Subject_ID,SUBJID,SEX,a10atc,crp3dec,FASTING_STATUS,FB_GLUK,FS_INS,FS_KOL,FS_KOL_H,FS_KOL_L,FS_TRIGL,HOMA_IR,PAILAHDE,Pills31,PITLAHDE,ZP4202U,ZT20,ZT27,ZT28,ZT29,ZT30,CASE,RACE,BMI,Systolic Blood Pressure MEAN,Diastolic Blood Pressure MEAN
Unnamed: 0_level_1,Int64⍰,Int64⍰,Int64⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,String⍰,Int64⍰,String⍰,String⍰,String⍰,String⍰
1,356529,2864,2,0,0.373,0,4.8,8.8,6.73,2.26,4.2,0.64,1.13,1,1,1,1,2,166.8,55.7,92.4,71.8,2,Caucasian,20,110,79
2,358247,2374,1,0,0.38,0,5.6,7,3.95,1.55,2.1,0.67,0.94,1,X,1,X,X,179.6,67.2,92.5,79,2,Caucasian,20.8,117,72
3,356035,2803,1,0,9.214,0,5.7,12.9,7.44,1.19,5.6,1.52,1.72,1,X,1,X,X,177,101.7,112,117,2,Caucasian,32.5,147,97
4,353849,4709,1,0,0.61,0,4.9,8.7,3.92,1.44,2.3,0.35,1.12,1,X,1,X,X,173.4,89.9,103,102,2,Caucasian,29.9,125,60
5,354502,4350,1,0,0.593,0,4.9,3.6,3.53,1.53,1.8,0.39,0.47,1,X,1,1,2,170.8,66.5,90,75,2,Caucasian,22.8,X,X
6,356015,3480,2,0,1.774,0,4.7,8.7,3.59,1.48,1.8,0.69,1.11,1,0,1,0,2,170.5,71.6,99,78,2,Caucasian,24.6,138,84
7,356268,3729,1,0,2.564,0,4.6,5.7,5.71,2.01,3.4,0.6,0.73,1,X,1,X,X,182,73.5,91,84,2,Caucasian,22.2,132,76
8,358341,4031,1,0,3.998,0,5.9,7.6,4.11,1.31,2.3,1.03,1.03,1,X,1,X,X,182.3,85.4,102,92,2,Caucasian,25.7,132,100
9,357054,1104,1,0,3.238,0,5.4,12.2,5.93,0.92,3.5,3.24,1.6,1,X,1,X,X,178.4,91.1,102,99,2,Caucasian,28.6,140,110
10,354105,395,1,0,0.83,0,4.1,3.4,3.97,1.58,2.1,0.54,0.42,1,X,1,1,X,172.5,68.3,89.5,78.5,2,Caucasian,23,118,87


In [5]:
all(phenotype_data_sorted[:, 2] .== genotype_order)

true

In [6]:
#save file
CSV.write("full_phenotype_sorted", phenotype_data_sorted)

"full_phenotype_sorted"

# Filtered phenotype and genotype data for HDL phenotype

In [7]:
#import phenotype data whose order matches the genotypes
phenotype_data_sorted = CSV.read("full_phenotype_sorted")

# exclude samples without HDL measurements
HDL_chol = phenotype_data_sorted[:FS_KOL_H]
missing_HDL_data = HDL_chol .== "X"

# exclude people that are fasting
fasting_data = phenotype_data_sorted[:FASTING_STATUS]
contains_nonfasting_blood = (fasting_data .== "1") .+ (fasting_data .== "X")

# exlucde people on diabetes medication
diabetes_med = phenotype_data_sorted[:a10atc]
contains_diabetes_medication = (diabetes_med .== "1") .+ (diabetes_med .== "X")

# exclude SNPs with maf < 0.01 and SNPs with HWE p-value < 0.00001
rowmask, snps_to_keep = SnpArrays.filter(kevin_stampeed, min_success_rate_per_row=1.0, 
    min_success_rate_per_col=1.0, min_maf=0.01, min_hwe_pval=1e-5)

# combine
samples_to_exclude = missing_HDL_data .+ contains_nonfasting_blood .+ contains_diabetes_medication .+ rowmask
samples_to_keep = samples_to_exclude .== 0

@show count(snps_to_keep)
@show count(samples_to_keep)

count(snps_to_keep) = 324789
count(samples_to_keep) = 4907


4907

## write filtered result to new file

In [8]:
SnpArrays.filter("kevin_imputed", samples_to_keep, snps_to_keep, des="kevin_imputed_filtered_HDL")

4907×324789 SnpArray:
 0x03  0x03  0x03  0x03  0x03  0x03  …  0x03  0x02  0x03  0x03  0x03  0x03
 0x03  0x02  0x02  0x02  0x02  0x03     0x03  0x03  0x02  0x03  0x03  0x03
 0x02  0x03  0x03  0x03  0x02  0x03     0x02  0x02  0x03  0x03  0x03  0x03
 0x03  0x03  0x03  0x03  0x02  0x03     0x03  0x03  0x03  0x03  0x03  0x03
 0x03  0x03  0x03  0x03  0x03  0x03     0x03  0x02  0x03  0x03  0x03  0x03
 0x03  0x03  0x03  0x03  0x02  0x03  …  0x03  0x02  0x03  0x03  0x03  0x03
 0x03  0x03  0x03  0x03  0x02  0x03     0x02  0x02  0x02  0x02  0x02  0x03
 0x03  0x03  0x03  0x03  0x03  0x03     0x02  0x02  0x03  0x03  0x03  0x03
 0x03  0x03  0x03  0x03  0x03  0x03     0x03  0x00  0x00  0x00  0x00  0x03
 0x02  0x00  0x00  0x00  0x03  0x03     0x03  0x02  0x03  0x03  0x03  0x03
 0x03  0x03  0x03  0x03  0x02  0x03  …  0x03  0x00  0x03  0x03  0x03  0x02
 0x03  0x03  0x03  0x03  0x02  0x03     0x03  0x00  0x00  0x03  0x03  0x03
 0x03  0x02  0x02  0x02  0x03  0x03     0x03  0x03  0x00  0x02  0x02  0x03
   

## Compute top 2 principal components on resulting file using plink2

The following command was executed:
```
./plink2 --bfile kevin_imputed_filtered_HDL --pca 2
```

In [9]:
genotype_order = CSV.read("kevin_imputed.fam", delim=' ', header=false)[:, 1]
all(phenotype_data_sorted[:, 2] .== genotype_order)

true

In [10]:
# first filter out samples not used
filtered_phenotypes = phenotype_data_sorted[samples_to_keep, :]

# order phenotype data so it's columns matches the genotype data
filtered_genotype_order = CSV.read("kevin_imputed_filtered_HDL.fam", delim=' ', header=false)[:, 1]

# check if both are the same
all(filtered_phenotypes[:, 2] .== filtered_genotype_order)

true

# Begin analysis

In [11]:
x = SnpArray("kevin_imputed_filtered_HDL.bed")
y = parse.(Float64, HDL_chol[samples_to_keep])

4907-element Array{Float64,1}:
 2.26
 1.55
 1.19
 1.44
 1.53
 1.48
 2.01
 1.31
 0.92
 1.58
 1.74
 1.4 
 1.66
 ⋮   
 1.67
 1.62
 0.95
 2.2 
 1.67
 1.05
 2.2 
 1.25
 1.27
 1.73
 1.43
 1.49

# Include nongenetic covariates

As per Keys and Sabatti, we include intercept, sexOCPG, and first 2 principal components as nongenetic covariates. Here we standardized all covariates other than the intercept, because we want to ensure equal penalty to all covariates.

In [12]:
n, p = size(x)
z = zeros(n, 4) 

# add intercept
z[:, 1] .= ones(n) 

# add sexOCPG 
for i in 1:n
    if phenotype_data[:SEX][i] == 1 #males
        z[i, 2] = 0.0
    else
        # oral contraceptive: 0 = no, 1 = yes, X = unknown
        # pregnancy: 1 = yes, 2 = no, 3 and X = unknown
        my_covariates = (phenotype_data[:ZP4202U][i], phenotype_data[:ZT20][i])
        
        my_covariates == ("0", "1") && (z[i, 2] = 1)
        my_covariates == ("0", "2") && (z[i, 2] = 2)
        my_covariates == ("0", "3") && (z[i, 2] = 3)
        my_covariates == ("0", "X") && (z[i, 2] = 3)
        
        my_covariates == ("1", "1") && (z[i, 2] = 4)
        my_covariates == ("1", "2") && (z[i, 2] = 5)
        my_covariates == ("1", "3") && (z[i, 2] = 6)
        my_covariates == ("1", "X") && (z[i, 2] = 6)
        
        my_covariates == ("X", "1") && (z[i, 2] = 7)
        my_covariates == ("X", "2") && (z[i, 2] = 8)
        my_covariates == ("X", "3") && (z[i, 2] = 9)
        my_covariates == ("X", "X") && (z[i, 2] = 9)
    end 
end

# add first 2 principal components
pc = CSV.read("kevin_imputed_filtered_HDL.eigenvec", delim="\t", header=true)
pc1 = pc[:, 3]
pc2 = pc[:, 4]

# standardize all covariates
mean_sexOCPG = mean(z[:, 2])
mean_pc1 = mean(pc1)
mean_pc2 = mean(pc2)
std_pc1 = std(pc1)
std_pc2 = std(pc2)
std_sexOCPG = std(z[:, 2])
z[:, 2] .= (z[:, 2] .- mean_sexOCPG) ./ std_sexOCPG
z[:, 3] .= (pc1 .- mean_pc1) ./ std_pc1
z[:, 4] .= (pc2 .- mean_pc2) ./ std_pc2

4907-element view(::Array{Float64,2}, :, 4) with eltype Float64:
  0.11501383899262699
 -0.731947701751833  
 -0.40013164774817345
  0.233679720519238  
 -0.47001409107384234
 -1.1977466058465738 
  0.3955613347813178 
 -0.18019782607838233
 -0.5253507375192138 
 -0.7215813612198158 
 -0.7305818663438984 
 -0.7541022524969485 
 -0.04512293804652651
  ⋮                  
 -0.10299240368094033
 -0.9690497242851563 
 -0.7020604213125643 
 -0.33975191671965843
 -0.23874243466136452
 -0.15942592102314546
  0.25845317310551646
 -0.433975648299967  
  0.03560718021748424
  0.07674803386119655
 -0.4389416857575347 
  0.07464324647614845

In [13]:
@show mean(z[:, 2]), var(z[:, 2])
@show mean(z[:, 3]), var(z[:, 3])
@show mean(z[:, 4]), var(z[:, 4])

(mean(z[:, 2]), var(z[:, 2])) = (9.267319153993562e-17, 1.0000000000000022)
(mean(z[:, 3]), var(z[:, 3])) = (-9.267319153993562e-17, 1.0000000000000002)
(mean(z[:, 4]), var(z[:, 4])) = (-5.792074471245977e-17, 1.0)


(-5.792074471245977e-17, 1.0)

# Run cross validation

In [23]:
Random.seed!(2019)
d = Normal
l = canonicallink(d())
path = collect(1:20)
num_folds = 5
folds = rand(1:num_folds, size(x, 1))

4907-element Array{Int64,1}:
 5
 5
 4
 3
 4
 4
 4
 1
 4
 5
 4
 4
 4
 ⋮
 1
 4
 3
 3
 1
 3
 5
 3
 3
 2
 1
 5

In [24]:
println("start time = " * string(Dates.format(now(), "HH:MM")))
mses = cv_iht(d(), l, x, z, y, 1, path, folds, num_folds, debias=false, parallel=true)
println("end time = " * string(Dates.format(now(), "HH:MM")))

start time = 15:13


Crossvalidation Results:
	k	MSE
	1	145.12410532503964
	2	134.36040289022387
	3	130.72472497157796
	4	125.4161959385786
	5	126.2359895716371
	6	125.78257289367798
	7	124.2757976109952
	8	125.25632875344488
	9	124.02063808893105
	10	124.350144844104
	11	124.97068957148014
	12	125.32195687543482
	13	125.45767566168111
	14	125.96429524409824
	15	125.73779936320864
	16	126.19412010177521
	17	126.6648667684983
	18	126.54742881742231
	19	127.20724353860666
	20	127.00630454726273

The lowest MSE is achieved at k = 9 

end time = 17:17


# Compute final model

In [25]:
xbm = SnpBitMatrix{Float64}(x, model=ADDITIVE_MODEL, center=true, scale=true); 
@show k = argmin(mses)
d = Normal
l = canonicallink(d())
result = L0_reg(x, xbm, z, y, 1, k, d(), l, debias=true)

k = argmin(mses) = 9


IHT results:

Compute time (sec):     81.92405891418457
Final loglikelihood:    -1850.6283699345115
Iterations:             6
Max number of groups:   1
Max predictors/group:   9
IHT estimated 8 nonzero SNP predictors and 1 non-genetic predictors.

Genetic predictors:
8×2 DataFrame
│ Row │ Position │ Estimated_β │
│     │ [90mInt64[39m    │ [90mFloat64[39m     │
├─────┼──────────┼─────────────┤
│ 1   │ 119288   │ -0.0296878  │
│ 2   │ 119293   │ 0.172518    │
│ 3   │ 119294   │ -0.0721499  │
│ 4   │ 213243   │ -0.0269039  │
│ 5   │ 265116   │ -0.0351221  │
│ 6   │ 275336   │ -0.0514734  │
│ 7   │ 275338   │ 0.0252196   │
│ 8   │ 276491   │ -0.026729   │

Nongenetic predictors:
1×2 DataFrame
│ Row │ Position │ Estimated_β │
│     │ [90mInt64[39m    │ [90mFloat64[39m     │
├─────┼──────────┼─────────────┤
│ 1   │ 1        │ 1.56218     │

In [26]:
estimated_b = result.beta
position = findall(!iszero, estimated_b)
found_snps = CSV.read("kevin_imputed_filtered_HDL.bim", delim='\t', header=false)[position, :]

Unnamed: 0_level_0,Column1,Column2,Column3,Column4,Column5,Column6
Unnamed: 0_level_1,Int64⍰,String⍰,Int64⍰,Int64⍰,String⍰,String⍰
1,6,rs9261224,0,30121866,A,G
2,6,rs6917603,0,30125050,G,A
3,6,rs9261256,0,30129920,C,G
4,11,rs7120118,0,47242866,G,A
5,15,rs1532085,0,56470658,A,G
6,16,rs3764261,0,55550825,A,C
7,16,rs7499892,0,55564091,A,G
8,16,rs3852700,0,65829359,G,A


In [27]:
# try no debias
result = L0_reg(x, xbm, z, y, 1, k, d(), l, debias=false)

IHT results:

Compute time (sec):     859.1547918319702
Final loglikelihood:    -1831.2980574010048
Iterations:             75
Max number of groups:   1
Max predictors/group:   9
IHT estimated 8 nonzero SNP predictors and 1 non-genetic predictors.

Genetic predictors:
8×2 DataFrame
│ Row │ Position │ Estimated_β │
│     │ [90mInt64[39m    │ [90mFloat64[39m     │
├─────┼──────────┼─────────────┤
│ 1   │ 119288   │ -0.0415433  │
│ 2   │ 119293   │ 0.172384    │
│ 3   │ 119294   │ -0.0598533  │
│ 4   │ 213243   │ -0.0269279  │
│ 5   │ 265116   │ -0.0340865  │
│ 6   │ 275336   │ -0.0586943  │
│ 7   │ 276491   │ -0.0275546  │
│ 8   │ 310746   │ 0.0259204   │

Nongenetic predictors:
1×2 DataFrame
│ Row │ Position │ Estimated_β │
│     │ [90mInt64[39m    │ [90mFloat64[39m     │
├─────┼──────────┼─────────────┤
│ 1   │ 1        │ 1.56209     │

In [28]:
estimated_b = result.beta
position = findall(!iszero, estimated_b)
found_snps = CSV.read("kevin_imputed_filtered_HDL.bim", delim='\t', header=false)[position, :]

Unnamed: 0_level_0,Column1,Column2,Column3,Column4,Column5,Column6
Unnamed: 0_level_1,Int64⍰,String⍰,Int64⍰,Int64⍰,String⍰,String⍰
1,6,rs9261224,0,30121866,A,G
2,6,rs6917603,0,30125050,G,A
3,6,rs9261256,0,30129920,C,G
4,11,rs7120118,0,47242866,G,A
5,15,rs1532085,0,56470658,A,G
6,16,rs3764261,0,55550825,A,C
7,16,rs3852700,0,65829359,G,A
8,20,rs1800961,0,42475778,A,G
