# Analyze associations between eQTLs and traits
- we have clustered loci based on their colocalization (Kohonen SOMs)
- Look at how eQTLs colocalize with each of these clusters

In [2]:
setwd("/frazer01/projects/CARDIPS/analysis/cardiac_gwas_coloc")

source("script/functions.R"  )


In [3]:
dir.create("pipeline/5.4.analyze_coloc_qtl_som_maps"           , showWarnings = FALSE)
dir.create("pipeline/5.4.analyze_coloc_qtl_som_maps/coloc_data", showWarnings = FALSE)


# Get GWAS data

In [4]:
suppressPackageStartupMessages(library(kohonen))


In [5]:
manifest   = add_rownames(fread("pipeline/1.1.sumstats/manifest.txt"                     , sep = "\t", header = TRUE, data.table = FALSE))
loci       = add_rownames(fread("pipeline/1.2.genomewide_significant_loci/loci.txt"      , sep = "\t", header = TRUE, data.table = FALSE))
loci2study = add_rownames(fread("pipeline/1.2.genomewide_significant_loci/loci2study.txt", sep = "\t", header = TRUE, data.table = FALSE))


In [6]:
populations = c('meta','AFR','AMR','CSA','EAS','EUR','MID')

In [7]:
moloc_df       = fread  ("pipeline/2.2.moloc/moloc.txt", sep = "\t", header = TRUE, data.table = FALSE)
moloc_list     = readRDS("pipeline/2.2.moloc/moloc_list.rds")
moloc_map_list = readRDS("pipeline/2.2.moloc/moloc_map_list.rds")


In [8]:
moloc_loci_groups = moloc_map_list[["som"]][["input"]]
moloc_loci2class  = moloc_map_list[["loci2class"]]
moloc_trait2class = moloc_map_list[["trait2class"]]

moloc_loci2class$class = paste0("V", moloc_loci2class$som)

rownames(moloc_loci2class) = moloc_loci2class$id

## Get eQTL coloc data

In [9]:
qtl_list = readRDS("/frazer01/projects/CARDIPS/analysis/cardiac_qtls_combined/input/qtl/qtls.RDS"   )
exp_list = readRDS("/frazer01/projects/CARDIPS/analysis/cardiac_qtls_combined/input/expdata_qtl.rds")

In [10]:
phenotypes  = qtl_list$phenotypes
qtls        = qtl_list$qtl
coordinates = exp_list$coordinates
rownames(coordinates) = coordinates$transcript_id

In [11]:
coloc_qtls = fread("pipeline/4.1.coloc_qtls/coloc_eqtls.txt", sep = "\t", header = TRUE, data.table = FALSE)


In [12]:
str(coloc_qtls)

'data.frame':	860462 obs. of  16 variables:
 $ transcript_id: chr  "ENSG00000000457.14_7" "ENSG00000000457.14_7" "ENSG00000000457.14_7" "ENSG00000000457.14_7" ...
 $ nsnps        : int  1513 1513 1513 1513 1513 1513 1513 1513 1513 1513 ...
 $ PP.H0.abf    : num  1.56e-07 3.48e-01 3.54e-01 1.58e-07 3.61e-01 ...
 $ PP.H1.abf    : num  0.387 0.0305 0.0362 0.3902 0.0279 ...
 $ PP.H2.abf    : num  2.38e-07 5.33e-01 5.42e-01 2.41e-07 5.53e-01 ...
 $ PP.H3.abf    : num  0.5922 0.0466 0.0554 0.5972 0.0426 ...
 $ PP.H4.abf    : num  0.0208 0.0415 0.0116 0.0126 0.0148 ...
 $ locus        : chr  "1_168611028_169868981" "1_168611028_169868981" "1_168611028_169868981" "1_168611028_169868981" ...
 $ gwas         : chr  "icd10-I26-both_sexes" "continuous-PP-both_sexes-combined_medadj_irnt" "continuous-PP-both_sexes-combined_medadj_irnt" "continuous-PP-both_sexes-combined_medadj_irnt" ...
 $ pop          : chr  "meta" "AMR" "CSA" "meta" ...
 $ type         : int  0 0 0 0 0 0 0 0 0 0 ...
 $ id         

# Integrate coloc with moloc data

In [13]:
get_coloc_qtl_by_locus = function(id, moloc_loci2class, coloc_qtls)
{
    locus       = moloc_loci2class[id, "locus"  ]
    som         = moloc_loci2class[id, "class"  ]
    cluster     = moloc_loci2class[id, "cluster"]
    traits2test = data.frame(trait = colnames(moloc_trait2class), weight = as.numeric(moloc_trait2class[som,]))
    traits      = traits2test[traits2test$weight >= 0.5, "trait"]
    coloc       = coloc_qtls[ coloc_qtls$locus == locus & coloc_qtls$pop == "meta" & coloc_qtls$gwas %in% traits,]
    
    out = list(som            = moloc_loci2class[id,],
               traits_cluster = traits,
               eqtls          = coloc[ coloc$PP.H4.abf >= 0.8, "transcript_id"],
               traits         = traits2test,
               coloc          = coloc
              )
    
    #return(coloc[ coloc$PP.H4.abf >= 0.8,])
    return(out)
}

id = "5_130403286_131933599.5"
id = "5_130403286_131933599.3"

coloc_list        = lapply(moloc_loci2class$id, function(id){get_coloc_qtl_by_locus(id, moloc_loci2class, coloc_qtls)})
names(coloc_list) = moloc_loci2class$id

saveRDS(coloc_list, "pipeline/5.4.analyze_coloc_qtl_som_maps/coloc_list.rds")

In [15]:
locus2coloc = as.data.frame(rbindlist(lapply(coloc_list, function(x)
{
    out          = x$som
    out$traits   = paste(sort(unique(x$traits_cluster)), collapse = "; ")
    out$traits_n = length(unique(x$traits_cluster))
    out$eqtls    = paste(sort(unique(x$eqtls)), collapse = "; ")
    out$eqtls_n  = length(unique(x$eqtls))
    
    return(out)
})), stringsAsFactors = FALSE)

fwrite(locus2coloc, "pipeline/5.4.analyze_coloc_qtl_som_maps/locus2coloc.txt", sep = "\t", col.names = TRUE, row.names = FALSE)

In [16]:
locus2coloc[ locus2coloc$id == "17_40144007_48070076.2",]

Unnamed: 0_level_0,som,id,locus,cluster,class,traits,traits_n,eqtls,eqtls_n
Unnamed: 0_level_1,<dbl>,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<int>
12,24,17_40144007_48070076.2,17_40144007_48070076,20,V24,biomarkers-30690-both_sexes-irnt; biomarkers-30780-both_sexes-irnt; continuous-LDLC-both_sexes-medadj_irnt,3,ENSG00000177469.13_8; ENST00000322157.9_3; ENST00000517484.5_3,3


In [15]:
totest = locus2coloc[ locus2coloc$traits_n > 0 & rowSums(locus2coloc[,c("traits_n", "eqtls_n")]) > 1, "id"]

length(totest)

# Analyze loci

In [16]:
suppressPackageStartupMessages(library(coloc))

In [17]:
get_gwas_data = function(coord, gwas_file)
{
    my_head              = colnames(fread(cmd = paste("zcat", gwas_file, "|", "head -n 2"), sep = "\t", header = TRUE, data.table = FALSE))
    gwas_data            = suppressWarnings(tabix.read.table(gwas_file, coord, col.names = TRUE, stringsAsFactors = FALSE))
    colnames(gwas_data)  = my_head
    gwas_data$id         = paste("VAR", gwas_data$chr, gwas_data$pos, gwas_data$ref, gwas_data$alt, sep = "_")
    rownames(gwas_data)  = gwas_data$id
    
    return(gwas_data)
}

create_dataset = function(study, trait_type, totest, variants, pop, manifest)
{
    #message(paste(study, trait_type, pop))
    if(trait_type %in% c("categorical", "icd10", "phecode"))
    {
        populations = unlist(strsplit(manifest[study, "pops"], ","))
        if(pop == "meta"){n = sum(manifest[study, paste("n_cases", populations, sep = "_")]) + sum(manifest[study, paste("n_controls", populations, sep = "_")])}
        if(pop != "meta"){n =     manifest[study, paste("n_cases", pop        , sep = "_")]  +     manifest[study, paste("n_controls", pop        , sep = "_")] }
        
        if(pop == "meta"){s = sum(manifest[study, paste("n_cases", populations, sep = "_")]) / n}
        if(pop != "meta"){s =     manifest[study, paste("n_cases", pop        , sep = "_")]  / n}
        
        totest  = totest[is.na(totest[,paste("pval", pop, sep = "_")]) == FALSE & is.na(totest[,paste("af_controls", pop, sep = "_")]) == FALSE, ]
        dataset = list(snp = variants, pvalues = totest[variants, paste("pval", pop, sep = "_")], N = n, s = s, MAF = totest[variants, paste("af_controls", pop, sep = "_")], type = "cc")
        
    }
    if(trait_type %in% c("biomarkers", "continuous"))
    {
        populations = unlist(strsplit(manifest[study, "pops"], ","))
        if(pop == "meta"){n = sum(manifest[study, paste("n_cases", populations, sep = "_")])}
        if(pop != "meta"){n =     manifest[study, paste("n_cases", pop        , sep = "_")] }
        
        totest  = totest[is.na(totest[,paste("pval", pop, sep = "_")]) == FALSE & is.na(totest[,paste("af", pop, sep = "_")]) == FALSE, ]
        dataset = list(snp = variants, pvalues = totest[variants, paste("pval", pop, sep = "_")], N = n, MAF = totest[variants, paste("af", pop, sep = "_")], type = "quant")
    }
    if(trait_type == "qtl")
    {
        rownames(totest) = totest$id
        dataset = list(snp     = variants, 
                       pvalues = totest[variants, "pval"], 
                       N       = 966, 
                       MAF     = totest[variants, "af"], 
                       type    = "quant")
    }
    
    
    return(dataset)
}

run_coloc_by_pop = function(locus, study1, study2, pop, totest1, totest2, manifest)
{
    if (nrow(manifest[ manifest$id == study1,]) > 0){trait_type1 = manifest[study1, "trait_type"]}
    if (nrow(manifest[ manifest$id == study2,]) > 0){trait_type2 = manifest[study2, "trait_type"]}
    
    if (nrow(manifest[ manifest$id == study1,]) == 0){trait_type1 = "qtl"}
    if (nrow(manifest[ manifest$id == study2,]) == 0){trait_type2 = "qtl"}
    
    if( trait_type1 %in% c("categorical", "icd10", "phecode")){variants1  = totest1[is.na(totest1[,paste("pval", pop, sep = "_")]) == FALSE & is.na(totest1[,paste("af_controls", pop, sep = "_")]) == FALSE, "id"]}
    if( trait_type1 %in% c("biomarkers", "continuous"       )){variants1  = totest1[is.na(totest1[,paste("pval", pop, sep = "_")]) == FALSE & is.na(totest1[,paste("af"         , pop, sep = "_")]) == FALSE, "id"]}
    if( trait_type1 %in% c("qtl"                            )){variants1  = totest1$id}
    
    
    if( trait_type2 %in% c("categorical", "icd10", "phecode")){variants2  = totest2[is.na(totest2[,paste("pval", pop, sep = "_")]) == FALSE & is.na(totest2[,paste("af_controls", pop, sep = "_")]) == FALSE, "id"]}
    if( trait_type2 %in% c("biomarkers", "continuous"       )){variants2  = totest2[is.na(totest2[,paste("pval", pop, sep = "_")]) == FALSE & is.na(totest2[,paste("af"         , pop, sep = "_")]) == FALSE, "id"]}
    if( trait_type2 %in% c("qtl"                            )){variants2  = totest2$id}
    
    if( trait_type1 %in% c("qtl")){pop1 = ""}
    if( trait_type2 %in% c("qtl")){pop2 = ""}
    
    if(!trait_type1 %in% c("qtl")){pop1 = pop}
    if(!trait_type2 %in% c("qtl")){pop2 = pop}
    
    variants2   = totest2$id
    variants    = intersect(variants1, variants2)
    
    if(length(variants) > 100)
    {
        dataset1        = create_dataset(study1, trait_type1, totest1, variants, pop1, manifest)
        dataset2        = create_dataset(study2, trait_type2, totest2, variants, pop2, manifest)
        coloc_mapped    = coloc.abf(dataset1 = dataset1, dataset2 = dataset2) 
        probs           = as.data.frame(t(coloc_mapped$summary))
        myres           = coloc_mapped$results
        myres           = myres[, c(which(colnames(myres) == "snp"), ncol(myres))]
        colnames(myres) = c("id", "pp_snp")
        myres           = myres[order(myres$pp_snp, decreasing = TRUE), ]
        myres$cum       = cumsum(myres$pp_snp)
        myres$cs        = FALSE
        to_cs           = myres[myres$cum >= 0.99, ][1, "id"]
        
        myres[myres$cum < 0.99 | myres$id == to_cs, "cs"] = TRUE
        
        out             = cbind(probs, myres[1, ])
    }else
    {
        out = data.frame(nsnps = 0, PP.H0.abf = 1, PP.H1.abf = 0, PP.H2.abf = 0, PP.H3.abf = 0, PP.H4.abf = 0, 
                         locus = locus, study1 = study1, study2 = study2, id = "", pp_snp = 0
                        )
        myres = data.frame(id = "", pp_snp = 0, cum = 1, cs = FALSE)
    }

    return(list(top = out, pp = myres))
}

combine_variants = function(totest_comb, coloc_tests, totest_list)
{
    myvars = sort(unique(unlist(lapply(totest_list, function(x){x$id}))))
    out    = matrix(0, nrow = length(myvars), ncol = nrow(totest_comb), dimnames = list(myvars, totest_comb$id))
    
    if(nrow(totest_comb) > 1)
    {
        for(id in totest_comb$id)
        {
            this = coloc_tests[[id]][["pp"]]
            
            if(nrow(this) > 1){out[ this$id, id] = this$pp_snp}
        }

        out = out[rownames(out) != "",]
    }else
    {
        this = coloc_tests[[totest_comb[1, "id"]]][["pp"]]
        
        if(nrow(this) > 1){out[ this$id, 1] = this$pp_snp}
    }
    
    return(out)
}

quiet = function(x) 
{ 
    sink(tempfile()) 
    on.exit(sink()) 
    invisible(force(x)) 
} 

analyze_locus = function(id, coloc_list, loci, manifest, to_return = FALSE)
{
    message(id)
    som            = coloc_list[[id]][["som"           ]]
    traits_cluster = coloc_list[[id]][["traits_cluster"]]
    eqtls          = coloc_list[[id]][["eqtls"         ]]
    traits         = coloc_list[[id]][["traits"        ]]
    coloc          = coloc_list[[id]][["coloc"         ]]   
    
    locus         = som[1, "locus"]
    coloc         = coloc[ coloc$PP.H4.abf >= 0.8,]
    coloc         = coloc[order(coloc$PP.H4.abf, decreasing = TRUE),]
    coloc$tr2type = paste(coloc$transcript_id, coloc$type)
    coord         = loci[locus,]
    coord         = paste0(coord$chrom, ":", coord$from, "-", coord$to)
    
    # get GWAS data
    totest_gwas_list        = lapply(traits_cluster, function(gwas){get_gwas_data(coord, manifest[gwas, "sumstat_file"])})
    names(totest_gwas_list) = traits_cluster
    
    # get QTL data
    qtls          = unique(coloc[,c("transcript_id", "gene_id", "phenotype", "type", "tr2type")])
    
    if(nrow(qtls) > 0)
    {
        qtls$qtl_file = paste("/frazer01/projects/CARDIPS/analysis/cardiac_qtls_combined/input/qtl/processing", qtls$phenotype, paste("qtl", qtls$transcript_id, "txt", sep = "."), sep = "/")

        qtl_list = lapply(1:nrow(qtls), function(ii)
        {
            indata = fread(qtls[ii, "qtl_file"], sep = "\t", header = TRUE, data.table = FALSE)
            type   =       qtls[ii, "type"    ]
            indata = indata[indata$type == type,]

            return(indata)
        })

        names(qtl_list) = qtls$tr2type
        
        totest_list = c(totest_gwas_list, qtl_list)
    }else
    {
        totest_list = totest_gwas_list
    }
    
    # Generate matrix for coloc (all vs. all)
    
    totest_comb           = as.data.frame(t(combn(names(totest_list), 2)))
    colnames(totest_comb) = c("study1", "study2")
    totest_comb$id        = paste(totest_comb$study1, totest_comb$study2, sep = ":")
    rownames(totest_comb) = totest_comb$id
    
    coloc_tests = lapply(totest_comb$id, function(id)
    {
        study1 = totest_comb[id, "study1"]
        study2 = totest_comb[id, "study2"]
        out    = run_coloc_by_pop(locus, study1, study2, "meta", totest_list[[study1]], totest_list[[study2]], manifest)
        
        return(out)
    })
    
    names(coloc_tests) = totest_comb$id
    
    pps                = combine_variants(totest_comb, coloc_tests, totest_list)
    var2pp             = data.frame(id = rownames(pps), pp_mean = as.numeric(rowMeans(pps)))
    var2pp             = var2pp[order(var2pp$pp_mean, decreasing = TRUE),]    
    out                = list(comb        = totest_comb, 
                              totest_list = totest_list, 
                              coloc_tests = coloc_tests,
                              pps         = pps        ,
                              var2pp      = var2pp
                             )
    
    saveRDS(out, paste0("pipeline/5.4.analyze_coloc_qtl_som_maps/coloc_data/", id, ".rds"))
    
    if(to_return == TRUE){return(out)}
}

id = "5_136864391_137972681.2" # atrial fibrillation + pulse rate
#id = "5_16423249_16671484.1" 

x = quiet(analyze_locus(id, coloc_list, loci, manifest, TRUE))

#invisible(lapply(totest, function(id){quiet(analyze_locus(id, coloc_list, loci, manifest, FALSE))}))


5_136864391_137972681.2



# Summarize colocalization results

In [37]:
find_lead = function(id)
{
    x      = readRDS(paste0("pipeline/5.4.analyze_coloc_qtl_som_maps/coloc_data/", id, ".rds"))
    out    = x[["var2pp"]][1,]
    out    = cbind(locus2coloc[ locus2coloc$id == id, ], data.frame(variant_id = out$id, pp = out$pp_mean))
    
    if(out$eqtls_n >  0)
    {
        genes                = sort(unique(coordinates[ coordinates$transcript_id %in% unlist(strsplit(out$eqtls, "; ")), "gene_name"]))
        out$eqtls_gene_names = paste(genes, collapse = "; ")
        out$eqtls_genes_n    = length(genes)
    }else
    {
        out$eqtls_gene_names = ""
        out$eqtls_genes_n    = 0
    }
    
    out$traits_name = paste(sort(unique(manifest[ unlist(strsplit(out$traits, "; ")), "name"])), collapse = "; ")
    
    return(out)
}

id = "5_136864391_137972681.2" # atrial fibrillation + pulse rate
#find_lead(id)

pps = as.data.frame(rbindlist(lapply(totest, find_lead)), stringsAsFactors = FALSE)
pps = pps[order(pps$pp, decreasing = TRUE),]
fwrite(pps, "pipeline/5.4.analyze_coloc_qtl_som_maps/pps.txt", sep = "\t", col.names = TRUE, row.names = FALSE)

In [39]:
nrow(pps[ pps$traits_n > 0 & pps$eqtls_n > 0 & pps$pp > 0.9, ])
nrow(pps[ pps$traits_n > 0 & pps$eqtls_n > 0 & pps$pp > 0.5 & pps$pp <= 0.9, ])


In [40]:
pps[ pps$traits_n > 0 & pps$eqtls_n > 0 & pps$pp > 0.9, ]

Unnamed: 0_level_0,som,id,locus,cluster,class,traits,traits_n,eqtls,eqtls_n,variant_id,pp,eqtls_gene_names,eqtls_genes_n,traits_name
Unnamed: 0_level_1,<dbl>,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<int>,<chr>,<dbl>,<chr>,<dbl>,<chr>
306,95,14_72565647_74785483.2,14_72565647_74785483,19,V95,continuous-102-both_sexes-irnt,1,ENSG00000182732.18_5,1,VAR_14_72885471_G_C,1.0,RGS6,1,Pulse rate
96,37,19_53257172_53457172.1,19_53257172_53457172,5,V37,continuous-20022-both_sexes-irnt,1,ENSG00000204604.12_7; ENST00000601980.1_2,2,VAR_19_53357172_C_T,1.0,ZNF468,1,Birth weight
729,214,19_6644254_6845138.1,19_6644254_6845138,14,V214,continuous-PP-both_sexes-combined_medadj_irnt,1,ENSG00000125733.18_7,1,VAR_19_6745138_G_A,1.0,TRIP10,1,Pulse pressure
895,269,9_35586407_36048588.1,9_35586407_36048588,4,V269,continuous-DBP-both_sexes-combined_medadj_irnt; continuous-MAP-both_sexes-combined_medadj_irnt; continuous-SBP-both_sexes-combined_medadj_irnt; phecode-401-both_sexes,4,ENSG00000196196.3_4,1,VAR_9_35906471_C_T,1.0,HRCT1,1,Diastolic blood pressure; Hypertension; Mean arterial pressure; Systolic blood pressure
1517,376,3_86937543_87221775.1,3_86937543_87221775,7,V376,biomarkers-30760-both_sexes-irnt; biomarkers-30870-both_sexes-irnt,2,ENSG00000206538.9_6,1,VAR_3_87037543_A_G,1.0,VGLL3,1,HDL cholesterol; Triglycerides
234,73,22_27930915_28313866.1,22_27930915_28313866,19,V73,continuous-102-both_sexes-irnt,1,ENSG00000169184.6_6,1,VAR_22_28181399_C_T,1.0,MN1,1,Pulse rate
130,40,17_48495988_48737385.1,17_48495988_48737385,15,V40,biomarkers-30870-both_sexes-irnt,1,ENSG00000006282.21_8,1,VAR_17_48624523_A_C,0.9999998,SPATA20,1,Triglycerides
138,40,22_35824713_36142986.1,22_35824713_36142986,15,V40,biomarkers-30870-both_sexes-irnt,1,ENSG00000221963.6_4,1,VAR_22_36042986_C_T,0.9999993,APOL6,1,Triglycerides
307,95,6_117676756_119700041.1,6_117676756_119700041,19,V95,continuous-102-both_sexes-irnt,1,ENSG00000198523.6_4,1,VAR_6_118667522_T_C,0.9999929,PLN,1,Pulse rate
573,163,19_51026824_51232746.1,19_51026824_51232746,11,V163,continuous-21001-both_sexes-irnt,1,ENSG00000131409.13_7,1,VAR_19_51130909_G_A,0.9999929,LRRC4B,1,Body mass index (BMI)


In [41]:
coordinates[ coordinates$transcript_id == "ENSG00000164867.11_5", ]

Unnamed: 0_level_0,chrom,start,end,strand,gene_id,gene_name,gene_type,transcript_id,transcript_type,phenotype
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
ENSG00000164867.11_5,chr7,150688105,150711676,+,ENSG00000164867.11_5,NOS3,protein_coding,ENSG00000164867.11_5,protein_coding,rna
