# Analysis of meta-analyzed data

In [1]:
setwd("/frazer01/projects/CARDIPS/analysis/cardiac_gwas_coloc")

source("script/functions.R"  )


# Input data

In [2]:
suppressPackageStartupMessages(library(kohonen))


In [3]:
manifest   = add_rownames(fread("pipeline/1.1.sumstats/manifest.txt"                     , sep = "\t", header = TRUE, data.table = FALSE))
loci       = add_rownames(fread("pipeline/1.2.genomewide_significant_loci/loci.txt"      , sep = "\t", header = TRUE, data.table = FALSE))
loci2study = add_rownames(fread("pipeline/1.2.genomewide_significant_loci/loci2study.txt", sep = "\t", header = TRUE, data.table = FALSE))


In [4]:
qtl_list = readRDS("/frazer01/projects/CARDIPS/analysis/cardiac_qtls_combined/input/qtl/qtls.RDS"   )
exp_list = readRDS("/frazer01/projects/CARDIPS/analysis/cardiac_qtls_combined/input/expdata_qtl.rds")

In [5]:
phenotypes  = qtl_list$phenotypes
qtls        = qtl_list$qtl
coordinates = exp_list$coordinates
rownames(coordinates) = coordinates$transcript_id

In [6]:
coloc_list        = readRDS("pipeline/5.4.analyze_coloc_qtl_som_maps/coloc_list.rds")
locus2coloc       = fread  ("pipeline/5.4.analyze_coloc_qtl_som_maps//locus2coloc.txt", sep = "\t", header = TRUE, data.table = FALSE)
intersected       = fread  ("pipeline/4.1.coloc_qtls/intersected_qtls_loci.txt"       , sep = "\t", header = TRUE, data.table = FALSE)
qtls2test         = fread  ("pipeline/4.1.coloc_qtls/qtls2test.txt"                   , sep = "\t", header = TRUE, data.table = FALSE)
intersected       = intersected[ intersected$phenotype %in% c("rna", "isoform"),]


In [7]:
meta_list        = lapply(locus2coloc$id, function(id){readRDS(paste0("pipeline/5.5.meta_analysis/coloc_data/", id, ".rds"))})
names(meta_list) = locus2coloc$id

In [8]:
snps = fread(cmd = "zcat /frazer01/reference/public/ukbb/variants.tsv.bgz", sep = "\t", header = TRUE, data.table = FALSE)
snps$chrom = as.numeric(snps$chr)

“NAs introduced by coercion”


In [9]:
find_coloc_qtls = function(coloc)
{
    out = as.data.frame(rbindlist(lapply(names(coloc), function(transcript_id)
    {
        as.data.frame(rbindlist(lapply(names(coloc[[transcript_id]]), function(type)
        {
            x               = coloc[[transcript_id]][[type]][["pp"]]
            x$transcript_id = transcript_id
            x$type          = as.numeric(sub("type ", "", type))
            
            return(x)
        })), stringsAsFactors = FALSE)
    })), stringsAsFactors = FALSE)
    
    out = out[ out$PP.H4.abf >= 0.8,]
    
    if(nrow(out) > 0)
    {
        out$id = paste(out$transcript_id, out$type, sep = ":")
    }else
    {
        return(c())
    }
}

find_credible_set = function(x)
{
    cs      = x[["credible_set"]]
    fm      = x[["pp"          ]]
    cs      = cs[cs != "null"]
    is_null = TRUE
    
    if((fm[1, "id"] == "null")|(length(cs) >= 10000)){is_null = FALSE}
    
    return(list(cs = cs, fm = is_null, index_var = ifelse(is_null == FALSE, yes = "", no = fm[1, "id"])))
}

locus2coloc = as.data.frame(rbindlist(lapply(meta_list, function(x)
{
    out              = x$som
    out$traits       = paste(sort(unique(x$traits_cluster)), collapse = "; ")
    out$traits_n     = length(unique(x$traits_cluster))
    eqtls            = find_coloc_qtls(x$coloc)
    out$eqtls        = paste(eqtls, collapse = ";")
    out$eqtls_n      = length(eqtls)
    credible_set     = find_credible_set(x[["finemapped_gwas"]])
    out$credible_set = length(credible_set[["cs"]])
    out$finemapped   =        credible_set[["fm"]]
    
    if(length(credible_set[["cs"]]) > 0)
    {
        cs_coord    = as.numeric(unlist(lapply(credible_set[["cs"]], function(x){unlist(strsplit(x, "_"))[[3]]})))
        out$cs_from = min(cs_coord)
        out$cs_to   = max(cs_coord)
        out$cs_snp  = credible_set[["index_var"]]
    }else
    {
        out$cs_from = NA
        out$cs_to   = NA
        out$cs_snp  = ""
    }
    
    return(out)
})), stringsAsFactors = FALSE)

locus2coloc$cs_size = locus2coloc$cs_to - locus2coloc$cs_from + 1

locus2coloc$chrom = as.numeric(unlist(lapply(locus2coloc$cs_snp, function(x){ifelse(x == "", yes = 0, no = unlist(strsplit(x, "_"))[[2]])})))
locus2coloc$pos   = as.numeric(unlist(lapply(locus2coloc$cs_snp, function(x){ifelse(x == "", yes = 0, no = unlist(strsplit(x, "_"))[[3]])})))

locus2coloc = merge(locus2coloc, snps[,c("chrom", "pos", "ref", "alt", "rsid")], by = c("chrom", "pos"), all.x = TRUE)

locus2coloc = locus2coloc[!(locus2coloc$cs_snp == "VAR_13_38249726_C_A" & locus2coloc$alt == "G") & !(locus2coloc$cs_snp == "VAR_15_99258710_A_T" & locus2coloc$alt == "G"),]

fwrite(locus2coloc, "pipeline/5.4.analyze_coloc_qtl_som_maps/signal2coloc.txt", sep = "\t", col.names = TRUE, row.names = FALSE)

# Create credible set list

In [11]:
credible_sets = lapply(locus2coloc$id, function(signal_id)
{
    meta_list[[signal_id]][["finemapped_gwas"]]
})
names(credible_sets) = locus2coloc$id

saveRDS(credible_sets, "pipeline/5.4.analyze_coloc_qtl_som_maps/credible_sets.rds")

## Annotate fine-mapped variants based on genes
- https://adairama.wordpress.com/2013/02/15/functionally-annotate-snps-and-indels-in-bioconductor/

In [13]:
fm99            = locus2coloc[ locus2coloc$finemapped == TRUE & locus2coloc$credible_set == 1,]
fm99$variant_id = unlist(lapply(fm99$id, function(x){meta_list[[x]][["finemapped_gwas"]][["credible_set"]][[1]]}))
fm99$chrom      = as.numeric(unlist(lapply(fm99$variant_id, function(x){unlist(strsplit(x, "_"))[[2]]})))
fm99$pos        = as.numeric(unlist(lapply(fm99$variant_id, function(x){unlist(strsplit(x, "_"))[[3]]})))
fm99            = merge(fm99, snps[,c("chrom", "pos", "rsid", "consequence", "consequence_category")], by = c("chrom", "pos"), all.x = TRUE)

fm99[ is.na(fm99$consequence), "consequence_category"] = "non_coding"
fm99[ is.na(fm99$consequence), "consequence"         ] = "intergenic_variant"
fwrite(fm99, "pipeline/5.4.analyze_coloc_qtl_som_maps/finemapped_99.txt", sep = "\t", col.names = TRUE, row.names = FALSE)

In [1]:
fm99

ERROR: Error in eval(expr, envir, enclos): object 'fm99' not found


# Process meta-analysis
## 1. Summarize results:
- how many signals/locus: consider NULL!
- credible set size distributions


In [None]:
signal2locus = table(locus2coloc$locus)
signal2locus = data.frame(locus = names(signal2locus), signals = as.numeric(signal2locus))

In [None]:
table(signal2locus$signals)

hist(signal2locus$signals, main = "Signals per locus")

In [None]:
cs2bin = data.frame(bin  = 1:7, 
                    from = c(0, 1,  5, 10, 20,  50,   100), 
                    to   = c(1, 5, 10, 20, 50, 100, 10000))
cs2bin$name = paste(cs2bin$from + 1, cs2bin$to, sep = "-")
cs2bin[1,"name"] = "1"
cs2bin[7,"name"] = "101+"

pp2bin = data.frame(bin  = 1:8, 
                    from = c( 99, 95, 90, 80, 50, 10,  1, 0), 
                    to   = c(100, 99, 95, 90, 80, 50, 10, 1))
pp2bin$name = paste(pp2bin$from, pp2bin$to, sep = "-")

csets = as.data.frame(rbindlist(lapply(names(meta_list), function(id)
{
    x                  = meta_list[[id]]
    out                = x$finemapped_gwas$pp
    out                = out[out$cs == TRUE, c("id", "pval", "pp_snp")]
    colnames(out)[[1]] = "variant_id"
    out$bin            = ""
    out$id             = id
    
    for(bin in pp2bin$bin)
    {
        from = pp2bin[bin, "from"] / 100
        to   = pp2bin[bin, "to"  ] / 100
        name = pp2bin[bin, "name"]
        
        if(nrow(out[out$pp_snp > from & out$pp_snp <= to, ]) > 0){out[out$pp_snp > from & out$pp_snp <= to, "bin"] = name}
    }
    return(out)
})), stringsAsFactors = FALSE)

In [None]:
csets2n = aggregate(pp_snp ~ id, data = csets, FUN = length)
colnames(csets2n)[[2]] = "n_variants"
for(bin in cs2bin$bin)
{
    from = cs2bin[bin, "from"]
    to   = cs2bin[bin, "to"  ]
    name = cs2bin[bin, "name"]

    if(nrow(csets2n[csets2n$n_variants > from & csets2n$n_variants <= to, ]) > 0){csets2n[csets2n$n_variants > from & csets2n$n_variants <= to, "bin"] = name}
}



In [None]:
cs2bin_n = table(csets2n$bin)
cs2bin_n = merge(cs2bin, data.frame(name = names(cs2bin_n), n_signals = as.numeric(cs2bin_n)))
cs2bin_n = cs2bin_n[order(cs2bin_n$bin),]

barplot(toplot$n_signals, names.arg = cs2bin_n$name, horiz = TRUE, las = 2, main = "Number of variants in credible set, by signal")

In [None]:
cs2bin_n

## 2. overlap between meta-analyzed signals and colocalized eQTLs

In [None]:
gwas2coloc = as.data.frame(rbindlist(lapply(names(meta_list), function(id)
{
    x   = meta_list[[id]][["coloc"]]
    if(length(x) > 0)
    {
        out = as.data.frame(rbindlist(lapply(names(x), function(transcript_id)
        {
            as.data.frame(rbindlist(lapply(names(x[[transcript_id]]), function(type_x)
            {
                type           = sub("type ", "", type_x)
                this           = x[[transcript_id]][[type_x]]
                out            = this[["pp"]]
                out$variant_id = out$id
                out$id         = NULL
                
                return(cbind(data.frame(id = id, transcript_id = transcript_id, type = type), out))
            })), stringsAsFactors = FALSE)
        })), stringsAsFactors = FALSE)
    }else
    {
        out = data.frame(id = id, transcript_id = "", type = 0,
                         nsnps = 0, PP.H0.abf = 0, PP.H1.abf = 1, PP.H2.abf = 0, PP.H3.abf = 0, PP.H4.abf = 0, pp_snp = 0, variant_id = ""
                        )
    }
    
    return(out)
})), stringsAsFactors = FALSE)

gwas2coloc = gwas2coloc[ gwas2coloc$nsnps > 0,]

## Combine all

In [None]:
signal2data = data.frame(id               = csets2n$id, 
                         csets_n_variants = csets2n$n_variants,
                         csets_bin        = csets2n$bin
                         )

signal2data = merge(signal2data, as.data.frame(rbindlist(lapply(signal2data$id, function(id)
{
    x            = meta_list[[id]]
    cluster_data = x[["som"]]
    traits       = x[["traits_cluster"]]
    out          = cluster_data
    out$traits   = paste(traits, collapse = ";")
    out$n_traits = length(traits)
    coloc        = gwas2coloc[ gwas2coloc$id == id & gwas2coloc$PP.H4.abf >= 0.8, ]
    
    if(nrow(coloc) > 0)
    {
        out$transcript_ids   = paste(coloc$transcript_id, coloc$type, sep = ":", collapse = ";")
        out$n_transcript_ids = nrow(coloc)
    }else
    {
        out$transcript_ids   = ""
        out$n_transcript_ids = 0
    }
    
    return(out)
})), stringsAsFactors = FALSE))

fwrite(signal2data, "pipeline/5.5.meta_analysis/signal2data.txt", sep = "\t", col.names = TRUE, row.names = FALSE)

In [None]:
signal2locus_n = table(table(signal2data$locus))
signal2locus_n = data.frame(n_loci = as.numeric(names(signal2locus_n)), n = as.numeric(signal2locus_n))
signal2locus_n = signal2locus_n[order(signal2locus_n$n_loci),]

barplot(signal2locus_n$n, names.arg = signal2locus_n$n_loci, horiz = TRUE, las = 2, main = "Signals per locus")

In [None]:
signal2trait_n = table(signal2data$n_traits)
signal2trait_n = data.frame(n_traits = as.numeric(names(signal2trait_n)), n = as.numeric(signal2trait_n))
signal2trait_n = signal2trait_n[order(signal2trait_n$n_traits),]

barplot(signal2trait_n$n, names.arg = signal2trait_n$n_traits, horiz = TRUE, las = 2, main = "Traits per signal")

In [None]:
signal2coloc_n = table(signal2data$n_transcript_ids)
signal2coloc_n = data.frame(n_transcript_ids = as.numeric(names(signal2coloc_n)), n = as.numeric(signal2coloc_n))
signal2coloc_n = signal2coloc_n[order(signal2coloc_n$n_transcript_ids),]

sum(signal2coloc_n[ signal2coloc_n$n_transcript_ids > 0, "n"])

barplot(signal2coloc_n$n, names.arg = signal2coloc_n$n_transcript_ids, horiz = TRUE, las = 2, main = "Colocalizing eQTLs by signal")

In [None]:
signal2locus_n
signal2trait_n
signal2coloc_n

# Add cell type information to eQTLs

In [None]:
qtl2gwas               = merge(gwas2coloc[ gwas2coloc$PP.H4.abf >= 0.8, c("id", "transcript_id", "type")], signal2data[,c("id", "locus", "csets_n_variants", "csets_bin", "som", "cluster", "class", "traits", "n_traits")], all.y = TRUE)
qtl_int                = as.data.frame(rbindlist(qtl_list[["interactions"]][c("rna", "isoform")]))
cell_data              = exp_list[["color"]]
cell_annots            = unique(c(cell_data[["by_tissue"]][,"body_site"], cell_data[["by_organ"]][,"body_site"], cell_data[["by_cell"]][,"cell_type"]))
qtl2gwas$id2type       = paste(qtl2gwas$transcript_id, qtl2gwas$type, sep = ":")
qtl_int $id2type       = paste(qtl_int $transcript_id, qtl_int $type, sep = ":")
qtl2gwas[,cell_annots] = 0

for(annot in cell_annots)
{
    x = qtl_int[ qtl_int$interaction == annot & qtl_int$cell == TRUE, "id2type"]
    qtl2gwas[ qtl2gwas$id2type %in% x, annot] = 1
}

qtl2gwas$id2type       = NULL

qtl2gwas[is.na(qtl2gwas$transcript_id) == TRUE, cell_annots] = NA

In [None]:
colSums(qtl2gwas[,cell_annots], na.rm = TRUE)

In [None]:
str(qtl2gwas)

