In [2]:
library(CPBayes)

# Perform colocalization at each locus, between each pair of traits, for all ethnicities

In [8]:
setwd("/frazer01/projects/CARDIPS/analysis/cardiac_gwas_coloc")

source("script/functions.R"  )


In [9]:
dir.create("pipeline/2.1.coloc"               , showWarnings = FALSE)
dir.create("pipeline/2.1.coloc/coloc_by_locus", showWarnings = FALSE)


In [10]:
manifest   = add_rownames(fread("pipeline/1.1.sumstats/manifest.txt"                     , sep = "\t", header = TRUE, data.table = FALSE))
loci       = add_rownames(fread("pipeline/1.2.genomewide_significant_loci/loci.txt"      , sep = "\t", header = TRUE, data.table = FALSE))
loci2study = add_rownames(fread("pipeline/1.2.genomewide_significant_loci/loci2study.txt", sep = "\t", header = TRUE, data.table = FALSE))


In [11]:
populations = c('meta','AFR','AMR','CSA','EAS','EUR','MID')

# Create qsub for each locus

In [6]:
run_qsub = function(locus)
{
    qsub = paste("qsub", paste(getwd(), "script", "2.1.coloc_by_locus.sh", sep = "/"), locus)
    
    system(qsub)
    
    #message(qsub)
}

invisible(lapply(loci$locus, run_qsub))


# After colocalization is done, combine all data

In [12]:
coloc = as.data.frame(rbindlist(lapply(loci$locus, function(x)
{
    infile = paste0("pipeline/2.1.coloc/coloc_by_locus/", x, ".txt")
    indata = fread(infile, sep = "\t", header = TRUE, data.table = FALSE)
    
    return(indata)
})), stringsAsFactors = FALSE)

fwrite(coloc, "pipeline/2.1.coloc/coloc.txt", sep = "\t", col.names = TRUE, row.names = FALSE)

# SCRATCH:
- Create functions to run coloc

In [None]:
suppressPackageStartupMessages(library(coloc))

In [None]:
get_gwas_data = function(coord, gwas_file)
{
    my_head              = colnames(fread(cmd = paste("zcat", gwas_file, "|", "head -n 2"), sep = "\t", header = TRUE, data.table = FALSE))
    gwas_data            = suppressWarnings(tabix.read.table(gwas_file, coord, col.names = TRUE, stringsAsFactors = FALSE))
    colnames(gwas_data)  = my_head
    gwas_data$variant_id = paste(gwas_data$chr, gwas_data$pos, gwas_data$ref, gwas_data$alt, sep = "_")
    rownames(gwas_data)  = gwas_data$variant_id
    
    return(gwas_data)
}

create_dataset = function(study, totest, variants, pop, populations, manifest)
{
    trait_type = manifest[study, "trait_type"]
    
    if(trait_type %in% c("categorical", "icd10", "phecode"))
    {
        if(pop == "meta"){n = sum(manifest[study, paste("n_cases", populations, sep = "_")]) + sum(manifest[study, paste("n_controls", populations, sep = "_")])}
        if(pop != "meta"){n =     manifest[study, paste("n_cases", pop        , sep = "_")]  +     manifest[study, paste("n_controls", pop        , sep = "_")] }
        
        if(pop == "meta"){s = sum(manifest[study, paste("n_cases", populations, sep = "_")]) / n}
        if(pop != "meta"){s =     manifest[study, paste("n_cases", pop        , sep = "_")]  / n}
        
        totest  = totest[is.na(totest[,paste("pval", pop, sep = "_")]) == FALSE & is.na(totest[,paste("af_controls", pop, sep = "_")]) == FALSE, ]
        dataset = list(snp = variants, pvalues = totest[variants, paste("pval", pop, sep = "_")], N = n, s = s, MAF = totest[variants, paste("af_controls", pop, sep = "_")], type = "cc")
        
    }else
    {
        if(pop == "meta"){n = sum(manifest[study, paste("n_cases", populations, sep = "_")])}
        if(pop != "meta"){n =     manifest[study, paste("n_cases", pop        , sep = "_")] }
        
        totest  = totest[is.na(totest[,paste("pval", pop, sep = "_")]) == FALSE & is.na(totest[,paste("af", pop, sep = "_")]) == FALSE, ]
        dataset = list(snp = variants, pvalues = totest[variants, paste("pval", pop, sep = "_")], N = n, MAF = totest[variants, paste("af", pop, sep = "_")], type = "quant")
    }
    
    return(dataset)
}

run_coloc_by_pop = function(locus, study1, study2, pop, populations, totest1, totest2, manifest)
{
    trait_type1     = manifest[study1, "trait_type"]
    trait_type2     = manifest[study2, "trait_type"]
    
    message(paste(trait_type1, trait_type2))
    
    if( trait_type1 %in% c("categorical", "icd10", "phecode")){variants1  = totest1[is.na(totest1[,paste("pval", pop, sep = "_")]) == FALSE & is.na(totest1[,paste("af_controls", pop, sep = "_")]) == FALSE, "variant_id"]}
    if( trait_type2 %in% c("categorical", "icd10", "phecode")){variants2  = totest2[is.na(totest2[,paste("pval", pop, sep = "_")]) == FALSE & is.na(totest2[,paste("af_controls", pop, sep = "_")]) == FALSE, "variant_id"]}
    if(!trait_type1 %in% c("categorical", "icd10", "phecode")){variants1  = totest1[is.na(totest1[,paste("pval", pop, sep = "_")]) == FALSE & is.na(totest1[,paste("af"         , pop, sep = "_")]) == FALSE, "variant_id"]}
    if(!trait_type2 %in% c("categorical", "icd10", "phecode")){variants2  = totest2[is.na(totest2[,paste("pval", pop, sep = "_")]) == FALSE & is.na(totest2[,paste("af"         , pop, sep = "_")]) == FALSE, "variant_id"]}
    
    variants        = intersect(variants1, variants2)
    
    if(length(variants) > 100)
    {
    dataset1        = create_dataset(study1, totest1, variants, pop, populations[populations != "meta"], manifest)
    dataset2        = create_dataset(study2, totest2, variants, pop, populations[populations != "meta"], manifest)
    coloc_mapped    = coloc.abf(dataset1 = dataset2, dataset2 = dataset2) 
    probs           = as.data.frame(t(coloc_mapped$summary))
    myres           = coloc_mapped$results
    myres           = myres[, c(which(colnames(myres) == "snp"), ncol(myres))]
    colnames(myres) = c("variant_id", "pp_snp")
    myres           = cbind(data.frame(locus   = locus,
                                       study1  = study1,
                                       study2  = study2,
                                       pop     = pop
                                      ) , myres)
    myres           = myres[order(myres$pp_snp, decreasing = TRUE), ]
    out             = cbind(probs, myres[1, ])
    }else
    {
        out = data.frame(nsnps = 0, PP.H0.abf = 1, PP.H0.abf = 0, PP.H0.abf = 0, PP.H0.abf = 0, PP.H0.abf = 0, 
                         locus = locus, study1 = study1, study2 = study2, pop = pop, variant_id = "", pp_snp = 0
                        )
    }
    
    return(out)
}

run_coloc_gwas = function(locus, study1, study2, loci, manifest)
{
    chrom       = loci    [locus , "chrom" ]
    from        = loci    [locus , "from"  ]
    to          = loci    [locus , "to"    ]
    coord       = paste0(chrom, ":", from, "-", to)
    totest1     = get_gwas_data(coord, manifest[study1, "sumstat_file"])
    totest2     = get_gwas_data(coord, manifest[study2, "sumstat_file"])
    populations = intersect(c("meta", unlist(strsplit(manifest[study1, "pops"], ","))), c("meta", unlist(strsplit(manifest[study2, "pops"], ","))))
    out         = as.data.frame(rbindlist(lapply(populations, function(pop){run_coloc_by_pop(locus, study1, study2, pop, populations, totest1, totest2, manifest)})), stringsAsFactors = FALSE)
    
    return(out)
}

#ii    = 1
#jj    = 2
locus = "1_965800_2544414"

id1   = "biomarkers-30600-both_sexes-irnt"
id2   = "phecode-366-both_sexes"

x = run_coloc_gwas(locus, id1, id2, loci, manifest)

#out = as.data.frame(rbindlist(lapply(1:(nrow(manifest) - 1), function(ii)
#{
#    id1 = manifest[ii, "id"]
#    return(as.data.frame(rbindlist(lapply((ii + 1):nrow(manifest), function(jj)
#    {
#        id2   = manifest[jj, "id"]
#        
#        return(run_coloc_gwas(locus, id1, id2, loci, manifest))
#    })), stringsAsFactors = FALSE))
#})), stringsAsFactors = FALSE)
#
#fwrite(out, paste0("pipeline/2.1.coloc/coloc_by_locus/", locus, ".txt"), sep = "\t", col.names = TRUE, row.names = FALSE)




In [None]:
str(x)

In [None]:
sort(unique(manifest$trait_type))