In [1]:
setwd("/frazer01/projects/CARDIPS/analysis/cardiac_eqtls")

source("script/packages.R"  )
source("script/input_data.R")
source("script/functions.R" )

dir.create("pipeline/4.1.differential_expression", showWarnings = FALSE)


# Input data


In [2]:
metadata   =              fread("pipeline/3.1.covariates/metadata.txt"     , sep = "\t", header = TRUE , data.table = FALSE)
covariates = add_rownames(fread("pipeline/3.1.covariates/covariates.txt"   , sep = "\t", header = TRUE , data.table = FALSE))
int_list   = readLines         ("pipeline/3.2.eqtls/vars/cardiac_eqtls.txt")
#int_list   = int_list[grepl("^peer", int_list) == FALSE & grepl("^pc", int_list) == FALSE] # original
int_list   = c("sex", "total_reads_norm") # remove mitochondrial reads as covariate
covariates = covariates[,int_list]
gene_tpm   = add_rownames(fread("pipeline/1.2.expression/tpm_gene.normalized.txt"   , sep = "\t", header = TRUE , data.table = FALSE))
isof_tpm   = add_rownames(fread("pipeline/1.2.expression/tpm_isoform.normalized.txt", sep = "\t", header = TRUE , data.table = FALSE))
isof_use   = add_rownames(fread("pipeline/1.2.expression/use_isoform.normalized.txt", sep = "\t", header = TRUE , data.table = FALSE))


In [3]:

gene_info               = fread("input/phenotypes/gene_info.txt"   , sep = "\t", header = TRUE, data.table = FALSE)
isof_info               = fread("input/phenotypes/isoform_info.txt", sep = "\t", header = TRUE, data.table = FALSE)
gene_info$transcript_id = gene_info$gene_id

diffgene = fread(paste("pipeline/4.1.differential_expression", "diffexp.txt"     , sep = "/"), sep = "\t", header = TRUE, data.table = FALSE)
diffcell = fread(paste("pipeline/4.1.differential_expression", "diffexp_cell.txt", sep = "/"), sep = "\t", header = TRUE, data.table = FALSE)

diffgene$diffexp = FALSE
diffgene[diffgene$qval < 0.05, "diffexp"] = TRUE


# Get gene sets

In [4]:
gmt2list = function(gs)
{    
    indata = readLines(paste("input/phenotypes/msigdb", paste(gs, "v7.1.symbols.gmt", sep = "."), sep = "/"))
    indata = lapply(indata, function(x)
    {
        this = unlist(strsplit(x, "\t"))
        out  = list(gene_set = this[[1]], url = this[[2]], gene_names = this[3:length(this)])
        
        return(out)
    })
    names(indata) = unlist(lapply(indata, function(x){x$gene_set}))
    
    return(indata)
}

msigdb               = c("c2.cp.biocarta", "c2.cp.kegg", "c2.cp.reactome", "c5.bp", "c5.cc", "c5.mf", "h.all")
#msigdb               = c("h.all")
genesets2test        = lapply(msigdb, gmt2list)
names(genesets2test) = msigdb


# Functional enrichment


In [12]:
calculate_functional_enrichment_tissue_by_gene_set = function(tissue1, tissue2, type, gs, gene_set, diffexp, genes_dif, genes_ns)
{
    genes_in   = intersect(gene_set[["gene_names"]], c(genes_dif, genes_ns))
    out        = data.frame(tissue1 = tissue1, tissue2 = tissue2, type = type, gs_source = gs, gene_set = gene_set[["gene_set"]], url = gene_set[["url"]], ngenes = length(genes_in))
    
    if(nrow(diffexp[diffexp$gene_name %in% genes_in,]) > 1)
    {
        tofisher = matrix(0, nrow = 2, ncol = 2)
        rownames(tofisher) = c("dif", "ns")
        colnames(tofisher) = c("yes", "no")

        tofisher["dif" , "yes"] =                     length(intersect(genes_dif, genes_in))
        tofisher["dif" , "no" ] = length(genes_dif) - length(intersect(genes_dif, genes_in))
        tofisher["ns"  , "yes"] =                     length(intersect(genes_ns , genes_in))
        tofisher["ns"  , "no" ] = length(genes_ns ) - length(intersect(genes_ns , genes_in))

        test = fisher.test(tofisher)

        out  = cbind(out, data.frame(dif_yes  = tofisher["dif" , "yes"],
                                     dif_no   = tofisher["dif" , "no" ],
                                     ns_yes   = tofisher["ns"  , "yes"],
                                     ns_no    = tofisher["ns"  , "no" ],
                                     estimate = test$estimate, 
                                     ci1      = test$conf.int[[1]], 
                                     ci2      = test$conf.int[[2]], 
                                     pval     = test$p.value))
    }else
    {
        out  = cbind(out, data.frame(dif_yes  = NA,
                                     dif_no   = NA,
                                     ns_yes   = NA,
                                     ns_no    = NA,
                                     estimate = NA,
                                     ci1      = NA,
                                     ci2      = NA,
                                     pval     = NA
                                    ))

    }
    return(out)
}

run_test = function(tissue1, tissue2, type, gs, gene_set, diffgene)
{
    diffexp    = diffgene[diffgene$type == type & diffgene$tissue1 == tissue1 & diffgene$tissue2 == tissue2,]
    genes_dif  = unique(diffexp[diffexp$qval < 0.05 & abs(diffexp$beta) > 2, "gene_name"])
    genes_ns   = unique(diffexp[!diffexp$gene_name %in% genes_dif          , "gene_name"])

    return(calculate_functional_enrichment_tissue_by_gene_set(tissue1, tissue2, type, gs, gene_set, diffexp, genes_dif, genes_ns))
}

type      = "isoform_use"
#gs        = "c2.cp.biocarta"
#gene_sets = genesets2test[[gs]]
#gene_set  = gene_sets[[1]]
#tissue1   = "ipsc_cvpc"
#tissue2   = "heart"

tests = as.data.frame(rbindlist(lapply(msigdb, function(gs)
{
    message(paste(type, gs))
    gene_sets = genesets2test[[gs]]
    as.data.frame(rbindlist(lapply(gene_sets, function(gene_set)
    {
        out = rbind(run_test("ipsc_cvpc", "heart"  , type, gs, gene_set, diffgene),
                    run_test("ipsc_cvpc", "arteria", type, gs, gene_set, diffgene),
                    run_test("heart"    , "arteria", type, gs, gene_set, diffgene)
                   )
            return(out)
        run_test(tissue1, tissue2, type, gs, gene_set, diffexp, genes_dif, genes_ns)
    })), stringsAsFactors = FALSE)
})), stringsAsFactors = FALSE)

tests     = tests[is.na(tests$estimate) == FALSE, ]
tests$fdr = p.adjust(tests$pval, method = "BH")

fwrite(tests, "pipeline/4.1.differential_expression/functional_enrichment_isoforms.tissue.txt", sep = "\t", col.names = TRUE, row.names = FALSE)




isoform_use c2.cp.biocarta

isoform_use c2.cp.kegg

isoform_use c2.cp.reactome

isoform_use c5.bp

isoform_use c5.cc

isoform_use c5.mf

isoform_use h.all



In [13]:
tests

Unnamed: 0_level_0,tissue1,tissue2,type,gs_source,gene_set,url,ngenes,dif_yes,dif_no,ns_yes,ns_no,estimate,ci1,ci2,pval,fdr
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,ipsc_cvpc,heart,isoform_use,c2.cp.biocarta,BIOCARTA_RELA_PATHWAY,http://www.gsea-msigdb.org/gsea/msigdb/cards/BIOCARTA_RELA_PATHWAY,13,0,357,13,10303,0.000000,0.00000000,9.508920,1.00000000,1
2,ipsc_cvpc,arteria,isoform_use,c2.cp.biocarta,BIOCARTA_RELA_PATHWAY,http://www.gsea-msigdb.org/gsea/msigdb/cards/BIOCARTA_RELA_PATHWAY,13,0,587,13,10073,0.000000,0.00000000,5.646502,1.00000000,1
3,heart,arteria,isoform_use,c2.cp.biocarta,BIOCARTA_RELA_PATHWAY,http://www.gsea-msigdb.org/gsea/msigdb/cards/BIOCARTA_RELA_PATHWAY,13,0,105,13,10555,0.000000,0.00000000,33.461653,1.00000000,1
4,ipsc_cvpc,heart,isoform_use,c2.cp.biocarta,BIOCARTA_NO1_PATHWAY,http://www.gsea-msigdb.org/gsea/msigdb/cards/BIOCARTA_NO1_PATHWAY,12,1,356,11,10305,2.631128,0.06096736,18.190303,0.33533183,1
5,ipsc_cvpc,arteria,isoform_use,c2.cp.biocarta,BIOCARTA_NO1_PATHWAY,http://www.gsea-msigdb.org/gsea/msigdb/cards/BIOCARTA_NO1_PATHWAY,12,1,586,11,10075,1.562884,0.03625091,10.788693,0.49297072,1
6,heart,arteria,isoform_use,c2.cp.biocarta,BIOCARTA_NO1_PATHWAY,http://www.gsea-msigdb.org/gsea/msigdb/cards/BIOCARTA_NO1_PATHWAY,12,0,105,12,10556,0.000000,0.00000000,36.674787,1.00000000,1
7,ipsc_cvpc,heart,isoform_use,c2.cp.biocarta,BIOCARTA_CSK_PATHWAY,http://www.gsea-msigdb.org/gsea/msigdb/cards/BIOCARTA_CSK_PATHWAY,7,0,357,7,10309,0.000000,0.00000000,20.104871,1.00000000,1
8,ipsc_cvpc,arteria,isoform_use,c2.cp.biocarta,BIOCARTA_CSK_PATHWAY,http://www.gsea-msigdb.org/gsea/msigdb/cards/BIOCARTA_CSK_PATHWAY,7,0,587,7,10079,0.000000,0.00000000,11.939444,1.00000000,1
9,heart,arteria,isoform_use,c2.cp.biocarta,BIOCARTA_CSK_PATHWAY,http://www.gsea-msigdb.org/gsea/msigdb/cards/BIOCARTA_CSK_PATHWAY,7,0,105,7,10561,0.000000,0.00000000,70.617145,1.00000000,1
10,ipsc_cvpc,heart,isoform_use,c2.cp.biocarta,BIOCARTA_SRCRPTP_PATHWAY,http://www.gsea-msigdb.org/gsea/msigdb/cards/BIOCARTA_SRCRPTP_PATHWAY,6,0,357,6,10310,0.000000,0.00000000,24.611645,1.00000000,1
