In [1]:
setwd("/frazer01/projects/CARDIPS/analysis/cardiac_eqtls")

source("script/packages.R"  )
source("script/input_data.R")
source("script/functions.R" )
source("script/colors.R"    )


In [2]:
dir.create("pipeline/4.10.diffexp_eclip_enrichment"    , showWarnings = FALSE)
dir.create("pipeline/4.10.diffexp_eclip_enrichment/tmp", showWarnings = FALSE)


In [3]:
geneinfo_genes    = fread("pipeline/1.2.expression/gene_info.txt"   , sep = "\t", header = TRUE , data.table = FALSE)
geneinfo_isoforms = fread("pipeline/1.2.expression/isoform_info.txt", sep = "\t", header = TRUE , data.table = FALSE)


In [4]:
diffgene = fread(paste("pipeline/4.1.differential_expression", "diffexp.txt"     , sep = "/"), sep = "\t", header = TRUE, data.table = FALSE)
diffcell = fread(paste("pipeline/4.1.differential_expression", "diffexp_cell.txt", sep = "/"), sep = "\t", header = TRUE, data.table = FALSE)

diffgene_gene = diffgene[diffgene$type == "gene_tpm",]

diffgene = diffgene[diffgene$type == "isoform_use",]
diffcell = diffcell[diffcell$type == "isoform_use",]




In [12]:
domains      = fread("/reference/public/Prot2HG//prot2hg.txt"                        , sep = ";" , header = TRUE, data.table = FALSE)
exons_unique = fread("pipeline/4.10.diffexp_eclip_enrichment/exons_unique.txt"       , sep = "\t", header = TRUE, data.table = FALSE)
exons2domain = fread("pipeline/4.10.diffexp_eclip_enrichment/exons_unique2domain.txt", sep = "\t", header = TRUE, data.table = FALSE)


In [21]:
exons2domain_single = as.data.frame(rbindlist(lapply(1:nrow(exons_unique), function(ii)
{
    gene_id = unlist(strsplit(exons_unique[ii, "gene_id"], "\\."))[[1]]
    from    = exons_unique[ii, "start"  ]
    to      = exons_unique[ii, "end"    ]
    domain  = sort(unique(domains[domains$ensembl == gene_id & ((domains$chr_start >= from & domains$chr_start <= to) | (domains$chr_end >= from & domains$chr_end <= to)), "feature_name"]))
    
    if(length(domain) > 0)
    {
        out        = exons_unique[rep(ii, length(domain)),]
        out$domain = domain
    }else
    {
        out        = exons_unique[ii,]
        out$domain = ""
    }
    
    return(out)
})), stringsAsFactors = FALSE)

fwrite(exons2domain_single, "pipeline/4.10.diffexp_eclip_enrichment/exons_unique2domain_single.txt", sep = "\t", col.names = TRUE, row.names = FALSE)



In [58]:
domains2test = sort(unique(exons2domain_single[ exons2domain_single$domain != "", "domain"]))
exons        = unique(exons2domain_single[, c("gene_id", "tissue", "tissue1", "tissue2")])

In [60]:
calculate_enrichment = function(domain, tissue1, tissue2, x, exons)
{
    x       = x[x$tissue1 == tissue1 & x$tissue2 == tissue2, ]
    exons   = exons[exons$tissue1 == tissue1 & exons$tissue2 == tissue2, ]
    exons11 = unique(x[x$tissue == tissue1 & x$domain == domain, "gene_id"])
    exons21 = unique(x[x$tissue == tissue2 & x$domain == domain, "gene_id"])
    exons12 = unique(exons[exons$tissue == tissue1 & !exons$gene_id %in% exons11, "gene_id"])
    exons22 = unique(exons[exons$tissue == tissue2 & !exons$gene_id %in% exons21, "gene_id"])
    
    out = data.frame(domain  = domain , 
                     tissue1 = tissue1, 
                     tissue2 = tissue2,
                     x11     = length(exons11),
                     x12     = length(exons12),
                     x21     = length(exons21),
                     x22     = length(exons22)
                    )
    
    totest = matrix(as.numeric(out[1,c("x11", "x12", "x21", "x22")]), nrow = 2, byrow = TRUE)
    test   = fisher.test(totest)
    out    = cbind(out,
                   data.frame(odds_ratio = test$estimate, 
                              log2r      = log2(test$estimate),
                              ci1        = test$conf.int[[1]],
                              ci2        = test$conf.int[[2]],
                              pval       = test$p.value
                             )
                  )
    
    
    return(out)
}

test_ipsc_cvpc_heart   = as.data.frame(rbindlist(lapply(domains2test, function(domain){calculate_enrichment(domain, "ipsc_cvpc", "heart"  , exons2domain_single, exons)})), stringsAsFactors = FALSE)
test_ipsc_cvpc_arteria = as.data.frame(rbindlist(lapply(domains2test, function(domain){calculate_enrichment(domain, "ipsc_cvpc", "arteria", exons2domain_single, exons)})), stringsAsFactors = FALSE)
test_heart_arteria     = as.data.frame(rbindlist(lapply(domains2test, function(domain){calculate_enrichment(domain, "heart"    , "arteria", exons2domain_single, exons)})), stringsAsFactors = FALSE)




In [63]:
test_ipsc_cvpc_heart   = test_ipsc_cvpc_heart  [ test_ipsc_cvpc_heart  $x11 >0 | test_ipsc_cvpc_heart  $x21 > 0,]
test_ipsc_cvpc_arteria = test_ipsc_cvpc_arteria[ test_ipsc_cvpc_arteria$x11 >0 | test_ipsc_cvpc_arteria$x21 > 0,]
test_heart_arteria     = test_heart_arteria    [ test_heart_arteria    $x11 >0 | test_heart_arteria    $x21 > 0,]

test_ipsc_cvpc_heart  $qval = p.adjust(test_ipsc_cvpc_heart  $pval, method = "BH")
test_ipsc_cvpc_arteria$qval = p.adjust(test_ipsc_cvpc_arteria$pval, method = "BH")
test_heart_arteria    $qval = p.adjust(test_heart_arteria    $pval, method = "BH")

nrow(test_ipsc_cvpc_heart  [test_ipsc_cvpc_heart  $qval < 0.05,])
nrow(test_ipsc_cvpc_arteria[test_ipsc_cvpc_arteria$qval < 0.05,])
nrow(test_heart_arteria    [test_heart_arteria    $qval < 0.05,])

domain_data           = unique(domains[,c("feature_name", "CDD")])
colnames(domain_data) = c("domain", "cdd")
#domain_tests          = merge(rbind(test_ipsc_cvpc_heart, test_ipsc_cvpc_arteria, test_heart_arteria), domain_data)
#domain_tests$url      = ""
#domain_tests[domain_tests$cdd != "", "url"] = paste("https://www.ncbi.nlm.nih.gov/Structure/cdd/cddsrv.cgi?uid", sub("CDD:", "", domain_tests[domain_tests$cdd != "", "cdd"]), sep = "=")

domain_tests = rbind(test_ipsc_cvpc_heart, test_ipsc_cvpc_arteria, test_heart_arteria)
fwrite(domain_tests, "pipeline/4.10.diffexp_eclip_enrichment/domain_tests.txt", sep = "\t", col.names = TRUE, row.names = FALSE)


In [64]:
domain_tests[ domain_tests$qval < 0.05,]

Unnamed: 0_level_0,domain,tissue1,tissue2,x11,x12,x21,x22,odds_ratio,log2r,ci1,ci2,pval,qval
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
48,acetylation,ipsc_cvpc,heart,72,424,32,463,2.454805,1.2956085,1.561413,3.930989,4.42544e-05,0.027813891
1155,phosphorylation,ipsc_cvpc,heart,212,284,137,358,1.949342,0.9629872,1.483265,2.56717,8.075507e-07,0.001015091
11551,phosphorylation,ipsc_cvpc,arteria,253,336,172,417,1.824615,0.8675917,1.424169,2.3412,1.149528e-06,0.001535769


In [56]:
head(domain_tests)

Unnamed: 0_level_0,domain,tissue1,tissue2,x11,x12,x21,x22,odds_ratio,log2r,ci1,ci2,pval,qval,cdd,url
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
1,"[DE]-X(1,2)-F-X-X-[FL]-X-X-X-R motif",ipsc_cvpc,heart,1,5450,0,2760,inf,inf,0.01299822,inf,1,1,,
2,"[DE]-X(1,2)-F-X-X-[FL]-X-X-X-R motif",ipsc_cvpc,arteria,1,5873,0,3331,inf,inf,0.01455579,inf,1,1,,
3,11 X 7 AA tandem repeats of [DR]-P-Y-R-[LI][AG][QHP],ipsc_cvpc,heart,1,5450,0,2760,inf,inf,0.01299822,inf,1,1,,
4,11 X 7 AA tandem repeats of [DR]-P-Y-R-[LI][AG][QHP],ipsc_cvpc,arteria,1,5873,0,3331,inf,inf,0.01455579,inf,1,1,,
5,11beta-HSD1_like_SDR_c,ipsc_cvpc,heart,3,5448,1,2759,1.519201,0.6033132,0.12191425,79.75708,1,1,CDD:187593,https://www.ncbi.nlm.nih.gov/Structure/cdd/cddsrv.cgi?uid=187593
6,14 X 6 AA repeats of [ED]-R-S-M-M-S,ipsc_cvpc,heart,1,5450,0,2760,inf,inf,0.01299822,inf,1,1,,


In [45]:
test_ipsc_cvpc_heart[ order(test_ipsc_cvpc_heart$pval),]

Unnamed: 0_level_0,domain,tissue1,tissue2,x11,x12,x21,x22,odds_ratio,log2r,ci1,ci2,pval
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
706,Integrin_alpha2,ipsc_cvpc,heart,4,5447,26,2734,0.07723351,-3.694629,0.019558813,0.2229884,2.627612e-09
496,FANCI,ipsc_cvpc,heart,4,5447,23,2737,0.08739675,-3.516277,0.021952638,0.2561814,4.545723e-08
987,Myosin_head,ipsc_cvpc,heart,38,5413,0,2760,Inf,Inf,4.995424089,Inf,2.490259e-07
670,I-set,ipsc_cvpc,heart,1,5450,16,2744,0.03147826,-4.989500,0.000755118,0.2028236,3.003939e-07
989,Myosin_tail_1,ipsc_cvpc,heart,42,5409,1,2759,21.41956350,4.420857,3.635679531,863.1504763,7.541168e-07
704,Int_alpha,ipsc_cvpc,heart,4,5447,19,2741,0.10596207,-3.238380,0.026192690,0.3192898,1.861311e-06
1726,UCH,ipsc_cvpc,heart,49,5402,4,2756,6.24928228,2.643691,2.287489901,23.8675965,1.302645e-05
1063,O-FucT,ipsc_cvpc,heart,2,5449,14,2746,0.07200765,-3.795706,0.007927086,0.3138969,1.304819e-05
1064,O-FucT-2,ipsc_cvpc,heart,2,5449,14,2746,0.07200765,-3.795706,0.007927086,0.3138969,1.304819e-05
225,Cast,ipsc_cvpc,heart,0,5451,10,2750,0.00000000,-Inf,0.000000000,0.2253085,1.821471e-05


In [8]:
exons2domain$has_domain = FALSE
exons2domain[exons2domain$domain != "", "has_domain"] = TRUE


In [9]:
test_exon2domain = function(tissue1, tissue2, exons2domain)
{
    x     = exons2domain[exons2domain$tissue1 == tissue1 & exons2domain$tissue2 == tissue2, ]
    out   = data.frame(x11 = nrow(x[x$tissue == tissue1 & x$has_domain == TRUE ,]),
                       x12 = nrow(x[x$tissue == tissue1 & x$has_domain == FALSE,]),
                       x21 = nrow(x[x$tissue == tissue2 & x$has_domain == TRUE ,]),
                       x22 = nrow(x[x$tissue == tissue2 & x$has_domain == FALSE,])
                      )
    
    totest = matrix(as.numeric(out[1,]), nrow = 2, byrow = TRUE)
    test   = fisher.test(totest)
    out    = cbind(out, data.frame(tissue1 = tissue1, tissue2 = tissue2, estimate = test$estimate, ci1 = test$conf.int[[1]], ci2 = test$conf.int[[2]], pval = test$p.value))
    
    return(out)
}
tests = as.data.frame(rbindlist(list(test_exon2domain("ipsc_cvpc", "heart"  , exons2domain), 
                                     test_exon2domain("ipsc_cvpc", "arteria", exons2domain), 
                                     test_exon2domain("heart"    , "arteria", exons2domain) 
                                    )), stringsAsFactors = FALSE)

tests

x11,x12,x21,x22,tissue1,tissue2,estimate,ci1,ci2,pval
<int>,<int>,<int>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
3611,2184,1637,1199,ipsc_cvpc,heart,1.2109655,1.1038711,1.3283887,4.412539e-05
3941,2424,2041,1444,ipsc_cvpc,arteria,1.1502442,1.0562927,1.2524879,0.001209298
328,338,450,198,heart,arteria,0.4272654,0.3383922,0.5385876,1.01601e-13


In [13]:
str(exons_unique)

'data.frame':	19795 obs. of  15 variables:
 $ chrom          : chr  "chr6" "chr6" "chr6" "chr6" ...
 $ start          : int  53379133 53379237 53380907 53385576 53387213 53388053 53388715 53362139 53364843 53365045 ...
 $ end            : int  53379135 53379295 53381020 53385758 53387325 53388121 53388726 53363765 53364963 53365148 ...
 $ strand         : chr  "-" "-" "-" "-" ...
 $ gene_id        : chr  "ENSG00000001084.13_10" "ENSG00000001084.13_10" "ENSG00000001084.13_10" "ENSG00000001084.13_10" ...
 $ gene_name      : chr  "GCLC" "GCLC" "GCLC" "GCLC" ...
 $ gene_type      : chr  "protein_coding" "protein_coding" "protein_coding" "protein_coding" ...
 $ exon_id        : chr  "ENSE00002447953.1_1" "ENSE00003627005.1_1" "ENSE00003816642.1_1" "ENSE00003829214.1_1" ...
 $ exon_number    : int  1 2 3 4 5 6 7 1 2 3 ...
 $ transcript_id  : chr  "ENST00000514933.2_3" "ENST00000514933.2_3" "ENST00000514933.2_3" "ENST00000514933.2_3" ...
 $ transcript_type: chr  "processed_transcript" "proces

In [11]:
str(exons2domain)

'data.frame':	19795 obs. of  17 variables:
 $ chrom          : chr  "chr6" "chr6" "chr6" "chr6" ...
 $ start          : int  53379133 53379237 53380907 53385576 53387213 53388053 53388715 53362139 53364843 53365045 ...
 $ end            : int  53379135 53379295 53381020 53385758 53387325 53388121 53388726 53363765 53364963 53365148 ...
 $ strand         : chr  "-" "-" "-" "-" ...
 $ gene_id        : chr  "ENSG00000001084.13_10" "ENSG00000001084.13_10" "ENSG00000001084.13_10" "ENSG00000001084.13_10" ...
 $ gene_name      : chr  "GCLC" "GCLC" "GCLC" "GCLC" ...
 $ gene_type      : chr  "protein_coding" "protein_coding" "protein_coding" "protein_coding" ...
 $ exon_id        : chr  "ENSE00002447953.1_1" "ENSE00003627005.1_1" "ENSE00003816642.1_1" "ENSE00003829214.1_1" ...
 $ exon_number    : int  1 2 3 4 5 6 7 1 2 3 ...
 $ transcript_id  : chr  "ENST00000514933.2_3" "ENST00000514933.2_3" "ENST00000514933.2_3" "ENST00000514933.2_3" ...
 $ transcript_type: chr  "processed_transcript" "proces

In [10]:
str(domains)

'data.frame':	1231249 obs. of  22 variables:
 $ id           : int  1 2 3 4 5 6 7 8 9 10 ...
 $ gene         : chr  "A2M" "A2M" "A2M" "A2M" ...
 $ protein_ID   : chr  "NP_000005.2" "NP_000005.2" "NP_000005.2" "NP_000005.2" ...
 $ gene_ID      : chr  "NM_000014" "NM_000014" "NM_000014" "NM_000014" ...
 $ strand       : chr  "-" "-" "-" "-" ...
 $ type         : chr  "Region" "Region" "Region" "Region" ...
 $ feature_name : chr  "YfaS" "YfaS" "YfaS" "YfaS" ...
 $ prot_start   : int  124 124 124 124 124 124 124 124 124 124 ...
 $ prot_end     : int  1071 1071 1071 1071 1071 1071 1071 1071 1071 1071 ...
 $ cds_start    : int  749 749 749 749 749 749 749 749 749 749 ...
 $ cds_end      : int  3593 3593 3593 3593 3593 3593 3593 3593 3593 3593 ...
 $ cds_length   : int  2844 2844 2844 2844 2844 2844 2844 2844 2844 2844 ...
 $ rev_trans_map: num  1 1 1 1 1 1 1 1 1 1 ...
 $ chr_start    : int  9230360 9231840 9232235 9232690 9241796 9242498 9242952 9243797 9246061 9247569 ...
 $ chr_end      : 