In [1]:
setwd("/frazer01/projects/CARDIPS/analysis/cardiac_eqtls")

source("script/packages.R"  )
source("script/input_data.R")
source("script/functions.R" )
source("script/colors.R"    )


In [2]:
geneinfo_gene       = fread("pipeline/1.2.expression/gene_info.txt"    , sep = "\t", header = TRUE , data.table = FALSE)
geneinfo_isoform    = fread("pipeline/1.2.expression/isoform_info.txt" , sep = "\t", header = TRUE , data.table = FALSE)
eqtl_rbp            = fread("pipeline/7.1.trans/rbp/input/eqtl_rbp.txt", sep = "\t", header = TRUE , data.table = FALSE)
gene_map            = fread("/frazer01/reference/public/cross_mapability/hg19_gencode19_75merExon_75merUTR_2mismatch_gene_mappability.txt" , sep = "\t", header = FALSE, data.table = FALSE)
gene_cross          = fread("/frazer01/reference/public/cross_mapability/hg19_gencode19_75merExon_75merUTR_2mismatch_cross_mappability.txt", sep = "\t", header = FALSE, data.table = FALSE)

colnames(gene_map  ) = c("gene_id", "mappability")
colnames(gene_cross) = c("gene_id1", "gene_id2", "cross")


In [21]:
eqtl_rbp$gene2type = paste(eqtl_rbp$gene_id, eqtl_rbp$type)

In [10]:
isoform2rbp = fread("pipeline/7.1.trans/rbp/input/isoform2rbp.txt", sep = "\t", header = TRUE , data.table = FALSE)


In [3]:
trans0 = as.data.frame(rbindlist(lapply(list.files("pipeline/7.1.trans/rbp/trans_eqtls_by_rbp/", full.names = TRUE), function(x)
{
    fread(x, sep = "\t", header = TRUE, data.table = FALSE)
})), stringsAsFactors = FALSE)

fwrite(trans0, "pipeline/7.1.trans/rbp/trans_eqtls.unfiltered.txt", sep = "\t", col.names = TRUE, row.names = FALSE)


# Filter by mappability, similar to GTEx methods:
- gene mappability >= 0.8
- remove all cross mapped interactions

In [4]:
gene_cross      $gene1 = unlist(lapply(gene_cross      $gene_id1, function(x){unlist(strsplit(x, "\\."))[[1]]}))
gene_cross      $gene2 = unlist(lapply(gene_cross      $gene_id2, function(x){unlist(strsplit(x, "\\."))[[1]]}))
gene_map        $gene  = unlist(lapply(gene_map        $gene_id , function(x){unlist(strsplit(x, "\\."))[[1]]}))
geneinfo_gene   $gene  = unlist(lapply(geneinfo_gene   $gene_id , function(x){unlist(strsplit(x, "\\."))[[1]]}))
geneinfo_isoform$gene  = unlist(lapply(geneinfo_isoform$gene_id , function(x){unlist(strsplit(x, "\\."))[[1]]}))


In [5]:
trans = merge(trans0, geneinfo_isoform[,c("transcript_id", "gene_id", "gene_name")], by.x = "transcript_id_trans", by.y = "transcript_id", suffixes = c("_rbp", "_trans"))

In [35]:
geneinfo_gene$is_map = FALSE
geneinfo_gene[geneinfo_gene$gene %in% gene_map[gene_map$mappability >= 0.8, "gene"], "is_map"] = TRUE

gene_cross$gene2gene = paste(gene_cross$gene1, gene_cross$gene2)

gene_cross_filtered = merge(gene_cross         [,c("gene1", "gene2", "cross"           )], geneinfo_gene[geneinfo_gene$is_map == TRUE, c("gene", "gene_id")], by.x = "gene1", by.y = "gene")
gene_cross_filtered = merge(gene_cross_filtered[,c("gene1", "gene2", "cross", "gene_id")], geneinfo_gene[geneinfo_gene$is_map == TRUE, c("gene", "gene_id")], by.x = "gene2", by.y = "gene", suffixes = 1:2)

gene_cross_filtered$gene2gene = paste(gene_cross_filtered$gene_id1   , gene_cross_filtered$gene_id2     )
trans              $gene2gene = paste(trans              $gene_id_rbp, trans              $gene_id_trans)

nrow(trans)
trans_filtered = trans[!trans$gene2gene %in% gene_cross_filtered$gene2gene & trans$gene_id_rbp %in% geneinfo_gene[geneinfo_gene$is_map == TRUE, "gene_id"] & trans$gene_id_trans %in% geneinfo_gene[geneinfo_gene$is_map == TRUE, "gene_id"] & trans$gene_id_rbp != trans$gene_id_trans,]
nrow(trans_filtered)


In [36]:
trans_filtered = merge(trans_filtered, geneinfo_gene[,c("gene_id", "chrom", "start", "end")], by.x = "gene_id_rbp"  , by.y = "gene_id")
trans_filtered = merge(trans_filtered, geneinfo_gene[,c("gene_id", "chrom", "start", "end")], by.x = "gene_id_trans", by.y = "gene_id", suffixes = c("_rbp", "_trans"))

trans_filtered = trans_filtered[trans_filtered$chrom_rbp != trans_filtered$chrom_trans,]


In [40]:
nrow(trans_filtered)


In [38]:
isoform2rbp   $rbp2iso = paste(isoform2rbp   $rbp_name     , isoform2rbp   $transcript_id      )
trans_filtered$rbp2iso = paste(trans_filtered$gene_name_rbp, trans_filtered$transcript_id_trans)
isoform2rbp   $n_bs    = isoform2rbp$x

trans2rbp = merge(trans_filtered, isoform2rbp[,c("rbp2iso", "n_bs")], all.x = TRUE)
trans2rbp[is.na(trans2rbp$n_bs) == TRUE, "n_bs"] = 0

In [45]:
trans2rbp$rbp2type = paste(trans2rbp$gene_id_rbp, trans2rbp$type)


In [44]:
str(trans2rbp)

'data.frame':	7748617 obs. of  21 variables:
 $ rbp2iso            : chr  "ABCF1 ENST00000000233.10_2" "ABCF1 ENST00000000233.10_2" "ABCF1 ENST00000000233.10_2" "ABCF1 ENST00000000412.3" ...
 $ gene_id_trans      : chr  "ENSG00000004059.11_6" "ENSG00000004059.11_6" "ENSG00000004059.11_6" "ENSG00000003056.3" ...
 $ gene_id_rbp        : chr  "ENSG00000204574.13_6" "ENSG00000204574.13_6" "ENSG00000204574.13_6" "ENSG00000204574.13_6" ...
 $ transcript_id_trans: chr  "ENST00000000233.10_2" "ENST00000000233.10_2" "ENST00000000233.10_2" "ENST00000000412.3" ...
 $ gene_name_rbp      : chr  "ABCF1" "ABCF1" "ABCF1" "ABCF1" ...
 $ type               : int  2 0 1 2 1 0 0 1 2 0 ...
 $ beta_gt            : num  0.00488 -0.00129 -0.00911 -0.03142 -0.04261 ...
 $ se_gt              : num  0.0809 0.1068 0.1362 0.0428 0.072 ...
 $ pval_gt            : num  0.952 0.99 0.947 0.463 0.554 ...
 $ beta_exp           : num  0.1929 0.1929 0.1929 0.0539 0.0539 ...
 $ se_exp             : num  0.083 0.083 0.083 0

# Find trans-eQTLs:
- RBP is eGene
- 

In [46]:
trans_fdr        = trans2rbp[trans2rbp$n_bs > 0 & trans2rbp$rbp2type %in% eqtl_rbp[ eqtl_rbp$egene == TRUE, "gene2type"],]
trans_fdr$fdr_gt = p.adjust(trans_fdr$pval_gt, method = "bonferroni")

nrow(trans_fdr[trans_fdr$fdr_gt < 0.05,])


In [50]:
head(trans_fdr[order(trans_fdr$pval_gt), c("gene_name_rbp", "gene_name_trans")])

Unnamed: 0_level_0,gene_name_rbp,gene_name_trans
Unnamed: 0_level_1,<chr>,<chr>
5549643,RPS3,HLA-C
6849474,U2AF2,P3H3
3883922,LIN28B,DCXR
4027990,METAP2,CD151
4014895,METAP2,ZC3HC1
7481060,YBX3,SLCO3A1


In [41]:
cor.test(abs(trans2rbp$beta_gt), trans2rbp$n_bs)
t.test(abs(trans2rbp[trans2rbp$n_bs == 0, "beta_gt"]), abs(trans2rbp[trans2rbp$n_bs > 0, "beta_gt"]))


	Pearson's product-moment correlation

data:  abs(trans2rbp$beta_gt) and trans2rbp$n_bs
t = -80.821, df = 7748615, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.02972575 -0.02831873
sample estimates:
        cor 
-0.02902226 



	Welch Two Sample t-test

data:  abs(trans2rbp[trans2rbp$n_bs == 0, "beta_gt"]) and abs(trans2rbp[trans2rbp$n_bs > 0, "beta_gt"])
t = 94.61, df = 4455378, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.005471112 0.005702589
sample estimates:
 mean of x  mean of y 
0.09035696 0.08477011 


In [19]:
str(eqtl_rbp)
str(trans2rbp)

'data.frame':	240 obs. of  24 variables:
 $ transcript_id: chr  "ENSG00000003756.17_8" "ENSG00000004478.8_4" "ENSG00000004478.8_4" "ENSG00000004478.8_4" ...
 $ gene_id      : chr  "ENSG00000003756.17_8" "ENSG00000004478.8_4" "ENSG00000004478.8_4" "ENSG00000004478.8_4" ...
 $ gene_name    : chr  "RBM5" "FKBP4" "FKBP4" "FKBP4" ...
 $ gene_type    : chr  "protein_coding" "protein_coding" "protein_coding" "protein_coding" ...
 $ start        : int  50126352 2904136 2904136 2904136 18942768 15490861 15490861 49337907 49337907 797075 ...
 $ end          : int  50156454 2914589 2914589 2914589 18979039 15529952 15529952 49375297 49375297 812327 ...
 $ strand       : chr  "+" "+" "+" "+" ...
 $ chrom        : int  3 12 12 12 19 19 19 17 17 19 ...
 $ pos          : int  50291617 3131206 2559828 3077678 18542000 15184745 15595193 49333926 49249186 738113 ...
 $ ref          : chr  "G" "T" "G" "C" ...
 $ alt          : chr  "A" "C" "A" "T" ...
 $ rsid         : chr  "rs9852677" "rs3741952" "rs476

# FDR Correction

In [8]:

trans_fdr = as.data.frame(rbindlist(lapply(sort(unique(trans_filtered$rbp2type)), function(x)
{
    this = trans_filtered[trans_filtered$rbp2type == x,]
    this$fdr_gt = p.adjust(this$pval_gt, method = "BH")
    
    return(this)
})), stringsAsFactors = FALSE)

nrow(trans_fdr[trans_fdr$fdr_gt < 0.05,])


In [41]:
nrow(trans_filtered)

trans_filtered$fdr_gt = p.adjust(trans_filtered$pval_gt, method = "BH")

nrow(trans_filtered[trans_filtered$fdr_gt < 0.05,])

trans_filtered$fdr_gt = p.adjust(trans_filtered$pval_gt, method = "bonferroni")

nrow(trans_filtered[trans_filtered$fdr_gt < 0.05,])




In [9]:
a = trans_fdr[trans_fdr$fdr_gt < 0.05,]
a[order(a$pval_gt),]

Unnamed: 0_level_0,transcript_id_trans,gene_id_rbp,gene_name_rbp,type,beta_gt,se_gt,pval_gt,beta_exp,se_exp,pval_exp,gene_id_trans,gene_name_trans,gene2gene,rbp2type,fdr_gt
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>
1290835,ENST00000221943.13_3,ENSG00000077348.9_5,EXOSC5,0,-1.8316911,0.07005790,2.561077e-111,0.329216320,0.06715195,1.130755e-06,ENSG00000105341.18_8,DMAC2,ENSG00000077348.9_5 ENSG00000105341.18_8,ENSG00000077348.9_5 0,9.199900e-107
1305077,ENST00000438807.7_2,ENSG00000077348.9_5,EXOSC5,0,1.7660105,0.07024377,5.650790e-105,-0.353242412,0.06619657,1.214285e-07,ENSG00000105341.18_8,DMAC2,ENSG00000077348.9_5 ENSG00000105341.18_8,ENSG00000077348.9_5 0,1.014938e-100
1321819,ENST00000589970.5_4,ENSG00000077348.9_5,EXOSC5,0,1.4305681,0.06605678,2.482885e-83,-0.248140228,0.05874833,2.658543e-05,ENSG00000105341.18_8,DMAC2,ENSG00000077348.9_5 ENSG00000105341.18_8,ENSG00000077348.9_5 0,2.973007e-79
4447242,ENST00000232766.6_2,ENSG00000132153.15_4,DHX30,0,1.0885030,0.05868925,7.645252e-65,-0.030778172,0.04488684,4.930974e-01,ENSG00000114648.12_5,KLHL18,ENSG00000132153.15_4 ENSG00000114648.12_5,ENSG00000132153.15_4 0,2.740135e-60
4466322,ENST00000483201.1_1,ENSG00000132153.15_4,DHX30,0,-1.1254084,0.06119269,6.021527e-64,0.036452486,0.04668954,4.351691e-01,ENSG00000114648.12_5,KLHL18,ENSG00000132153.15_4 ENSG00000114648.12_5,ENSG00000132153.15_4 0,1.079088e-59
7308091,ENST00000545991.1_2,ENSG00000177192.14_6,PUS1,1,1.1263665,0.08303945,4.057461e-38,0.173940835,0.06592876,8.482515e-03,ENSG00000185163.10_5,DDX51,ENSG00000177192.14_6 ENSG00000185163.10_5,ENSG00000177192.14_6 1,1.457075e-33
6406449,ENST00000503548.1_1,ENSG00000163950.13_6,SLBP,0,0.8465191,0.06433221,3.697921e-36,-0.004642902,0.06630632,9.441924e-01,ENSG00000163945.18_7,UVSSA,ENSG00000163950.13_6 ENSG00000163945.18_7,ENSG00000163950.13_6 0,1.328256e-31
1422302,ENST00000520773.5_2,ENSG00000086589.12_6,RBM22,0,1.2615286,0.11152995,9.485335e-28,0.063131727,0.05746532,2.722476e-01,ENSG00000197083.11_6,ZNF300P1,ENSG00000086589.12_6 ENSG00000197083.11_6,ENSG00000086589.12_6 0,3.407038e-23
7292419,ENST00000397333.4_3,ENSG00000177192.14_6,PUS1,1,-0.7217171,0.06656053,9.186856e-26,-0.005428641,0.05113513,9.154781e-01,ENSG00000185163.10_5,DDX51,ENSG00000177192.14_6 ENSG00000185163.10_5,ENSG00000177192.14_6 1,1.649546e-21
1317503,ENST00000546362.5_1,ENSG00000077348.9_5,EXOSC5,0,-1.0221577,0.10044566,4.825409e-23,0.243054959,0.07605391,1.445184e-03,ENSG00000142046.15_5,TMEM91,ENSG00000077348.9_5 ENSG00000142046.15_5,ENSG00000077348.9_5 0,4.333459e-19


In [33]:
str(trans)

'data.frame':	9185040 obs. of  12 variables:
 $ transcript_id_trans: chr  "ENST00000000233.10_2" "ENST00000000233.10_2" "ENST00000000233.10_2" "ENST00000000233.10_2" ...
 $ gene_id_rbp        : chr  "ENSG00000198301.12_6" "ENSG00000137656.12_6" "ENSG00000138385.16_4" "ENSG00000278053.5_5" ...
 $ gene_name_rbp      : chr  "SDAD1" "BUD13" "SSB" "DDX52" ...
 $ type               : int  0 0 1 0 0 1 1 0 1 2 ...
 $ beta_gt            : num  -0.0131 0.065 -0.0874 0.0218 -0.0293 ...
 $ se_gt              : num  0.0903 0.1167 0.0765 0.077 0.1341 ...
 $ pval_gt            : num  0.885 0.578 0.254 0.777 0.827 ...
 $ beta_exp           : num  0.1283 -0.016 0.1933 0.0125 0.0628 ...
 $ se_exp             : num  0.0734 0.0462 0.092 0.0658 0.0587 ...
 $ pval_exp           : num  0.0809 0.7288 0.036 0.8489 0.2847 ...
 $ gene_id_trans      : chr  "ENSG00000004059.11_6" "ENSG00000004059.11_6" "ENSG00000004059.11_6" "ENSG00000004059.11_6" ...
 $ gene_name_trans    : chr  "ARF5" "ARF5" "ARF5" "ARF5" ...


In [16]:
head(trans[order(trans$pval_exp),])

Unnamed: 0_level_0,gene_id,gene_name,type,beta_gt,se_gt,pval_gt,beta_exp,se_exp,pval_exp,transcript_id_trans
Unnamed: 0_level_1,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
498561,ENSG00000021776.11_6,AQR,0,-0.02484298,0.04367786,0.569655874,0.7761448,0.01896424,4.447857000000001e-204,ENST00000559767.1_2
7296221,ENSG00000165392.11_5,WRN,0,0.26895872,0.09155295,0.003394549,-1.3214207,0.03331833,2.1176260000000003e-196,ENST00000298139.7_3
7334492,ENSG00000165392.11_5,WRN,1,0.02362748,0.05871155,0.687465268,-1.3214207,0.03331833,2.1176260000000003e-196,ENST00000298139.7_3
1303564,ENSG00000065183.16_4,WDR3,0,0.03334196,0.0607721,0.583395557,-1.3857772,0.03957107,1.104553e-167,ENST00000349139.6_2
498558,ENSG00000021776.11_6,AQR,0,-0.02747553,0.03871158,0.478052022,-0.5868621,0.01680972,5.2519889999999996e-167,ENST00000156471.10_3
4337194,ENSG00000124571.18_7,XPO5,0,-0.01217756,0.03581265,0.733913108,-0.7151746,0.02060243,9.329656999999999e-166,ENST00000265351.12_3
