# Test if eQTLs for RNA binding proteins are associated with the expression of isoforms that have the RBP binding
- RBP binding: from eCLIP: `/frazer01/home/tarthur/projects/CARDIPS/rbp_analysis/sorted_bed/`


In [1]:
setwd("/frazer01/projects/CARDIPS/analysis/cardiac_eqtls")

source("script/packages.R"  )
source("script/input_data.R")
source("script/functions.R" )
source("script/colors.R"    )


In [2]:
geneinfo_gene_all   = fread("input/phenotypes/gene_info.txt"          , sep = "\t", header = TRUE , data.table = FALSE)
geneinfo_gene       = fread("pipeline/1.2.expression/gene_info.txt"   , sep = "\t", header = TRUE , data.table = FALSE)
geneinfo_isoform    = fread("pipeline/1.2.expression/isoform_info.txt", sep = "\t", header = TRUE , data.table = FALSE)
diffexp             = fread("pipeline/4.1.differential_expression/diffexp.txt", sep = "\t", header = TRUE , data.table = FALSE)
eqtl_genes          = fread("pipeline/3.2.eqtls/eqtls/cardiac_eqtls.gene.egenes.txt"         , sep = "\t", header = TRUE , data.table = FALSE)


In [3]:
rbp_folder         = "/frazer01/home/tarthur/projects/CARDIPS/rbp_analysis/sorted_peaks"
rbpbs_list         = list.files(rbp_folder, pattern = "bed$", full.names = TRUE)
rbp_meta           = fread("/reference/public/ENCODE/rbp_eclip_2020_09_09/metadata.tsv", sep = "\t", header = TRUE, data.table = FALSE)
rbp_meta           = rbp_meta[,c("File accession", "File format type", "File assembly", "Assay", "Biosample term name", "Experiment target")]
colnames(rbp_meta) = c("id", "filetype", "assembly", "assay", "cell_line", "gene_name")
rbp_meta$infile    = paste(rbp_folder, paste(rbp_meta$id, "bed", sep = "."), sep = "/")
rbp_meta$gene_name = gsub("-human", "", rbp_meta$gene_name)
rbp_meta           = rbp_meta[rbp_meta$infile %in% rbpbs_list & rbp_meta$assembly == "hg19",]
rbp_meta           = merge(rbp_meta, geneinfo_gene_all[geneinfo_gene_all$gene_id != "ENSG00000280987.4_7",c("gene_name", "gene_id")])
rbp_meta           = merge(rbp_meta, diffexp[diffexp$type == "gene_tpm" & diffexp$tissue1 == "ipsc_cvpc" & diffexp$tissue2 == "heart", c("gene_id", "beta", "pval", "qval", "diffexp")], all.x = TRUE)

# Intersect RBPs with isoform positions

In [4]:
rbp_data = as.data.frame(rbindlist(lapply(rbp_meta$infile, function(x)
{
    indata = fread(x, sep = "\t", header = FALSE, data.table = FALSE)
    indata = indata[,1:4]
    
    return(indata)
})), stringsAsFactors = FALSE)

colnames(rbp_data) = c("chrom", "from", "to", "id")
rbp_data           = rbp_data[order(rbp_data$chrom, rbp_data$from, rbp_data$to),]
rbp_bed            = paste(getwd(), "pipeline/7.1.trans/rbp/input", "rbp.bed", sep = "/")

fwrite(rbp_data, rbp_bed, sep = "\t", col.names = FALSE, row.names = FALSE)


In [5]:
intersected_file = paste(getwd(), "pipeline/7.1.trans/rbp/input", "intersected.bed", sep = "/")
command          = paste("bedtools", "intersect", "-loj", "-a", rbp_bed, "-b", "/reference/private/Gencode.v34lift37/isoform.bed", ">", intersected_file)

#system(command)

In [7]:
intersected           = fread(intersected_file, sep = "\t", header = FALSE, data.table = FALSE)
intersected           = intersected[,c(8,4)]
colnames(intersected) = c("transcript_id", "x")
intersected$rbp_name  = unlist(lapply(intersected$x, function(x){unlist(strsplit(x, "_"))[[1]]}))
#intersected           = unique(intersected)


In [14]:
isoform2rbp = aggregate(x ~ rbp_name + transcript_id, data = intersected[intersected$transcript_id != ".",], FUN = length)

In [17]:
fwrite(isoform2rbp, "pipeline/7.1.trans/rbp/input/isoform2rbp.txt", sep = "\t", col.names = TRUE, row.names = FALSE)


# Get eQTL information for each RBP

In [24]:
eqtl_rbp = eqtl_genes[eqtl_genes$gene_id %in% rbp_meta$gene_id, ]
fwrite(eqtl_rbp, "pipeline/7.1.trans/rbp/input/eqtl_rbp.txt", sep = "\t", col.names = TRUE, row.names = FALSE)


In [48]:
invisible(lapply(sort(unique(eqtl_rbp$gene_id)), function(rbp)
{
    indata = isoform2rbp[ isoform2rbp$rbp_name == geneinfo_gene[geneinfo_gene$gene_id == rbp, "gene_name"],]
    indata = indata[indata$transcript_id %in% geneinfo_isoform$transcript_id,]
    
    fwrite(indata[,c("transcript_id", "x")], paste("pipeline/7.1.trans/rbp/input/rbp2transcript", paste(rbp, "txt", sep = "."), sep = "/"), sep = "\t", col.names = TRUE, row.names = FALSE)
}))



# Run trans-analysis

In [113]:
run_trans_eqtls_qsub = function()
{
    
    sh_file = paste(getwd(), "pipeline/7.1.trans/script", paste("run.7.1", "rbp", "sh", sep = "."), sep = "/")
    
    writeLines(text = c("#!/usr/bin/sh",
                        "source /frazer01/home/matteo/.bashrc",
                        paste("Rscript", paste(paste(getwd(), "script", "7.2.run_trans_eqtls_rbp.R", sep = "/"),"--taskid", "$SGE_TASK_ID"))
                       ), 
               con  = sh_file, 
               sep  = "\n\n")
    
    qsub_command = paste("qsub",
                         "-l", "short",
                         "-t", paste(1, "-", nrow(eqtl_rbp), ":1", sep = ""),
                         "-tc", 500, 
                         "-o" , paste(getwd(), paste("logs/eqtls", "7.2.run_trans_eqtls_rbp", "out", sep = "."), sep = "/"),
                         "-e" , paste(getwd(), paste("logs/eqtls", "7.2.run_trans_eqtls_rbp", "err", sep = "."), sep = "/"),
                         sh_file
                        )
    
    message(qsub_command)
    system(qsub_command)
}

run_trans_eqtls_qsub() # run all genes


qsub -l short -t 1-240:1 -tc 500 -o /frazer01/projects/CARDIPS/analysis/cardiac_eqtls/logs/eqtls.7.2.run_trans_eqtls_rbp.out -e /frazer01/projects/CARDIPS/analysis/cardiac_eqtls/logs/eqtls.7.2.run_trans_eqtls_rbp.err /frazer01/projects/CARDIPS/analysis/cardiac_eqtls/pipeline/7.1.trans/script/run.7.1.rbp.sh

“system call failed: Cannot allocate memory”
“error in running command”


# Private: test trans analysis

In [49]:
taskid = 2

In [88]:
exp_all          = add_rownames(fread("pipeline/1.2.expression//use_isoform.normalized.txt", sep = "\t", header = TRUE , data.table = FALSE))


In [90]:
gt_folder           = paste(getwd(), "pipeline", "1.3.genotype"  , "tpm_gene"   , sep = "/")
geneinfo_gene       = fread("pipeline/1.2.expression/gene_info.txt"                          , sep = "\t", header = TRUE , data.table = FALSE)
geneinfo_isoform    = fread("pipeline/1.2.expression/isoform_info.txt"                       , sep = "\t", header = TRUE , data.table = FALSE)
eqtl_rbp            = fread("pipeline/7.1.trans/rbp/input/eqtl_rbp.txt"                      , sep = "\t", header = TRUE , data.table = FALSE)
eqtl                = eqtl_rbp[taskid, c("gene_id", "gene_name", "type", "chrom", "pos", "ref", "alt", "id", "beta", "se", "pval", "fdr", "qval", "egene")]
rbp                 = eqtl$gene_id
totest              = fread(paste("pipeline/7.1.trans/rbp/input/rbp2transcript", paste(rbp, "txt", sep = "."), sep = "/"), sep = "\t", header = TRUE , data.table = FALSE)
vars0               = readLines(paste("pipeline/3.2.eqtls", "vars", paste("cardiac_eqtls", "isoform", "txt", sep = "."), sep = "/"))
exp_rbp             = add_rownames(fread(paste(paste(getwd(), "pipeline", "1.2.expression", "tpm_gene", sep = "/"), paste(rbp, "txt", sep = "."), sep = "/"), sep = "\t", header = TRUE , data.table = FALSE))
covariates          = add_rownames(fread("pipeline/3.1.covariates/covariates.txt", sep = "\t", header = TRUE , data.table = FALSE))
metadata            =              fread("pipeline/3.1.covariates/metadata.txt"  , sep = "\t", header = TRUE , data.table = FALSE)
gtdata              = add_rownames(fread(paste(gt_folder , paste("gt_data", rbp, "txt", sep = "."), sep = "/"), sep = "\t", header = TRUE , data.table = FALSE))
covariates          =              merge(metadata, covariates, by.x = "run", by.y = "row.names")
covariates          = merge(covariates, data.frame(wgs_id = colnames(gtdata), gt = as.numeric(gtdata[eqtl$id,])))

In [91]:
isoform = totest[1, "transcript_id"]


In [107]:
calculate_trans = function(x, name, tolm, vars0)
{
    tolm$totest    = tolm[,x]
    out0           = as.data.frame(summary(lm(norm_isoform ~ ., data = tolm[,c("norm_isoform", "totest", "norm", vars0)]))$coefficients)
    out0           = out0["totest", c("Estimate", "Std. Error", "Pr(>|t|)")]
    colnames(out0) = paste(c("beta", "se", "pval"), name, sep = "_")
    
    return(out0)
}

out = as.data.frame(rbindlist(lapply(rownames(exp_all), function(isoform)
{
    expdata                  = data.frame(run = colnames(exp_all), norm_isoform = as.numeric(exp_all[isoform,]))
    tolm                     = merge(covariates, expdata, by   = "run")
    tolm                     = merge(tolm      , exp_rbp, by.x = "run", by.y = "row.names")
    out_gt                   = calculate_trans("gt"  , "gt" , tolm, vars0)
    out_ex                   = calculate_trans("norm", "exp", tolm, vars0)
    out0                     = cbind(eqtl[,c("gene_id", "gene_name", "type")], out_gt, out_ex)
    out0$transcript_id_trans = isoform  
    
    return(out0)
})), stringsAsFactors = FALSE)

fwrite(out, paste("pipeline/7.1.trans/rbp/trans_eqtls_by_rbp", paste(rbp, "txt", sep = "."), sep = "/"), sep = "\t", col.names = TRUE, row.names = FALSE)



In [108]:
out

gene_id,gene_name,type,beta_gt,se_gt,pval_gt,beta_exp,se_exp,pval_exp,transcript_id_trans
<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
ENSG00000004478.8_4,FKBP4,0,0.009097402,0.09815838,0.92617872,-0.04602705,0.07474442,0.5381945,ENST00000373020.9_3
ENSG00000004478.8_4,FKBP4,0,-0.145590625,0.10858625,0.1803453,0.10842578,0.08277097,0.1905628,ENST00000614008.4_2
ENSG00000004478.8_4,FKBP4,0,0.156038611,0.11132992,0.16139969,-0.06932271,0.08487057,0.4142661,ENST00000496771.5_2
ENSG00000004478.8_4,FKBP4,0,0.051821544,0.09563707,0.58805805,0.05734375,0.07283661,0.4313273,ENST00000371588.9_2
ENSG00000004478.8_4,FKBP4,0,-0.182392447,0.09181826,0.04730068,-0.08345882,0.07007668,0.2339962,ENST00000371582.8_3
ENSG00000004478.8_4,FKBP4,0,-0.17579727,0.09914612,0.0765643,0.03404474,0.07563421,0.6527348,ENST00000367771.11_4
ENSG00000004478.8_4,FKBP4,0,-0.033159763,0.10191495,0.74498254,0.08132829,0.07760932,0.2949703,ENST00000367770.5_3
ENSG00000004478.8_4,FKBP4,0,0.124295003,0.0995408,0.21212087,-0.08752642,0.07586547,0.2489414,ENST00000367772.8_3
ENSG00000004478.8_4,FKBP4,0,0.110927507,0.08838833,0.20981943,0.02577185,0.06736618,0.7021379,ENST00000374005.8_3
ENSG00000004478.8_4,FKBP4,0,0.035656397,0.11061518,0.74726878,0.06417893,0.08423454,0.4463241,ENST00000399173.5_2


In [70]:
metadata

run,assay_id,wgs_id,subject_id,subject_name,study,tissue,body_site
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
013eb23c-4ff0-493b-b106-9c8c9c2176d6,013eb23c-4ff0-493b-b106-9c8c9c2176d6,583d5465-f933-46d1-9fa4-307b76b97311,bd04a8cc-5d63-45bc-a2cc-91b0c7cb6e01,iPSCORE_2_4,ipscore,ipsc_cvpc,ipsc_cvpc
028df3e3-a041-4f91-9da7-517a7a33e271,028df3e3-a041-4f91-9da7-517a7a33e271,c08cc10c-44c1-4fa4-897a-636f5dfb058f,bd4643a6-9742-4a25-8029-a967cc2a84c7,iPSCORE_13_3,ipscore,ipsc_cvpc,ipsc_cvpc
030ec865-77e5-4e82-9b18-77a51e787655,030ec865-77e5-4e82-9b18-77a51e787655,b0efcf59-dea7-4f62-bf01-41bf08422001,e0793a72-9694-4711-a2c6-d2ff9071d05f,iPSCORE_8_4,ipscore,ipsc_cvpc,ipsc_cvpc
03df2c1b-bcc6-4fa3-9f47-d6f9e43be283,03df2c1b-bcc6-4fa3-9f47-d6f9e43be283,99dc8247-0416-44c2-854d-a2e31066a75b,508fb6c5-c9d5-4881-9638-cd4bd04d822f,iPSCORE_25_1,ipscore,ipsc_cvpc,ipsc_cvpc
067afe59-e81d-4176-800d-f1b111c1ec82,067afe59-e81d-4176-800d-f1b111c1ec82,78d8c099-465b-412d-bf88-e7ae54366e34,03346cae-a4f3-4481-92ff-d76db0c82468,iPSCORE_116_1,ipscore,ipsc_cvpc,ipsc_cvpc
0716c9fa-8153-477d-8e24-6abee17c9456,0716c9fa-8153-477d-8e24-6abee17c9456,c643713c-38f0-4514-b820-a566801a386a,bfee9a02-5ec3-4e11-a8e2-ebd47623caa0,iPSCORE_11_3,ipscore,ipsc_cvpc,ipsc_cvpc
0726b84c-c9ed-45a8-b50b-e1e8f484be8e,0726b84c-c9ed-45a8-b50b-e1e8f484be8e,c5b18bcb-bbb9-43ee-989a-717c14371257,f0173728-58cb-4d7d-b28d-02355b49b9ed,iPSCORE_13_1,ipscore,ipsc_cvpc,ipsc_cvpc
08f6fa0b-14ea-4ac6-85ec-67045c15d5ac,08f6fa0b-14ea-4ac6-85ec-67045c15d5ac,01fe3fd3-5392-45e6-93b7-448151107e1d,83a1b295-b907-4233-b0a8-df753b2cee16,iPSCORE_16_3,ipscore,ipsc_cvpc,ipsc_cvpc
0b111711-0130-44aa-b513-a308d13d1652,0b111711-0130-44aa-b513-a308d13d1652,90657981-484d-444a-9b2b-9b479f84c8c6,42e4583a-b1bc-4cc3-85d7-68662433b345,iPSCORE_29_1,ipscore,ipsc_cvpc,ipsc_cvpc
0c025698-b29b-4f3f-adab-c41cffc53911,0c025698-b29b-4f3f-adab-c41cffc53911,8ae6e66c-b3a2-436a-8a28-94356cd373fd,d4073658-c7fe-4f53-9e80-b95121a65475,iPSCORE_53_1,ipscore,ipsc_cvpc,ipsc_cvpc


In [16]:
str(unique(rbp_meta$gene_name))
str(unique(rbp_meta[,c("gene_name", "cell_line")]))


 chr [1:148] "RBM5" "FKBP4" "UPF1" "AKAP8L" "UTP18" "PTBP1" "MATR3" "AQR" ...
'data.frame':	222 obs. of  2 variables:
 $ gene_name: chr  "RBM5" "FKBP4" "UPF1" "UPF1" ...
 $ cell_line: chr  "HepG2" "HepG2" "HepG2" "K562" ...


In [27]:
str(rbp_meta)

'data.frame':	674 obs. of  12 variables:
 $ gene_id  : chr  "ENSG00000003756.17_8" "ENSG00000003756.17_8" "ENSG00000003756.17_8" "ENSG00000004478.8_4" ...
 $ gene_name: chr  "RBM5" "RBM5" "RBM5" "FKBP4" ...
 $ id       : chr  "ENCFF980CBO" "ENCFF400CFS" "ENCFF663GBN" "ENCFF374GKD" ...
 $ filetype : chr  "narrowPeak" "narrowPeak" "narrowPeak" "narrowPeak" ...
 $ assembly : chr  "hg19" "hg19" "hg19" "hg19" ...
 $ assay    : chr  "eCLIP" "eCLIP" "eCLIP" "eCLIP" ...
 $ cell_line: chr  "HepG2" "HepG2" "HepG2" "HepG2" ...
 $ infile   : chr  "/frazer01/home/tarthur/projects/CARDIPS/rbp_analysis/sorted_peaks/ENCFF980CBO.bed" "/frazer01/home/tarthur/projects/CARDIPS/rbp_analysis/sorted_peaks/ENCFF400CFS.bed" "/frazer01/home/tarthur/projects/CARDIPS/rbp_analysis/sorted_peaks/ENCFF663GBN.bed" "/frazer01/home/tarthur/projects/CARDIPS/rbp_analysis/sorted_peaks/ENCFF374GKD.bed" ...
 $ beta     : num  -0.639 -0.639 -0.639 0.529 0.529 ...
 $ pval     : num  1.52e-06 1.52e-06 1.52e-06 5.20e-06 5.20e-06