In [1]:
setwd("/frazer01/projects/CARDIPS/analysis/cardiac_eqtls")

source("script/packages.R"  )
source("script/input_data.R")
source("script/functions.R" )
source("script/colors.R"    )

suppressPackageStartupMessages(library(coloc))

In [15]:
outfolder = "pipeline/6.1.coloc_gwas/intersect_rbps"

dir.create(outfolder, showWarnings = FALSE)

In [2]:
geneinfo = fread("pipeline/1.2.expression/isoform_info.txt"                   , sep = "\t", header = TRUE , data.table = FALSE)
coloc    = fread("pipeline/6.1.coloc_gwas/coloc.eqtls.isoform.txt"            , sep = "\t", header = TRUE , data.table = FALSE)
fm       = fread("pipeline/3.2.eqtls/eqtls_fine_map/cardiac_eqtls.isoform.txt", sep = "\t", header = TRUE , data.table = FALSE)


In [3]:
rbps        = data.frame(rbp = c("FMR1", "FUS", "FXR2", "HNRNPA1", "HNRNPL", "RBM5", "SRSF7"), folder = "")
rbps$folder = paste("/frazer01/home/tarthur/projects/CARDIPS/fine_mapping/analysis/2020-10-08-fgwas", tolower(rbps$rbp), "peakfiles", sep = "/")


# Write BED file with all fine mapped eVariants

In [16]:
fm2iso       = merge(fm, geneinfo[,c("transcript_id", "start", "end", "strand")])
fm2iso       = fm2iso[fm2iso$pos >= fm2iso$start & fm2iso$pos <= fm2iso$end,]
fm2iso$tr2id = paste(fm2iso$transcript_id, fm2iso$type, fm2iso$id, sep = ":")
fm2iso$chrom = paste("chr", fm2iso$chrom, sep = "")
fm2iso       = fm2iso[order(fm2iso$chrom, fm2iso$pos),]
fm2iso$score = 0
fm_bed       = paste(outfolder, "fm.bed", sep = "/")

fwrite(unique(fm2iso[,c("chrom", "pos", "pos", "tr2id", "score", "strand")]), fm_bed, sep = "\t", col.names = FALSE, row.names = FALSE)

# Intersect:
- fine mapped eVariants with RBP loci (eCLIP)
- coloc with GWAS (by eIsoform or by PPA from coloc)
- eVariants that affect RBP motifs (FIMO): all eVariants that coloc?
- find examples (hopefully for some RBPs that are enriched for GWAS traits in FGWAS)

## intersect fine mapped eVariants with RBP loci (eCLIP)

In [21]:
intersect_rbp = function(rbp, rbps, fm)
{
    #bedfiles = list.files(rbps[rbps$rbp == rbp, "folder"], full.names = TRUE)
    command  = paste("bedtools", "intersect", "-wa", "-a", fm_bed, "-b", paste(rbps[rbps$rbp == rbp, "folder"], "*", sep = "/"), "|", "cut", "-f", 4)
    overlap  = system(command, intern = TRUE)
    
    return(overlap)
}

overlap_list        = lapply(rbps$rbp, function(rbp){intersect_rbp(rbp, rbps, fm)})
names(overlap_list) = rbps$rbp

In [31]:
intdata       = fm2iso
intdata$score = NULL

for(rbp in rbps$rbp)
{
    intdata[, paste("eclip", rbp, sep = ".")] = FALSE
    intdata[ intdata$tr2id %in% overlap_list[[rbp]], paste("eclip", rbp, sep = ".")] = TRUE
}

intdata$eclip = FALSE
intdata[intdata$tr2id %in% unique(unlist(overlap_list)), "eclip"] = TRUE


## intersect with coloc

In [86]:
intdata$tr2type         = paste(intdata$transcript_id, intdata$type)
int2coloc               = intdata[intdata$eclip == TRUE, c("transcript_id", "type", "id", "pp")]
int2coloc$pp_fm         = int2coloc$pp
int2coloc$pp            = NULL
int2coloc               = merge(int2coloc, coloc[, c("transcript_id", "type", "trait", "PP.H4.abf")], by = c("transcript_id", "type"))
int2coloc$pp_coloc_gene = int2coloc$PP.H4.abf
int2coloc$PP.H4.abf     = NULL
#int2coloc               = int2coloc[ int2coloc$pp_coloc_gene >= 0.5,]

In [53]:
int2coloc_var = as.data.frame(rbindlist(lapply(sort(unique(int2coloc$transcript_id)), function(x)
{
    indata  = fread(paste("pipeline/6.1.coloc_gwas/coloc.isoform", paste("snp", x, "txt", sep = "."), sep = "/"), sep = "\t", header = TRUE, data.table = FALSE)
    outdata = merge(int2coloc, indata[,c("transcript_id", "type", "id", "trait", "pp_snp")], by = c("transcript_id", "type", "id", "trait"))
    return(outdata)
})), stringsAsFactors = FALSE)

fwrite(int2coloc_var, paste(outfolder, "int2coloc_var.txt", sep = "/"), sep = "\t", col.names = TRUE, row.names = FALSE)

In [60]:
intdata  $coloc_gene = FALSE
int2coloc$tr2id      = paste(int2coloc$transcript_id, int2coloc$type, int2coloc$id, sep = ":")

for(trait in sort(unique(int2coloc$trait)))
{
    intdata[, paste("coloc_gene", trait, sep = ".")] = FALSE
    intdata[ intdata$tr2id %in% int2coloc[int2coloc$trait == trait & int2coloc$pp_coloc_gene >= 0.5, "tr2id"], paste("coloc_gene", trait, sep = ".")] = TRUE
}

intdata[rowSums(intdata[, paste("coloc_gene", sort(unique(int2coloc$trait)), sep = ".")]) > 0, "coloc_gene"] = TRUE

In [69]:
intdata      $coloc_snp = FALSE
int2coloc_var$tr2id     = paste(int2coloc_var$transcript_id, int2coloc_var$type, int2coloc_var$id, sep = ":")

for(trait in sort(unique(int2coloc_var$trait)))
{
    intdata[, paste("coloc_snp", trait, sep = ".")] = FALSE
    intdata[ intdata$tr2id %in% int2coloc_var[int2coloc_var$trait == trait & int2coloc_var$pp_snp >= 0.01, "tr2id"], paste("coloc_snp", trait, sep = ".")] = TRUE
}

intdata[rowSums(intdata[, paste("coloc_snp", sort(unique(int2coloc$trait)), sep = ".")]) > 0, "coloc_snp"] = TRUE

In [75]:
tops = intdata[ intdata$eclip == TRUE & intdata$coloc_gene == TRUE & intdata$coloc_snp == TRUE, ]

fwrite(int2coloc, paste(outfolder, "int2coloc.txt", sep = "/"), sep = "\t", col.names = TRUE, row.names = FALSE)
fwrite(intdata  , paste(outfolder, "intdata.txt"  , sep = "/"), sep = "\t", col.names = TRUE, row.names = FALSE)
fwrite(tops[,c("transcript_id", "type", "gene_name", "id", "pp")], paste(outfolder, "tops.txt"  , sep = "/"), sep = "\t", col.names = TRUE, row.names = FALSE)

tops[,c("transcript_id", "type", "gene_name", "id", "pp")]

Unnamed: 0_level_0,transcript_id,type,gene_name,id,pp
Unnamed: 0_level_1,<chr>,<int>,<chr>,<chr>,<dbl>
58451,ENST00000472194.6_1,0,ATAD3B,VAR_1_1415063_T_G,0.43642791
58452,ENST00000472194.6_1,0,ATAD3B,VAR_1_1415476_G_C,0.44240107
62109,ENST00000480991.1_2,0,SLC35E2B,VAR_1_1596500_C_T,0.83906331
102009,ENST00000611123.1_2,0,SLC35E2B,VAR_1_1596500_C_T,0.47855651
102013,ENST00000611123.1_2,1,SLC35E2B,VAR_1_1596500_C_T,0.01638872
103095,ENST00000617444.5_3,0,SLC35E2B,VAR_1_1596500_C_T,0.01864093
102015,ENST00000611123.1_2,2,SLC35E2B,VAR_1_1602587_C_T,0.47960877
69003,ENST00000496676.5_1,0,RNF207,VAR_1_6278414_A_G,0.33245518
30465,ENST00000377705.6_3,0,NOL9,VAR_1_6583270_C_G,0.02425345
30470,ENST00000377705.6_3,0,NOL9,VAR_1_6589641_C_T,0.01234203


In [142]:
intdata_pp       = fm2iso
intdata_pp$score = NULL

for(rbp in rbps$rbp)
{
    intdata_pp[, paste("eclip", rbp, sep = ".")] = FALSE
    intdata_pp[ intdata_pp$tr2id %in% overlap_list[[rbp]], paste("eclip", rbp, sep = ".")] = TRUE
}

intdata_pp$eclip = FALSE
intdata_pp[intdata_pp$tr2id %in% unique(unlist(overlap_list)), "eclip"] = TRUE

mycols = colnames(intdata_pp)

In [143]:
intdata_pp$coloc_gene = FALSE
int2coloc $tr2id      = paste(int2coloc$transcript_id, int2coloc$type, int2coloc$id, sep = ":")

for(trait in sort(unique(int2coloc$trait)))
{
    tomerge           = int2coloc[int2coloc$trait == trait, c("tr2id", "pp_coloc_gene")]
    colnames(tomerge) = c("tr2id", paste("coloc_gene", trait, sep = "."))
    intdata_pp        = merge(intdata_pp, tomerge, all.x = TRUE)
}

intdata_pp$coloc_gene = unlist(lapply(1:nrow(intdata_pp), function(ii)
{
    out    = 0
    mydata = intdata_pp[ii, grepl("coloc_gene\\.", colnames(intdata_pp)) ==TRUE]
    if(length(mydata[is.na(mydata) == FALSE]) > 0){out = max(mydata, na.rm = TRUE)}
    
    return(out)
}))



In [144]:
intdata_pp   $coloc_snp = FALSE
int2coloc_var$tr2id     = paste(int2coloc_var$transcript_id, int2coloc_var$type, int2coloc_var$id, sep = ":")

for(trait in sort(unique(int2coloc_var$trait)))
{
    tomerge           = int2coloc_var[int2coloc_var$trait == trait, c("tr2id", "pp_snp")]
    tomerge           = aggregate(pp_snp ~ tr2id, data = tomerge, FUN = max)
    colnames(tomerge) = c("tr2id", paste("coloc_snp", trait, sep = "."))
    intdata_pp        = merge(intdata_pp, tomerge, all.x = TRUE)
}

intdata_pp$coloc_snp = unlist(lapply(1:nrow(intdata_pp), function(ii)
{
    out    = 0
    mydata = intdata_pp[ii, grepl("coloc_snp\\.", colnames(intdata_pp)) ==TRUE]
    if(length(mydata[is.na(mydata) == FALSE]) > 0){out = max(mydata, na.rm = TRUE)}
    
    return(out)
}))

fwrite(intdata_pp  , paste(outfolder, "intdata_pp.txt"  , sep = "/"), sep = "\t", col.names = TRUE, row.names = FALSE)

fwrite(intdata_pp[,c("transcript_id", "type", "gene_name", "id", "eclip", "pp", "coloc_gene", "coloc_snp")], paste(outfolder, "tops_n.txt"  , sep = "/"), sep = "\t", col.names = TRUE, row.names = FALSE)


biomarkers-30680-both_sexes-irnt 32682

biomarkers-30690-both_sexes-irnt 32682

biomarkers-30760-both_sexes-irnt 32682

biomarkers-30780-both_sexes-irnt 32682

biomarkers-30790-both_sexes-irnt 32682

biomarkers-30870-both_sexes-irnt 32682

categorical-20002-both_sexes-1074 32682

categorical-20002-both_sexes-1075 32682

categorical-20002-both_sexes-1081 32682

categorical-6150-both_sexes-100 32682

categorical-6150-both_sexes-2 32682

categorical-6150-both_sexes-3 32682

categorical-6150-both_sexes-4 32682

categorical-6153-both_sexes-1 32682

categorical-6153-both_sexes-2 32682

categorical-6177-both_sexes-1 32682

categorical-6177-both_sexes-100 32682

categorical-6177-both_sexes-2 32682

continuous-102-both_sexes-irnt 32682

continuous-12336-both_sexes-irnt 32682

continuous-12340-both_sexes-irnt 32682

continuous-21001-both_sexes-irnt 32682

continuous-3894-both_sexes-irnt 32682

continuous-4079-both_sexes-irnt 32682

continuous-4080-both_sexes-irnt 32682

continuous-5983-both_sexe

In [149]:
intdata_pp$score = (1 + intdata_pp$pp) * (1 + intdata_pp$coloc_gene) * (1 + intdata_pp$coloc_snp)

fwrite(intdata_pp[,c("tr2id", "transcript_id", "type", "gene_name", "id", "eclip", "pp", "coloc_gene", "coloc_snp", "score")], paste(outfolder, "tops_n.txt"  , sep = "/"), sep = "\t", col.names = TRUE, row.names = FALSE)


# Get FIMO data

In [173]:
myvars       = merge(unique(fm[fm$id %in% intdata_pp$id, c("transcript_id", "chrom", "pos", "ref", "alt", "id")]), geneinfo[,c("transcript_id", "strand")])
myvars       = unique(myvars[,c("transcript_id", "chrom", "pos", "ref", "alt", "id", "strand")])
myvars$chrom = paste("chr", myvars$chrom, sep = "")
myvars       = myvars[order(myvars$chrom, myvars$pos),]
myvars       = myvars[myvars$ref %in% c("A", "C", "G", "T") & myvars$alt %in% c("A", "C", "G", "T"),]
myvars$from  = myvars$pos - 11
myvars$to    = myvars$pos + 10
myvars$score = 0
myvars$coord = paste(myvars$chrom, ":", myvars$from, "-", myvars$to, "(", myvars$strand, ")", sep = "")
mybed        = paste(outfolder, "variants2fasta.bed", sep = "/")
myfasta      = paste(outfolder, "variants2fasta.txt", sep = "/")

fwrite(unique(myvars[,c("chrom", "from", "to", "id", "score", "strand")]), mybed, sep = "\t", col.names = FALSE, row.names = FALSE)

In [184]:
command = paste("bedtools", "getfasta", "-tab", "-s", "-fi", "/frazer01/reference/public/hg19/hg19.fa", "-bed", mybed, ">", myfasta)

system(command)

seqdata            = fread(myfasta, sep = "\t", header = FALSE, data.table = FALSE)
colnames(seqdata)  = c("coord", "sequence")
seqdata            = merge(myvars, seqdata)
seqdata$sequence   = toupper(seqdata$sequence)
seqdata$ref_strand = seqdata$ref
seqdata$alt_strand = seqdata$alt
seqdata[seqdata$strand == "-", "ref_strand"] = mapvalues(seqdata[seqdata$strand == "-", "ref"], from = c("A", "T", "C", "G"), to = c("T", "A", "G", "C"), warn_missing = FALSE)
seqdata[seqdata$strand == "-", "alt_strand"] = mapvalues(seqdata[seqdata$strand == "-", "alt"], from = c("A", "T", "C", "G"), to = c("T", "A", "G", "C"), warn_missing = FALSE)
seqdata$sequence_alt = unlist(lapply(1:nrow(seqdata), function(ii)
{
    paste(substr(seqdata[ii, "sequence"], start = 1, stop = 10), 
          seqdata[ii, "alt_strand"], 
          substr(seqdata[ii, "sequence"], start = 12, stop = 21), 
          sep = ""
         )
}))

In [193]:
tofasta = unique(seqdata[,c("coord", "sequence", "sequence_alt")])
tofasta = unlist(lapply(1:nrow(tofasta), function(ii)
{
    out1 = paste(">", tofasta[ii, "coord"], ".ref\n", tofasta[ii, "sequence"    ], sep = "")
    out2 = paste(">", tofasta[ii, "coord"], ".alt\n", tofasta[ii, "sequence_alt"], sep = "")
    
    return(paste(out1, out2, sep = "\n"))
}))

fasta2fimo      = paste(outfolder, "fasta2fimo.fa"        , sep = "/")
fimo_motifs     = paste(outfolder, "motifs.all.homer.meme", sep = "/")
writeLines(tofasta, fasta2fimo, sep = "\n")

In [216]:
fimo_out = paste(outfolder, "fimo_results.txt", sep = "/")
command  = paste("/frazer01/software/meme_4.12.0/bin/fimo", "--norc", "--text", "--o", fimo_out_folder, "--thresh", 1, fimo_motifs, fasta2fimo, ">", fimo_out)


In [222]:
fimo           = fread(fimo_out, sep = "\t", header = TRUE, data.table = FALSE)
fimo           = fimo[fimo$start <= 11 & fimo$stop >= 11,]
colnames(fimo) = gsub("-", "", gsub("# ", "", colnames(fimo)))
fimo$coord     = unlist(lapply(fimo$sequence_name, function(x){unlist(strsplit(x, "\\."))[[1]]}))
fimo$is_ref    = FALSE

fimo[grepl("ref$", fimo$sequence_name) == TRUE, "is_ref"] = TRUE

In [224]:
fimo2delta = merge(fimo[fimo$is_ref == TRUE , c("motif_id", "motif_alt_id", "coord", "start", "stop", "score", "matched_sequence")],
                   fimo[fimo$is_ref == FALSE, c("motif_id", "motif_alt_id", "coord", "start", "stop", "score", "matched_sequence")],
                   by       = c("motif_id", "motif_alt_id", "coord", "start", "stop"),
                   suffixes = c("_ref", "_alt")
                  )

In [229]:
fimo2delta$max = unlist(apply(fimo2delta[,c("score_ref", "score_alt")], 1, max))

In [244]:
to_reduce  = unique(fimo2delta[ fimo2delta$max > 0, c("coord", "motif_id", "motif_alt_id")])
fimo2motif = as.data.frame(rbindlist(lapply(1:nrow(to_reduce), function(ii)
{
    if(ii %in% ((1:100) * 1000)){message(ii)}
    
    coord        = to_reduce[ii, "coord"       ]
    motif_id     = to_reduce[ii, "motif_id"    ]
    motif_alt_id = to_reduce[ii, "motif_alt_id"]
    
    this = fimo2delta[ fimo2delta$coord == coord & fimo2delta$motif_id == motif_id & fimo2delta$motif_alt_id == motif_alt_id,]
    return(this[ which.max(this$max),])
    
})), stringsAsFactors = FALSE)

fimo2motif$delta = fimo2motif$score_alt - fimo2motif$score_ref
fwrite(fimo2motif, paste(outfolder, "fimo2motif.txt", sep = "/"), sep = "\t", col.names = TRUE, row.names = FALSE)


1000

2000

3000

4000

5000

6000

7000

8000

9000

10000

11000

12000

13000

14000

15000

16000

17000

18000

19000

20000

21000

22000

23000

24000

25000

26000

27000

28000

29000

30000

31000

32000

33000

34000

35000

36000

37000

38000

39000

40000

41000

42000

43000

44000

45000

46000

47000

48000

49000

50000

51000

52000

53000

54000

55000

56000

57000

58000

59000

60000

61000

62000

63000

64000

65000

66000

67000

68000

69000

70000

71000

72000

73000

74000

75000

76000

77000

78000

79000

80000

81000

82000

83000

84000

85000

86000

87000

88000

89000

90000

91000



In [261]:
intdata_pp2       = intdata_pp
intdata_pp2$coord = paste(intdata_pp2$chrom, ":", intdata_pp2$pos - 11, "-", intdata_pp2$pos + 10, "(", intdata_pp2$strand, ")", sep = "")

for(motif in sort(unique(fimo2motif$motif_alt_id)))
{
    tomerge           = fimo2motif[fimo2motif$motif_alt_id == motif, c("coord", "max", "delta")]
    #tomerge           = aggregate(. ~ coord, data = tomerge, FUN = max)
    colnames(tomerge) = c("coord", paste(c("score", "delta"), motif, sep = "."))
    intdata_pp2        = merge(intdata_pp2, tomerge, all.x = TRUE)
}

fwrite(intdata_pp2, paste(outfolder, "intdata_pp_with_motifs.txt"  , sep = "/"), sep = "\t", col.names = TRUE, row.names = FALSE)
fwrite(intdata_pp2[,colnames(intdata_pp2) %in% c("transcript_id", "type", "gene_name", "id", "eclip", "pp", "coloc_gene", "coloc_snp", "score") | (grepl("^eclip\\.", colnames(intdata_pp2)) == TRUE | grepl("^score\\.", colnames(intdata_pp2)) == TRUE | grepl("^delta\\.", colnames(intdata_pp2)) == TRUE)], paste(outfolder, "tops_n_with_motifs.txt"  , sep = "/"), sep = "\t", col.names = TRUE, row.names = FALSE)


In [270]:
int_isoforms  = fread("pipeline/3.2.eqtls/eqtls/cardiac_eqtls.isoform.interactions.txt", sep = "\t", header = TRUE, data.table = FALSE)
int_isoforms  = int_isoforms[int_isoforms$cell == TRUE,]

In [271]:
intdata_pp2$tr2type = paste(intdata_pp2$transcript_id, intdata_pp2$type)

for(cell in sort(unique(int_isoforms$interaction)))
{
    tomerge           = int_isoforms[int_isoforms$interaction == cell, c("transcript_id", "type")]
    tomerge           = paste(tomerge$transcript_id, tomerge$type)
    intdata_pp2[,paste("cell", cell, sep = ".")] = FALSE
    intdata_pp2[intdata_pp2$tr2type %in% tomerge,paste("cell", cell, sep = ".")] = TRUE
    
}



In [272]:
fwrite(intdata_pp2, paste(outfolder, "intdata_pp_with_motifs.txt"  , sep = "/"), sep = "\t", col.names = TRUE, row.names = FALSE)
fwrite(intdata_pp2[,colnames(intdata_pp2) %in% c("transcript_id", "type", "gene_name", "id", "eclip", "pp", "coloc_gene", "coloc_snp", "score") | (grepl("^eclip\\.", colnames(intdata_pp2)) == TRUE | grepl("^score\\.", colnames(intdata_pp2)) == TRUE | grepl("^delta\\.", colnames(intdata_pp2)) == TRUE)], paste(outfolder, "tops_n_with_motifs.txt"  , sep = "/"), sep = "\t", col.names = TRUE, row.names = FALSE)


In [274]:
fwrite(intdata_pp2[,colnames(intdata_pp2) %in% c("transcript_id", "type", "gene_name", "id", "eclip", "pp", "coloc_gene", "coloc_snp", "score") | (grepl("^cell\\.", colnames(intdata_pp2)) == TRUE | grepl("_HNRNPL_", colnames(intdata_pp2)) == TRUE | grepl("_RBM5_", colnames(intdata_pp2)) == TRUE)], paste(outfolder, "tops_n_with_motifs.txt"  , sep = "/"), sep = "\t", col.names = TRUE, row.names = FALSE)


## Try to filter
- ENST00000565571.5_1:0:VAR_16_89627460_T_G: iPSC-CVPC, overlaps eCLIP, but does not alter motifs
- ENST00000567815.5_3:0:VAR_16_89627460_T_G: adult, alters FMR1 (not strong)
- ENST00000563270.5_1:0:VAR_16_89627460_T_G: adult, alters FMR1 (not strong)


- ENST00000336783.8_1:0:VAR_16_28844284_A_G
- ENST00000513239.5_1:0:VAR_4_526848_A_G
- ENST00000513239.5_1:0:VAR_4_527176_G_C
- ENST00000523267.1_2:0:VAR_5_179264731_T_C



In [309]:
intdata_pp3 = intdata_pp2

pp.fm        = intdata_pp3[ intdata_pp3$pp         >= 0.1, "tr2id"]
pp.gwas.gene = intdata_pp3[ intdata_pp3$coloc_gene >= 0.5, "tr2id"]
pp.gwas.snp  = intdata_pp3[ intdata_pp3$coloc_snp  >= 0.1, "tr2id"]

cell.ipsc_cvpc = intdata_pp3[ intdata_pp3$cell.ipsc_cvpc == TRUE, "tr2id"]
cell.adult     = intdata_pp3[ intdata_pp3$cell.adult     == TRUE, "tr2id"]


rbp.hnrnpl.score = intdata_pp3[ intdata_pp3$eclip.HNRNPL == TRUE & apply(intdata_pp3[,grepl("^score\\.", colnames(intdata_pp2)) == TRUE & grepl("_HNRNPL_", colnames(intdata_pp2)) == TRUE], 1, function(x){max(    c(0, x) , na.rm = TRUE)}) >= 4, "tr2id"]
rbp.hnrnpl.delta = intdata_pp3[ intdata_pp3$eclip.HNRNPL == TRUE & apply(intdata_pp3[,grepl("^delta\\.", colnames(intdata_pp2)) == TRUE & grepl("_HNRNPL_", colnames(intdata_pp2)) == TRUE], 1, function(x){max(abs(c(0, x)), na.rm = TRUE)}) >= 4, "tr2id"]
rbp.rbm5.score   = intdata_pp3[ intdata_pp3$eclip.RBM5   == TRUE & is.na(intdata_pp3$score.RBPmap_RBM5_garggwr_human_PSSM) == FALSE & intdata_pp3$score.RBPmap_RBM5_garggwr_human_PSSM >= 4, "tr2id"]
rbp.rbm5.delta   = intdata_pp3[ intdata_pp3$eclip.RBM5   == TRUE & is.na(intdata_pp3$delta.RBPmap_RBM5_garggwr_human_PSSM) == FALSE & intdata_pp3$delta.RBPmap_RBM5_garggwr_human_PSSM >= 4, "tr2id"]

length(pp.fm           )
length(pp.gwas.gene    )
length(pp.gwas.snp     )
length(cell.ipsc_cvpc  )
length(cell.adult      )
length(rbp.hnrnpl.score)
length(rbp.hnrnpl.delta)
length(rbp.rbm5.score  )
length(rbp.rbm5.delta  )



In [318]:
toint = unique(intdata_pp3[,c("tr2id", "transcript_id", "type", "gene_name", "gene_id", "id", "pp", "coloc_gene", "coloc_snp")])
toint       $tr2type = paste(toint       $transcript_id, toint       $type, sep = ":")
int_isoforms$tr2type = paste(int_isoforms$transcript_id, int_isoforms$type, sep = ":")

toint$pp.fm            = FALSE
toint$pp.gwas.gene     = FALSE
toint$pp.gwas.snp      = FALSE
#toint$cell.ipsc_cvpc   = FALSE
#toint$cell.adult       = FALSE
#toint$rbp.hnrnpl.score = FALSE
#toint$rbp.hnrnpl.delta = FALSE
#toint$rbp.rbm5.score   = FALSE
#toint$rbp.rbm5.delta   = FALSE


toint[ toint$tr2id %in% pp.fm           , "pp.fm"           ] = TRUE
toint[ toint$tr2id %in% pp.gwas.gene    , "pp.gwas.gene"    ] = TRUE
toint[ toint$tr2id %in% pp.gwas.snp     , "pp.gwas.snp"     ] = TRUE
#toint[ toint$tr2id %in% cell.ipsc_cvpc  , "cell.ipsc_cvpc"  ] = TRUE
#toint[ toint$tr2id %in% cell.adult      , "cell.adult"      ] = TRUE
#toint[ toint$tr2id %in% rbp.hnrnpl.score, "rbp.hnrnpl.score"] = TRUE
#toint[ toint$tr2id %in% rbp.hnrnpl.delta, "rbp.hnrnpl.delta"] = TRUE
#toint[ toint$tr2id %in% rbp.rbm5.score  , "rbp.rbm5.score"  ] = TRUE
#toint[ toint$tr2id %in% rbp.rbm5.delta  , "rbp.rbm5.delta"  ] = TRUE


for(cell in sort(unique(int_isoforms$interaction)))
{
    this = unique(int_isoforms[int_isoforms$interaction == cell, "tr2type"])
    
    toint[,paste("cell", sub("^cibersort.regular.", "", cell), sep = ".")] = FALSE
    toint[toint$tr2type %in% this, paste("cell", sub("^cibersort.regular.", "", cell), sep = ".")] = TRUE
}


for(rbp in rbps$rbp)
{
    mycols_score = colnames(intdata_pp3[,grepl("^score\\.", colnames(intdata_pp2)) == TRUE & grepl(paste("_", rbp, "_", sep = ""), colnames(intdata_pp2)) == TRUE])
    mycols_delta = colnames(intdata_pp3[,grepl("^delta\\.", colnames(intdata_pp2)) == TRUE & grepl(paste("_", rbp, "_", sep = ""), colnames(intdata_pp2)) == TRUE])
    
    if(length(mycols_score) > 1)
    {
        totest = intdata_pp3[intdata_pp3[, paste("eclip", rbp, sep = ".")] == TRUE &
                             apply(intdata_pp3[,mycols_score], 1, function(x){max(    c(0, x) , na.rm = TRUE)}) >= 4 &
                             apply(intdata_pp3[,mycols_delta], 1, function(x){max(abs(c(0, x)), na.rm = TRUE)}) >= 3, ]
    }else
    {
        totest = intdata_pp3[intdata_pp3[, paste("eclip", rbp, sep = ".")] == TRUE &
                             is.na(intdata_pp3[, mycols_score]) == FALSE &     intdata_pp3[, mycols_score]  >= 4 & 
                             is.na(intdata_pp3[, mycols_delta]) == FALSE & abs(intdata_pp3[, mycols_delta]) >= 3, ]
    }
    
    toint[                             , paste("rbp", rbp, sep = ".")] = FALSE
    toint[toint$tr2id %in% totest$tr2id, paste("rbp", rbp, sep = ".")] = TRUE
}

toint$score   = rowSums(toint[,colnames(toint) %in% c("pp.fm", "pp.gwas.gene", "pp.gwas.snp", paste("rbp", rbps$rbp, sep = ".")) | grepl("^cell\\.", colnames(toint)) == TRUE])
toint$is_cell = FALSE
toint$is_rbp  = FALSE

toint[rowSums(toint[,grepl("^cell\\.", colnames(toint)) == TRUE]) > 0, "is_cell"] = TRUE
toint[rowSums(toint[,paste("rbp", rbps$rbp, sep = ".")         ]) > 0, "is_rbp" ] = TRUE



fwrite(toint, paste(outfolder, "filtered_for_examples.txt", sep = "/"), sep = "\t", col.names = TRUE, row.names = FALSE)


In [316]:
str(int_isoforms)
str(toint)

'data.frame':	1446 obs. of  40 variables:
 $ transcript_id: chr  "ENST00000190983.4_1" "ENST00000216416.9_4" "ENST00000220853.8_2" "ENST00000220931.11_3" ...
 $ gene_id      : chr  "ENSG00000064205.10_4" "ENSG00000100528.12_6" "ENSG00000104412.8_5" "ENSG00000104490.18_8" ...
 $ gene_name    : chr  "CCN5" "CNIH1" "EMC2" "NCALD" ...
 $ gene_type    : chr  "protein_coding" "protein_coding" "protein_coding" "protein_coding" ...
 $ start        : int  43343886 54890279 109455853 102698770 43678846 43678846 37321791 196678409 110210679 7281376 ...
 $ end          : int  43356452 54908092 109501425 102803197 43769140 43769140 37358944 196695931 110217908 7313432 ...
 $ strand       : chr  "+" "-" "+" "-" ...
 $ chrom        : int  20 14 8 8 7 7 6 3 1 6 ...
 $ pos          : int  43348735 54890379 109587552 102900082 43642005 43690946 37348920 196674879 110212293 7280780 ...
 $ ref          : chr  "C" "C" "G" "C" ...
 $ alt          : chr  "A" "A" "A" "T" ...
 $ rsid         : chr  "rs2296530"