In [1]:
library(RPostgreSQL)
printf <- function(...) print(noquote(sprintf(...)))   

Loading required package: DBI


In [2]:
chrom <- "chr13"
loc.start <- 41019200
loc.end   <- 41019360
loc.string <- sprintf("%s:d-d", chrom, loc.start, loc.end)

In [3]:
db.cs <- dbConnect(PostgreSQL(), user="trena", password="trena", 
                   dbname="chipseq", host="whovian")

db.trena <- dbConnect(PostgreSQL(), user="trena", password="trena", 
                      dbname="trena", host="whovian")

In [4]:
# what does the hits table look like?
dbGetQuery(db.cs, "select * from hits limit 3")

loc,type,name,length,strand,sample_id,method,provenance,score1,score2,score3,score4,score5,score6
chr1:1677841-1677991,chipseq.peak,ATF2,151,,pooled,cusanovitch,chipseq.minid.tbd,155,,,,,
chr1:1678101-1678251,chipseq.peak,ATF2,151,,pooled,cusanovitch,chipseq.minid.tbd,131,,,,,
chr1:1828321-1828471,chipseq.peak,ATF2,151,,pooled,cusanovitch,chipseq.minid.tbd,30,,,,,


In [5]:
query.regions <- sprintf("select * from regions where chrom='%s' and start > %d and endpos < %d", 
                          chrom, loc.start, loc.end)
system.time(tbl.regions <- dbGetQuery(db.cs, query.regions))  # 0.064 seconds 
tbl.regions

   user  system elapsed 
  0.005   0.000   0.082 

loc,chrom,start,endpos
chr13:41019204-41019354,chr13,41019204,41019354


In [6]:
query.hits <- sprintf("select * from hits where loc='%s'", tbl.regions[1, "loc"])
system.time(tbl.hits <- dbGetQuery(db.cs, query.hits))     # 0.35 seconds
dim(tbl.hits)

   user  system elapsed 
  0.005   0.000   0.007 

In [7]:
tbl.out <- merge(tbl.regions, tbl.hits, on="loc")
preferred.column.order <- c("chrom", "start", "endpos", "name", "strand", "score1",
                            "type", "length", "sample_id", "method", "provenance",
                            "score2", "score3", "score4", "score5", "score6")
tbl.out <- tbl.out[, preferred.column.order]
head(tbl.out[, 1:10])

chrom,start,endpos,name,strand,score1,type,length,sample_id,method
chr13,41019204,41019354,ATF2,,332,chipseq.peak,151,pooled,cusanovitch
chr13,41019204,41019354,BATF,,332,chipseq.peak,151,pooled,cusanovitch
chr13,41019204,41019354,BCL11A,,332,chipseq.peak,151,pooled,cusanovitch
chr13,41019204,41019354,BCL3,,332,chipseq.peak,151,pooled,cusanovitch
chr13,41019204,41019354,BCLAF1,,332,chipseq.peak,151,pooled,cusanovitch
chr13,41019204,41019354,BHLHE40,,332,chipseq.peak,151,pooled,cusanovitch


In [8]:
# remind ourselves what the fimo table looks like
dbGetQuery(db.trena, "select * from fimo_hg38 limit 3")

motifname,chrom,start,endpos,strand,motifscore,pval,empty,sequence
MA0002.2,10,18991,19001,-,12.4655,2.73e-05,,TTCTGTGGTTC
MA0002.2,10,19478,19488,-,13.3448,1.17e-05,,TTCTGTGGTTG
MA0002.2,10,20814,20824,-,12.9483,1.69e-05,,GGCTGTGGGTT


In [9]:
# get all the fimo hits in the region of interest
query.fimo <- sprintf("select * from fimo_hg38 where chrom='%s' and start >= %d and endpos <= %d",
                      "13", loc.start, loc.end)
system.time(tbl.fimo <- dbGetQuery(db.trena, query.fimo))  # 5.17 seconds; indexing needed?
dim(tbl.fimo)

   user  system elapsed 
  0.004   0.001   5.073 

In [18]:
# what genes (tfs) have been mapped into chr13:41019200-4109360 by cusanovich ChIPseq?
# which have motifs? 
genes.tfs <- sort(unique(tbl.out$name))                                             # 36
genes.allMapped <- dbGetQuery(db.trena, "select distinct gene from tfMotifs")[,1]   # 847
genes.tfs.withMotifs <- intersect(genes.tfs, genes.allMapped)                       # 27/36
printf("%d/%d tfs with known motifs", length(genes.tfs.withMotifs), length(genes.tfs))

[1] 27/36 tfs with known motifs


In [19]:
# what motifs are associated with each of these allegedly bound tfs? 

f <- function(gene){
  dbGetQuery(db.trena, sprintf("select motif from tfmotifs where gene = '%s'", gene))[,1]  
  }

motifs.csGenes <- lapply(genes.tfs.withMotifs, f)
                         
names(motifs.csGenes) <- genes.tfs.withMotifs
# print(head(motifs.csGenes))

In [17]:
# filter this list, keeping only motifs actually mapped by fimo in the target region                                                                                                                                                                       
motifs.fimo <- sort(unique(tbl.fimo$motifname))
for(tf in names(motifs.csGenes)){
   found.by.fimo <- intersect(motifs.csGenes[[tf]], motifs.fimo)
   printf("%8s: %s", tf, paste(found.by.fimo, collapse=","))
   }


[1]     ATF2: MA0490.1,MA0491.1
[1]     BATF: MA0462.1
[1]  BHLHE40: 
[1]    CEBPB: 
[1]    EP300: 
[1]    FOXM1: 
[1]     IRF4: MA0050.2,MA0652.1
[1]     JUND: MA0491.1,MA0490.1
[1]      MAX: 
[1]      MAZ: 
[1]    MEF2A: 
[1]    MEF2C: 
[1]     MTA3: 
[1]     MXI1: 
[1]   NFATC1: 
[1]     NFIC: 
[1]    NFKB1: 
[1]     PAX5: 
[1]   POU2F2: 
[1]    RUNX3: 
[1]      SP1: 
[1]     SPI1: MA0080.4
[1]    STAT3: MA0517.1
[1]     TAF1: 
[1]      TBP: 
[1]     TCF3: 
[1]      YY1: MA0095.2
