## Ten seconds is too long for an inner join between regions & hits tables -- R version

- here we discover that doing the join (merge) in R takes ~0.2 seconds

In [1]:
 library(RPostgreSQL)

Loading required package: DBI


In [2]:
dbpiq = dbConnect(PostgreSQL(), user="pshannon", dbname="piqTest")

In [3]:
dbListTables(dbpiq)

### what do the tables look like?

In [4]:
rbind(dbGetQuery(dbpiq, "select count(*) from regions"), dbGetQuery(dbpiq, "select count(*) from hits"))

count
1158121
30072096


In [5]:
 dbGetQuery(dbpiq, "select * from regions limit 3")

loc,chrom,start,stop
chr21:5011471-5011482,chr21,5011471,5011482
chr21:5018173-5018184,chr21,5018173,5018184
chr21:5057475-5057486,chr21,5057475,5057486


In [6]:
dbGetQuery(dbpiq, "select * from hits limit 3")

loc,type,name,strand,sample_id,method,provenance,score1,score2,score3,score4,score5,score6
chr21:9650860-9650871,motif.in.footprint,MA0032.2,+,ENCSR000EJJ,piq,piq.minid.tbd,10.1972,-0.905663,3.06436,0.548674,,
chr21:9651991-9652002,motif.in.footprint,MA0032.2,+,ENCSR000EJJ,piq,piq.minid.tbd,10.2452,-0.987957,2.46424,0.53979,,
chr21:9653310-9653321,motif.in.footprint,MA0032.2,-,ENCSR000EJJ,piq,piq.minid.tbd,10.8088,-0.999619,-18.2933,0.549584,,


In [7]:
system.time(tbl <- dbGetQuery(dbpiq, "select count(*) from regions where chrom='chr21' and start > 5010000 and stop < 5010500"))
print(tbl)

   user  system elapsed 
  0.004   0.000   0.209 

  count
1    18


In [8]:
getHits <- function(chrom, start, stop){
   query.p0 <- "select * from regions r inner join hits h on r.loc = h.loc "
   query.p1 <- sprintf("where r.chrom='%s' and r.start > %d and r.stop < %d", chrom, start, stop)
   query <- paste(query.p0, query.p1)
   dbGetQuery(dbpiq, query)[, -1]  # remove the leading 'loc' column
   }

### before indexing the hits table:
<pre>
 user  system elapsed 
0.005   0.000  11.009 
 dim: 36 16   
<pre>

In [9]:
print(system.time(tbl <- getHits("chr21", 15010000, 15010030)))
dim(tbl)

   user  system elapsed 
  0.004   0.000   8.859 


In [10]:
unique(tbl$loc)

### ~ 10secs on the join seems too long
Break the join up into two queries, time them each

In [11]:
system.time(tbl <- dbGetQuery(dbpiq, "select loc from regions where chrom='chr21' and start > 15010014 and stop < 150100100"))

   user  system elapsed 
  2.404   0.059   2.819 

In [12]:
system.time(tbl<-dbGetQuery(dbpiq, "select * from hits where loc='chr21:15010014-15010020'"))

   user  system elapsed 
  0.004   0.000   0.006 

In [13]:
dim(tbl)

### Do the 'join' (aka 'merge') in R:  50x faster

In [14]:
getHits2 <- function(chrom, start, stop)
{
   query.p0 <- "select loc, chrom, start, stop from regions"
   query.p1 <- sprintf("where chrom='%s' and start > %d and stop < %d", chrom, start, stop)
   query.regions <- paste(query.p0, query.p1)
   tbl.regions <- dbGetQuery(dbpiq, query.regions)
   if(nrow(tbl.regions) == 0)
      return(data.frame())
   loc.set <- sprintf("('%s')", paste(tbl.regions$loc, collapse="','"))
   query.hits <- sprintf("select * from hits where loc in %s", loc.set)
   tbl.hits <- dbGetQuery(dbpiq, query.hits)
   merge(tbl.regions, tbl.hits, on="loc")
}

In [15]:
print(system.time(tbl <- getHits2("chr21", 15010000, 15010030)))
dim(tbl)

   user  system elapsed 
  0.010   0.000   0.195 
