-
Notifications
You must be signed in to change notification settings - Fork 0
/
FootprintFinder.R
499 lines (465 loc) · 22.5 KB
/
FootprintFinder.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
#' @import methods
#'
#' @name FootprintFinder-class
#' @rdname FootprintFinder-class
#' @aliases FootprintFinder
#'
#' @slot genome.db The address of a genome database for use in filtering
#' @slot project.db The address of a project database for use in filtering
#' @slot quiet A logical argument denoting whether the FootprintFinder object should behave quietly
#----------------------------------------------------------------------------------------------------
.FootprintFinder <- setClass("FootprintFinder",
slots = c(genome.db="DBIConnection",
project.db="DBIConnection",
quiet="logical")
)
#----------------------------------------------------------------------------------------------------
printf <- function(...) print(noquote(sprintf(...)))
#----------------------------------------------------------------------------------------------------
setGeneric("getChromLoc", signature="obj",
function(obj, name, biotype="protein_coding",moleculetype="gene")
standardGeneric("getChromLoc"))
setGeneric("getGenePromoterRegion", signature="obj",
function(obj, gene, size.upstream=1000, size.downstream=0,
biotype="protein_coding", moleculetype="gene")
standardGeneric("getGenePromoterRegion"))
setGeneric("getFootprintsForGene", signature="obj",
function(obj, gene, size.upstream=1000, size.downstream=0,
biotype="protein_coding", moleculetype="gene")
standardGeneric("getFootprintsForGene"))
setGeneric("getFootprintsInRegion", signature="obj",
function(obj, chromosome, start, end) standardGeneric("getFootprintsInRegion"))
setGeneric("getGtfGeneBioTypes", signature="obj",
function(obj) standardGeneric("getGtfGeneBioTypes"))
setGeneric("getGtfMoleculeTypes", signature="obj",
function(obj) standardGeneric("getGtfMoleculeTypes"))
setGeneric("closeDatabaseConnections", signature="obj",
function(obj) standardGeneric("closeDatabaseConnections"))
setGeneric("getPromoterRegionsAllGenes",signature="obj",
function(obj ,size.upstream=10000 , size.downstream=10000 , use_gene_ids = TRUE )
standardGeneric("getPromoterRegionsAllGenes"))
#----------------------------------------------------------------------------------------------------
#' @title Class FootprintFinder
#' @name FootprintFinder-class
#' @rdname FootprintFinder-class
#'
#' @description
#' The FootprintFinder class is designed to query 2 supplied footprint databases (a genome database
#' and a project database) for supplied genes or regions. Within the TReNA package, the
#' FootprintFinder class is mainly used by the FootprintFilter class, but the FootprintFinder class
#' offers more flexibility in constructing queries.
#'
#' @param genome.database.uri The address of a genome database for use in filtering. This database
#' must contain the tables "gtf" and "motifsgenes" at a minimum. The URI format is as follows:
#' "dbtype://host/database" (e.g. "postgres://localhost/genomedb")
#' @param project.database.uri The address of a project database for use in filtering. This database
#' must contain the tables "regions" and "hits" at a minimum. The URI format is as follows:
#' "dbtype://host/database" (e.g. "postgres://localhost/projectdb")
#' @param quiet A logical denoting whether or not the FootprintFinder object should print output
#'
#' @return An object of the FootprintFinder class
#'
#' @export
#'
#' @seealso \code{\link{FootprintFilter}}
#'
#' @family FootprintFinder methods
FootprintFinder <- function(genome.database.uri, project.database.uri, quiet=TRUE)
{
genome.db.info <- parseDatabaseUri(genome.database.uri)
project.db.info <- parseDatabaseUri(project.database.uri)
stopifnot(genome.db.info$brand %in% c("postgres","sqlite"))
# open the genome database
if(genome.db.info$brand == "postgres"){
host <- genome.db.info$host
dbname <- genome.db.info$name
driver <- RPostgreSQL::PostgreSQL()
genome.db <- DBI::dbConnect(driver, user= "trena", password="trena", dbname=dbname, host=host)
existing.databases <- DBI::dbGetQuery(genome.db, "select datname from pg_database")[,1]
DBI::dbDisconnect(genome.db)
stopifnot(dbname %in% existing.databases)
genome.db <- DBI::dbConnect(driver, user="trena", password="trena", dbname=dbname, host=host)
expected.tables <- c("gtf", "motifsgenes")
stopifnot(all(expected.tables %in% DBI::dbListTables(genome.db)))
if(!quiet){
row.count <- DBI::dbGetQuery(genome.db, "select count(*) from gtf")[1,1]
printf("%s: %d rows", sprintf("%s/gtf", genome.database.uri), row.count)
row.count <- DBI::dbGetQuery(genome.db, "select count(*) from motifsgenes")[1,1]
printf("%s: %d rows", sprintf("%s/motifsgenes", genome.database.uri), row.count)
}
} # if postgres
# open the project database
if(project.db.info$brand == "postgres"){
host <- project.db.info$host
dbname <- project.db.info$name
driver <- RPostgreSQL::PostgreSQL()
project.db <- DBI::dbConnect(driver, user= "trena", password="trena", dbname=dbname, host=host)
existing.databases <- DBI::dbGetQuery(project.db, "select datname from pg_database")[,1]
stopifnot(dbname %in% existing.databases)
DBI::dbDisconnect(project.db)
project.db <- DBI::dbConnect(driver, user="trena", password="trena", dbname=dbname, host=host)
expected.tables <- c("regions", "hits")
stopifnot(all(expected.tables %in% DBI::dbListTables(project.db)))
if(!quiet){
row.count <- DBI::dbGetQuery(project.db, "select count(*) from regions")[1,1]
printf("%s: %d rows", sprintf("%s/regions", project.database.uri), row.count)
}
} # if postgres
# open the genome database
if(genome.db.info$brand == "sqlite"){
dbname <- paste(genome.db.info$host, genome.db.info$name, sep = "/")
driver <- RSQLite::SQLite()
genome.db <- DBI::dbConnect(driver, dbname=dbname)
stopifnot(file.exists(dbname))
expected.tables <- c("gtf", "motifsgenes")
stopifnot(all(expected.tables %in% DBI::dbListTables(genome.db)))
if(!quiet){
row.count <- DBI::dbGetQuery(genome.db, "select count(*) from gtf")[1,1]
printf("%s: %d rows", sprintf("%s/gtf", genome.database.uri), row.count)
row.count <- DBI::dbGetQuery(genome.db, "select count(*) from motifsgenes")[1,1]
printf("%s: %d rows", sprintf("%s/motifsgenes", genome.database.uri), row.count)
}
} # if sqlite
# open the project database
if(project.db.info$brand == "sqlite"){
dbname <- paste(project.db.info$host, project.db.info$name, sep = "/")
driver <- RSQLite::SQLite()
project.db <- DBI::dbConnect(driver, dbname = dbname)
stopifnot(file.exists(dbname))
expected.tables <- c("regions", "hits")
stopifnot(all(expected.tables %in% DBI::dbListTables(project.db)))
if(!quiet){
row.count <- DBI::dbGetQuery(project.db, "select count(*) from regions")[1,1]
printf("%s: %d rows", sprintf("%s/regions", project.database.uri), row.count)
}
} # if sqlite
.FootprintFinder(genome.db=genome.db, project.db=project.db, quiet=quiet)
} # FootprintFinder, the constructor
#----------------------------------------------------------------------------------------------------
#' Close a Footprint Database Connection
#'
#' This method takes a FootprintFinder object and closes connections to the footprint databases
#' if they are currently open.
#'
#' @rdname closeDatabaseConnections
#' @aliases closeDatabaseConnections
#'
#' @param obj An object of class FootprintFinder
#'
#' @export
#'
#' @family FootprintFinder methods
#'
#' @return Closes the specified database connection
setMethod("closeDatabaseConnections", "FootprintFinder",
function(obj){
if(!obj@quiet) printf("-- FootprintFinder::closeDataConnections")
if("DBIConnection" %in% is(obj@genome.db)){
if(!obj@quiet) printf("closing genome.db")
DBI::dbDisconnect(obj@genome.db)
}
if("DBIConnection" %in% is(obj@project.db)){
if(!obj@quiet) printf("closing project.db")
DBI::dbDisconnect(obj@project.db)
}
})
#----------------------------------------------------------------------------------------------------
#' Get the List of Biotypes
#'
#' Using the gtf table in the genome database contained in a FootprintFinder object, get the list of
#' different types of biological units (biotypes) contained in the table.
#'
#' @rdname getGtfGeneBioTypes
#' @aliases getGtfGeneBioTypes
#'
#' @param obj An object of class FootprintFinder
#'
#' @export
#'
#' @family FootprintFinder methods
#'
#' @return A sorted list of the types of biological units contained in the gtf table of the genome
#' database.
#'
#' @examples
#' db.address <- system.file(package="trena", "extdata")
#' genome.db.uri <- paste("sqlite:/",db.address,"mef2c.neighborhood.hg38.gtfAnnotation.db", sep = "/")
#' project.db.uri <- paste("sqlite:/",db.address,"mef2c.neigborhood.hg38.footprints.db", sep = "/")
#' fp <- FootprintFinder(genome.db.uri, project.db.uri)
#'
#' biotypes <- getGtfGeneBioTypes(fp)
setMethod("getGtfGeneBioTypes", "FootprintFinder",
function(obj){
sort(DBI::dbGetQuery(obj@genome.db, "select distinct gene_biotype from gtf")[,1])
})
#----------------------------------------------------------------------------------------------------
#' Get the List of Molecule Types
#'
#' Using the gtf table in the genome database contained in a FootprintFinder object, get the list of
#' different types of molecules contained in the table.
#'
#' @rdname getGtfMoleculeTypes
#' @aliases getGtfMoleculeTypes
#'
#' @param obj An object of class FootprintFinder
#'
#' @export
#'
#' @family FootprintFinder methods
#'
#' @return A sorted list of the types of molecules contained in the gtf table of the genome
#' database.
#'
#' @examples
#' db.address <- system.file(package="trena", "extdata")
#' genome.db.uri <- paste("sqlite:/",db.address,"mef2c.neighborhood.hg38.gtfAnnotation.db", sep = "/")
#' project.db.uri <- paste("sqlite:/",db.address,"mef2c.neigborhood.hg38.footprints.db", sep = "/")
#' fp <- FootprintFinder(genome.db.uri, project.db.uri)
#'
#' mol.types <- getGtfMoleculeTypes(fp)
setMethod("getGtfMoleculeTypes", "FootprintFinder",
function(obj){
sort(DBI::dbGetQuery(obj@genome.db, "select distinct moleculetype from gtf")[,1])
})
#----------------------------------------------------------------------------------------------------
#' Get Chromosome Location
#'
#' Using the gtf table in the genome database contained in a FootprintFinder object, get the locations
#' of chromosomes with the specified gene name, biological unit type, and molecule type
#'
#' @rdname getChromLoc
#' @aliases getChromLoc
#'
#' @param obj An object of class FootprintFinder
#' @param name A gene name or ID
#' @param biotype A type of biological unit (default="protein_coding")
#' @param moleculetype A type of molecule (default="gene")
#'
#' @export
#'
#' @family FootprintFinder methods
#'
#' @return A dataframe containing the results of a database query pertaining to the specified name,
#' biotype, and molecule type. This dataframe contains the following columns: gene_id, gene_name,
#' chr, start, endpos, strand
#'
#' @examples
#' db.address <- system.file(package="trena", "extdata")
#' genome.db.uri <- paste("sqlite:/",db.address,"mef2c.neighborhood.hg38.gtfAnnotation.db", sep = "/")
#' project.db.uri <- paste("sqlite:/",db.address,"mef2c.neigborhood.hg38.footprints.db", sep = "/")
#' fp <- FootprintFinder(genome.db.uri, project.db.uri)
#'
#' chrom.locs <- getChromLoc(fp, name = "MEF2C")
setMethod("getChromLoc", "FootprintFinder",
function(obj, name, biotype="protein_coding", moleculetype="gene"){
query <- paste("select gene_id, gene_name, chr, start, endpos, strand from gtf where ",
sprintf("(gene_name='%s' or gene_id='%s') ", name, name),
sprintf("and gene_biotype='%s' and moleculetype='%s'", biotype, moleculetype),
collapse=" ")
DBI::dbGetQuery(obj@genome.db, query)
})
#----------------------------------------------------------------------------------------------------
#' Get Gene Promoter Region
#'
#' Using the \code{\link{getChromLoc}} function in conjunction with the gtf table inside the genome
#' database specified by the FootprintFinder object, get the chromosome, starting location,
#' and ending location for gene promoter region.
#'
#' @rdname getGenePromoterRegion
#' @aliases getGenePromoterRegion
#'
#' @param obj An object of class FootprintFinder
#' @param gene A gene name of ID
#' @param size.upstream An integer denoting the distance upstream of the target gene to look for footprints
#' (default = 1000)
#' @param size.downstream An integer denoting the distance downstream of the target gene to look for footprints
#' (default = 0)
#' @param biotype A type of biological unit (default="protein_coding")
#' @param moleculetype A type of molecule (default="gene")
#'
#' @export
#'
#' @family FootprintFinder methods
#'
#' @return A list containing 3 elements:
#' 1) chr : The name of the chromosome containing the promoter region for the specified gene
#' 2) start : The starting location of the promoter region for the specified gene
#' 3) end : The ending location of the promoter region for the specified gene
#'
#' @examples
#' db.address <- system.file(package="trena", "extdata")
#' genome.db.uri <- paste("sqlite:/",db.address,"mef2c.neighborhood.hg38.gtfAnnotation.db", sep = "/")
#' project.db.uri <- paste("sqlite:/",db.address,"mef2c.neigborhood.hg38.footprints.db", sep = "/")
#' fp <- FootprintFinder(genome.db.uri, project.db.uri)
#'
#' prom.region <- getGenePromoterRegion(fp, gene = "MEF2C")
setMethod("getGenePromoterRegion", "FootprintFinder",
function(obj, gene, size.upstream=1000, size.downstream=0, biotype="protein_coding", moleculetype="gene"){
tbl.loc <- getChromLoc(obj, gene, biotype=biotype, moleculetype=moleculetype)
if(nrow(tbl.loc) < 1){
warning(sprintf("no chromosomal location for %s (%s, %s)", gene, biotype, moleculetype))
return(NA)
}
chrom <- tbl.loc$chr[1]
start.orig <- tbl.loc$start[1]
end.orig <- tbl.loc$endpos[1]
strand <- tbl.loc$strand[1]
if(strand == "-"){ # reverse (minus) strand. TSS is at "end" position
start.loc <- end.orig - size.downstream
end.loc <- end.orig + size.upstream
}
else{ # forward (plus) strand. TSS is at "start" position
start.loc <- start.orig - size.upstream
end.loc <- start.orig + size.downstream
}
return(list(chr=chrom, start=start.loc, end=end.loc))
})
#----------------------------------------------------------------------------------------------------
#' Get Footprints for Gene
#'
#' Using the \code{\link{getGenePromoterRegion}} and \code{\link{getFootprintsInRegion}} functions
#' in conjunction with the gtf table inside the genome database specified by the FootprintFinder object,
#' retrieve a dataframe containing the footprints for a specified gene
#'
#' @rdname getFootprintsForGene
#' @aliases getFootprintsForGene
#'
#' @param obj An object of class FootprintFinder
#' @param gene A gene name of ID
#' @param size.upstream An integer denoting the distance upstream of the target gene to look for footprints
#' (default = 1000)
#' @param size.downstream An integer denoting the distance downstream of the target gene to look for footprints
#' (default = 0)
#' @param biotype A type of biological unit (default="protein_coding")
#' @param moleculetype A type of molecule (default="gene")
#'
#' @return A dataframe containing all footprints for the specified gene and accompanying parameters
#'
#' @export
#'
#' @family FootprintFinder methods
#'
#' @examples
#' db.address <- system.file(package="trena", "extdata")
#' genome.db.uri <- paste("sqlite:/",db.address,"mef2c.neighborhood.hg38.gtfAnnotation.db", sep = "/")
#' project.db.uri <- paste("sqlite:/",db.address,"mef2c.neigborhood.hg38.footprints.db", sep = "/")
#' fp <- FootprintFinder(genome.db.uri, project.db.uri)
#'
#' footprints <- getFootprintsForGene(fp, gene = "MEF2C")
setMethod("getFootprintsForGene", "FootprintFinder",
function(obj, gene, size.upstream=1000, size.downstream=0,
biotype="protein_coding", moleculetype="gene"){
stopifnot(length(gene) == 1)
loc <- getGenePromoterRegion(obj, gene, size.upstream, size.downstream,
biotype=biotype, moleculetype=moleculetype)
if(!obj@quiet) print(loc)
getFootprintsInRegion(obj, loc$chr, loc$start, loc$end)
}) # getFootprintsForGene
#----------------------------------------------------------------------------------------------------
#' Get Footprints in a Region
#'
#' Using the regions and hits tables inside the project database specified by the FootprintFinder
#' object, return the location, chromosome, starting position, and ending positions of all footprints
#' for the specified region.
#'
#' @rdname getFootprintsInRegion
#' @aliases getFootprintsInRegion
#'
#' @param obj An object of class FootprintFinder
#' @param chromosome The name of the chromosome of interest
#' @param start An integer denoting the start of the desired region
#' @param end An integer denoting the end of the desired region
#'
#' @export
#'
#' @family FootprintFinder methods
#'
#' @return A dataframe containing all footprints for the specified region
#'
#' @examples
#' db.address <- system.file(package="trena", "extdata")
#' genome.db.uri <- paste("sqlite:/",db.address,"mef2c.neighborhood.hg38.gtfAnnotation.db", sep = "/")
#' project.db.uri <- paste("sqlite:/",db.address,"mef2c.neigborhood.hg38.footprints.db", sep = "/")
#' fp <- FootprintFinder(genome.db.uri, project.db.uri)
#'
#' footprints <- getFootprintsInRegion(fp, chromosome = "chr5",
#' start = 88903305, end = 88903319 )
setMethod("getFootprintsInRegion", "FootprintFinder",
function(obj, chromosome, start, end){
query.p0 <- "select loc, chrom, start, endpos from regions"
query.p1 <- sprintf("where chrom='%s' and start >= %d and endpos <= %d", chromosome, start, end)
query.regions <- paste(query.p0, query.p1)
tbl.regions <- DBI::dbGetQuery(obj@project.db, query.regions)
if(nrow(tbl.regions) == 0)
return(data.frame())
loc.set <- sprintf("('%s')", paste(tbl.regions$loc, collapse="','"))
query.hits <- sprintf("select * from hits where loc in %s", loc.set)
tbl.hits <- DBI::dbGetQuery(obj@project.db, query.hits)
tbl.out <- merge(tbl.regions, tbl.hits, on="loc")
unique(tbl.out)
}) # getFootprintsInRegion
#----------------------------------------------------------------------------------------------------
#' Get Promoter Regions for All Genes
#'
#' Using the gtf table inside the genome database specified by the FootprintFinder object, return the
#' promoter regions for every protein-coding gene in the database.
#'
#' @rdname getPromoterRegionsAllGenes
#' @aliases getPromoterRegionsAllGenes
#'
#' @param obj An object of class FootprintFinder
#' @param size.upstream An integer denoting the distance upstream of each gene's transcription start
#' site to include in the promoter region (default = 1000)
#' @param size.downstream An integer denoting the distance downstream of each gene's transcription start
#' site to include in the promoter region (default = 1000)
#' @param use_gene_ids A binary indicating whether to return gene IDs or gene names (default = T)
#'
#' @return A GRanges object containing the promoter regions for all genes
#'
#' @export
#'
#' @family FootprintFinder methods
#'
#' @examples
#' db.address <- system.file(package="trena", "extdata")
#' genome.db.uri <- paste("sqlite:/",db.address,"mef2c.neighborhood.hg38.gtfAnnotation.db", sep = "/")
#' project.db.uri <- paste("sqlite:/",db.address,"mef2c.neigborhood.hg38.footprints.db", sep = "/")
#' fp <- FootprintFinder(genome.db.uri, project.db.uri)
#'
#' footprints <- getPromoterRegionsAllGenes(fp)
setMethod("getPromoterRegionsAllGenes","FootprintFinder",
function( obj , size.upstream=10000 , size.downstream=10000 , use_gene_ids = TRUE ) {
query <-
paste( "select gene_name, gene_id, chr, start, endpos, strand from gtf where" ,
"gene_biotype='protein_coding' and moleculetype='gene'" , sep=" " )
genes = DBI::dbGetQuery( obj@genome.db , query )
# function to get each transcript's TSS
get_tss <-
function( t ) {
chrom <- genes$chr[t]
start.orig <- genes$start[t]
end.orig <- genes$endpos[t]
strand <- genes$strand[t]
if(strand == "-"){ # reverse (minus) strand. TSS is at "end" position
tss <- end.orig
}
else{ # forward (plus) strand. TSS is at "start" position
tss <- start.orig
}
return( tss )
}
# apply get_tss to all transcripts
tss = sapply( 1:nrow(genes) , get_tss )
# assemble a bed file for the TSSs
promoter_regions = unique( data.frame(
chr = genes$chr ,
start = tss - size.upstream , end = tss + size.downstream ,
gene_name = genes$gene_name ,
gene_id = genes$gene_id ))
# GRanges obj
gr = GenomicRanges::makeGRangesFromDataFrame( promoter_regions , keep.extra.columns = TRUE )
if( use_gene_ids == FALSE ) names(gr) = promoter_regions$gene_name
if( use_gene_ids == TRUE ) names(gr) = promoter_regions$gene_id
return( gr )
}) # getPromoterRegionsAllGenes
#----------------------------------------------------------------------------------------------------