-
Notifications
You must be signed in to change notification settings - Fork 1
/
canonical_junctions.R
123 lines (105 loc) · 4.17 KB
/
canonical_junctions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#' Build canonical junctions from transcripts
#'
#' @param tx a \code{\link[GenomicRanges]{GRangesList}} of reference transcripts
#'
#' @return a character vector of canonical splice junction ids
#'
#'
#' We build all canonical splice junctions that are in the annotated input transcripts.
#' The following lists implementation rules for adjacent canonical exon-exon junction:
#'
#' - strand = +: \eqn{e_i}, \eqn{s_{i+1}}
#' - strand = -: \eqn{e_{i+1}}, \eqn{s_{i}}
#'
#' We also include canonical intron-retention junctions. These are 5' donor or
#' 3' acceptor sites of canonical exon-exon junctions that are not used in all
#' isoforms of the gene. They are located within an exon of other transcripts.
#' Canonical intron-retention junctions are defined by the coordinate of the
#' last exon base and the next base.
#' Therefore, we just need to check whether both bases are included in a single exon.
#'
#' @examples
#' gtf_file <- system.file("extdata","GTF_files","Aedes_aegypti.partial.gtf",
#' package="splice2neo")
#'
#' tx <- parse_gtf(gtf_file)
#'
#' canonical_junctions(tx[1:10])
#'
#' @export
canonical_junctions <- function(tx){
# full ranges of all transcript as a single GRanges object
tx_gr <- base::range(tx)
exons_gr <- unlist(tx)
# get data.frame with all transcripts
tx_df <- dplyr::tibble(
chr = GenomeInfoDb::seqnames(tx_gr) %>% as.character(),
strand = BiocGenerics::strand(tx_gr) %>% as.character(),
exon_start = BiocGenerics::start(tx) %>% as.list(),
exon_end = BiocGenerics::end(tx) %>% as.list(),
tx_name = names(tx)
) %>%
unnest(c(exon_start, exon_end))
# build exon-exon junctions ==================================================
junc_ee <- tx_df %>%
dplyr::group_by(tx_name) %>%
# get left and right junction coordinates as exon boundaries (depending on strand)
dplyr::mutate(
left = ifelse(strand == "+", exon_end, dplyr::lead(exon_end)),
right = ifelse(strand == "+", dplyr::lead(exon_start), exon_start)
) %>%
# filter out entries for first and last exon (without junctions)
dplyr::filter(!is.na(left), !is.na(right)) %>%
dplyr::ungroup() %>%
# - junc_id: `<chr>_<pos1>_<pos2>_<strand>`
dplyr::mutate(
junc_id = generate_junction_id(chr, left, right, strand)
)
# build intron-retention junctions ===========================================
junc_ir <- tx_df %>%
# get left and right junction coordinates as exon boundaries
dplyr::mutate(
start_junc_id = generate_junction_id(chr, exon_start - 1, exon_start, strand),
end_junc_id = generate_junction_id(chr, exon_end, exon_end + 1, strand),
) %>%
# combine into one column
tidyr::pivot_longer(cols = c("start_junc_id", "end_junc_id"),
names_to = "left_right", values_to = "junc_id") %>%
# distribute junc_id values in to separate columns
tidyr::separate(junc_id, into = c("chr", "start_end", "strand"), sep = ":", remove = FALSE) %>%
tidyr::separate(start_end, into = c("start", "end"), sep = "-", remove = FALSE)
# build GRanges object
junc_ir_gr <- GenomicRanges::GRanges(
junc_ir$chr,
IRanges::IRanges(
as.integer(junc_ir$start),
as.integer(junc_ir$end),
),
strand = junc_ir$strand,
junc_id = junc_ir$junc_id
)
# get subset of exon start and end junctions that are within an exon
covered_junc_gr <- junc_ir_gr %>%
IRanges::subsetByOverlaps(exons_gr, type = "within")
covered_junc_id <- unique(covered_junc_gr$junc_id)
# filter to contain only
junc_ir_covered <- junc_ir %>%
dplyr::filter(junc_id %in% covered_junc_id) %>%
dplyr::mutate(
left = as.integer(start),
right = as.integer(end)
) %>%
dplyr::select(chr, strand, tx_name, left, right, junc_id)
# merge with other exon-to-exon junctions ====================================
junc_df <- dplyr::bind_rows(
"exon_to_exon" = junc_ee,
"within_exon" = junc_ir_covered,
.id = "junction_type"
)
# use only unique junctions
junc_unique <- junc_df %>%
dplyr::distinct(junc_id, chr, strand, left, right, junction_type) %>%
dplyr::mutate(in_gencode = TRUE)
# return only a chracter vector with junctions
return(unique(junc_df$junc_id))
}