R/canonical_junctions.R


#' Build canonical junctions from transcripts
#'
#' @param tx a \code{\link[GenomicRanges]{GRangesList}} of reference transcripts
#'
#' @return a character vector of canonical splice junction ids
#'
#'
#' We build all canonical splice junctions that are in the annotated input transcripts.
#' The following lists implementation rules for adjacent canonical exon-exon junction:
#'
#' - strand = +: \eqn{e_i}, \eqn{s_{i+1}}
#' - strand = -: \eqn{e_{i+1}}, \eqn{s_{i}}
#'
#' We also include canonical intron-retention junctions. These are 5' donor or
#' 3' acceptor sites of canonical exon-exon junctions that are not used in all
#' isoforms of the gene. They are located within an exon of other transcripts.
#' Canonical intron-retention junctions are defined by the coordinate of the
#' last exon base and the next base.
#' Therefore, we just need to check whether both bases are included in a single exon.
#'
#' @examples
#' gtf_file <- system.file("extdata","GTF_files","Aedes_aegypti.partial.gtf",
#'   package="splice2neo")
#'
#' tx <- parse_gtf(gtf_file)
#'
#' canonical_junctions(tx[1:10])
#'
#' @export
canonical_junctions <- function(tx){

  # full ranges of all transcript as a single GRanges object
  tx_gr <- base::range(tx)

  exons_gr <- unlist(tx)

  # get data.frame with all transcripts
  tx_df <- dplyr::tibble(
    chr = GenomeInfoDb::seqnames(tx_gr) %>% as.character(),
    strand = BiocGenerics::strand(tx_gr) %>% as.character(),
    exon_start = BiocGenerics::start(tx) %>% as.list(),
    exon_end = BiocGenerics::end(tx) %>% as.list(),
    tx_name = names(tx)
  ) %>%
    unnest(c(exon_start, exon_end))

  # build exon-exon junctions ==================================================
  junc_ee <- tx_df %>%
    dplyr::group_by(tx_name) %>%

    # get left and right junction coordinates as exon boundaries (depending on strand)
    dplyr::mutate(
      left = ifelse(strand == "+", exon_end, dplyr::lead(exon_end)),
      right = ifelse(strand == "+", dplyr::lead(exon_start), exon_start)
    ) %>%
    # filter out entries for first and last exon (without junctions)
    dplyr::filter(!is.na(left), !is.na(right)) %>%
    dplyr::ungroup() %>%
    # - junc_id: `<chr>_<pos1>_<pos2>_<strand>`
    dplyr::mutate(
      junc_id = generate_junction_id(chr, left, right, strand)
    )

  # build intron-retention junctions ===========================================
  junc_ir <- tx_df %>%

    # get left and right junction coordinates as exon boundaries
    dplyr::mutate(

      start_junc_id = generate_junction_id(chr, exon_start - 1, exon_start, strand),
      end_junc_id = generate_junction_id(chr, exon_end, exon_end + 1, strand),

    ) %>%
    # combine into one column
    tidyr::pivot_longer(cols = c("start_junc_id", "end_junc_id"),
                 names_to = "left_right", values_to = "junc_id") %>%
    # distribute junc_id values in to separate columns
    tidyr::separate(junc_id, into = c("chr", "start_end", "strand"), sep = ":", remove = FALSE) %>%
    tidyr::separate(start_end, into = c("start", "end"), sep = "-", remove = FALSE)

  # build GRanges object
  junc_ir_gr <- GenomicRanges::GRanges(
    junc_ir$chr,
    IRanges::IRanges(
      as.integer(junc_ir$start),
      as.integer(junc_ir$end),
    ),
    strand = junc_ir$strand,
    junc_id = junc_ir$junc_id
  )

  # get subset of exon start and end junctions that are within an exon
  covered_junc_gr <- junc_ir_gr %>%
    IRanges::subsetByOverlaps(exons_gr, type = "within")

  covered_junc_id <- unique(covered_junc_gr$junc_id)

  # filter to contain only
  junc_ir_covered <- junc_ir %>%
    dplyr::filter(junc_id %in% covered_junc_id) %>%
    dplyr::mutate(
      left = as.integer(start),
      right = as.integer(end)
    ) %>%
    dplyr::select(chr, strand, tx_name, left, right, junc_id)

  # merge with other exon-to-exon junctions ====================================
  junc_df <- dplyr::bind_rows(
    "exon_to_exon" = junc_ee,
    "within_exon" = junc_ir_covered,
    .id = "junction_type"
  )

  # use only unique junctions
  junc_unique <- junc_df %>%
    dplyr::distinct(junc_id, chr, strand, left, right, junction_type) %>%
    dplyr::mutate(in_gencode = TRUE)

  # return only a chracter vector with junctions
  return(unique(junc_df$junc_id))

}