# Analysis Notebook - Count Genes and Events - `CountGenesAndEvents`

As instructed by the software, we are using the raw counts as provided by rMATS.  The raw counts we are using in the model are `ijc` and `sjc`, the sample specific raw read counts as they align to the junctions of the `included exon (ijc)` and the junctions of the `excluded or skipped exon (sjc)` respectively.


Be sure to set your GITHUB_TOKEN, prior to downloading files

One suggestion is change it to your token and then run it then immediately change it back to this:

Sys.setenv(GITHUB_TOKEN = "your-very-own-github-token")

In [1]:
# Sys.setenv(GITHUB_TOKEN="your-very-own-github-token")

In [2]:
library(dplyr)
library(ggplot2)
library(limma)
library(piggyback)
library(multtest)
library(Biobase)
library(edgeR)
library(tibble)
#install.packages('R.utils')
library(R.utils)

“package ‘dplyr’ was built under R version 3.6.2”
Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following object is masked from ‘package:limma’:

    plotMA

The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, ev

## Download all the rMATS results

Each of the alternative splicing output files are downloaded here:

In [3]:
#
# refined results (FC > 1.5 and pVal < 0.05)
# for the sex*as_event coefficient result for the linear model
#
significant_results_dir = "../data/"
pattern = "DGE_sex_as_events_refined.csv"
files <- list.files(path = significant_results_dir, pattern = pattern)
as_types <- c("a3ss", "a5ss", "mxe", "ri", "se")

In [4]:
a3ss_annot <- read.table(file = "../data/fromGTF.A3SS.txt", sep = "\t", quote = "\"", header = T, stringsAsFactors = F)
a5ss_annot <- read.table(file = "../data/fromGTF.A5SS.txt", sep = "\t", quote = "\"", header = T, stringsAsFactors = F)
mxe_annot <- read.table(file = "../data/fromGTF.MXE.txt", sep = "\t", quote = "\"", header = T, stringsAsFactors = F)
ri_annot <- read.table(file = "../data/fromGTF.RI.txt", sep = "\t", quote = "\"", header = T, stringsAsFactors = F)
se_annot <- read.table(file = "../data/fromGTF.SE.txt", sep = "\t", quote = "\"", header = T, stringsAsFactors = F)

In [144]:
message ("\nReading significant events for each splicing factor\n")
gene_as = data.frame()
counts <- rep(NA, length(files))
ASE <- rep("NA", length(files))
Tissue <- rep("NA", length(files))

for (i in 1:length(files)) {
    lines <- read.csv(paste0(significant_results_dir, files[i]),header = TRUE)
    event     <- as.vector(as.character(rownames(lines)))
    tissue <- gsub("_DGE_sex_as_events_refined.csv","", files[i], fixed = TRUE)

    counts[i] <- dim(lines)[1]
    
    if (dim(lines)[1] > 0) { #has significant events
        # rownames of the significant results file are a combination of gene-symbol and junction name
        # using a regular expression - extract the last numbers that then are the index to the annotation table
        event_idx <- substring(event, regexpr("[0-9]+$", event))
        
        if (grepl("a3ss_", files[i])) {
            tissue <- gsub("a3ss_","", tissue, fixed = TRUE)
            ASE[i] <- "A3SS"
            Tissue[i] <- tissue
            idx <- match(event_idx, a3ss_annot$ID)
            res <- data.frame(GeneJunction = event,
                              ASE = "A3SS", 
                              ASE_IDX = idx,
                              Tissue = tissue, 
                              GeneSymbol = a3ss_annot$geneSymbol[idx],
                              chr = a3ss_annot$chr[idx])
        }
        if (grepl("a5ss_", files[i])) {
            tissue <- gsub("a5ss_","", tissue, fixed = TRUE)
            ASE[i] <- "A5SS"
            Tissue[i] <- tissue
            idx <- match(event_idx, a5ss_annot$ID)
            res <- data.frame(GeneJunction = event,
                              ASE = "A5SS", 
                              ASE_IDX = idx,
                              Tissue = tissue, 
                              GeneSymbol = a5ss_annot$geneSymbol[idx],
                              chr = a5ss_annot$chr[idx])
        }
        if (grepl("mxe_", files[i])) {
            ASE[i] <- "MXE"
            tissue <- gsub("mxe_","", tissue, fixed = TRUE)
            Tissue[i] <- tissue
            idx <- match(event_idx, a3ss_annot$ID)
            res <- data.frame(GeneJunction = event,
                              ASE = "MXE", 
                              ASE_IDX = idx,
                              Tissue = tissue, 
                              GeneSymbol = mxe_annot$geneSymbol[idx],
                              chr = mxe_annot$chr[idx])
        }
        if (grepl("se_", files[i])) {
            ASE[i] <- "SE"
            tissue <- gsub("se_","", tissue, fixed = TRUE)
            Tissue[i] <- tissue
            idx <- match(event_idx, se_annot$ID)
            res <- data.frame(GeneJunction = event,
                              ASE = "SE", 
                              ASE_IDX = idx,
                              Tissue = tissue, 
                              GeneSymbol = se_annot$geneSymbol[idx],
                              chr = se_annot$chr[idx])
        }
        if (grepl("ri_", files[i])){
            ASE[i] <- "RI"
            tissue <- gsub("ri_","", tissue, fixed = TRUE)
            Tissue[i] <- tissue
            idx <- match(event_idx, ri_annot$ID)
            res <- data.frame(GeneJunction = event,
                              ASE = "RI", 
                              ASE_IDX = idx,
                              Tissue = tissue, 
                              GeneSymbol = ri_annot$geneSymbol[idx],
                              chr = a3ss_annot$chr[idx])
        }
        gene_as <- rbind(gene_as, res)
        
    } #if has sig. events
    
} #for all files    
message ("\nDone reading significant events!\n")


Reading significant events for each splicing factor


Done reading significant events!



### Glimpse into what we have

In [151]:
glimpse(gene_as)
table(is.na(gene_as$Tissue))
colnames(gene_as)
write.table(gene_as, "../data/gene_as.tsv", quote=FALSE, sep="\t")
head(gene_as)

Rows: 7,077
Columns: 6
$ GeneJunction [3m[90m<fct>[39m[23m XIST-2253, XIST-2252, GREB1L-4933, RHCG-1776, XIST-2253,…
$ ASE          [3m[90m<fct>[39m[23m SE, SE, SE, SE, SE, SE, SE, SE, SE, SE, SE, SE, SE, SE, …
$ ASE_IDX      [3m[90m<int>[39m[23m 2253, 2252, 4933, 1776, 2253, 2252, 4819, 4818, 4820, 45…
$ Tissue       [3m[90m<fct>[39m[23m adiposubcutaneous, adiposubcutaneous, adiposubcutaneous,…
$ GeneSymbol   [3m[90m<fct>[39m[23m DLEU1, DLEU1, AKT1, MLF1, DLEU1, DLEU1, CNN2, KCNK7, CNN…
$ chr          [3m[90m<fct>[39m[23m chr13, chr13, chr14, chr3, chr13, chr13, chr19, chr11, c…



FALSE 
 7077 

Unnamed: 0_level_0,GeneJunction,ASE,ASE_IDX,Tissue,GeneSymbol,chr
Unnamed: 0_level_1,<fct>,<fct>,<int>,<fct>,<fct>,<fct>
1,XIST-2253,SE,2253,adiposubcutaneous,DLEU1,chr13
2,XIST-2252,SE,2252,adiposubcutaneous,DLEU1,chr13
3,GREB1L-4933,SE,4933,adiposubcutaneous,AKT1,chr14
4,RHCG-1776,SE,1776,adiposubcutaneous,MLF1,chr3
5,XIST-2253,SE,2253,adipovisceral_omentum,DLEU1,chr13
6,XIST-2252,SE,2252,adipovisceral_omentum,DLEU1,chr13


In [149]:
res <-   dplyr::group_by(gene_as, Tissue, GeneJunction, GeneSymbol)
res2 <- res %>%
        group_by(Tissue,ASE) %>%
        count(Tissue,GeneSymbol) %>%
        arrange(desc(n)) %>%
        as.data.frame()

res3 <- gene_as %>%
        group_by(GeneJunction, ASE) %>%
        count(GeneJunction, ASE) %>%
        arrange(desc(n)) %>% 
        as.data.frame()

head(res3,50)
#   count(Tissue, GeneSymbol) %>%
#   arrange(desc(n)) %>%
#dim(res2) 
#genes <- as.character(unique(gene_as$GeneSymbol))
#length(genes)
#genes.o <- genes[order(genes)]
#sum(length(unique_genes))
#sum(res2[res2$GeneSymbol == "XIST",]$n)

Unnamed: 0_level_0,GeneJunction,ASE,n
Unnamed: 0_level_1,<fct>,<fct>,<int>
1,XIST-10149,SE,39
2,XIST-10152,SE,39
3,XIST-10154,SE,39
4,XIST-10150,SE,38
5,XIST-2253,A3SS,36
6,XIST-2252,A3SS,35
7,XIST-10155,SE,34
8,XIST-10151,SE,32
9,KDM5C-22847,SE,27
10,ZFX-13700,SE,25


### Count splicing event by chromosome

In [91]:
res <- gene_as          %>% 
       group_by(chr)    %>% 
       count(chr)       %>% 
       arrange(desc(n)) %>% 
       as.data.frame()
res$chr <- factor(res$chr, levels = res$chr)
length(res$chr)
res
glimpse(res)
write.table(res, file= "../data/Totals_by_chr.tsv", sep="\t", quote = FALSE, row.names=F)

chr,n
<fct>,<int>
chrX,828
chr1,676
chr19,468
chr11,445
chr2,425
chr3,423
chr17,396
chr12,370
chr16,339
chr4,308


Rows: 23
Columns: 2
$ chr [3m[90m<fct>[39m[23m chrX, chr1, chr19, chr11, chr2, chr3, chr17, chr12, chr16, chr4, …
$ n   [3m[90m<int>[39m[23m 828, 676, 468, 445, 425, 423, 396, 370, 339, 308, 274, 269, 247, …


###  Count most frequent spliced genes 

In [90]:
res <- gene_as %>% 
       group_by(GeneSymbol) %>% count(GeneSymbol) %>% arrange(desc(n)) %>% as.data.frame()
res$GeneSymbol <- factor(res$GeneSymbol, levels = res$GeneSymbol)
length(res$GeneSymbol)
head(res)
write.table(res, file = "../data/Totals_by_geneSymbol.tsv", sep = "\t", quote=FALSE, row.names = F)

Unnamed: 0_level_0,GeneSymbol,n
Unnamed: 0_level_1,<fct>,<int>
1,XIST,338
2,DDX3X,129
3,KDM5C,49
4,ZFX,48
5,NLRP2,29
6,KDM6A,27


### Count most frequent splicing by tissue

In [89]:
res <- gene_as %>% 
       group_by(Tissue) %>% 
       count(Tissue) %>% 
       arrange(desc(n)) %>% 
       as.data.frame()
res$Tissue <- factor(res$Tissue, levels = res$Tissue)
length(res$Tissue)
res
write.table(res, file = "../data/Totals_by_tissue.tsv", sep = "\t", row.names = F)

Tissue,n
<fct>,<int>
breast_mammary_tissue,4373
brain_nucleus_accumbens_basal_ganglia,598
esophagus_muscularis,299
artery_aorta,183
cells_cultured_fibroblasts,134
thyroid,119
pituitary,92
adiposubcutaneous,88
spleen,88
muscle_skeletal,82


###  Significant Count by splicing type (significant == FC > 1.5 and pVal < 0.05)

Our starting values were the significant events, all meeting the criteria FC > 1.5 and pVal < 0.05


In [None]:
res <- gene_as %>% group_by(ASE) %>% count(ASE) %>% arrange(desc(n)) %>% as.data.frame()
res$ASE <- factor(res$ASE, levels = res$ASE)
head(res)
write.table(res, file= "../data/Totals_by_splicingtype.tsv")

###  Significant Count by splicing type (significant == FC > 1.5 and pVal < 0.05)

In [None]:
A3SS_keep <- as.character(gene_as$ASE) %in% "A3SS"
table(A3SS_keep)
A3SS.gene_as <- data.frame(A3SS.gene_as[A3SS_keep == TRUE,])

A5SS_keep <- as.character(gene_as$ASE) %in% "A5SS"
table(A5SS_keep)
A5SS.gene_as <- data.frame(A5SS.gene_as[A5SS_keep == TRUE,])

MXE_keep  <- as.character(gene_as$ASE) %in% "MXE"
table(MXE_keep)
MXE.gene_as <- data.frame(MXE.gene_as[MXE_keep == TRUE,])

SE_keep   <- as.character(gene_as$ASE) %in% "SE"
table(SE_keep)
SE.gene_as <- data.frame(SE.gene_as[SE_keep == TRUE,])

RI_keep   <- as.character(gene_as$ASE) %in% "RI"
table(RI_keep)
RI.gene_as <- data.frame(RI.gene_as[RI_keep == TRUE,])

dim(A3SS.gene_as)
dim(A5SS.gene_as)
dim(MXE.gene_as)
dim(SE.gene_as)
dim(RI.gene_as)


In [None]:
A3SS.res <- A3SS.gene_as %>% group_by(GeneSymbol) %>% count(GeneSymbol) %>% arrange(desc(n)) %>% as.data.frame()
A3SS.res$GeneSymbol <- factor(A3SS.res$GeneSymbol, levels = A3SS.res$GeneSymbol)
length(res$GeneSymbol)
head(res)

#A3SS.res <- A3SS.gene_as %>% group_by(GeneSymbol) %>% count(GeneSymbol) %>% arrange(desc(n)) %>% as.data.frame()
#A3SS.res$GeneSymbol <- factor(A3SS.res$GeneSymbol, levels = A3SS.res$GeneSymbol)
head(A3SS.res)

In [None]:
glimpse(data)

res2 <- data             %>% 
        group_by(Tissue) %>% 
        summarise(Total = sum(Counts)) %>% 
        arrange(desc(Total)) %>% as.data.frame()
glimpse(res2)

In [None]:
#Add number of tissues
nTissues <- rep(NA, length(res))
for (i in 1:nrow(res)) {
  df_gene <- gene_as %>% filter(GeneSymbol == res$GeneSymbol[i])
  nTissues[i] <- length(unique(df_gene$Tissue))
}
res$Tissues <- nTissues

In [None]:
# Genes with more than 10 splicing events
ggplot(res[res$n > 10, ], aes(x = GeneSymbol, y = n)) +
  geom_point(size = 4, aes(fill = Tissues, color = Tissues)) +
  theme_bw() +
  theme(axis.text.x = element_text(size=10, angle = 270, hjust = 0.0, vjust = 0.5),
        axis.text.y = element_text(size=16),
        axis.title.x = element_text(face="plain", colour="black", 
                                    size=14),
        axis.title.y = element_text(face="plain", colour="black", 
                                    size=14),
        legend.title=element_blank(),
        legend.text = element_text(face="plain", colour="black", 
                                   size=12)) +
  scale_fill_viridis_c(aesthetics = c("colour", "fill"),
                       option = "plasma",
                       limits = c(1, 30), breaks = c(10, 20, 30)) +
  ylab(paste("Number of sex-biased splicing events")) +
  xlab("Genes")

Pie chart

Tissue specific 1 tissue
Tissue group 2-5 tissues
Recurrent > 5 tissues

In [None]:
#Pie chart - Number of patients with 0, 1, >1 events
counts <- c(res %>% filter(Tissues == 1) %>% count() %>% as.numeric(),
            res %>% filter(Tissues > 1 & Tissues < 5) %>% count() %>% as.numeric(),
            res %>% filter(Tissues > 5) %>% count() %>% as.numeric())

In [None]:
# Define some colors ideal for black & white print
colors <- c("white","grey70","grey90","grey50","black")
colors <- c("grey90","grey50", "black")

Calculate the percentage for each category

In [None]:
counts_labels <- round(counts/sum(counts) * 100, 1)
counts_labels <- paste(counts_labels, "%", sep="")
counts_labels <- paste(counts_labels, counts, sep =" ")

In [None]:
# Create a pie chart with defined heading and custom colors
# and labels
pie(counts, main="", col=colors, labels=counts_labels,
    cex=1.2)
# Create a legend at the right   
legend(1.5, 0.5, c("1 tissue","2-5 tissues","> 5 tissues"), cex=1.2, 
       fill=colors)

### Count most frequent spliced chromosomes
To get an indication of which chromosome has the most frequent slicing event (regardless of type)
We create an index based upon the number of exons per chromosome.

get the annotation file, at this writing, gencode.v30.annotation.gtf
The information as to the number of exons within the chromosome may be found there

In [None]:
message("getting the annotation file used in the analysis\n")
setwd("../data")
system("wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/gencode.v30.annotation.gtf.gz")
system("gunzip gencode.v30.annotation.gtf.gz")
setwd("../jupyter")
message("done\n")

Import the gencode annotation file and extract the exon information

In [None]:
# Number of splicing events normalized per chromosome
library(rtracklayer)
gencode <- import.gff("../data/gencode.v30.annotation.gtf")
exons <- gencode[ gencode$type == "exon", ]
exons <- as.data.frame(exons)

Obtain chromosomes we have splicing information for (recall we did not use chr Y in our analysis

In [None]:
all_chr <- as.character(unique(gene_as$chr))
chr_counts <- rep(0, length(all_chr))
exon_counts <- data.frame(chr = all_chr, counts = chr_counts)

In [None]:
for (i in 1:length(all_chr)) {
  chr_counts[i] <- nrow(exons[exons$seqnames == all_chr[i], ])
}
glimpse (chr_counts)

In [None]:
res <- gene_as %>% group_by(chr) %>% count(chr) %>% arrange(desc(n)) %>% as.data.frame()
glimpse(res)
res$chr <- factor(res$chr, levels = res$chr)
idx <- match(res$chr, exon_counts$chr)
res$ExonCounts <- exon_counts$counts[idx]
res$Index <- res$n / res$ExonCounts) * 1000
res_sorted <- res %>% arrange(desc(Index))

In [None]:
idx <- match(res$chr, exon_counts$chr)

In [None]:
res$ExonCounts <- exon_counts$counts[idx]

In [None]:
res$Index <- (res$n / res$ExonCounts) * 1000

In [None]:
res_sorted <- res %>% arrange(desc(Index))
res_sorted$chr <- factor(res_sorted$chr, levels = res_sorted$chr)

In [None]:
ggplot(res_sorted, aes(x = chr, y = Index, size = n)) +
  geom_point(color = "red") +
  theme_bw() +
  theme(axis.text.x = element_text(size=10, angle = 270, hjust = 0.0, vjust = 0.5),
        axis.text.y = element_text(size=16),
        axis.title.x = element_text(face="plain", colour="black", 
                                    size=14),
        axis.title.y = element_text(face="plain", colour="black", 
                                    size=14),
        legend.title=element_blank(),
        legend.text = element_text(face="plain", colour="black", 
                                   size=12)) +
  scale_fill_viridis_c(aesthetics = c("colour", "fill"),
                       option = "plasma",
                       limits = c(1, 650)) +
  ylab(paste("Normalized Number of sex-biased ASE")) +
  xlab("Chromosomes") +
  guides(size = guide_legend(title = "Number of ASE"))

In [None]:
write.table(data,       file = "../data/Significant_events.tsv", sep = "\t", row.names = F, quote = F)
write.table(res_sorted, file = "../data/SplicingIndex_chr.tsv", sep = "\t", quote = F, row.names = F)

## Metadata

For replicability and reproducibility purposes, we also print the following metadata:

1. Checksums of **'artefacts'**, files generated during the analysis and stored in the folder directory **`data`**
2. List of environment metadata, dependencies, versions of libraries using `utils::sessionInfo()` and [`devtools::session_info()`](https://devtools.r-lib.org/reference/session_info.html)

### 1. Checksums with the sha256 algorithm

In [None]:
rm (notebookid)
notebookid   = "countGenesAndEvents"
notebookid

message("Generating sha256 checksums of the file `../data/Totals_by_tissue.tsv` directory .. ")
system(paste0("cd ../data && find . -name SplicingIndex_chr.tsv -exec sha256sum {} \\;  >  ../metadata/", notebookid, "_sha256sums.txt"), intern = TRUE)
message("Done!\n")

message("Generating sha256 checksums of the file `../data/Significant_events.tsv` directory .. ")
system(paste0("cd ../data && find . -name SplicingIndex_chr.tsv -exec sha256sum {} \\;  >  ../metadata/", notebookid, "_sha256sums.txt"), intern = TRUE)
message("Done!\n")

message("Generating sha256 checksums of the file `../data/Significant_events.tsv` directory .. ")
system(paste0("cd ../data && find . -name SplicingIndex_chr.tsv -exec sha256sum {} \\;  >  ../metadata/", notebookid, "_sha256sums.txt"), intern = TRUE)
message("Done!\n")


paste0("../metadata/", notebookid, "_sha256sums.txt")

data.table::fread(paste0("../metadata/", notebookid, "_sha256sums.txt"), header = FALSE, col.names = c("sha256sum", "file"))

### 2. Libraries metadata

In [None]:
dev_session_info   <- devtools::session_info()
utils_session_info <- utils::sessionInfo()

message("Saving `devtools::session_info()` objects in ../metadata/devtools_session_info.rds  ..")
saveRDS(dev_session_info, file = paste0("../metadata/", notebookid, "_devtools_session_info.rds"))
message("Done!\n")

message("Saving `utils::sessionInfo()` objects in ../metadata/utils_session_info.rds  ..")
saveRDS(utils_session_info, file = paste0("../metadata/", notebookid ,"_utils_info.rds"))
message("Done!\n")

dev_session_info$platform
dev_session_info$packages[dev_session_info$packages$attached==TRUE, ]