# SplicingIndex figure

This notebook creates a single figure, **splicingIndex.pdf**, a plot showing the number of AS events per 1000 exons for each chromosome.

In [2]:
suppressMessages({
    options(warn = -1) 
    library(dplyr)
    library(devtools)
    library(ggplot2)
    library(rtracklayer)
})

### 1. read in gene_as.tsv data containing all significant alternative splicing events

In [3]:
filename <- "../data/gene_as.tsv"
gene_as <- read.table(filename, header=TRUE, sep="\t", skipNul=FALSE, stringsAsFactors = FALSE)
head(gene_as,2)

Unnamed: 0_level_0,GeneJunction,ASE,ASE_IDX,Tissue,counts,Display,GeneSymbol,GeneID,chr,logFC,AveExpr,t,PValue,AdjPVal,B
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,XIST-2253,A3SS,2253,adipose_subcutaneous,3,Adipose (sc),XIST,ENSG00000229807.11,chrX,-4.410265,3.196555,-35.99306,1.037929e-151,7.98686e-148,303.0562
2,XIST-2252,A3SS,2252,adipose_subcutaneous,3,Adipose (sc),XIST,ENSG00000229807.11,chrX,-2.409615,3.647928,-21.3246,1.906043e-75,7.3335e-72,152.3585


### 2 Count most frequent spliced chromosomes
To get an indication of which chromosome has the most frequent slicing event (regardless of type)
We create an index based upon the number of exons per chromosome.

get the annotation file, at this writing, gencode.v30.annotation.gtf
The information as to the number of exons within the chromosome may be found there

In [4]:
if (!("gencode.v30.annotation.gtf.gz" %in% list.files("../data/"))) {
    message("downloading gencode v30 annotation\n")
    system("wget -O ../data/gencode.v30.annotation.gtf.gz ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/gencode.v30.annotation.gtf.gz")
    message("Done!\n")
    message("Unzipping compressed file gencode.v30.annotation.gtf.gz..")
    system("gunzip ../data/gencode.v30.annotation.gtf.gz", intern = TRUE)
    message("Done! gencode.v30.annotation.gtf can be found in ../data/")
}
gencode <- rtracklayer::import("../data/gencode.v30.annotation.gtf")

In [5]:
exons <- gencode[ gencode$type == "exon", ]
exons <- as.data.frame(exons)

#Obtain chromosomes we have splicing information for (recall we did not use chr Y in our analysis)
all_chr <- as.character(unique(gene_as$chr))
chr_counts <- rep(0, length(all_chr))


for (i in 1:length(all_chr)) {
  chr_counts[i] <- nrow(exons[exons$seqnames == all_chr[i], ])
}

exon_counts <- data.frame(chr = all_chr, counts = chr_counts)

# Count most frequent spliced chromosomes
res <- gene_as %>% group_by(chr) %>% count(chr) %>% arrange(desc(n)) %>% as.data.frame()
res$chr <- factor(res$chr, levels = res$chr)

idx <- match(res$chr, exon_counts$chr)

res$ExonCounts <- exon_counts$counts[idx]

res$Index <- (res$n / res$ExonCounts) * 1000

res_sorted <- res %>% arrange(desc(Index))
res_sorted$chr <- factor(res_sorted$chr, levels = res_sorted$chr)
glimpse(res_sorted)
message("res_sorted has ", length(res_sorted$chr), " chromosomes with data about events, exons, and the splicing index")

Rows: 23
Columns: 4
$ chr        [3m[90m<fct>[39m[23m chrX, chr19, chr22, chr4, chr1, chr16, chr11, chr17, chr6,…
$ n          [3m[90m<int>[39m[23m 586, 336, 128, 207, 460, 236, 292, 267, 194, 158, 242, 249…
$ ExonCounts [3m[90m<dbl>[39m[23m 40029, 74466, 28655, 50420, 118996, 61199, 75976, 78291, 5…
$ Index      [3m[90m<dbl>[39m[23m 14.639386, 4.512126, 4.466934, 4.105514, 3.865676, 3.85627…


res_sorted has 23 chromosomes with data about events, exons, and the splicing index



In [None]:
# Colors for uniformity
# Let's use the following colors for uniformity
darkblue <- "#3c5488"
red <- "#e64b35"
nearlyblack <- "#040C04"
purple <- "#790079"
orange = "#ff9900"

In [None]:
g2 <- ggplot(res_sorted, aes(x = chr, y = Index, size = n)) +
  geom_point(color = darkblue) +
  theme_bw() +
  theme(axis.text.x = element_text(size=16, angle = 270, hjust = 0.0, vjust = 0.5),
        axis.text.y = element_text(size=32),
        axis.title.x = element_blank(),
        axis.title.y = element_text(face="plain", colour="black", 
                                    size=18),
        legend.title=element_text(face="plain", colour="black", 
                                    size=18),
        legend.text = element_text(face="plain", colour="black", 
                                   size=32),
        legend.position = c(0.8, 0.7)) +
  scale_fill_viridis_c(aesthetics = c("colour", "fill"),
                       option = "plasma",
                       limits = c(1, 650)) +
  ylab(paste("Normalized number of sex-biased ASE")) +
  guides(size = guide_legend(title = "Unnormalized\nnumber of\nsex-biased\nASE per\nchromosome"))
ggsave("../pdf/splicingIndex.pdf",g2)
message("Saved plot as ../pdf/splicingIndex.pdf")
g2

### Appendix - Metadata

For replicability and reproducibility purposes, we also print the following metadata:

1. Checksums of **'artefacts'**, files generated during the analysis and stored in the folder directory **`data`**
2. List of environment metadata, dependencies, versions of libraries using `utils::sessionInfo()` and [`devtools::session_info()`](https://devtools.r-lib.org/reference/session_info.html)

### Appendix 1. Checksums with the sha256 algorithm

In [7]:
notebookid = "create_SplicingIndex_figure"

### Appendix 2. Libraries metadata

In [8]:
dev_session_info   <- devtools::session_info()
utils_session_info <- utils::sessionInfo()

message("Saving `devtools::session_info()` objects in ../metadata/devtools_session_info.rds  ..")
saveRDS(dev_session_info, file = paste0("../metadata/", notebookid, "_devtools_session_info.rds"))
message("Done!\n")

message("Saving `utils::sessionInfo()` objects in ../metadata/utils_session_info.rds  ..")
saveRDS(utils_session_info, file = paste0("../metadata/", notebookid ,"_utils_info.rds"))
message("Done!\n")

dev_session_info$platform
dev_session_info$packages[dev_session_info$packages$attached==TRUE, ]

Saving `devtools::session_info()` objects in ../metadata/devtools_session_info.rds  ..

Done!


Saving `utils::sessionInfo()` objects in ../metadata/utils_session_info.rds  ..

Done!




 setting  value                       
 version  R version 3.6.1 (2019-07-05)
 os       Ubuntu 18.04.4 LTS          
 system   x86_64, linux-gnu           
 ui       X11                         
 language en_US.UTF-8                 
 collate  en_US.UTF-8                 
 ctype    en_US.UTF-8                 
 tz       Etc/UTC                     
 date     2020-09-02                  

Unnamed: 0_level_0,package,ondiskversion,loadedversion,path,loadedpath,attached,is_base,date,source,md5ok,library
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<lgl>,<fct>
BiocGenerics,BiocGenerics,0.32.0,0.32.0,/opt/conda/lib/R/library/BiocGenerics,/opt/conda/lib/R/library/BiocGenerics,True,False,2019-10-29,Bioconductor,,/opt/conda/lib/R/library
devtools,devtools,2.3.0,2.3.0,/opt/conda/lib/R/library/devtools,/opt/conda/lib/R/library/devtools,True,False,2020-04-10,CRAN (R 3.6.1),,/opt/conda/lib/R/library
dplyr,dplyr,0.8.5,0.8.5,/opt/conda/lib/R/library/dplyr,/opt/conda/lib/R/library/dplyr,True,False,2020-03-07,CRAN (R 3.6.1),,/opt/conda/lib/R/library
GenomeInfoDb,GenomeInfoDb,1.22.0,1.22.0,/opt/conda/lib/R/library/GenomeInfoDb,/opt/conda/lib/R/library/GenomeInfoDb,True,False,2019-10-29,Bioconductor,,/opt/conda/lib/R/library
GenomicRanges,GenomicRanges,1.38.0,1.38.0,/opt/conda/lib/R/library/GenomicRanges,/opt/conda/lib/R/library/GenomicRanges,True,False,2019-10-29,Bioconductor,,/opt/conda/lib/R/library
ggplot2,ggplot2,3.3.0,3.3.0,/opt/conda/lib/R/library/ggplot2,/opt/conda/lib/R/library/ggplot2,True,False,2020-03-05,CRAN (R 3.6.1),,/opt/conda/lib/R/library
IRanges,IRanges,2.20.0,2.20.0,/opt/conda/lib/R/library/IRanges,/opt/conda/lib/R/library/IRanges,True,False,2019-10-29,Bioconductor,,/opt/conda/lib/R/library
repr,repr,1.1.0,1.1.0,/opt/conda/lib/R/library/repr,/opt/conda/lib/R/library/repr,True,False,2020-01-28,CRAN (R 3.6.3),,/opt/conda/lib/R/library
rtracklayer,rtracklayer,1.46.0,1.46.0,/opt/conda/lib/R/library/rtracklayer,/opt/conda/lib/R/library/rtracklayer,True,False,2019-10-29,Bioconductor,,/opt/conda/lib/R/library
S4Vectors,S4Vectors,0.24.0,0.24.0,/opt/conda/lib/R/library/S4Vectors,/opt/conda/lib/R/library/S4Vectors,True,False,2019-10-29,Bioconductor,,/opt/conda/lib/R/library
