# Analysis Notebook - Fisher exact test

This notebook looks at specific Genes and Gene Subcategories

In [2]:
suppressWarnings({suppressMessages({
options(warn = -1) 
library(dplyr)
library(multtest)
library(R.utils)
})})

### 1  Read in all and significant alternative splicing and differential gene expression results

The summary data captured in the now saved **all_gene_as_gene_names.tsv**, **all_genes_dge_data** and significant results captured in **gene_as.tsv** and **gene_dge.tsv**

In [3]:
results_dir  <- "../data/"
all_genes_as_data  <- read.table("../assets/all_gene_as_gene_names.tsv", header=TRUE, sep="\t",
                               skipNul=FALSE, stringsAsFactors = FALSE)
names(all_genes_as_data) <- c("GeneSymbol", "ensg")
all_genes_dge_data <- read.table("../assets/all_gene_dge_gene_names.tsv", header=TRUE, sep="\t",
                               skipNul=FALSE, stringsAsFactors = FALSE)
sig_gene_as  <- read.table(file="../data/gene_as.tsv", header=TRUE, sep="\t",
                               skipNul=FALSE, stringsAsFactors = FALSE)
sig_gene_dge  <- read.table(file="../data/gene_dge.tsv", header=TRUE, sep="\t",
                               skipNul=FALSE, stringsAsFactors = FALSE)
head(sig_gene_as,2)
head(sig_gene_dge,2)
head(all_genes_as_data,2)
head(all_genes_dge_data,2)

Unnamed: 0_level_0,GeneJunction,ASE,ASE_IDX,Tissue,counts,Display,GeneSymbol,GeneID,chr,logFC,AveExpr,t,PValue,AdjPVal,B
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,XIST-2253,A3SS,2253,adipose_subcutaneous,4,Adipose (sc),XIST,ENSG00000229807.11,chrX,-4.408605,3.196317,-36.48897,4.635568e-154,3.893877e-150,310.016
2,XIST-2252,A3SS,2252,adipose_subcutaneous,4,Adipose (sc),XIST,ENSG00000229807.11,chrX,-2.414713,3.64769,-21.92106,1.444102e-78,6.065229000000001e-75,160.0282


Unnamed: 0_level_0,Tissue,ENSG_ver,ENSG_no_ver,GeneSymbol,counts,Display,logFC,AveExpr,t,PValue,AdjPVal,B,chr
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,adipose_subcutaneous,ENSG00000176728.7,ENSG00000176728,TTTY14,765,Adipose (sc),-7.982166,-0.9288129,-139.823,0,0,1107.423,chrY
2,adipose_subcutaneous,ENSG00000231535.5,ENSG00000231535,LINC00278,765,Adipose (sc),-6.09542,-2.7765638,-126.9138,0,0,1050.366,chrY


Unnamed: 0_level_0,GeneSymbol,ensg
Unnamed: 0_level_1,<chr>,<chr>
1,A1BG,ENSG00000121410.11
2,A1CF,ENSG00000148584.15


Unnamed: 0_level_0,ENSG_ver,ENSG_no_ver,GeneSymbol,chr
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>
1,ENSG00000183878.15,ENSG00000183878,UTY,chrY
2,ENSG00000129824.15,ENSG00000129824,RPS4Y1,chrY


### 2  Count up genes
About 250 genes were found in the splicing data but not in the gene expression data, presumably related to the different processing pipelines used. We can therefore not make an assessment of whether these genes were differentially expressed or not, and thus we remove the genes prior to further analysis.


In [9]:
# all genes identified in the gene expression data
allExpressionGenes <- unique(sort(all_genes_dge_data$GeneSymbol))
# all genes identified in the splicing data
allSplicingGenes  <- unique(sort(all_genes_as_data$GeneSymbol))
# Genes found in splicing data but not in expression data
orphanSplicingGenes <- setdiff(allSplicingGenes,allExpressionGenes)
message("All expression genes n=", length(allExpressionGenes),"; all splicing genes n=", length(allSplicingGenes), "; splicing genes not represented in expression set n=", length(orphanSplicingGenes))
correctedSplicing <- setdiff(allSplicingGenes, orphanSplicingGenes)
message("Note that we expect to find genes in the expression set that are not in the splicing set")
message("After removing the orphan splicing genes, we are left with  ", length(correctedSplicing), " genes in the splicing dataset")
universe <- allExpressionGenes

All expression genes n=41705; all splicing genes n=14694; splicing genes not represented in expression set n=256

Note that we expect to find genes in the expression set that are not in the splicing set

After removing the orphan splicing genes, we are left with  14438 genes in the splicing dataset



# Create the sets of differentially expressed/spliced genes
Note that we also need to correct the set of differentially spliced genes as above

In [14]:
sigDGEGenes <- unique(sort(sig_gene_dge$GeneSymbol))
sigASGenes  <- unique(sort(sig_gene_as$GeneSymbol))
correctedSigASGenes <- setdiff(sigASGenes, orphanSplicingGenes)
message("total AS (uncorrected) n=", length(sigASGenes), "; corrected n=", length(correctedSigASGenes))
total <- length(universe)
n_dge <- length(sigDGEGenes)
n_das <- length(correctedSigASGenes)
message("significant differentially expresssed genes: n=", n_dge, "/", total, ": ", 100*n_dge/total,"%")
message("significant differentially spliced genes: n=", n_das, "/", total, ": ", 100*n_das/total,"%")

total AS (uncorrected) n=2887; corrected n=2851

significant differentially expresssed genes: n=7417/41705: 17.7844383167486%

significant differentially spliced genes: n=2851/41705: 6.83611077808416%



In [19]:
dge_but_not_das <- setdiff(sigDGEGenes, correctedSigASGenes)
das_but_not_dge <- setdiff(correctedSigASGenes, sigDGEGenes)
dge_and_das <- intersect(sigDGEGenes, correctedSigASGenes)
neither_dge_nor_das <- setdiff(setdiff(universe,sigDGEGenes), correctedSigASGenes)
n_dge_but_not_das <- length(dge_but_not_das)
n_das_but_not_dge <- length(das_but_not_dge)
n_dge_and_das <- length(dge_and_das)
n_neither_dge_nor_das <- length(neither_dge_nor_das)
message("Differentially expressed but not differentially spliced: n=", n_dge_but_not_das, ": ", 100*n_dge_but_not_das/n_dge, "% of all DGE genes")
message("Differentially spliced but not differentially spliced: n=", n_das_but_not_dge, ": ", 100*n_das_but_not_dge/n_das, "% of all DAS genes")
message("DGE and DAS: ", n_dge_and_das,"; ", 100*n_dge_and_das/total,"% of all genes")
expected_proportion <- (n_dge/total)*(n_das/total)
message("By chance we would expect ", expected_proportion*total,", or ", 100*expected_proportion, "%")
message("Number of genes with neighther DGE nor DAS ", n_neither_dge_nor_das)

Differentially expressed but not differentially spliced: n=6270: 84.5355264931913% of all DGE genes

Differentially spliced but not differentially spliced: n=1704: 59.7685022799018% of all DAS genes

DGE and DAS: 1147; 2.75026975182832% of all genes

By chance we would expect 507.034336410502, or 1.21576390459298%

Number of genes with neighther DGE nor DAS 32883



# 2.1 Check whether the increased proportion is statistically significant

Comparing differentially expressed genes with differentially alternatively spliced:

|  	|  DGE+| DGE-|
|-	|-	|-	|
| DAS+|  a|  b|
| DAS-|  c| d|

In [21]:
a <- n_dge_and_das
b <- n_das_but_not_dge
c <- n_dge_but_not_das
d <- n_neither_dge_nor_das
m <- matrix(c(a,b,c,d), nrow=2,byrow = TRUE)
m

0,1
1147,1704
6270,32883


In [22]:
fisher.test(m)


	Fisher's Exact Test for Count Data

data:  m
p-value < 2.2e-16
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 3.256777 3.825200
sample estimates:
odds ratio 
   3.53005 


### Appendix - Metadata

For replicability and reproducibility purposes, we also print the following metadata:

1. Checksums of **'artefacts'**, files generated during the analysis and stored in the folder directory **`data`**
2. List of environment metadata, dependencies, versions of libraries using `utils::sessionInfo()` and [`devtools::session_info()`](https://devtools.r-lib.org/reference/session_info.html)

### Appendix 1. Checksums with the sha256 algorithm

In [24]:
notebookid   = "FisherExactTests"

### Appendix 2. Libraries metadata

In [25]:
dev_session_info   <- devtools::session_info()
utils_session_info <- utils::sessionInfo()

message("Saving `devtools::session_info()` objects in ../metadata/devtools_session_info.rds  ..")
saveRDS(dev_session_info, file = paste0("../metadata/", notebookid, "_devtools_session_info.rds"))
message("Done!\n")

message("Saving `utils::sessionInfo()` objects in ../metadata/utils_session_info.rds  ..")
saveRDS(utils_session_info, file = paste0("../metadata/", notebookid ,"_utils_info.rds"))
message("Done!\n")

dev_session_info$platform
dev_session_info$packages[dev_session_info$packages$attached==TRUE, ]

Saving `devtools::session_info()` objects in ../metadata/devtools_session_info.rds  ..

Done!


Saving `utils::sessionInfo()` objects in ../metadata/utils_session_info.rds  ..

Done!




 setting  value                       
 version  R version 3.6.1 (2019-07-05)
 os       Ubuntu 18.04.4 LTS          
 system   x86_64, linux-gnu           
 ui       X11                         
 language en_US.UTF-8                 
 collate  en_US.UTF-8                 
 ctype    en_US.UTF-8                 
 tz       Etc/UTC                     
 date     2020-07-23                  

Unnamed: 0_level_0,package,ondiskversion,loadedversion,path,loadedpath,attached,is_base,date,source,md5ok,library
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<lgl>,<fct>
dplyr,dplyr,1.0.0,1.0.0,/opt/conda/lib/R/library/dplyr,/opt/conda/lib/R/library/dplyr,True,False,2020-05-29,CRAN (R 3.6.1),,/opt/conda/lib/R/library
repr,repr,1.1.0,1.1.0,/opt/conda/lib/R/library/repr,/opt/conda/lib/R/library/repr,True,False,2020-01-28,CRAN (R 3.6.3),,/opt/conda/lib/R/library
