# Analysis Notebook - create all DGE files

Creating and saving two files in this notebook

 **1. chr_genes.tsv:** create a file with chromosome, ENSG (no version number) and GeneSymbols using gencode.v30.annotation.gtf

 **2. all_gene_dge.tsv:** create this file using the chr_genes from gencode.v30.annotation for the specific ENSG ids that are used in the differential gene analysis

In [None]:
defaultW <- getOption("warn")  # suppress warnings for this cell
options(warn = -1) 
library(dplyr)
library(rtracklayer)

options(warn = defaultW)

### 1  Add to the all_gene_dge_names.tsv structure

First gather the data and add GeneSymbol, ENSG without version and chromosome

### 1.1 create a file used for statistical analysis of DGE genes
all the tissues used the same listing of genes for the differential gene analysis -- so reading anly of the files will allow these ENSG files to be mapped to GeneSymbols and Chromosomes using the gencode.v30.annotation file

In [None]:
#
# add chr information for summary data later, use the annotation we used for rMATS
#
if (!("gencode.v30.annotation.gtf.gz" %in% list.files("../data/"))) {
    message("downloading gencode v30 annotation\n")
    system("wget -O ../data/gencode.v30.annotation.gtf.gz ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/gencode.v30.annotation.gtf.gz")
    message("Done!\n")
    message("Unzipping compressed file gencode.v30.annotation.gtf.gz..")
    system("gunzip ../data/gencode.v30.annotation.gtf.gz", intern = TRUE)
    message("Done! gencode.v30.annotation.gtf can be found in ../data/")
}
gencode <- import("../data/gencode.v30.annotation.gtf")
gtf.df <- as.data.frame (gencode)
chr_genes <- unique(gtf.df[,c("seqnames","gene_name","gene_id")])
colnames(chr_genes) <- c("chr","GeneSymbol", "ENSG")
head(chr_genes)
for (i in 1:dim(chr_genes)[1]) {
    chr_genes$ENSG[i] <- as.character(strsplit(chr_genes$ENSG[i],'\\.\\w+$'))
}
head(chr_genes)

In [None]:
write.table(chr_genes, "../data/chr_genes.tsv", quote=FALSE, sep="\t")

### 1.2 Create the all_genes_dge_names.tsv file for analysis

All of the **DGE.csv** tissue files have the same gene names

In [None]:
results_dir     <- "../data/"
all_dge_pattern <- "*_DGE.csv"
all_dge_files    <- list.files(path = results_dir, pattern = all_dge_pattern)
message ("number of DGE files ", length(all_dge_files))

In [None]:
all_gene_dge = data.frame()

In [None]:
for (file in 1:length(all_dge_files)) {

    lines  <- read.table(file=paste0(results_dir, all_dge_files[file]), 
                                  header = TRUE, sep = ",", quote = "\"'", skipNul = FALSE)
    message("For   ", all_dge_files[file])
    message("we find the number of genes to be ", dim(lines))

    if (dim(lines)[1] > 0) {
        ensg_ver   <- as.vector(as.character(rownames(lines)))
        chr        <- rep("NA",dim(lines)[1])
        ensg_no_ver<- rep("NA",dim(lines)[1])
        ensg_genes <- rep("NA",dim(lines)[1])
       
        for (i in 1:dim(lines)[1]) {
            ensg_no_ver[i] <- as.character(strsplit(ensg_ver[i],'\\.\\w+$'))
   	    match  <- as.character(chr_genes$ENSG) %in% as.character((ensg_no_ver[i]))
	    
   	    if (sum(match==TRUE)== 1) {
   	        chr[i]        <- as.character(chr_genes[match,]$chr)
                ensg_genes[i] <- as.character(chr_genes[match,]$GeneSymbol)
   		  
   	    # if there are multiple matches, just keep the first result
   	    } else if (sum(match==TRUE)>1) {
   	        all_chr <- as.vector(as.character(chr_genes[match,]$chr))
   		chr[i] <- as.character(all_chr[1])
   		all_genes <- as.vector(as.character(chr_genes[match,]$GeneSymbol))
   		ensg_genes[i] <- as.character(all_genes[1])
		
   	    } # end if there is a match
	    
   	    res <- data.frame(ENSG_ver     <- ensg_ver[i],
                          ENSG_no_ver  <- ensg_no_ver[i],
                          GeneSymbol   <- ensg_genes[i],
                          chr          <- chr[i])
        
            all_gene_dge <- rbind(all_gene_dge, res)
	    
        } # for all lines
	
    } #if has events

In [None]:
} #for all files

In [None]:
colnames(all_gene_dge) <- c("ENSG_ver","ENSG_no_ver","GeneSymbol","chr")
sorted_all_gene_dge <- all_gene_dge[order(all_gene_dge["ENSG_ver"]),]
unique_all_gene_dge <- unique(sorted_all_gene_dge)

In [None]:
message("The universe of all genes (without ChrY) is ", length(unique_all_gene_dge$GeneSymbol))

n_unique_genes <- length(summary(as.factor(all_gene_dge$GeneSymbol),maxsum=50000))
message("We extracted a total of ",nrow(all_gene_dge)," differential gene events (all_gene_dge)")
message("This includes ", n_unique_genes, " total genes")

In [None]:
table(unique_all_gene_dge$chr)
write.table(unique_all_gene_dge, "../data/all_gene_dge.tsv", quote=FALSE, sep="\t")

### Appendix - Metadata

For replicability and reproducibility purposes, we also print the following metadata:

1. Checksums of **'artefacts'**, files generated during the analysis and stored in the folder directory **`data`**
2. List of environment metadata, dependencies, versions of libraries using `utils::sessionInfo()` and [`devtools::session_info()`](https://devtools.r-lib.org/reference/session_info.html)

### Appendix 1. Checksums with the sha256 algorithm

In [None]:
rm (notebookid)
notebookid   = "createAllgeneDGE"
notebookid

message("Generating sha256 checksums of the file `../data/all_gene_dge_gene_names.tsv` directory .. ")
system(paste0("cd ../data && find . -name all_gene_dge_gene_names.tsv -exec sha256sum {} \\;  >  ../metadata/", notebookid, "_sha256sums.txt"), intern = TRUE)
message("Done!\n")

message("Generating sha256 checksums of the file `../data/chr_genes.tsv` directory .. ")
system(paste0("cd ../data && find . -name chr_genes.tsv -exec sha256sum {} \\;  >  ../metadata/", notebookid, "_sha256sums.txt"), intern = TRUE)
message("Done!\n")

###Â Appendix 2. Libraries metadata

In [None]:
dev_session_info   <- devtools::session_info()
utils_session_info <- utils::sessionInfo()

message("Saving `devtools::session_info()` objects in ../metadata/devtools_session_info.rds  ..")
saveRDS(dev_session_info, file = paste0("../metadata/", notebookid, "_devtools_session_info.rds"))
message("Done!\n")

message("Saving `utils::sessionInfo()` objects in ../metadata/utils_session_info.rds  ..")
saveRDS(utils_session_info, file = paste0("../metadata/", notebookid ,"_utils_info.rds"))
message("Done!\n")

dev_session_info$platform
dev_session_info$packages[dev_session_info$packages$attached==TRUE, ]