# Analysis Notebook - create all DGE files

Creating and saving two files in this notebook

 **1. chr_genes.tsv:** create a file with chromosome, ENSG (no version number) and GeneSymbols using gencode.v30.annotation.gtf

 **2. all_gene_dge_gene_names.tsv:** create this file using the chr_genes from gencode.v30.annotation for the specific ENSG ids that are used in the differential gene analysis

In [1]:
defaultW <- getOption("warn")  # suppress warnings for this cell
options(warn = -1) 
library(dplyr)
library(rtracklayer)

options(warn = defaultW)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: BiocGenerics

Loading required package: parallel


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, g

### 1  Add to the all_gene_dge_names.tsv structure

First gather the data and add GeneSymbol, ENSG without version and chromosome

### 1.1 create a file used for statistical analysis of DGE genes
all the tissues used the same listing of genes for the differential gene analysis -- so reading anly of the files will allow these ENSG files to be mapped to GeneSymbols and Chromosomes using the gencode.v30.annotation file

In [2]:
#
# add chr information for summary data later, use the annotation we used for rMATS
#
if (!("gencode.v30.annotation.gtf.gz" %in% list.files("../data/"))) {
    message("downloading gencode v30 annotation\n")
    system("wget -O ../data/gencode.v30.annotation.gtf.gz ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/gencode.v30.annotation.gtf.gz")
    message("Done!\n")
    message("Unzipping compressed file gencode.v30.annotation.gtf.gz..")
    system("gunzip ../data/gencode.v30.annotation.gtf.gz", intern = TRUE)
    message("Done! gencode.v30.annotation.gtf can be found in ../data/")
}
gencode <- import("../data/gencode.v30.annotation.gtf")
gtf.df <- as.data.frame (gencode)
chr_genes <- unique(gtf.df[,c("seqnames","gene_name","gene_id")])
colnames(chr_genes) <- c("chr","GeneSymbol", "ENSG")
head(chr_genes)
for (i in 1:dim(chr_genes)[1]) {
    chr_genes$ENSG[i] <- as.character(strsplit(chr_genes$ENSG[i],'\\.\\w+$'))
}
head(chr_genes)

Unnamed: 0_level_0,chr,GeneSymbol,ENSG
Unnamed: 0_level_1,<fct>,<chr>,<chr>
1,chr1,DDX11L1,ENSG00000223972.5
13,chr1,WASH7P,ENSG00000227232.5
26,chr1,MIR6859-1,ENSG00000278267.1
29,chr1,MIR1302-2HG,ENSG00000243485.5
37,chr1,MIR1302-2,ENSG00000284332.1
40,chr1,FAM138A,ENSG00000237613.2


Unnamed: 0_level_0,chr,GeneSymbol,ENSG
Unnamed: 0_level_1,<fct>,<chr>,<chr>
1,chr1,DDX11L1,ENSG00000223972
13,chr1,WASH7P,ENSG00000227232
26,chr1,MIR6859-1,ENSG00000278267
29,chr1,MIR1302-2HG,ENSG00000243485
37,chr1,MIR1302-2,ENSG00000284332
40,chr1,FAM138A,ENSG00000237613


In [3]:
write.table(chr_genes, "../data/chr_genes.tsv", quote=FALSE, sep="\t")

### 1.2 Create the all_genes_dge_names.tsv file for analysis

All of the **DGE.csv** tissue files have the same gene names

In [4]:
results_dir     <- "../data/"
all_dge_pattern <- "whole_blood_DGE.csv"
all_dge_file    <- list.files(path = results_dir, pattern = all_dge_pattern)
head(all_dge_file)

In [5]:
lines  <- read.table(file=paste0(results_dir, all_dge_file), 
                                     header = TRUE, sep = ",", quote = "\"'", skipNul = FALSE)
dim(lines)
head(lines,2)

Unnamed: 0_level_0,logFC,AveExpr,t,P.Value,adj.P.Val,B
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000183878.15,-8.901997,1.186565,-105.6978,0,0,986.5619
ENSG00000129824.15,-8.928976,3.892373,-103.1027,0,0,982.0146


In [6]:
    all_gene_dge = data.frame()

    if (dim(lines)[1] > 0) {
        ensg_ver   <- as.vector(as.character(rownames(lines)))
        chr        <- rep("NA",dim(lines)[1])
        ensg_no_ver<- rep("NA",dim(lines)[1])
        ensg_genes <- rep("NA",dim(lines)[1])
        for (i in 1:dim(lines)[1]) {
            ensg_no_ver[i] <- as.character(strsplit(ensg_ver[i],'\\.\\w+$'))
            match  <- as.character(chr_genes$ENSG) %in% as.character((ensg_no_ver[i]))
            if (sum(match==TRUE)== 1) {
                chr[i]        <- as.character(chr_genes[match,]$chr)
                ensg_genes[i] <- as.character(chr_genes[match,]$GeneSymbol)
            } else if (sum(match==TRUE)>1) {
                all_chr <- as.vector(as.character(chr_genes[match,]$chr))
                chr[i] <- as.character(all_chr[1])
                all_genes <- as.vector(as.character(chr_genes[match,]$GeneSymbol))
                ensg_genes[i] <- as.character(all_genes[1]) 
            }
            res <- data.frame(ENSG_ver     <- ensg_ver[i],
                              ENSG_no_ver  <- ensg_no_ver[i],
                              GeneSymbol   <- ensg_genes[i],
                              chr          <- chr[i])
            colnames(res) <- c("ENSG_ver","ENSG_no_ver","GeneSymbol","chr")
            
            all_gene_dge <- rbind(all_gene_dge, res)
            
        }
    } #if has events
    n_unique_genes <- length(summary(as.factor(all_gene_dge$GeneSymbol),maxsum=50000))
    message("We extracted a total of ",nrow(all_gene_dge)," differential gene events (all_gene_dge)")
    message("This includes ", n_unique_genes, " total genes")


We extracted a total of 42271 differential gene events (all_gene_dge)

This includes 41706 total genes



In [7]:
head(all_gene_dge)
table(all_gene_dge$chr)
write.table(all_gene_dge, "../data/all_gene_dge_gene_names.tsv", quote=FALSE, sep="\t")

Unnamed: 0_level_0,ENSG_ver,ENSG_no_ver,GeneSymbol,chr
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>
1,ENSG00000183878.15,ENSG00000183878,UTY,chrY
2,ENSG00000129824.15,ENSG00000129824,RPS4Y1,chrY
3,ENSG00000233864.7,ENSG00000233864,,
4,ENSG00000012817.15,ENSG00000012817,KDM5D,chrY
5,ENSG00000231535.5,ENSG00000231535,LINC00278,chrY
6,ENSG00000215580.10,ENSG00000215580,BCORP1,chrY



 chrY  chrX chr20  chr8  chr1 chr19  chr6 chr11 chr15 chr14 chr10  chr9  chr4 
  104  1419  1047  1597  3931  2359  2118  2279  1528  1510  1628  1618  1699 
 chr3  chr5  chr7  chr2 chr12 chr17 chr16 chr18 chr13 chr22  chrM chr21 
 2267  2029  2096  2875  2252  2371  1858   845   899  1026    37   544 

### Appendix - Metadata

For replicability and reproducibility purposes, we also print the following metadata:

1. Checksums of **'artefacts'**, files generated during the analysis and stored in the folder directory **`data`**
2. List of environment metadata, dependencies, versions of libraries using `utils::sessionInfo()` and [`devtools::session_info()`](https://devtools.r-lib.org/reference/session_info.html)

### Appendix 1. Checksums with the sha256 algorithm

In [8]:
rm (notebookid)
notebookid   = "createAllgeneDGE"
notebookid

message("Generating sha256 checksums of the file `../data/all_gene_dge_gene_names.tsv` directory .. ")
system(paste0("cd ../data && find . -name all_gene_dge_gene_names.tsv -exec sha256sum {} \\;  >  ../metadata/", notebookid, "_sha256sums.txt"), intern = TRUE)
message("Done!\n")

message("Generating sha256 checksums of the file `../data/chr_genes.tsv` directory .. ")
system(paste0("cd ../data && find . -name chr_genes.tsv -exec sha256sum {} \\;  >  ../metadata/", notebookid, "_sha256sums.txt"), intern = TRUE)
message("Done!\n")

“object 'notebookid' not found”


Generating sha256 checksums of the file `../data/all_gene_dge_gene_names.tsv` directory .. 



Done!


Generating sha256 checksums of the file `../data/chr_genes.tsv` directory .. 



Done!




### Appendix 2. Libraries metadata

In [9]:
dev_session_info   <- devtools::session_info()
utils_session_info <- utils::sessionInfo()

message("Saving `devtools::session_info()` objects in ../metadata/devtools_session_info.rds  ..")
saveRDS(dev_session_info, file = paste0("../metadata/", notebookid, "_devtools_session_info.rds"))
message("Done!\n")

message("Saving `utils::sessionInfo()` objects in ../metadata/utils_session_info.rds  ..")
saveRDS(utils_session_info, file = paste0("../metadata/", notebookid ,"_utils_info.rds"))
message("Done!\n")

dev_session_info$platform
dev_session_info$packages[dev_session_info$packages$attached==TRUE, ]

Saving `devtools::session_info()` objects in ../metadata/devtools_session_info.rds  ..

Done!


Saving `utils::sessionInfo()` objects in ../metadata/utils_session_info.rds  ..

Done!




 setting  value                       
 version  R version 3.6.2 (2019-12-12)
 os       Ubuntu 18.04.3 LTS          
 system   x86_64, linux-gnu           
 ui       X11                         
 language en_US.UTF-8                 
 collate  en_US.UTF-8                 
 ctype    en_US.UTF-8                 
 tz       Etc/UTC                     
 date     2020-06-21                  

Unnamed: 0_level_0,package,ondiskversion,loadedversion,path,loadedpath,attached,is_base,date,source,md5ok,library
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<chr>,<chr>,<lgl>,<fct>
BiocGenerics,BiocGenerics,0.32.0,0.32.0,/opt/conda/lib/R/library/BiocGenerics,/opt/conda/lib/R/library/BiocGenerics,True,False,2019-10-29,Bioconductor,,/opt/conda/lib/R/library
dplyr,dplyr,0.8.4,0.8.4,/opt/conda/lib/R/library/dplyr,/opt/conda/lib/R/library/dplyr,True,False,2020-01-31,CRAN (R 3.6.2),,/opt/conda/lib/R/library
GenomeInfoDb,GenomeInfoDb,1.22.0,1.22.0,/opt/conda/lib/R/library/GenomeInfoDb,/opt/conda/lib/R/library/GenomeInfoDb,True,False,2019-10-29,Bioconductor,,/opt/conda/lib/R/library
GenomicRanges,GenomicRanges,1.38.0,1.38.0,/opt/conda/lib/R/library/GenomicRanges,/opt/conda/lib/R/library/GenomicRanges,True,False,2019-10-29,Bioconductor,,/opt/conda/lib/R/library
IRanges,IRanges,2.20.0,2.20.0,/opt/conda/lib/R/library/IRanges,/opt/conda/lib/R/library/IRanges,True,False,2019-10-29,Bioconductor,,/opt/conda/lib/R/library
rtracklayer,rtracklayer,1.46.0,1.46.0,/opt/conda/lib/R/library/rtracklayer,/opt/conda/lib/R/library/rtracklayer,True,False,2019-10-29,Bioconductor,,/opt/conda/lib/R/library
S4Vectors,S4Vectors,0.24.0,0.24.0,/opt/conda/lib/R/library/S4Vectors,/opt/conda/lib/R/library/S4Vectors,True,False,2019-10-29,Bioconductor,,/opt/conda/lib/R/library
