# Figure 1C - YARN Normalization Version

A `heatplot` representing similarity in the fold-changes between male and female samples, with the values in the heatmap being the correlation between the vectors of fold changes of the tissues. </b>

We downloaded the GTEx version 8.0 RNA-seq and genotype data (phs000424.v8.v2), released 2019-08-26.
We used YARN (https://bioconductor.org/packages/release/bioc/html/yarn.html), uploading the downloadGTEx function
to download this release, and used it to perform quality control, gene filtering and normalization pre-processing on the
GTEx RNA-seq data, as described in (Paulson et al, 2017).   This pipelines tested for sample sex-misidentification, 
merged related sub-tissues, performed tissue-aware normalization using qsmooth (Hicks et al, 2017).

## Loading dependencies

In [None]:
library(downloader)
library(readr)
library(edgeR)
library(biomaRt)
library(DBI) # v >= 1.1.0 required for biomaRt
library(devtools)
library(yarn)
library(statmod)
Sys.setenv(TAR = "/bin/tar") # for gzfile

Begin here if you have already run this and created the `data/gtex.rds` file

Please `git clone` the repository and start working after changing to this as your working directory (`cd lifebitCloudOSDRE`). The `data` subdirectory, along with all other paths used in this Notebook are relative to the `lifebitCloudOSDRE` repository.

In [None]:
#NOTE TO USERS:
#
# Either run option 1 or option 2
#
# 1. CAUTION! It requires some minutes to complete, also memory and enough storage
#
obj <- yarn::downloadGTExV8(type='genes',file='../data/gtex.rds')
saveRDS(obj, file = "../data/gtex.rds")
#
# or
#
# 2. Retrieve the released location the already generated gtex.rds
# 
#    Run the following line in the data directory prior to the next line if you wish to use the already
#    generated file.
# 
#    and then run this line.
#   
#    To Run option 2 -- launch a terminal window 
#    and change to the 'data' subdirectory
#
#    cd lifebitCloudOSDRE/data
#
#    and run this command
#
#    wget https://github.com/lifebit-ai/lifebitCloudOSDREgtex/releases/download/gtex_archive/gtex.rds
#  
#    Then come back to this R code and uncomment this line.
#
#    obj <- readRDS("../data/gtex.rds")

In [None]:
# Confirm that it is an expression set.
# and check the dimensions of the objects, and the phenotype information of the objects
class(obj)
dim(phenoData(obj))
dim(obj)

In [None]:
#Okay - for some reason our phenotype data is larger than our expression data - I've written Joe Paulson about that.
#In the meantime, make sure that the two sets are aligned.
sample_names=as.vector(as.character(colnames(exprs(obj))))
head(sample_names)
length(sample_names)

pheno_sample_names=as.vector(as.character(rownames(pData(obj))))
head(pheno_sample_names)
length(pheno_sample_names)

logical_match_names=pheno_sample_names %in% sample_names
length(logical_match_names)
table(logical_match_names)
pData(obj) <- (pData(obj)[logical_match_names==TRUE,])
dim(pData(obj))
dim(obj)

In [None]:
#  Now we want to replace all *dashes* with _underscores_

In [None]:
newSampID <- gsub('-','\\.',pData(obj)$SAMPID)

In [None]:
head (newSampID)

In [None]:
pData(obj)$SAMPID <- newSampID

In [None]:
colnames(pData(obj))

In [None]:
head(exprs(obj))

In [None]:
#  Now let us do the differential analysis - using EdgeR

In [None]:
x <- exprs(obj)

In [None]:
dim(x)

In [None]:
# To use the DGEList function from EdgeR, we need to transpose our x so that the length of group is equal
# to the number of columns in our counts (x).
# You will get an error in DGEList (counts = x, group = group) if the length of group is not equal to the number of columns in counts

In [None]:
group <- factor(pData(obj)$SEX)

In [None]:
table(group)

In [None]:
#caution this step takes a lot of memory and time - so do one of two things again:
#
# 1. You can either run this step from scratch and save the rds object for next time.
#
#y <- DGEList(counts=x, group=group)
#y <- calcNormFactors(y)
#saveRDS(y, file = "../data/DGENormFactorsy.rds")
#
# 2. or if you have already run this step and want to retrieve it again
# you can read this from the saved rds object
#
y <- readRDS("../data/DGENormFactorsy.rds")
attributes(y)

In [None]:
dim(y$counts)

In [None]:
# For Guy -- does this do what you are expecting -- I am confused because what you get when you
#        ask for the min (table(groups)) is the smaller sized group -- which in this case is 
#        female -- it will help the reader to know what you are doing here with the statement.
#        one can read what it is doing but not understand your objective.groups <- pData(obj)$SEX
# keep.events <- rep(TRUE, nrow(y))
#for (group in c(1,2)) {
#    keep.events <- keep.events & 
#                   rowSums(cpm(y[,groups %in% group]) > 1) >= 0.25*min(table(groups))
#}
#
#  From Anne - I believe the objective in this step is to keep only those genes that are in the
#    that are above the threshold of expression for the lower quartile of all sex specific genes.
#    groups = (1,2) -- lots of confusion in logic between groups and group and male and female
#    I recommend we use male and female.
#    two errors then in the above loop - 
#     1. min(table(groups)) will always return the
#        length of the number of samples that are female 
#        (which is 5978 for this v8 GTEx)
#     2. sum(table(group)) will return the number of samples that are either male or female
#        depending upon whether you are in the loop for male or female consideration.
#
#    If my assumption is true - I recommend replacing it with the following.
#
#  this should be relaced 
#
keep.events <- rep(TRUE, nrow(y))
nrow(y)
table(pData(obj)$SEX)
table(group)
#
# first keep all the events for the male subsets meeting our threshold criteria
#
keep.events  <- keep.events & rowSums(cpm(y$counts[,group == 1]) > 1) >= 0.25*length(group==1)
#
# now keep all the male subsets or the female subsets meeting our criteria
#
keep.events2 <- keep.events | rowSums(cpm(y$counts[,group == 2]) > 1) >= 0.25*length(group==2)
table(keep.events)
table(keep.events2)

In [None]:
reduced_y<- y[keep.events2,]

In [None]:
reduced_obj <- obj[keep.events2==TRUE,]

In [None]:
dim(reduced_obj)

In [None]:
# if you have generated these before
saveRDS(reduced_y, file = "../data/reduced_y.rds")
saveRDS(reduced_obj, file = "../data/reduced_obj.rds")
#reduced_obj <- readRDS(file = "../data/reduced_obj.rds")
#reduced_y   <- readRDS(file = "../data/reduced_y.rds")


In [None]:
# separate the analysis by male and by female

In [None]:
reduced_male   <- pData(reduced_obj)$SEX==1
reduced_female <- pData(reduced_obj)$SEX==2

In [None]:
reduced_obj_male   <- reduced_obj[,reduced_male==TRUE]
reduced_obj_female <- reduced_obj[,reduced_female==TRUE]

In [None]:
dim(reduced_obj_male)
dim(reduced_obj_female)

In [None]:
# changing from SMTS to SMTSD - more granularity -- but we could produce both figures
tissue_groups_male <- factor(pData(reduced_obj_male)$SMTSD)
tissue_groups_female <- factor(pData(reduced_obj_female)$SMTSD)

In [None]:
# good sanity check, the male set does not have any vaginas or uterus
table (tissue_groups_male)

In [None]:
# and the females have no prostate or testis
table(tissue_groups_female)

In [None]:
# Reproducing Guys results using the yarn expression object
# loop through the tissues and for those tissues that are shared between the two sexes
# perform a differential gene analysis on a per tissue basis


In [None]:
tissue_groups <- factor(pData(reduced_obj)$SMTSD)

In [None]:
tissue_male_female <- tissue_groups_male %in% tissue_groups_female
table(tissue_male_female)

In [None]:
tissue_shared_male_female <- factor(tissue_groups_male[tissue_male_female])
table(tissue_shared_male_female)

In [None]:
# SEX is coded 1 == Male
#              2 == Female
sex = factor(pData(reduced_obj)$SEX)

In [None]:
library(stringr)

In [None]:
#
# define a function that takes two inputs, the tissue and an object
#    and createe teh model matrix based upon that tissue's sex
#    perform a linear fit after calculating normal factors (based upon library size)
#    and calculate the dispersion using voom (mean variance model of dispersion)
#    grab the results - save the results for prosperity and make a variable
#
fit_tissue <- function (tissue, obj) {
    tissue_true   <- pData(obj)$SMTSD == tissue
    tissue_obj    <- obj[,tissue_true ==TRUE]
    tissue_sex    <- factor(pData(tissue_obj)$SEX)
    tissue_design <- model.matrix(~tissue_sex)
    y_tissue      <- DGEList(counts=exprs(tissue_obj), group=tissue_sex)
    y_tissue      <- calcNormFactors(y_tissue)
    y_tissue_voom <- voom(y_tissue, tissue_design)
    fit_tissue    <- lmFit(y_tissue_voom, tissue_design)
    fit_tissue    <- eBayes(fit_tissue, robust=TRUE)
    results_tissue <- topTable (fit_tissue, coef='tissue_sex2', number=nrow(y_tissue))
    filename = paste(paste("../data",tissue, sep="/"),"DGE.txt", sep="_")
    write.table(results_tissue, filename, sep='\t', quote=FALSE)
    return (results_tissue)
}

In [None]:
# debugging with using the 'Thyroid' tissue
#thyroid_logFC <- fit_tissue('Thyroid',reduced_obj)
#thyroid_logFC
# 
all_logFC <- lapply(X=levels(tissue_shared_male_female), FUN=fit_tissue, obj=reduced_obj)

In [None]:
filenames <- list.files("../data", pattern="*_DGE.txt", all.files=FALSE,
    full.names=FALSE)

In [None]:
# preserve the ordered rownames for later assignment to matrix
fullfilename <- paste('../data',filenames[1],sep="/")
logFC_mat <- read.delim2(fullfilename, stringsAsFactors=FALSE)
logFC_mat <- logFC_mat[order(rownames(logFC_mat)),]
logFC_mat_rownames <- as.character(rownames(logFC_mat)) 

In [None]:
# Make a matrix for each of the tissues
# from what files are saved

make_tissue_matrix_ready <- function (file, obj) {
    filename        <- paste('../data',file,sep="/")
    logFC_mat       <- read.delim2(filename, stringsAsFactors = FALSE)
    logFC_mat       <- logFC_mat[order(rownames(logFC_mat)),]
    logFC           <- as.matrix(as.numeric(logFC_mat$logFC),nrow=dim(obj)[2],ncol=1)
    rownames(logFC) <- rownames(logFC_mat)
    return(logFC)
}

In [None]:
matrix_list <- lapply(X=filenames, FUN=make_tissue_matrix_ready, obj=reduced_obj)

In [None]:
length(matrix_list)
logFC_mat = as.matrix(lapply(X=matrix_list, FUN=cbind),nrow=dim(reduced_obj)[2], ncol=length(matrix_list))

In [None]:
get_tissue_name <- function (tissue_name) {
    tissue <- str_replace(tissue_name,'_DGE.txt','')
    return(tissue)
}

In [None]:
tissue_list <- lapply(X=levels(tissue_shared_male_female), FUN=get_tissue_name)
length(tissue_list)
head(tissue_list)

In [None]:
logFC_mat = as.matrix(as.numeric(unlist(matrix_list[1]),nrow=dim(reduced_obj)[2], ncol=1))
for (i in (2:length(matrix_list))) {
    n= as.matrix(as.numeric(unlist(matrix_list[i]),nrow=dim(reduced_obj)[2], ncol=1))
    logFC_mat = cbind(logFC_mat,n)
}
dim(logFC_mat)
rownames(logFC_mat) = logFC_mat_rownames
colnames(logFC_mat) = tissue_list
head(logFC_mat)

In [None]:
logFC_mat_NQ <- normalizeQuantiles(logFC_mat)

In [None]:
head(logFC_mat_NQ)

In [None]:
dist_mat <- as.matrix(cor(logFC_mat_NQ))

In [None]:
head(dist_mat)

In [None]:
rownames(dist_mat) <- colnames(logFC_mat)
colnames(dist_mat) <- colnames(logFC_mat)

In [None]:
library(pheatmap)
pheatmap(as.matrix(dist_mat),  fontsize = 6)
hm.parameters <- list(dist_mat, fontsize=6)
do.call("pheatmap", c(hm.parameters, filename="../pdf/Figure1cv2.pdf"))