# Figure 1C - YARN Normalization Version

A `heatplot` representing similarity in the fold-changes between male and female samples, with the values in the heatmap being the correlation between the vectors of fold changes of the tissues. </b>

We downloaded the GTEx version 8.0 RNA-seq and genotype data (phs000424.v8.v2), released 2019-08-26.
We used YARN (https://bioconductor.org/packages/release/bioc/html/yarn.html), uploading the downloadGTEx function
to download this release, and used it to perform quality control, gene filtering and normalization pre-processing on the
GTEx RNA-seq data, as described in (Paulson et al, 2017).   This pipelines tested for sample sex-misidentification, 
merged related sub-tissues, performed tissue-aware normalization using qsmooth (Hicks et al, 2017).

## Loading dependencies

In [None]:
library(downloader)
library(readr)
library(edgeR)
library(biomaRt)
library(DBI) # v >= 1.1.0 required for biomaRt
library(devtools)
library(yarn)
Sys.setenv(TAR = "/bin/tar") # for gzfile

Begin here if you have already run this and created the `data/gtex.rds` file

Please `git clone` the repository and start working after changing to this as your working directory (`cd lifebitCloudOSDRE`). The `data` subdirectory, along with all other paths used in this Notebook are relative to the `lifebitCloudOSDRE` repository.

In [None]:
# CAUTION! It requires some minutes to complete, also memory and enough storage
obj <- yarn::downloadGTExV8(type='genes',file='../data/gtex.rds')

This uploaded object is available by long names -- which might be nice to simplify ....

In [None]:
class(obj)

In [None]:
saveRDS(obj, file = "../data/ExpressionSetobj.rds")

In [None]:
dim(phenoData(obj))

In [None]:
# for later - not sure attributes are actually successfully being added
#ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
#attributes = listAttributes(ensembl)

In [None]:
dim(obj)

In [None]:
sample_names=as.vector(as.character(colnames(exprs(obj))))
head(sample_names)
length(sample_names)

In [None]:
pheno_sample_names=as.vector(as.character(rownames(pData(obj))))
head(pheno_sample_names)
length(pheno_sample_names)

Okay - for some reason our phenotype data is larger than our expression data - I've written Joe Paulson about that.
In the meantime, make sure that the two sets are aligned.

In [None]:
logical_match_names=pheno_sample_names %in% sample_names
length(logical_match_names)

In [None]:
table(logical_match_names)


In [None]:
pData(obj) <- (pData(obj)[logical_match_names==TRUE,])

In [None]:
dim(pData(obj))
dim(obj)

Now we want to replace all *dashes* with _underscores_

In [None]:
newSampID <- gsub('-','\\.',pData(obj)$SAMPID)

In [None]:
head (newSampID)

In [None]:
pData(obj)$SAMPID <- newSampID

In [None]:
colnames(pData(obj))

In [None]:
head(exprs(obj))

Now let us do the differential analysis - using EdgeR

In [None]:
x <- exprs(obj)

In [None]:
dim(x)

To use the DGEList function from EdgeR, we need to transpose our x so that the length of group is equal
to the number of columns in our counts (x).

You will get an error in DGEList (counts = x, group = group) if the length of group is not equal to the number of columns in counts

In [None]:
group <- factor(pData(obj)$SEX)

In [None]:
y <- DGEList(counts=x, group=group)

I keep running out of memory on this step - so on my laptop after calculating the DGEList
I saved it and now I uploaded it to this larger memory machine

In [None]:
attributes(y)

In [None]:
#caution this step takes a lot of memory and time

In [None]:
y <- calcNormFactors(y)

In [None]:
saveRDS(y, file = "../data/DGENormFactorsy.rds")

In [None]:
# For Guy -- does this do what you are expecting -- I am confused because what you get when you
#        ask for the min (table(groups)) is the smaller sized group -- which in this case is 
#        female -- it will help the reader to know what you are doing here with the statement.
#        one can read what it is doing but not understand your objective.

In [None]:
groups <- pData(obj)$SEX
keep.events <- rep(TRUE, nrow(y))
for (group in c(1,2)) {
    keep.events <- keep.events & 
                   rowSums(cpm(y[,groups %in% group]) > 1) >= 0.25*min(table(groups))
}


In [None]:
reduced_y<- y[keep.events,]

In [None]:
reduced_obj <- obj[keep.events==TRUE,]

In [None]:
dim(reduced_obj)

In [None]:
#saveRDS(reduced_y, file = "../data/reduced_y.rds")
#saveRDS(reduced_obj, file = "../data/reduced_obj.rds")
reduced_obj <- readRDS(file = "../data/reduced_obj.rds")
reduced_y   <- readRDS(file = "../data/reduced_y.rds")


In [None]:
#install.packages("statmod")
library(statmod)

In [None]:
# separate the analysis by male and by female
# then do the differential analysis regarding tissue
# then do the differential analysis male tissue vs female tissue

In [None]:
reduced_male   <- pData(reduced_obj)$SEX==1
reduced_female <- pData(reduced_obj)$SEX==2

In [None]:
reduced_obj_male   <- reduced_obj[,reduced_male==TRUE]
reduced_obj_female <- reduced_obj[,reduced_female==TRUE]

In [None]:
dim(reduced_obj_male)
dim(reduced_obj_female)

In [None]:
tissue_groups_male <- factor(pData(reduced_obj_male)$SMTS)
tissue_groups_female <- factor(pData(reduced_obj_female)$SMTS)

In [None]:
# good sanity check, the male set does not have any vaginas or uterus
table (tissue_groups_male)

In [None]:
# and the females have no prostate or testis
table(tissue_groups_female)

In [None]:
# Reproducing Guys results using the yarn expression object
# loop through the tissues and for those tissues that are shared between the two sexes
# perform a differential gene analysis on a per tissue basis


In [None]:
tissue_groups <- factor(pData(reduced_obj)$SMTS)

In [None]:
tissue_male_female <- tissue_groups_male %in% tissue_groups_female
table(tissue_male_female)

In [None]:
tissue_shared_male_female <- factor(tissue_groups_male[tissue_male_female])
table(tissue_shared_male_female)

In [None]:
# SEX is coded 1 == Male
#              2 == Female
sex = factor(pData(reduced_obj)$SEX)

In [None]:
#tissue_shared_male_female
for (tissue in tissue_shared_male_female) {
# missing: Bladder
#    tissue        <- 'Bladder'
    tissue_true   <- pData(reduced_obj)$SMTS==tissue
    tissue_obj    <- reduced_obj[,tissue_true==TRUE]
    tissue_sex    <- factor(pData(tissue_obj)$SEX)
    tissue_design <- model.matrix(~tissue_sex)
    y_tissue      <- DGEList(counts=exprs(tissue_obj), group=tissue_sex)
    tissue_sex_array <- lapply(pData(reduced_obj)$SMTS, factor())
    y_tissue_array <- lapply(pData(reduced_obj)$SMTS,DGEList(counts=exprs(tissue_obj), group=tissue_sex)
    y_tissue      <- calcNormFactors(y_tissue)
    y_tissue_voom <- voom (y_tissue, tissue_design)
    fit_tissue    <- lmFit(y_tissue_voom, tissue_design)
    fit_tissue    <- eBayes(fit_tissue, robust=TRUE)
    results_tissue<- topTable(fit_tissue, coef='tissue_sex2', number=nrow(y_tissue))
    assign(paste("results",tissue, sep="_"),results_tissue)
    filename = paste(paste("../data", tissue,sep="/"),"DGE.txt",sep="_")
    write.table(results_tissue,filename,sep='\t',quote = F)
}

In [None]:
filenames <- list.files("../data", pattern="*_DGE.txt", all.files=FALSE,
    full.names=FALSE)

In [None]:
adipose_tissue_logFC_mat <- read.delim2("../data/Adipose\ Tissue_DGE.txt",stringsAsFactors = FALSE)
adrenal_gland_logFC_mat  <- read.delim2("../data/Adrenal\ Gland_DGE.txt",stringsAsFactors = FALSE)
bladder_logFC_mat        <- read.delim2("../data/Bladder_DGE.txt",stringsAsFactors = FALSE)
blood_logFC_mat          <- read.delim2("../data/Blood_DGE.txt",stringsAsFactors = FALSE)
blood_vessel_logFC_mat   <- read.delim2("../data/Blood\ Vessel_DGE.txt",stringsAsFactors = FALSE) 
brain_logFC_mat          <- read.delim2("../data/Brain_DGE.txt",stringsAsFactors = FALSE)
breast_logFC_mat         <- read.delim2("../data/Breast_DGE.txt",stringsAsFactors = FALSE)
colon_logFC_mat          <- read.delim2("../data/Colon_DGE.txt",stringsAsFactors = FALSE)
esophagus_logFC_mat      <- read.delim2("../data/Esophagus_DGE.txt",stringsAsFactors = FALSE)
heart_logFC_mat          <- read.delim2("../data/Heart_DGE.txt",stringsAsFactors = FALSE)
kidney_logFC_mat         <- read.delim2("../data/Kidney_DGE.txt",stringsAsFactors = FALSE)
liver_logFC_mat          <- read.delim2("../data/Liver_DGE.txt",stringsAsFactors = FALSE)
lung_logFC_mat           <- read.delim2("../data/Lung_DGE.txt",stringsAsFactors = FALSE)
muscle_logFC_mat         <- read.delim2("../data/Muscle_DGE.txt",stringsAsFactors = FALSE)
nerve_logFC_mat          <- read.delim2("../data/Nerve_DGE.txt",stringsAsFactors = FALSE)
pancreas_logFC_mat       <- read.delim2("../data/Pancreas_DGE.txt",stringsAsFactors = FALSE)
pituitary_logFC_mat      <- read.delim2("../data/Pituitary_DGE.txt",stringsAsFactors = FALSE)
salivary_gland_logFC_mat <- read.delim2("../data/Salivary\ Gland_DGE.txt",stringsAsFactors = FALSE)
skin_logFC_mat           <- read.delim2("../data/Skin_DGE.txt",stringsAsFactors = FALSE)
small_intestine_logFC_mat<- read.delim2("../data/Small\ Intestine_DGE.txt",stringsAsFactors = FALSE)
spleen_logFC_mat         <- read.delim2("../data/Spleen_DGE.txt",stringsAsFactors = FALSE)
stomach_logFC_mat        <- read.delim2("../data/Stomach_DGE.txt",stringsAsFactors = FALSE)
thyroid_logFC_mat        <- read.delim2("../data/Thyroid_DGE.txt",stringsAsFactors = FALSE)

In [None]:
adipose_tissue_logFC <- as.matrix(as.numeric(adipose_tissue_logFC_mat$logFC),nrow=16794,ncol=1)

m <- adipose_tissue_logFC[order(rownames(adipose_tissue_logFC_mat)),]

In [None]:
adipose_tissue_logFC_mat       <- adipose_tissue_logFC_mat[order(rownames(adipose_tissue_logFC_mat)),]
adipose_tissue_logFC           <- as.matrix(as.numeric(adipose_tissue_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(adipose_tissue_logFC) <- rownames(adipose_tissue_logFC_mat)

adrenal_gland_logFC_mat       <- adrenal_gland_logFC_mat[order(rownames(adrenal_gland_logFC_mat)),]
adrenal_gland_logFC           <- as.matrix(as.numeric(adrenal_gland_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(adrenal_gland_logFC) <- rownames(adrenal_gland_logFC_mat)

bladder_logFC_mat       <- bladder_logFC_mat[order(rownames(bladder_logFC_mat)),]
bladder_logFC           <- as.matrix(as.numeric(bladder_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(bladder_logFC) <- rownames(bladder_logFC_mat)

blood_logFC_mat       <- blood_logFC_mat[order(rownames(blood_logFC_mat)),]
blood_logFC           <- as.matrix(as.numeric(blood_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(blood_logFC) <- rownames(blood_logFC_mat)

blood_vessel_logFC_mat       <- blood_vessel_logFC_mat[order(rownames(blood_vessel_logFC_mat)),]
blood_vessel_logFC           <- as.matrix(as.numeric(blood_vessel_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(blood_vessel_logFC) <- rownames(blood_vessel_logFC_mat)

brain_logFC_mat      <- brain_logFC_mat[order(rownames(brain_logFC_mat)),]
brain_logFC          <- as.matrix(as.numeric(brain_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(brain_logFC)<- rownames(brain_logFC_mat)

breast_logFC_mat       <- breast_logFC_mat[order(rownames(breast_logFC_mat)),]
breast_logFC           <- as.matrix(as.numeric(breast_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(breast_logFC) <- rownames(breast_logFC_mat)

colon_logFC_mat       <- colon_logFC_mat[order(rownames(colon_logFC_mat)),]
colon_logFC           <- as.matrix(as.numeric(colon_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(colon_logFC) <- rownames(colon_logFC_mat)

esophagus_logFC_mat       <- esophagus_logFC_mat[order(rownames(esophagus_logFC_mat)),]
esophagus_logFC           <- as.matrix(as.numeric(esophagus_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(esophagus_logFC) <- rownames(esophagus_logFC_mat)

heart_logFC_mat       <- heart_logFC_mat[order(rownames(heart_logFC_mat)),]
heart_logFC           <- as.matrix(as.numeric(heart_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(heart_logFC) <- rownames(heart_logFC_mat)

kidney_logFC_mat       <- kidney_logFC_mat[order(rownames(kidney_logFC_mat)),]
kidney_logFC           <- as.matrix(as.numeric(kidney_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(kidney_logFC) <- rownames(kidney_logFC_mat)

liver_logFC_mat       <- liver_logFC_mat[order(rownames(liver_logFC_mat)),]
liver_logFC           <- as.matrix(as.numeric(liver_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(liver_logFC) <- rownames(liver_logFC_mat)

lung_logFC_mat       <- lung_logFC_mat[order(rownames(lung_logFC_mat)),]
lung_logFC           <- as.matrix(as.numeric(lung_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(lung_logFC) <- rownames(lung_logFC_mat)

muscle_logFC_mat      <- muscle_logFC_mat[order(rownames(muscle_logFC_mat)),]
muscle_logFC          <- as.matrix(as.numeric(muscle_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(muscle_logFC)<- rownames(muscle_logFC_mat)

nerve_logFC_mat       <- nerve_logFC_mat[order(rownames(nerve_logFC_mat)),]
nerve_logFC           <- as.matrix(as.numeric(nerve_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(nerve_logFC) <- rownames(nerve_logFC_mat)

pancreas_logFC_mat       <- pancreas_logFC_mat[order(rownames(pancreas_logFC_mat)),]
pancreas_logFC           <- as.matrix(as.numeric(pancreas_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(pancreas_logFC) <- rownames(pancreas_logFC_mat)

pituitary_logFC_mat       <- pituitary_logFC_mat[order(rownames(pituitary_logFC_mat)),]
pituitary_logFC           <- as.matrix(as.numeric(pituitary_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(pituitary_logFC) <- rownames(pituitary_logFC_mat)

salivary_gland_logFC_mat       <- salivary_gland_logFC_mat[order(rownames(salivary_gland_logFC_mat)),]
salivary_gland_logFC           <- as.matrix(as.numeric(salivary_gland_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(salivary_gland_logFC) <- rownames(salivary_gland_logFC_mat)

skin_logFC_mat       <- skin_logFC_mat[order(rownames(skin_logFC_mat)),]
skin_logFC           <- as.matrix(as.numeric(skin_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(skin_logFC) <- rownames(skin_logFC_mat)

small_intestine_logFC_mat       <- small_intestine_logFC_mat[order(rownames(small_intestine_logFC_mat)),]
small_intestine_logFC           <- as.matrix(as.numeric(small_intestine_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(small_intestine_logFC) <- rownames(small_intestine_logFC_mat)

spleen_logFC_mat       <- spleen_logFC_mat[order(rownames(spleen_logFC_mat)),]
spleen_logFC           <- as.matrix(as.numeric(spleen_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(spleen_logFC) <- rownames(spleen_logFC_mat)

stomach_logFC_mat       <- stomach_logFC_mat[order(rownames(stomach_logFC_mat)),]
stomach_logFC           <- as.matrix(as.numeric(stomach_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(stomach_logFC) <- rownames(stomach_logFC_mat)

thyroid_logFC_mat       <- thyroid_logFC_mat[order(rownames(thyroid_logFC_mat)),]
thyroid_logFC           <- as.matrix(as.numeric(thyroid_logFC_mat$logFC),nrow=16794,ncol=1)
rownames(thyroid_logFC) <- rownames(thyroid_logFC_mat)


In [None]:
logFC_mat = as.matrix(cbind(adipose_tissue_logFC,
             adrenal_gland_logFC,
             bladder_logFC,
             blood_logFC,
             blood_vessel_logFC,
             brain_logFC,
             breast_logFC,
             colon_logFC,
             esophagus_logFC,
             heart_logFC,
             kidney_logFC,
             liver_logFC,
             lung_logFC,
             muscle_logFC,
             nerve_logFC,
             pancreas_logFC,
             pituitary_logFC,
             salivary_gland_logFC,
             skin_logFC,
             small_intestine_logFC,
             spleen_logFC,
             stomach_logFC,
             thyroid_logFC),nrow=16794,ncol=23)


In [None]:
rownames(logFC_mat) <- rownames(thyroid_logFC)
colnames(logFC_mat) <- c("adipose","adrenal_gland","bladder","blood","blood_vessel",
                        "brain","breast","colon","esophagus","heart","kidney",
                        "liver","lung","muscle","nerve","pancreas","pitutary",
                        "salivary_gland","skin","small_intestine",
                        "spleen","stomach","thyroid")
dim(logFC_mat)
head(logFC_mat)



In [None]:
logFC_mat_NQ <- normalizeQuantiles(logFC_mat)

In [None]:
dist_mat <- as.matrix(cor(logFC_mat_NQ))

In [None]:
rownames(dist_mat) <- colnames(logFC_mat)
colnames(dist_mat) <- colnames(logFC_mat)

In [None]:
library(pheatmap)
hm <- pheatmap(as.matrix(dist_mat))


In [None]:
class(hm)