# Figure 1C - YARN Normalization Version

Heatplot representing similarity in the fold-changes between male and female samples, with the values in the heatmap being the correlation between the vectors of fold changes of the tissues</b>

In [1]:
rm(list = ls())

We downloaded the GTEx version 8.0 RNA-seq and genotype data (phs000424.v8.v2), released 2019-08-26.
We used YARN (https://bioconductor.org/packages/release/bioc/html/yarn.html), uploading the downloadGTEx function
to download this release, and used it to perform quality control, gene filtering and normalization pre-processing on the
GTEx RNA-seq data, as described in (Paulson et al, 2017).   This pipelines tested for sample sex-misidentification, 
merged related sub-tissues, performed tissue-aware normalization using qsmooth (Hicks et al, 2017).

In [4]:
#if (!requireNamespace("BiocManager", quietly=TRUE))
#    install.packages("BiocManager")
#BiocManager::install("yarn")

In [5]:
#BiocManager::install("downloader")

In [6]:
#BiocManager::install("readr")

In [7]:
#BiocManager::install("biomaRt")

Define a V8 of the function from YARN - wrote the author to make this perhaps version - or I guess I could update the package itself.
THere were 3 lines to change for each of the source files

In [37]:
downloadGTExV8=function (type = "genes", file = NULL, ...) 
{
    phenoFile <- "https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"
    pheno2File <- "https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt"
    geneFile <- "https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz"
    message("Downloading and reading files")
    pdFile <- tempfile("phenodat", fileext = ".txt")
    download(phenoFile, destfile = pdFile)
    pd <- read_tsv(pdFile)
    pd <- as.matrix(pd)
    rownames(pd) <- pd[, "SAMPID"]
    ids <- sapply(strsplit(pd[, "SAMPID"], "-"), function(i) paste(i[1:2], 
        collapse = "-"))
    pd2File <- tempfile("phenodat2", fileext = ".txt")
    download(pheno2File, destfile = pd2File)
    pd2 <- read_tsv(pd2File)
    pd2 <- as.matrix(pd2)
    rownames(pd2) <- pd2[, "SUBJID"]
    pd2 <- pd2[which(rownames(pd2) %in% unique(ids)), ]
    pd2 <- pd2[match(ids, rownames(pd2)), ]
    rownames(pd2) <- colnames(counts)
    pdfinal <- AnnotatedDataFrame(data.frame(cbind(pd, pd2)))
    if (type == "genes") {
        countsFile <- tempfile("counts", fileext = ".gz")
        download(geneFile, destfile = countsFile)
        cnts <- suppressWarnings(read_tsv(geneFile, skip = 2))
        genes <- unlist(cnts[, 1])
        geneNames <- unlist(cnts[, 2])
        counts <- cnts[, -c(1:2)]
        counts <- as.matrix(counts)
        rownames(counts) <- genes
        for (i in 1:nrow(problems(cnts))) {
            counts[problems(cnts)$row[i], problems(cnts)$col[i]] <- 1e+05
        }
        throwAway <- which(rowSums(counts) == 0)
        counts <- counts[-throwAway, ]
        genes <- sub("\\..*", "", rownames(counts))
        host <- "www.ensembl.org"
        biomart <- "ENSEMBL_MART_ENSEMBL"
        dataset <- "hsapiens_gene_ensembl"
        attributes <- c("ensembl_gene_id", "hgnc_symbol", "chromosome_name", 
            "start_position", "end_position", "gene_biotype")
    }
    message("Creating ExpressionSet")
    pdfinal <- pdfinal[match(colnames(counts), rownames(pdfinal)), 
        ]
    es <- ExpressionSet(as.matrix(counts))
    phenoData(es) <- pdfinal
    pData(es)["GTEX-YF7O-2326-101833-SM-5CVN9", "SMTS"] <- "Skin"
    pData(es)["GTEX-YEC3-1426-101806-SM-5PNXX", "SMTS"] <- "Stomach"
    message("Annotating from biomaRt")
    es <- annotateFromBiomart(obj = es, genes = genes, host = host, 
        biomart = biomart, dataset = dataset, attributes = attributes)
    message("Cleaning up files")
    unlink(pdFile)
    unlink(pd2File)
    unlink(countsFile)
    if (!is.null(file)) 
        saveRDS(es, file = file)
    return(es)
}


In [None]:
Begin here if you have already run this and created the data/gtex.rds file

In [9]:
library(downloader)
library(readr)
library(biomaRt)
library(yarn)

In [11]:
getwd()

You may need to adjust your working directory -- the data subdirectory is relative to the lifebitCloudOSDRE working directory

In [12]:
#obj <- downloadGTExV8(type='genes',file='data/gtex.rds')

In [13]:
setwd("../")
obj<-readRDS('data/gtex.rds')

In [14]:
obj

ExpressionSet (storageMode: lockedEnvironment)
assayData: 55878 features, 17382 samples 
  element names: exprs 
protocolData: none
phenoData
  rowNames: GTEX-1117F-0226-SM-5GZZ7 GTEX-1117F-0426-SM-5EGHI ...
    GTEX-YEC3-1426-101806-SM-5PNXX (17384 total)
  varLabels: SAMPID SMATSSCR ... DTHHRDY (67 total)
  varMetadata: labelDescription
featureData
  featureNames: ENSG00000223972.5 ENSG00000227232.5 ...
    ENSG00000210196.2 (55878 total)
  fvarLabels: ensembl_gene_id hgnc_symbol ... gene_biotype (6 total)
  fvarMetadata: labelDescription
experimentData: use 'experimentData(object)'
Annotation:  

In [15]:
tissues <- pData(obj)$SMTS

In [36]:
dim(pData(obj))

In [38]:
dim(obj)

In [55]:
sample_names=as.vector(as.character(colnames(exprs(obj))))
head(sample_names)
length(sample_names)

In [56]:
pheno_sample_names=as.vector(as.character(rownames(pData(obj))))
head(pheno_sample_names)
length(pheno_sample_names)

Okay - for some reason our phenotype data is larger than our expression data - I've written Joe Paulson about that.
In the meantime, make sure that the two sets are aligned.

In [69]:
logical_match_names=pheno_sample_names %in% sample_names
length(logical_match_names)

In [70]:
table(logical_match_names)


logical_match_names
 TRUE 
17382 

In [71]:
pData(obj) <- (pData(obj)[logical_match_names==TRUE,])

Now we want to replace all *dashes* with _underscores_

In [72]:
newSampID <- gsub('-','\\.',pData(obj)$SAMPID)

In [73]:
head (newSampID)

In [74]:
pData(obj)$SAMPID <- newSampID

In [75]:
tissueFactors <- factor(tissues)

In [76]:
table(tissueFactors)

tissueFactors
 Adipose Tissue   Adrenal Gland         Bladder           Blood    Blood Vessel 
           1204             258              21             929            1335 
          Brain          Breast    Cervix Uteri           Colon       Esophagus 
           2642             459              19             779            1445 
 Fallopian Tube           Heart          Kidney           Liver            Lung 
              9             861              89             226             578 
         Muscle           Nerve           Ovary        Pancreas       Pituitary 
            803             619             180             328             283 
       Prostate  Salivary Gland            Skin Small Intestine          Spleen 
            245             162            1810             187             241 
        Stomach          Testis         Thyroid          Uterus          Vagina 
            360             361             653             142             156 

In [77]:
# SEX is coded 1 == Male
#              2 == Female
sex <- pData(obj)$SEX
age <- pData(obj)$AGE
#cod <- cause of death
cod <- pData(obj)$DTHHRDY
    

In [78]:
table(sex)
table(age)
table(cod)

sex
    1     2 
11584  5798 

age
20-29 30-39 40-49 50-59 60-69 70-79 
 1320  1323  2702  5615  5821   601 

cod
   0    1    2    3    4 
8814  711 4839  868 2039 

Now let us do the differential analysis - using EdgeR

In [79]:
#BiocManager::install("edgeR")
library(edgeR)

In [80]:
x <- exprs(obj)

In [81]:
dim(x)

To use the DGEList function from EdgeR, we need to transpose our x so that the length of group is equal
to the number of columns in our counts (x).

You will get an error in DGEList (counts = x, group = group) if the length of group is not equal to the number of columns in counts

In [82]:
group <- factor(pData(obj)$SEX)

In [84]:
y <- DGEList(counts=x, group=group)

In [86]:
y <- calcNormFactors(y)

ERROR: Error: vector memory exhausted (limit reached?)


In [None]:
We only want to keep those events that are greater than the first quartile (25%),
this is done using all non-zero (>1) events >= 0.25 min(table(pData(obj)$SEX))

In [None]:
==paste(unlist(strsplit(,'\\.'))[c(1,2)],collapse='.')

In [86]:
 minSamples W min(table(pData(obj)[, groups]))/2

In [95]:
filteredObj <- filterMissingGenes(obj)

In [100]:
table(pData(obj)[,'SEX'])


    1     2 
11584  5798 

Next, we are intrested in the Subject Phenotype information.
A de-identified, open access version of the sample annotations available in dbGaP.

These are found at the same location: *https://gtexportal.org/home/datasets*


In [21]:
attributes(obj)

$.__classVersion__
            R       Biobase          eSet ExpressionSet 
      "3.6.1"      "2.46.0"       "1.3.0"       "1.0.0" 

$experimentData
Experiment data
  Experimenter name:  
  Laboratory:  
  Contact information:  
  Title:  
  URL:  
  PMIDs:  
  No abstract available.

$assayData
<environment: 0x7ffe642b6580>

$phenoData
An object of class 'AnnotatedDataFrame'
  rowNames: GTEX-1117F-0226-SM-5GZZ7 GTEX-1117F-0426-SM-5EGHI ...
    GTEX-YEC3-1426-101806-SM-5PNXX (17384 total)
  varLabels: SAMPID SMATSSCR ... DTHHRDY (67 total)
  varMetadata: labelDescription

$featureData
An object of class 'AnnotatedDataFrame'
  featureNames: ENSG00000223972.5 ENSG00000227232.5 ...
    ENSG00000210196.2 (55878 total)
  varLabels: ensembl_gene_id hgnc_symbol ... gene_biotype (6 total)
  varMetadata: labelDescription

$annotation
character(0)

$protocolData
An object of class 'AnnotatedDataFrame': none

$class
[1] "ExpressionSet"
attr(,"package")
[1] "Biobase"


ERROR: Error in tissue(obj): could not find function "tissue"


In [15]:
gtex_pheno_url <- 'https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt'
gtex_pheno <-'data/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt'
download.file(gtex_pheno_url, destfile=gtex_pheno)

In [27]:
head(tissue.sets)

In [30]:
colnames.expression.mat<-colnames(gene_counts.data)

In [32]:
# replace all the dashes with with periods in sample names in the metadata file
meta.data$biospecimen_repository_sample_id<-gsub('-','\\.',meta.data$biospecimen_repository_sample_id)


In [29]:
for (tissue.set in tissue.sets) {
#    print (tissue.set)

    # extract and create an expression matrix from the released dataset
    # skipping the header rows
    # expression.mat<-read.table(gtex_gct, nrow=1,sep='\t',header=T,skip=2)
    # meta.data <- read.csv(dfile, header=FALSE)
    
    # if there is no data, skip and move on
    if (length(table(meta.data$sex_s[meta.data$body_site %in% tissue.set]))==1)
        next

    # if there is no data, skip
    if (sum(meta.data$body_site %in% tissue.set)==0)
        next

    for (col in colnames.expression.mat)
        col.in.tissue<-c(col.in.tissue, 
                         (col %in% meta.data$biospecimen_repository_sample_id) && (meta.data$body_site[which(meta.data$biospecimen_repository_sample_id==col)] %in% tissue.set))
    
#    expression.mat<-read.table(gtex_gct, colClasses = ifelse(col.in.tissue,"numeric","NULL"),sep='\t',header=T,skip=2)

    write.table(expression.mat,paste('data/expression',tissue.set,'.txt',sep=''),sep='\t',quote = F)

#}

[1] "Adipose - Subcutaneous"
[1] "Brain - Frontal Cortex (BA9)"
[1] "Thyroid"
[1] "Lung"
[1] "Whole Blood"
[1] "Brain - Cerebellar Hemisphere"
[1] "Artery - Tibial"
[1] "Cells - EBV-transformed lymphocytes"
[1] "Heart - Left Ventricle"
[1] "Brain - Cortex"
[1] "Muscle - Skeletal"
[1] "Brain - Anterior cingulate cortex (BA24)"
[1] "Brain - Substantia nigra"
[1] "Skin - Sun Exposed (Lower leg)"
[1] "Brain - Hypothalamus"
[1] "Brain - Hippocampus"
[1] "Brain - Caudate (basal ganglia)"
[1] "Brain - Putamen (basal ganglia)"
[1] "Brain - Spinal cord (cervical c-1)"
[1] "Brain - Cerebellum"
[1] "Brain - Nucleus accumbens (basal ganglia)"
[1] "Cells - Leukemia cell line (CML)"
[1] "Brain - Amygdala"
[1] "Nerve - Tibial"
[1] "Vagina"
[1] "Stomach"
[1] "Pancreas"
[1] "Colon - Sigmoid"
[1] "Skin - Not Sun Exposed (Suprapubic)"
[1] "Heart - Atrial Appendage"
[1] "Esophagus - Mucosa"
[1] "Adrenal Gland"
[1] "Breast - Mammary Tissue"
[1] "Liver"
[1] "Esophagus - Muscularis"
[1] "Cells - Cultured fib

In [25]:
for (tissue.set in tissue.sets)
{
    print (tissue.set)
  all.genes<-read.table(gtex_gct,sep='\t',header=T,skip=2,colClasses = c(rep("character", 2), rep("NULL", 11688)))
if (file.size(paste('data/expression',tissue.set,'.txt',sep=''))<=1)

  next

x <- read.delim(paste('data/expression',tissue.set,'.txt',sep=''))

#x<-x[all.genes$Description %in% our.genes,]

#all.genes<-all.genes[all.genes$Description %in% our.genes,2]

x<-x[!duplicated(all.genes[,'Description']),]

all.genes<-all.genes[!duplicated(all.genes[,'Description']),'Description']

rownames(x)<-all.genes

#meta.data<-read.table('/Users/karleg/Dimorph/2017December8GTExRNASeqSRARunTable.txt',sep='\t',header=TRUE)

#meta.data$Sample_Name_s<-gsub('-','\\.',meta.data$Sample_Name_s)

#meta.data<-meta.data[meta.data$Sample_Name_s %in% colnames(x),]

    
pheno<-read.csv(gtex_pheno,sep='\t')

pheno$SUBJID<-gsub('-','\\.',pheno$SUBJID)

sex<-c()  #the value 2 in the phenotypic data is the values that is 1 in design.  This value correslonds to female.

age<-c()

cod<-c()

for (col in colnames(x))
{
  sex<-c(sex,pheno$SEX[which(pheno$SUBJID==paste(unlist(strsplit(col,'\\.'))[c(1,2)],collapse='.'))])

  age<-c(age,pheno$AGE[which(pheno$SUBJID==paste(unlist(strsplit(col,'\\.'))[c(1,2)],collapse='.'))])

  cod<-c(cod,pheno$DTHHRDY[which(pheno$SUBJID==paste(unlist(strsplit(col,'\\.'))[c(1,2)],collapse='.'))])

}

if (length(table(sex))==1)

  next

cod[is.na(cod)]=0
#group <- meta.data$sex_s

y <- DGEList(counts=x,group=factor(sex))

y <- calcNormFactors(y)

groups<-sex

keep.events<-rep(T,nrow(y))

for (group in c(1,2))

  keep.events<-keep.events & (rowSums(cpm(y[,groups %in% group]) > 1) >= 0.25*min(table(groups)))

if (sum(keep.events)==0)

  next

y<-y[keep.events,]  #if only using the logFC to compare with AS then do not screen

design <- model.matrix(~factor(sex))

v <- voom(y, design)

fit <- lmFit(v, design)

fit <- eBayes(fit, robust=TRUE)

res=topTable(fit, coef='factor(sex)2',number=nrow(y))

write.table(res,paste('data/DE_result_',tissue.set,'.txt',sep=''),sep='\t',quote = F)

}
#edgeR

#y <- estimateDisp(y,design)

#fit <- glmFit(y,design)

#lrt <- glmLRT(fit,coef='factor(sex)2')

#hist(lrt$table$PValue)    