# Part I: Preprocessing

## 1. Load the required packages

In [None]:
library(MultiAssayExperiment)
library(curatedTCGAData)
library(TCGAutils)
library(UpSetR)
library(DESeq2)
library(M3C)
library(tidyverse)

## 2. Load data

In [None]:
curatedTCGAData(diseaseCode = "LUAD", assays = "*", version = '2.0.1', dry.run = TRUE)

luad.mae<- curatedTCGAData(
	diseaseCode = "LUAD",
	version='2.0.1',
	assays = c(
	"RPPAArray",
	"RNASeq2GeneNorm", 
	"GISTIC_ThresholdedByGene"
	),
	dry.run = FALSE
)

upsetSamples(luad.mae)

## 3. Data Preprocessing

### 3.1 Explore and clean multi assay experiment

In [None]:
luad.updated <- qreduceTCGA(luad.mae, keep.assay = FALSE)

# The sampleTables function gives a tally of available samples in the dataset based on the TCGA barcode information.
sampleTables(luad.updated)

# For reference in interpreting the sample type codes, see the sampleTypes table: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes
data("sampleTypes")
sampleTypes

# Only samples >>> 61 samples wit status 02 oder 11 are removed
solidtums <- TCGAsampleSelect(colnames(luad.updated), "01")
luad.processed <- luad.updated[, solidtums, ]

# The MultiAssayExperiment package then provides functionality to merge replicate profiles for a single patient (mergeReplicates()), which would now be appropriate but would not have been appropriate before splitting different tissue types into different assays, because that would average measurements from tumors and normal tissues.
luad.final <- mergeReplicates(intersectColumns(luad.processed))
luad.final

### 3.2 Extract expression data


In [None]:
rna.matrix <- getWithColData(luad.final, 2L)
count.dat <- assay(rna.matrix)
pheno.dat <- colData(rna.matrix)

## Reduce features of RNA layer
dds <- DESeqDataSetFromMatrix(countData = round(count.dat),
                                 colData = pheno.dat,
                                 design = ~ 1)

# Remove non-expressed genes
keep <- rowSums(counts(dds)) > 1
dds <- dds[keep,]

# VST tranformation:
# This function calculates a variance stabilizing transformation (VST) from the fitted dispersion-mean relation(s) 
# and then transforms the count data (normalized by division by the size factors or normalization factors), 
# yielding a matrix of values which are now approximately homoskedastic (having constant variance along the range of mean values).
dds.norm <- vst(dds)

# Keep only the 50 most variably expressed genes
filtered.exp <- featurefilter(assay(dds.norm), percentile=40, method='MAD', topN=50)
filtered.exp.final <- as.matrix(filtered.exp$filtered_data)


### 3.3 Extract copy number variations

In [None]:
cnvs.matrix <- assay(luad.final[[1L]])
rownames(cnvs.matrix) <- luad.final@ExperimentList@listData[["LUAD_GISTIC_ThresholdedByGene-20160128"]]@elementMetadata@listData[["Gene.Symbol"]]

# Reduce features of CNV data: keep CNVs in expression genes only
keep.cnvs.final <- rownames(filtered.exp.final)
common.cnvs <- intersect(rownames(cnvs.matrix),keep.cnvs.final)
filtered.cnvs.final <- cnvs.matrix[common.cnvs,]

### 3.4 Extract protein data

In [None]:
protein.dat <- assay(luad.final[[3L]])
# protein.dat contains 905 NAs which need to be removed ...
protein.final <- na.omit(protein.dat)

### 3.5 Prepare clinical metadata

In [None]:
clin.dat <- as.data.frame(colData(luad.final))

clinical.filtered <- clin.dat %>% 
select(c(patientID, years_to_birth, vital_status, pathologic_stage, pathology_T_stage, pathology_N_stage,
gender, radiation_therapy, histological_type,
patient.clinical_cqcf.anatomic_neoplasm_subdivision,  patient.tobacco_smoking_history,
days_to_death)) 

staget <- sub("[abcd]","",sub("t","",clinical.filtered$pathology_T_stage))
staget <- suppressWarnings(as.integer(staget))
clinical.filtered$pathology_T_stage <- staget

colData(luad.final) <- S4Vectors::DataFrame(clinical.filtered)

### 3.6 Create new multi assay experiment

In [None]:
mae <- c(luad.final, list(RNAseq = filtered.exp.final), mapFrom = "LUAD_RNASeq2GeneNorm-20160128")
mae <- mae[,,c(1,3,4)]
mae[,,c(1,2)] <- list(CNV = filtered.cnvs.final, RPPA=protein.final)

experiments <- assays(mae)
experiments[[1]] %>% as.matrix -> experiments[[1]]

# Feature names of different layers need unique names
paste('snv', experiments[[1]] %>% rownames, sep='_') -> rownames(experiments[[1]])
paste('protein', experiments[[2]] %>% rownames, sep='_') -> rownames(experiments[[2]])
paste('rna', experiments[[3]] %>% rownames, sep='_') -> rownames(experiments[[3]])

mae.final <- MultiAssayExperiment(experiments=experiments, colData = colData(mae), sampleMap = sampleMap(mae))

## 4. Serialize (save) the multi array experiment

In [None]:
serialized_mae_path = file.path(getwd(), "../mae.rds")
saveRDS(mae.final, file=serialized_mae_path)