# DIABLO, Integration of multi-omics data

This is an adaptation of this vignette: https://bioconductor.org/packages/release/bioc/vignettes/mixOmics/inst/doc/vignette.html
mixOmics package tutorials here: http://mixomics.org/

Data were preprocessed for participants first measurement of each omic. 

There is a lot of fine tuning that could be done if we want to showcase this analysis.


In [None]:
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(mixOmics))
suppressPackageStartupMessages(library(plyr))
suppressPackageStartupMessages(library(caTools))
suppressPackageStartupMessages(library(caret))
suppressPackageStartupMessages(library("BiocParallel"))

set.seed(99)

In [None]:
options(jupyter.plot_scale=1,
        width=200,
        repr.matrix.max.cols=200,
        repr.matrix.max.rows=Inf)

In [None]:
# Read in baseline measures
### NOTE DIABLO can be run with repeated measures
platform <- "SageMaker"
if ("SageMaker" == platform) {
    omicsDir  <- "/home/sagemaker-user/Aging_Workshop_24/data"
    frailtyDir <- "/home/sagemaker-user/Aging_Workshop_24/data/frailty"
} else {
    omicsDir <- "../WGCNA/"
    frailtyDir <- "/shared-libs/useful-files/frailty_measures_kanelab/FI_workshop_040124"
}
prots <- read_delim(file.path(omicsDir,"prot_baseline.csv"), delim=",")
mets <- read_delim(file.path(omicsDir,"mets_baseline.csv"), delim=",")
clin <- read_delim(file.path(omicsDir,"clinical_baseline.csv"), delim=",")

print(dim(mets))
print(dim(prots))
print(dim(clin))

frailty <- read_delim(file.path(frailtyDir,"combination_fi_040124.csv"), delim=",")

In [None]:
# Light filtering of missing values per row/colum

mets_filt <- mets[, colMeans(is.na(mets)) <= .15]
prots_filt <- prots[, colMeans(is.na(prots)) <= .15]
clin_filt <- clin[, colMeans(is.na(clin)) <= .15]
print(dim(mets_filt))
print(dim(prots_filt))
print(dim(clin_filt))

mets_filt <- mets_filt[rowMeans(is.na(mets_filt)) <= .15,]
prots_filt <- prots_filt[rowMeans(is.na(prots_filt)) <= .15,]
clin_filt <- clin_filt[rowMeans(is.na(clin_filt)) <= .15,]
print(dim(mets_filt))
print(dim(prots_filt))
print(dim(clin_filt))


## Diablo uses NIPALS for imputation.
## Diablo centers and scaled data.

In [None]:
# Merge to get participants with all measures
#put all data frames into list
df_list <- list(frailty, mets_filt, prots_filt, clin_filt)

#merge all data frames in list
combined_df <- df_list %>% reduce(inner_join, by = "public_client_id")

dim(combined_df)

In [None]:
# Split into "blocks" for DIABLO
metabolites <- combined_df[,colnames(combined_df) %in% colnames(mets_filt)]
proteins <- combined_df[,colnames(combined_df) %in% colnames(prots_filt)]
clinical <- combined_df[,colnames(combined_df) %in% colnames(clin_filt)]
frailty <- combined_df[, c("public_client_id", "sex", "age", "race", "self_fi", "lab_fi", "merge_fi")]
# DIABLO can only be run with categorical variables 
# Split frailty measures into quintiles
frailty <- frailty %>%
  dplyr::mutate(lab_quantile = dplyr::ntile(lab_fi, 5),
                self_quantile = dplyr::ntile(self_fi, 5),
                merge_quantile = dplyr::ntile(merge_fi, 5))

In [None]:
round(cor(frailty$self_quantile, frailty$lab_quantile, method='s'),3)
table(frailty$self_quantile, frailty$lab_quantile)

In [None]:
round(cor(frailty$merge_quantile, frailty$self_quantile, method='s'),3)
table(frailty$merge_quantile, frailty$self_quantile)

In [None]:
round(cor(frailty$merge_quantile, frailty$lab_quantile, method='s'),3)
table(frailty$merge_quantile, frailty$lab_quantile)

In [None]:
table(frailty$sex)

In [None]:
hist(frailty$age, breaks=40)

In [None]:
# Check rows are in the same order
all(frailty$public_client_id == metabolites$public_client_id)
all(frailty$public_client_id == proteins$public_client_id)
all(frailty$public_client_id == clinical$public_client_id)

# Single 'Omics PCA analysis

Using full data for the exploratory analysis. We could consider breaking into test/train to get out-of-sample predictions, but the goal here is just to take a quick look at the data so we know what to expect. These PCA plots show that self-reported Frailty Index is hard to predict on the basis of this data; achieving good performance will be difficult.

In [None]:
Outcome <- as.factor(frailty$self_quantile)
mets_mat <- as.matrix(metabolites[,2:ncol(metabolites)])
prots_mat <- as.matrix(proteins[,2:ncol(proteins)])
clin_mat <- as.matrix(clinical[,2:ncol(clinical)])


In [None]:
pca.mets <- pca(mets_mat, ncomp = 2, scale = TRUE)

plotIndiv(pca.mets, group = Outcome, ind.names = FALSE,
          legend = TRUE, 
          title = 'Metabolites, PCA comp 1 - 2')

In [None]:
par(mfrow=c(1,2))
boxplot(split(pca.mets$variates$X[,'PC1'], Outcome),
        ylab="PC1",xlab="Frailty (Quintile)",
        col=c('deepskyblue2','gold2','gray80','seagreen','pink3'))
abline(v=0,lty=2,lwd=3)

boxplot(split(pca.mets$variates$X[,'PC2'], Outcome),
        ylab="PC2",xlab="Frailty (Quintile)",
        col=c('deepskyblue2','gold2','gray80','seagreen','pink3'))
abline(h=0,lty=2,lwd=3)

In [None]:
pca.prots <- pca(prots_mat, ncomp = 2, scale = TRUE)

plotIndiv(pca.prots, group = Outcome, ind.names = FALSE,
          legend = TRUE, 
          title = 'Proteins, PCA comp 1 - 2')

In [None]:
par(mfrow=c(1,2))
boxplot(split(pca.prots$variates$X[,'PC1'], Outcome),
        ylab="PC1",xlab="Frailty (Quintile)",
        col=c('deepskyblue2','gold2','gray80','seagreen','pink3'))
abline(v=0,lty=2,lwd=3)

boxplot(split(pca.prots$variates$X[,'PC2'], Outcome),
        ylab="PC2",xlab="Frailty (Quintile)",
        col=c('deepskyblue2','gold2','gray80','seagreen','pink3'))
abline(h=0,lty=2,lwd=3)

In [None]:
pca.clin <- pca(clin_mat, ncomp = 2, scale = TRUE)

plotIndiv(pca.clin, group = Outcome, ind.names = FALSE,
          legend = TRUE, 
          title = 'Clinical Tests, PCA comp 1 - 2')

In [None]:
par(mfrow=c(1,2))
boxplot(split(pca.clin$variates$X[,'PC1'], Outcome),
        ylab="PC1",xlab="Frailty (Quintile)",
        col=c('deepskyblue2','gold2','gray80','seagreen','pink3'))
abline(v=0,lty=2,lwd=3)

boxplot(split(pca.clin$variates$X[,'PC2'], Outcome),
        ylab="PC2",xlab="Frailty (Quintile)",
        col=c('deepskyblue2','gold2','gray80','seagreen','pink3'))
abline(h=0,lty=2,lwd=3)

# Single 'Omics PLS-DA

Exploratory data analysis with PCA (above) finds the axes on which the data is most spread out; it allows us to look at the spatial pattern of the data. The outcome labeling each point (quintiles of self-reported Frailty Index, shown by color and marker), however, is not used in PCA; we look at how the outcome correlates (visually) with the spatial pattern.

PLS-DA is similar to PCA, except that it is explicitly trying to spread the spatial pattern of the outcome, rather than the predictive features. The outcome is used to supervise which axis is chosen first, second, etc. This is a first look at how well each of the individual 'omics datasets informs us about the outcome. When we integrate the 'omics data together, we will be looking to take advantage of any differences in what each type of data tells us about the outcome.

## Metabolomics PLS-DA

In [None]:
plsda.met <- mixOmics::plsda(mets_mat, Outcome, ncomp = 5)

perf.plsda.met  <- mixOmics::perf(plsda.met, validation = 'Mfold', folds = 3, 
                  progressBar = TRUE,  
                  nrepeat = 10)   ### This is a low number of repeats that should be increased for a better analysis. Its slow. 

plot(perf.plsda.met, sd = TRUE, legend.position = 'horizontal')

In [None]:
# Not great BER
perf.plsda.met

In [None]:
print(perf.plsda.met$error.rate.class,digits=3) 

In [None]:
plotIndiv(plsda.met, comp = c(1,2), # plot samples from final model
          group = Outcome, ind.names = FALSE, # colour by class label
          ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
          title = 'Metabolite sPLS-DA, comp 1 & 2')

In [None]:
plotIndiv(plsda.met, comp = c(1,3), # plot samples from final model
          group = Outcome, ind.names = FALSE, # colour by class label
          ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
          title = 'Metabolite sPLS-DA, comp 1 & 3')

In [None]:
plotIndiv(plsda.met, comp = c(1,4), # plot samples from final model
          group = Outcome, ind.names = FALSE, # colour by class label
          ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
          title = 'Metabolite sPLS-DA, comp 1 & 4')

In [None]:
plotIndiv(plsda.met, comp = c(1,5), # plot samples from final model
          group = Outcome, ind.names = FALSE, # colour by class label
          ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
          title = 'Metabolite sPLS-DA, comp 1 & 5')

In [None]:
# Component 4 appears to add to the ability to separate Q4 from Q3 and Q5; the value of Component 5 is less clear
met.auroc <- auroc(plsda.met, roc.comp = 4, print = FALSE)

## Proteomics PLS-DA

In [None]:
plsda.prots <- mixOmics::plsda(prots_mat, Outcome, ncomp = 5)

perf.plsda.prots  <- mixOmics::perf(plsda.prots, validation = 'Mfold', folds = 3, 
                  progressBar = TRUE,  
                  nrepeat = 10)   ### This is a low number of repeats that should be increased for a better analysis. Its slow. 

plot(perf.plsda.prots, sd = TRUE, legend.position = 'horizontal')

###  Exercise

Evaluate the relationship of the proteomics data to self-reported frailty index as we did for the metabolomics data above.

In [None]:
plotIndiv(plsda.prots, comp = c(1,2), # plot samples from final model
          group = Outcome, ind.names = FALSE, # colour by class label
          ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
          title = 'Protein sPLS-DA, comp 1 & 2')

In [None]:
plotIndiv(plsda.prots, comp = c(1,3), # plot samples from final model
          group = Outcome, ind.names = FALSE, # colour by class label
          ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
          title = 'Protein sPLS-DA, comp 1 & 3')

In [None]:
plotIndiv(plsda.prots, comp = c(1,4), # plot samples from final model
          group = Outcome, ind.names = FALSE, # colour by class label
          ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
          title = 'Protein sPLS-DA, comp 1 & 4')

In [None]:
plotIndiv(plsda.prots, comp = c(1,5), # plot samples from final model
          group = Outcome, ind.names = FALSE, # colour by class label
          ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
          title = 'Protein sPLS-DA, comp 1 & 5')

In [None]:
prots.auroc <- auroc(plsda.prots, roc.comp = 2, print = FALSE)

### End of exercise

## Clinical Tests PLS-DA

In [None]:
plsda.clin <- mixOmics::plsda(clin_mat, Outcome, ncomp = 5)

perf.plsda.clin  <- mixOmics::perf(plsda.clin, validation = 'Mfold', folds = 3, 
                  progressBar = TRUE,  
                  nrepeat = 10)   ### This is a low number of repeats that should be increased for a better analysis. Its slow. 

plot(perf.plsda.clin, sd = TRUE, legend.position = 'horizontal')

###  Exercise

Evaluate the relationship of the clinical test data to self-reported frailty index as we did for the metabolomics data above.

In [None]:
plotIndiv(plsda.clin, comp = c(1,2), # plot samples from final model
          group = Outcome, ind.names = FALSE, # colour by class label
          ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
          title = 'Clinical sPLS-DA, comp 1 & 2')

In [None]:
plotIndiv(plsda.clin, comp = c(1,3), # plot samples from final model
          group = Outcome, ind.names = FALSE, # colour by class label
          ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
          title = 'Clinical sPLS-DA, comp 1 & 3')

In [None]:
plotIndiv(plsda.clin, comp = c(1,4), # plot samples from final model
          group = Outcome, ind.names = FALSE, # colour by class label
          ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
          title = 'Clinical sPLS-DA, comp 1 & 4')

In [None]:
plotIndiv(plsda.clin, comp = c(1,5), # plot samples from final model
          group = Outcome, ind.names = FALSE, # colour by class label
          ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
          title = 'Clinical sPLS-DA, comp 1 & 5')

In [None]:
clin.auroc <- auroc(plsda.clin, roc.comp = 4, print = FALSE)

### End of exercise

In [None]:
# Summary of the ability of each 'omic type to classify each quintile of Self-Reported Frailty Index
auroc.table <- cbind(
    met.auroc[['Comp4']][,'AUC'],
    prots.auroc[['Comp2']][,'AUC'],
    clin.auroc[['Comp3']][,'AUC'])
dimnames(auroc.table) <- list(SelfFI = c('Q1','Q2','Q3','Q4','Q5'),
             Block = c("Metabolites","Proteins","Clinical"))
cat(noquote("Area Under ROC, predicting each quintile\n"))
print(auroc.table)

# Full-strength DIABLO: Multiblock sPLS-DA

In [None]:
X <- list(metabolite = mets_mat, 
          protein = prots_mat, 
          clinical = clin_mat)

In [None]:
# Initial design with correlatin of .10
design <- matrix(0.1, ncol = length(X), nrow = length(X), 
                dimnames = list(names(X), names(X)))
diag(design) <- 0
design 

In [None]:
# For reference this is a highly correlated data set
# Requiring lower correlation in the design leads to higher prediction
res1.pls <- pls(mets_mat, prots_mat, ncomp = 1)
cat(noquote(paste("cor(PLS.Metabolomics, PLS.Proteomics) =",round(cor(res1.pls$variates$X, res1.pls$variates$Y)[1,1], 3),"\n")))

res2.pls <- pls(mets_mat, clin_mat, ncomp = 1)
cat(noquote(paste("cor(PLS.Metabolomics, PLS.Clinical)   =",round(cor(res2.pls$variates$X, res2.pls$variates$Y)[1,1], 3),"\n")))

res3.pls<- pls(prots_mat, clin_mat, ncomp = 1)
cat(noquote(paste("cor(PLS.Proteomics,   PLS.Clinical)   =",round(cor(res3.pls$variates$X, res3.pls$variates$Y)[1,1],3),"\n")))

In [None]:
# This takes a 20 min to run!
diablo.selfFI <- block.plsda(X, Outcome, ncomp = 5, design = design)

perf.diablo.selfFI = mixOmics::perf(diablo.selfFI, validation = 'Mfold', 
                                     progressBar = TRUE,
                                     folds = 10, nrepeat = 10)

# Plot of the error rates based on weighted vote
plot(perf.diablo.selfFI)

In [None]:
perf.diablo.selfFI$choice.ncomp$WeightedVote

In [None]:
# ncomp <- perf.diablo.selfFI$choice.ncomp$WeightedVote["Overall.BER", "centroids.dist"]
ncomp <- 4

In [None]:
## Tuning the sparsity of the components

In [None]:
# Variable tuning - the number of features to include in each component
# Set the search grid with value of 5 and reduce later

startTime <- Sys.time() 
test.keepX <- list(metabolite = c(seq(3, 12, 3)),
                   protein = c(seq(3, 12, 3)),
                   clinical = c(seq(3, 9, 5)))

tune.diablo.selfFI <- tune.block.splsda(X, Outcome, ncomp = 2, 
                              test.keepX = test.keepX, design = design,
                              validation = 'Mfold', folds = 10, nrepeat = 1, ### Should update nrepeats with a final model
                              BPPARAM = BiocParallel::SnowParam(workers = 16),
                              dist = "centroids.dist")

list.keepX <- tune.diablo.selfFI$choice.keepX

endTime <- Sys.time() 
print(endTime - startTime)

In [None]:
print(list.keepX)

### Final model

In [None]:
diablo.selfFI.final <- block.splsda(X, Outcome, ncomp = ncomp, 
                            keepX = list.keepX, design = design)

In [None]:
plotIndiv(diablo.selfFI.final, comp = c(1,2), # plot samples from final model
          group = Outcome, ind.names = FALSE, 
          ellipse = TRUE, legend = TRUE, 
          title = 'Multiomic sPLS-DA, comp 1 & 2')

In [None]:
plotIndiv(diablo.selfFI.final, comp = c(1,3), # plot samples from final model
          group = Outcome, ind.names = FALSE, 
          ellipse = TRUE, legend = TRUE, 
          title = 'Multiomic sPLS-DA, comp 1 & 3')

In [None]:
plotIndiv(diablo.selfFI.final, comp = c(1,4), # plot samples from final model
          group = Outcome, ind.names = FALSE, 
          ellipse = TRUE, legend = TRUE, 
          title = 'Multiomic sPLS-DA, comp 1 & 4')

In [None]:
auc.diablo.prots <- auroc(diablo.selfFI.final, roc.block = "protein", roc.comp = 4, print = FALSE)

In [None]:
auc.diablo.met <- auroc(diablo.selfFI.final, roc.block = "metabolite", roc.comp = 4, print = FALSE)

In [None]:
auc.diablo.clin <- auroc(diablo.selfFI.final, roc.block = "clinical", roc.comp = 4, print = FALSE)

In [None]:
plotDiablo(diablo.selfFI.final, ncomp = 1)

In [None]:
plotDiablo(diablo.selfFI.final, ncomp = 2)

In [None]:
plotDiablo(diablo.selfFI.final, ncomp = 3)

In [None]:
plotDiablo(diablo.selfFI.final, ncomp = 4)

In [None]:
plotVar(diablo.selfFI.final, var.names = FALSE, style = 'graphics', legend = TRUE, 
        pch = c(16, 17, 15), cex = c(2,2,2),  comp=c(1,2),
        col = c('darkorchid', 'brown1', 'lightgreen'),
        title = 'Self-reported Frailty Index, DIABLO comp 1 - 2')

In [None]:
plotVar(diablo.selfFI.final, var.names = FALSE, style = 'graphics', legend = TRUE, 
        pch = c(16, 17, 15), cex = c(2,2,2), comp=c(3,4),
        col = c('darkorchid', 'brown1', 'lightgreen'),
        title = 'Self-reported Frailty Index, DIABLO comp 3 - 4')

In [None]:
norm <- function(v) { sqrt(sum(v*v)) }
threshold <- 0.25
for (omic in names(diablo.selfFI.final$loadings)[1:3]) {
    r <- unlist(apply(diablo.selfFI.final$loadings[[omic]], 1, norm))
    print(diablo.selfFI.final$loadings[[omic]][r > threshold,],digits=3)
    cat(noquote("\n"))
}

In [None]:
cimDiablo(diablo.selfFI.final, color.blocks = c('darkorchid', 'brown1', 'lightgreen'),
          comp = 1, margin=c(8,20), legend.position = "right")

In [None]:
perf.diablo.selfFI.final <- perf(diablo.selfFI.final,  validation = 'Mfold', folds = 10, 
                         nrepeat = 10, dist = 'centroids.dist')