# Session 1-2 Notebook for System Biology of Aging 
This notebook is part of the first session of the 2024 Systems Biology of Aging Workshop. But don't worry about setup, all of the files and R packages should be installed in the the Sagemaker environment.

This Notebook will use the "R" kernel. Double check that this is correct by looking in the top right corner.

> * Multi-Omic WGCNA
> ** Integrate Omics
> ** Correlate with frailty
> ** Explore results

## Setup
The next couple blocks of code will load the R packages into our environment and set some options for nicer visualizaions.

In [None]:
# Load packages, one per line for clarity
suppressMessages(library("tidyverse", quietly = TRUE, warn.conflicts=FALSE))
suppressMessages(library("ggplot2", quietly = TRUE, warn.conflicts=FALSE))
suppressMessages(library("WGCNA", quietly = TRUE, warn.conflicts=FALSE))
suppressMessages(library("org.Hs.eg.db", quietly = TRUE, warn.conflicts=FALSE))
suppressMessages(library("clusterProfiler", quietly = TRUE, warn.conflicts=FALSE))
suppressMessages(library("gplots", quietly = TRUE, warn.conflicts=FALSE))
suppressMessages(library("scales", quietly = TRUE, warn.conflicts=FALSE))
suppressMessages(library("Matrix", quietly = TRUE, warn.conflicts=FALSE))
suppressMessages(library("colorspace", quietly = TRUE, warn.conflicts=FALSE))

# Other options
source("Scripts/Workshop_scripts.R") # Functions for plotting
options(stringsAsFactors=FALSE)#Required for WGCNA
#enableWGCNAThreads(nThreads=2) # 
options(repr.plot.width=12, repr.plot.height=12)#Default=7x7
options(repr.matrix.max.rows=75, repr.matrix.max.cols=20)
options(warn=-1)

In [None]:
mets <- read_delim("./Session1_files/metabolites_baseline.tsv", show_col_types = FALSE)
prots <- read_delim("./Session1_files/proteins_baseline.tsv", show_col_types = FALSE)
clin <- read_delim("./Session1_files/chemistries_baseline.tsv", show_col_types = FALSE)

# Load meta data
meta_protein = read_delim("../data/arivale_snapshot_ISB_2019-05-10_0053/proteomics_metadata.tsv", skip=13, delim='\t', show_col_types = FALSE)
# Metabolites
meta_metabolites = read_delim("../data/arivale_snapshot_ISB_2019-05-10_0053/metabolomics_metadata.tsv", skip=13, delim='\t', show_col_types = FALSE)
# Frailty
fr_measures = read_delim("../data/frailty/combination_fi_040124.csv", show_col_types = FALSE)
fr_measures_key = read_delim("../Useful_Files/FI_Features.txt", show_col_types = FALSE, delim="\t")

Merge each dataframe together to create a multiomics dataframe that contains all shared participants. 

In [None]:
# Drop clinical lab tests that were used to generate lab_fi.
clin_no_frail <- clin[!colnames(clin) %in% fr_measures_key$Feature]

#Take participants having all data types
cm_df <- merge(clin_no_frail, mets, by="public_client_id")
pf_df <- merge(prots, fr_measures, by="public_client_id")
in_df <- merge(cm_df, pf_df, by="public_client_id")
print("Clinical labs")
dim(clin)
print("-> w/o tests that were used to generate lab_fi")
dim(clin_no_frail)
print("Metabolites")
dim(mets)
print("Proteins")
dim(prots)
print("FI measures")
dim(fr_measures)
print("Merged all")
dim(in_df)

# Drop id column and get features (for easily applying to the WGCNA functions)
num_df <- in_df %>%
    dplyr::select(unique(c(colnames(clin_no_frail), colnames(mets), colnames(prots)))) %>%
    column_to_rownames(var="public_client_id")
print("Final DF used in this analysis")
dim(num_df)
head(num_df)

## Data cleaning

In [None]:
## Filter samples and features based on WGCNA NA criteria (50%)

gsg = goodSamplesGenes(num_df, verbose = 1);
gsg$allOK
if (!gsg$allOK)
{
  # Optionally, print the gene and sample names that were removed:
  if (sum(!gsg$goodGenes)>0) 
     printFlush(paste("Removing genes:", paste(names(num_df)[!gsg$goodGenes], collapse = ", ")));
  if (sum(!gsg$goodSamples)>0) 
     printFlush(paste("Removing samples:", paste(rownames(num_df)[!gsg$goodSamples], collapse = ", ")));
  # Remove the offending genes and samples from the data:
  num_df = num_df[gsg$goodSamples, gsg$goodGenes]
}

dim(num_df)

# Clustering combined 'omics data

The abundance of two arbitrarily chosen proteins in proteomics data is typically positively correlated, as are arbitrarily chosen genes in transcriptome data. We can therefore choose to use the power functions typically used in WGCNA analyses, or the more general sigmoid functions, to convert correlations into adjacencies (or distances) for clustering proteins in WGCNA.

## Correlations



In [None]:
# Get the names of remaining analytes overall (all.analytes) by category
cat.prots <- intersect(colnames(prots),colnames(num_df))
cat.mets  <- intersect(colnames(mets),colnames(num_df))
cat.clin  <- intersect(colnames(clin),colnames(num_df))
all.analytes <- c(cat.prots,cat.mets,cat.clin)
print("Proteins")
print(length(cat.prots))
print("Metabolites")
print(length(cat.mets))
print("Clinical labs")
print(length(cat.clin))
print("All analytes")
print(length(all.analytes))
n.analytes <- length(all.analytes)

# To compute correlations, features must be numeric. We will use Spearman
all_df <- num_df[,all.analytes]

In [None]:
# Running correlations is slow. We will load the data for now. The code to run the correlations is below-

omic_cor <- readRDS("../Useful_Files/Omics_Cor.Rds")
Z.pp <- omic_cor$Prot_Prot
Z.mm <- omic_cor$Met_Met
Z.cc <- omic_cor$Clin_Clin

Z.pm <- omic_cor$Prot_Met
Z.pc <- omic_cor$Prot_Clin
Z.mc <- omic_cor$Met_Clin

# # # Within-category correlations
# Z.pp <- cor(all_df[,cat.prots], method = "s", use = 'pairwise.complete.obs')
# Z.mm <- cor(all_df[,cat.mets], method = "s", use = 'pairwise.complete.obs')
# Z.cc <- cor(all_df[,cat.clin], method = "s", use = 'pairwise.complete.obs')

# # Cross-category correlations
# Z.pm <- cor(all_df[,cat.prots], all_df[,cat.mets], method = "s", use = 'pairwise.complete.obs')
# Z.pc <- cor(all_df[,cat.prots], all_df[,cat.clin], method = "s", use = 'pairwise.complete.obs')
# Z.mc <- cor(all_df[,cat.mets], all_df[,cat.clin], method = "s", use = 'pairwise.complete.obs')
# #The error message indicates that at least one analyte pair did not have at least 2 samples
# # with values for both analytes, resulting in an NA correlation. We replace this NA with a 0.
# length(which(is.na(Z.pm)))
# length(which(is.na(Z.pc)))
# length(which(is.na(Z.mc)))
# Z.mc[is.na(Z.mc)] <- 0
# length(which(is.na(Z.mc)))

### Modeling protein-protein correlations


In [None]:
# Estimate coefficients for a Beta distribution by the method of moments
# i.e. by computing parameters that match the mean and variance of the "background" model (initially: all observed correlations)
Z.unique <- Z.pp[row(Z.pp) < col(Z.pp)] # unique, non-self correlations
x <- (1+Z.unique)/2
mZ <- mean(x)
s2Z <- var(x)
v.pp <- mZ*(mZ*(1-mZ)/s2Z - 1)
w.pp <- (1-mZ)*(mZ*(1-mZ)/s2Z - 1)
print(paste("Protein-protein: rho_ij ~ Beta(v =",round(v.pp,3),",w =",round(w.pp,3),")"))

In [None]:
# Evaluate how well the model fits the background distribution
fine <- 40
Bs <- (c(-fine:(1+fine))-0.5)/fine
hist(Z.unique, breaks=Bs, xlab="Correlation", ylab="Density",
     main="Pairwise protein correlations", prob=TRUE)
box()
abline(v=c(-1:1),lty=3)
r <- c(-fine:fine)/fine
lines(r, dbeta((1+r)/2, v.pp, w.pp)/2, lwd=3, col="MediumBlue")

The model doesn't fit the background distribution well enough (by eye). We will adjust the parameters until the model fits.

In [None]:
v.pp <- 38
w.pp <- 30
print(paste("Protein pairs: rho_ij ~ Beta(v =",round(v.pp,3),",w =",round(w.pp,3),")"))

fine <- 40
Bs <- (c(-fine:(1+fine))-0.5)/fine
hist(Z.unique, breaks=Bs, xlab="Correlation", ylab="Density", ylim=c(0,3.5),
     main="Pairwise protein correlations", prob=TRUE)
box()
abline(v=c(-1:1),lty=3)

r <- c(-fine:fine)/fine
lines(r, dbeta((1+r)/2, v.pp, w.pp)/2, lwd=3, col="MediumBlue")

### Modeling by category: metabolite-metabolite

In [None]:
Z.unique <- as.vector(Z.mm[row(Z.mm) < col(Z.mm)]) # unique, non-self correlations

v.mm <- 59.31
w.mm <- 56.38
print(paste("Metabolite pairs: rho_ij ~ Beta(v =",round(v.mm,3),",w =",round(w.mm,3),")"))

In [None]:
# Metabolite pair correlations

fine <- 40
Bs <- (c(-fine:(1+fine))-0.5)/fine
hist(Z.unique, breaks=Bs, xlab="Correlation", ylab="Density", ylim=c(0,4.5),
     main="Pairwise metabolite correlations", prob=TRUE)
box()
abline(v=c(-1:1),lty=3)

r <- c(-fine:fine)/fine
lines(r, dbeta((1+r)/2, v.mm, w.mm)/2, lwd=3, col="MediumBlue")

### Modeling by category: Clin-Clin

In [None]:
Z.unique <- as.vector(Z.cc[row(Z.cc) < col(Z.cc)]) # unique, non-self correlations

v.cc <- 38
w.cc <- 37
print(paste("Metabolite pairs: rho_ij ~ Beta(v =",round(v.cc,3),",w =",round(w.cc,3),")"))

fine <- 20
Bs <- (c(-fine:(1+fine))-0.5)/fine
hist(Z.unique, breaks=Bs, xlab="Correlation", ylab="Density", ylim=c(0,4),
     main="Pairwise clinical chemistry correlations", prob=TRUE)
box()
abline(v=c(-1:1),lty=3)

r <- c(-fine:fine)/fine
lines(r, dbeta((1+r)/2, v.cc, w.cc)/2, lwd=3, col="MediumBlue")

## Modeling cross-category correlations


### Modeling cross-category correlations: protein-metabolite

In [None]:
# Construct a background model by:
# 1. Find the mean and variance of the observed correlations
# 2. Estimate parameters and compare the background model to the histogram of observed correlations
# 3. Revise the mean and variance, recompute parameters, and compare again until satisfied with the fit to the background
#
dim(Z.pm)
Z.unique <- as.vector(Z.pm) # there are no self-comparisons, nor are there repeats due to symmetry
v.pm <- 88
w.pm <- 85 
print(paste("Protein-metabolite: rho_ij ~ Beta(v =",round(v.pm,3),",w =",round(w.pm,3),")"))

# The distribution of these cross-correlations is
# markedly narrower than either of the contributing 'omics

fine <- 40
Bs <- (c(-fine:(1+fine))-0.5)/fine
hist(Z.unique, breaks=Bs, xlab="Correlation", ylab="Density", ylim=c(0,6),
     main="Protein-metabolite correlations", prob=TRUE)
box()
abline(v=c(-1:1),lty=3)

r <- c(-fine:fine)/fine
lines(r, dbeta((1+r)/2, v.pm, w.pm)/2, lwd=3, col="MediumBlue")


### Modeling cross-category correlations: protein-clinical

In [None]:
Z.unique <- as.vector(Z.pc) # unique, non-self correlations

v.pc <- 59
w.pc <- 57 
print(paste("Protein-clinical: rho_ij ~ Beta(v =",round(v.pc,3),",w =",round(w.pc,3),")"))

fine <- 30

Bs <- (c(-fine:(1+fine))-0.5)/fine
hist(Z.unique, breaks=Bs, xlab="Correlation", ylab="Density", ylim=c(0,5),
     main="Protein-clinical chemistry correlations", prob=TRUE)
box()
abline(v=c(-1:1),lty=3)

r <- c(-fine:fine)/fine
lines(r, dbeta((1+r)/2, v.pc, w.pc)/2, lwd=3, col="MediumBlue")

### Modeling cross-category correlations: metabolite-clinical

In [None]:
Z.unique <- as.vector(Z.mc) # unique, non-self correlations

v.mc <- 62
w.mc <- 61
print(paste("Signed: rho_ij ~ Beta(v =",round(v.mc,3),",w =",round(w.mc,3),")"))

fine <- 30
Bs <- (c(-fine:(1+fine))-0.5)/fine
hist(Z.unique, breaks=Bs, xlab="Correlation", ylab="Density", ylim=c(0,5),
     main="Protein-clinical chemistry correlations", prob=TRUE)
box()
abline(v=c(-1:1),lty=3)

r <- c(-fine:fine)/fine
lines(r, dbeta((1+r)/2, v.mc, w.mc)/2, lwd=3, col="MediumBlue")


## Merging correlations from disparate data subsets


In [None]:
# Centering the protein, metabolite, and clinical chemistry correlations
#
# The null distribution model is r ~ 2 Beta(v,w) - 1
# The centering (target) distribution is r_centered ~ 2 Beta(nu, nu) - 1
#
center.beta <- function(r, v, w, nu) {
    return(2*qbeta(pbeta((1 + r)/2, v, w), nu, nu) - 1)
}

nu.std <- 32 # A little wider than the actual distributions, and centered at 0

Zc.pp <- center.beta(Z.pp, v.pp, w.pp, nu.std)
Zc.mm <- center.beta(Z.mm, v.mm, w.mm, nu.std)
Zc.cc <- center.beta(Z.cc, v.cc, w.cc, nu.std)
Zc.pm <- center.beta(Z.pm, v.pm, w.pm, nu.std)
Zc.pc <- center.beta(Z.pc, v.pc, w.pc, nu.std)
Zc.mc <- center.beta(Z.mc, v.mc, w.mc, nu.std)

## Construct a complete, centered correlation matrix

In [None]:
# Combined, centered correlations

all.analytes <- c(cat.prots,cat.mets,cat.clin)
Zc <- matrix(0, nrow = length(all.analytes),
                ncol = length(all.analytes))
rownames(Zc) <- all.analytes
colnames(Zc) <- all.analytes

###
# Block-structured correlation matrix
# Zc = [ PP     PM   PC |
#      | PM^T   MM   MC |
#      | PC^T  MC^T  CC ]
###
Zc[cat.prots, cat.prots] <- Zc.pp
Zc[cat.mets,  cat.mets]  <- Zc.mm
Zc[cat.clin,  cat.clin]  <- Zc.cc

Zc[cat.prots, cat.mets]  <- Zc.pm
Zc[cat.mets, cat.prots]  <- t(Zc.pm)

Zc[cat.prots, cat.clin]  <- Zc.pc
Zc[cat.clin, cat.prots]  <- t(Zc.pc)

Zc[cat.mets, cat.clin]  <- Zc.mc
Zc[cat.clin, cat.mets]  <- t(Zc.mc)

Z.unique <- Zc[row(Zc) < col(Zc)]
print(paste("Target: rho_ij ~ Beta(v =",round(nu.std,3),",w =",round(nu.std,3),")"))

x <- (1+Z.unique)/2
mZ <- mean(x)
s2Z <- var(x)
v.c <- mZ*(mZ*(1-mZ)/s2Z - 1)
w.c <- (1-mZ)*(mZ*(1-mZ)/s2Z - 1)
print(paste("Method of moments: rho_ij ~ Beta(v =",round(v.c,3),",w =",round(w.c,3),")"))


fine <- 100
Zc.unique <- as.vector(Zc[row(Zc) < col(Zc)])
Bs <- (c(-fine:(1+fine))-0.5)/fine
hist(Zc.unique, breaks=Bs, xlab="Correlation", ylab="Density", ylim=c(0,5),
     main="All pairwise correlations, centered", prob=TRUE)
box()
abline(v=c(-1:1),lty=3)

r <- c(-fine:fine)/fine
lines(r, dbeta((1+r)/2, nu.std, nu.std)/2, lwd=3, col="orangered")
lines(r, dbeta((1+r)/2, v.c, w.c)/2, lwd=3, col="MediumBlue")


These are now standardized correlations. The mean and variance of this distribution suggest a model (shown in blue) that fits less well than the standardizing model (in orange); this is a consequence of the differences between the models we fitted and the empirical distributions, and indicates that the enrichment of high correlations we observed in the individual 'omics distributions has been preserved. If we had used quantile normalization, the overabundance of high correlations would have been shifted to lower correlation values, and the fitted blue model would be identical to the standardizing model.

## Standard Signed WGCNA analysis (via a power function indicative of a scale-free network)

In [None]:
#Manually convert the pairwise correlation DF to the signed network DF
Zc_signed <- 0.5 + 0.5 * Zc

print(str_c("nrow: ", nrow(Zc_signed)))
head(Zc_signed)

#Choose a set of soft-thresholding powers
powers <- c(c(1:10), seq(from=11, to=15, by=1))
cutoff <- 0.8

#Call the network topology analysis function
sft <- pickSoftThreshold.fromSimilarity(Zc_signed, RsquaredCut=cutoff, powerVector=powers, blockSize=NULL, verbose=5)

#Plot the results
options(repr.plot.width=9, repr.plot.height=5)
par(mfrow=c(1,2))
cex1 <- 0.8
##Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)", ylab="Scale Free Topology Model Fit, signed R^2", type="n",
     main=paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers, cex=cex1, col="black")
##Line corresponds to using an R^2 cut-off of h
abline(h=cutoff, col="red")
##Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)", ylab="Mean Connectivity", type="n",
     main=paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1, col="black")

print(str_c("Estimated soft-thresholding power: ", sft$powerEstimate))

In [None]:
#Choose the power that best approximates a scale free topology while still maintaining high level of connectivity in the network
softPower <- sft$powerEstimate
print(softPower)
#Generate the adjacency matrix using the chosen soft-thresholding power
adjacency <- adjacency.fromSimilarity(Zc, power=softPower, type="signed")

print(str_c("nrow: ", nrow(adjacency)))
#head(adjacency)

#Turn adjacency into topological overlap
##You can input whatever matrix you want here!
TOM <- TOMsimilarity(adjacency, TOMType="signed")

#Turn into distance matrix
dissTOM <- 1 - TOM

print(str_c("nrow: ", nrow(dissTOM)))
head(dissTOM)

#Call the hierarchical clustering function
geneTree <- hclust(as.dist(dissTOM), method="average")

#Plot the resulting clustering tree (dendrogram)
options(repr.plot.width=12, repr.plot.height=6)
plot(geneTree, xlab="", sub="", main="Gene clustering on TOM-based dissimilarity",
     labels=FALSE, hang=0.04)

In [None]:
#Larger modules can be easier to interpret, so we set the minimum module size relatively high
minModuleSize <- max(c(20, round(ncol(all_df)/200, digits=0)))
print(str_c("minClusterSize = ", minModuleSize))

#Module identification using dynamic tree cut
dynamicMods <- cutreeDynamic(dendro=geneTree, distM=dissTOM,
                             deepSplit=4, pamStage=TRUE, pamRespectsDendro=FALSE,
                             minClusterSize=minModuleSize)
table(dynamicMods)

#Convert numeric lables into colors
dynamicColors <- labels2colors(dynamicMods)
table(dynamicColors)

#Plot the dendrogram and colors underneath
options(repr.plot.width=12, repr.plot.height=6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                    dendroLabels=FALSE, hang=0.03,
                    addGuide=TRUE, guideHang=0.05,
                    main="Gene dendrogram and module colors")

In [None]:
#Calculate eigengenes
MEList <- moduleEigengenes(all_df, colors=dynamicColors, impute=TRUE, nPC=2)
MEs <- MEList$eigengenes
print(str_c("nrow: ", nrow(MEs)))
head(MEs)

#Calculate dissimilarity of module eigengenes
MEDiss <- 1 - cor(MEs, use="pairwise.complete.obs")

#Cluster module eigengenes
METree <- hclust(as.dist(MEDiss), method="average")

#Plot the result
options(repr.plot.width=10, repr.plot.height=5)
plot(METree, main="Clustering of module eigengenes",
     xlab="", sub="")
MEDissThres <- 0.3
abline(h=MEDissThres, col="red")

## Clean data frames

In [None]:
#Clean the module eigengene table
eigengene_df <- MEs %>%
    rownames_to_column(var="public_client_id")
names(eigengene_df)[2:ncol(eigengene_df)] <- names(eigengene_df)[2:ncol(eigengene_df)] %>%
    str_replace(., "^ME", "") %>%
    str_to_title(.)

##Sample metadata
sample_tbl <- fr_measures %>%
    dplyr::filter(public_client_id %in% rownames(MEs)) %>%
    dplyr::arrange(public_client_id)#Sort row order
print("Sample metadata")
print(str_c("- nrow: ", nrow(sample_tbl)))

#Filter metadata to match columns in filtered data frame
sample_tbl <- sample_tbl %>%
    dplyr::filter(public_client_id %in% rownames(MEs))
print("Sample metadata after the filter")
print(str_c("- nrow: ", nrow(sample_tbl)))

#Code sex and race
phenotype_tbl <- sample_tbl %>%
    dplyr::mutate(BinarySex=ifelse(sex=="F", 0, 1),
                  BinaryRace=ifelse(race=="white", 0, 1)) %>%
    dplyr::mutate(BinaryRace=tidyr::replace_na(.$BinaryRace, 1)) %>%#Due to the existence of NA
    dplyr::select(public_client_id, BinarySex, BinaryRace, age, self_fi, lab_fi, merge_fi) %>%
    #Transform tibble for easily applying to the WGCNA functions
    column_to_rownames(var="public_client_id")

#Check phenotypes
print("Sample metadata")
print(str_c("- nrow: ", nrow(phenotype_tbl)))
print("- Contingency of BinarySex")
table(phenotype_tbl$BinarySex)
print("- Contingency of BinaryRace")
table(phenotype_tbl$BinaryRace)

## Module-trait relationship

In [None]:
#Calculate the numbers of modules and samples
#nModules <- ncol(MEs)
nSamples <- nrow(phenotype_tbl)

#Names (colors) of the modules
modNames = substring(names(MEs), 3)

##Check ID order before the cor() function
print(str_c("Matched IDs?: ", all(rownames(MEs)==rownames(phenotype_tbl))))

#Calculate module–trait relationship
moduleTraitCor <- as.data.frame(cor(MEs, phenotype_tbl, use="p"))
rownames(moduleTraitCor) <- str_to_title(modNames)
print("Module–trait relationship table")
print(str_c("nrow: ", nrow(moduleTraitCor)))

#Calculate statisitcal significance of module–trait relationship
MTRpval <- as.data.frame(corPvalueStudent(as.matrix(moduleTraitCor), nSamples))
rownames(MTRpval) <- str_to_title(modNames)
print("Module–trait relationship p-value table")
print(str_c("- nrow: ", nrow(MTRpval)))

#Eliminate the dummy module (Grey)
moduleTraitCor <- moduleTraitCor[rownames(moduleTraitCor)!="Grey",]
MTRpval <- MTRpval[rownames(MTRpval)!="Grey",]

#P-value adjustment across modules (per trait) using Benjamini–Hochberg method
MTRpval_adj <- as.data.frame(apply(MTRpval, 2, function(x){p.adjust(x, length(x), method="BH")}))
print("Module–trait relationship adjusted p-value table")
print(str_c("- nrow: ", nrow(MTRpval_adj)))

#Prepare text labels as matrix
textMatrix <- paste("r = ",signif(as.matrix(moduleTraitCor), 3),"\n(P = ",
                    signif(as.matrix(MTRpval_adj), 2),")", sep="")
dim(textMatrix) <- dim(moduleTraitCor)
#Revert module names back to apply color conversion
temp_c <- rownames(moduleTraitCor) %>%
    str_to_lower(.) %>%
    str_c("ME",.)

#Visualize
options(repr.plot.width=10, repr.plot.height=10)
par(mar=c(5, 5, 3, 2))
labeledHeatmap(Matrix=moduleTraitCor,
               xLabels=colnames(moduleTraitCor),
               yLabels=temp_c,
               #ySymbols=rownames(moduleTraitCor),
               colorLabels=FALSE,
               colors=blueWhiteRed(50),
               textMatrix=textMatrix,
               setStdMargins=FALSE,
               cex.text=1,
               zlim=c(-1,1),
               main=paste("Module–trait relationships"))

In [None]:
analyte_tbl_prot <- meta_protein %>%
    #Prepare the same analyte IDs within the data table
    dplyr::mutate(AnalyteID=str_c(name,"(",gene_name,")"), Dataset="Protein") %>%
    #Clean
    dplyr::rename(AnalyteID_original=name, UniProtID=uniprot, GeneSymbol=gene_name) %>%
    dplyr::select(AnalyteID, Dataset, AnalyteID_original, UniProtID, GeneSymbol)

analyte_tbl_met <- meta_metabolites %>%
    #Prepare the same analyte IDs within the data table
    dplyr::mutate(AnalyteID=str_c(CHEMICAL_ID,"(",BIOCHEMICAL_NAME,")"), Dataset="Metabolite") %>%
    #Clean
    dplyr::rename(ChemID=CHEMICAL_ID, ChemName=BIOCHEMICAL_NAME) %>%
    dplyr::select(AnalyteID, Dataset, ChemID, ChemName, KEGG, HMDB)

analyte_tbl_chem <- tibble(AnalyteID = cat.clin, Dataset="Chemistry")
analyte_tbl_multi <- dplyr::bind_rows(analyte_tbl_prot, analyte_tbl_met)
analyte_tbl_multi <- dplyr::bind_rows(analyte_tbl_multi, analyte_tbl_chem)

#Prepare the module assignment table
module_tbl <- tibble(AnalyteID=colnames(all_df),
                     ModuleID=str_to_title(dynamicColors))
print("Module assignment table (temp)")
print(str_c("- nrow: ", nrow(module_tbl)))
module_tbl <- merge(module_tbl, analyte_tbl_multi, by="AnalyteID")
head(module_tbl)

In [None]:
dim(all_df)
dim(MEs)

## Intramodular connectivity

In [None]:
#Prepare target modules
targets <- modNames[modNames!="grey"]

#Repeat for each module
temp_tbl <- tibble()
for (module in targets) {
    #Select module probes
    probes <- colnames(all_df)
    inModule <- (dynamicColors==module)
    modProbes <- probes[inModule]
    
    #Select the corresponding Topological Overlap
    modTOM <- TOM[inModule, inModule]
    dimnames(modTOM) <- list(modProbes, modProbes)
    
    #Calculate intramodular connectivity
    IMConn <- intramodularConnectivity(adjacency[modProbes, modProbes], rep(module, length(modProbes)), scaleByMax=FALSE)$kWithin
    
    #Summary table
    connectivity <- tibble(ModuleID=str_to_title(module),
                           AnalyteID=modProbes,
                           IntramodularConnectivity=IMConn,
                           TOMsimilaritySum=rowSums(modTOM))
    print(str_c(module," module: ", nrow(connectivity)))
    #print(head(connectivity))#Explicitly print due to within for-loop
    
    #Add to the overall table
    temp_tbl <- dplyr::bind_rows(temp_tbl, connectivity)
}
print(str_c("-> Total nrow: ", nrow(temp_tbl)))
head(temp_tbl)

#Update the module assignment table
module_tbl <- dplyr::left_join(module_tbl, temp_tbl, by=c("AnalyteID", "ModuleID"))
print("Module assignment table (updated)")
print(str_c("- nrow: ", nrow(module_tbl)))
head(module_tbl)

## Turquoise module

In [None]:
topQ <- 0.20 #Focus on the top hubs
hubs_turquoise <- module_tbl %>%
    dplyr::filter(ModuleID=="Turquoise") %>%
    dplyr::filter(IntramodularConnectivity>=quantile(IntramodularConnectivity, 1-topQ)) %>%
    arrange(desc(IntramodularConnectivity))

In [None]:
hubs_turquoise

## Multiomic functional enrichment

### Load and filter functional annotations

In [None]:
#Load the pre-cleaned annotations
pathways <- readRDS("../Useful_Files/KEGG_REACTOME_pathways.rds")
##Check the first 2 examples
temp_l <- lapply(names(pathways),
                 function(name) {head(pathways[[name]], 2)})
names(temp_l) <- names(pathways)
temp_l
print("Original nAnnotations:")
print(str_c("- Protein:", length(names(pathways$proteome))))
print(str_c("- Metabolite:", length(names(pathways$metabolome))))
temp_vec <- unique(names(pathways$proteome), names(pathways$metabolome))
print(str_c("-> Unique:", length(temp_vec)))
print("")

#Collapse the list object while combining protein and metabolite database IDs (UniProt and HMDB) per annotation
annotation_tbl <- tibble()
for (annotation in temp_vec) {
    temp_tbl1 <- tibble(AnnotationID=annotation,
                        AnnotationName=str_replace(annotation, "^\\(.+\\) ", ""),
                        Dataset="Protein",
                        AnalyteDatabaseID=as.character(pathways$proteome[[annotation]]))
    temp_tbl2 <- tibble(AnnotationID=annotation,
                        AnnotationName=str_replace(annotation, "^\\(.+\\) ", ""),
                        Dataset="Metabolite",
                        AnalyteDatabaseID=as.character(pathways$metabolome[[annotation]]))
    temp_tbl <- dplyr::bind_rows(temp_tbl1, temp_tbl2)
    annotation_tbl <- dplyr::bind_rows(annotation_tbl, temp_tbl)
}
print("Annotation DF")
print(str_c("- nrow: ", nrow(annotation_tbl)))
print(str_c("- nAnnotations: ", length(unique(annotation_tbl$AnnotationID))))
annotation_tbl %>%
    dplyr::group_by(Dataset) %>%
    dplyr::summarize(Count=n(), Unique=length(unique(AnalyteDatabaseID)))

#Prepare analytes that were included in the WGCNA input
temp_tbl1 <- module_tbl %>%
    dplyr::filter(!is.na(UniProtID)) %>%
    dplyr::rename(AnalyteDatabaseID=UniProtID) %>%
    dplyr::select(AnalyteDatabaseID, AnalyteID)
temp_tbl2 <- module_tbl %>%
    dplyr::filter(!is.na(HMDB)) %>%
    dplyr::rename(AnalyteDatabaseID=HMDB) %>%
    dplyr::select(AnalyteDatabaseID, AnalyteID)
temp_tbl <- dplyr::bind_rows(temp_tbl1, temp_tbl2)

#Take the analytes that were included in the WGCNA input
annotation_tbl <- dplyr::left_join(annotation_tbl, temp_tbl, by="AnalyteDatabaseID") %>%
    dplyr::filter(!is.na(AnalyteID))
print("Annotation DF (filtered)")
print(str_c("- nrow: ", nrow(annotation_tbl)))
print(str_c("- nAnnotations: ", length(unique(annotation_tbl$AnnotationID))))
annotation_tbl %>%
    dplyr::group_by(Dataset) %>%
    dplyr::summarize(Count=n(), Unique=length(unique(AnalyteDatabaseID)))
head(annotation_tbl)

#### Turquoise module

In [None]:
topQ <- 0.1#Focus on the top hubs
topX <- 30 #Number to display
#Summarize results per WGCNA module
module <- "Turquoise"

hubs <- module_tbl %>%
    dplyr::filter(ModuleID==module) %>%
    dplyr::filter(IntramodularConnectivity>=quantile(IntramodularConnectivity, 1-topQ)) %>%
.$AnalyteID
backgrounds <- annotation_tbl %>%
    dplyr::select(AnnotationID, AnalyteID)
labels <- annotation_tbl %>%
    dplyr::select(AnnotationID, AnnotationName) %>%
    dplyr::distinct()
print(str_c(module,": ",length(hubs)," hubs"))
##Save count info

#Enrichment analysis
obj <- enricher(gene=hubs,
                #universe=backgrounds,#Already managed in annotation metadata
                pvalueCutoff=1.0,#To export all
                pAdjustMethod="BH",
                qvalueCutoff=1.0,#To export all
                minGSSize=1,#Already managed in annotation metadata
                maxGSSize=1000,#Already managed in annotation metadata
                TERM2GENE=backgrounds,
                TERM2NAME=labels)

temp_tbl <- tibble(obj[]) %>%
    dplyr::rename(AnnotationID=ID, AnnotationName=Description, Ratio2Hubs=GeneRatio, Ratio2BGs=BgRatio,
                  Pval=pvalue, AdjPval=p.adjust, Qval=qvalue, MappedHub=geneID, nMappedHubs=Count)

#Retrieve nominal P-value < 0.05 (for display)
temp_tbl <- temp_tbl %>%
    dplyr::filter(Pval<0.05)
print(str_c("- # of annotations with nominal P-value < 0.05: ",as.character(nrow(temp_tbl))))
print(str_c(" -> Top ",topX,":"))
head(temp_tbl, topX)
#Retrieve adjusted P-value < 0.05
temp_tbl <- temp_tbl %>%
    dplyr::filter(AdjPval<0.05)
print(str_c("- # of annotations with adjusted P-value < 0.05: ",as.character(nrow(temp_tbl))))


#Visualization
if (is.data.frame(obj[])) {
    display <- tibble(obj[]) %>%
        dplyr::filter(pvalue<0.05) %>%
        nrow() %>%
        min(., topX)#Limit the maximum number of presentation
} else {
    display <- 0
}
if (display>0) {
    temp <- obj %>%
        dplyr::filter(pvalue<0.05) %>%#Display only nominal P-value < 0.05
        dplyr::mutate(PvalLabel=str_c("AdjPval = ",scales::scientific(p.adjust, digits=2)),
                      AdjSignif=ifelse(p.adjust<0.05, "True", "False")) %>%
        barplot(., x="Count", color="p.adjust", showCategory=display) +
        geom_text(aes(label=PvalLabel, color=AdjSignif), nudge_x=0.5, hjust=0) +
        coord_cartesian(clip="off") +
        scale_x_continuous(limits=c(0, 25.5), breaks=seq(0, 25, by=5), expand=c(0, 0)) +
        scale_y_discrete(labels=function(x) {str_wrap(x, width=60)}) +
        scale_fill_viridis_c(begin=0, end=1, direction=1, option="plasma",
                             limits=c(0, 0.2), breaks=seq(0, 0.2, by=0.05), oob=scales::squish,
                             name="AdjPval") +
        scale_color_manual(values=c("True"="#990000", "False"="gray40"), ) +
        guides(fill=guide_colorbar(reverse=TRUE), color="none") +
        labs(x="Count of hubs",
             y="", title=str_c("Enriched GOBP in the top ",topQ*100,"% hubs of ",module," module")) +
        theme_classic(base_size=16, base_family="Helvetica") +
        theme(text=element_text(face="plain", color="black", family="Helvetica")) +
        theme(axis.text.x=element_text(face="plain", color="black", family="Helvetica"),
              axis.text.y=element_text(face="plain", color="black", family="Helvetica", lineheight=0.75),
              axis.title=element_text(face="plain", color="black", family="Helvetica")) +
        theme(plot.title=element_text(size=18, hjust=1.0)) +
        theme(legend.direction="vertical", legend.box="horizontal",
              legend.background=element_blank())
    options(repr.plot.width=9, repr.plot.height=max(c(1+display*0.25, 2.5)))
    plot(temp)
}


#### Green module

In [None]:
topQ <- 0.1#Focus on the top hubs
topX <- 30 #Number to display
#Get Module
module <- "Green"

hubs <- module_tbl %>%
    dplyr::filter(ModuleID==module) %>%
    dplyr::filter(IntramodularConnectivity>=quantile(IntramodularConnectivity, 1-topQ)) %>%
.$AnalyteID
backgrounds <- annotation_tbl %>%
    dplyr::select(AnnotationID, AnalyteID)
labels <- annotation_tbl %>%
    dplyr::select(AnnotationID, AnnotationName) %>%
    dplyr::distinct()
print(str_c(module,": ",length(hubs)," hubs"))
##Save count info

#Enrichment analysis
obj <- enricher(gene=hubs,
                #universe=backgrounds,#Already managed in annotation metadata
                pvalueCutoff=1.0,#To export all
                pAdjustMethod="BH",
                qvalueCutoff=1.0,#To export all
                minGSSize=1,#Already managed in annotation metadata
                maxGSSize=1000,#Already managed in annotation metadata
                TERM2GENE=backgrounds,
                TERM2NAME=labels)

temp_tbl <- tibble(obj[]) %>%
    dplyr::rename(AnnotationID=ID, AnnotationName=Description, Ratio2Hubs=GeneRatio, Ratio2BGs=BgRatio,
                  Pval=pvalue, AdjPval=p.adjust, Qval=qvalue, MappedHub=geneID, nMappedHubs=Count)

#Retrieve nominal P-value < 0.05 (for display)
temp_tbl <- temp_tbl %>%
    dplyr::filter(Pval<0.05)
print(str_c("- # of annotations with nominal P-value < 0.05: ",as.character(nrow(temp_tbl))))
print(str_c(" -> Top ",topX,":"))
head(temp_tbl, topX)
#Retrieve adjusted P-value < 0.05
temp_tbl <- temp_tbl %>%
    dplyr::filter(AdjPval<0.05)
print(str_c("- # of annotations with adjusted P-value < 0.05: ",as.character(nrow(temp_tbl))))


#Visualization
if (is.data.frame(obj[])) {
    display <- tibble(obj[]) %>%
        dplyr::filter(pvalue<0.05) %>%
        nrow() %>%
        min(., topX)#Limit the maximum number of presentation
} else {
    display <- 0
}
if (display>0) {
    temp <- obj %>%
        dplyr::filter(pvalue<0.05) %>%#Display only nominal P-value < 0.05
        dplyr::mutate(PvalLabel=str_c("AdjPval = ",scales::scientific(p.adjust, digits=2)),
                      AdjSignif=ifelse(p.adjust<0.05, "True", "False")) %>%
        barplot(., x="Count", color="p.adjust", showCategory=display) +
        geom_text(aes(label=PvalLabel, color=AdjSignif), nudge_x=0.5, hjust=0) +
        coord_cartesian(clip="off") +
        scale_x_continuous(limits=c(0, 25.5), breaks=seq(0, 25, by=5), expand=c(0, 0)) +
        scale_y_discrete(labels=function(x) {str_wrap(x, width=60)}) +
        scale_fill_viridis_c(begin=0, end=1, direction=1, option="plasma",
                             limits=c(0, 0.2), breaks=seq(0, 0.2, by=0.05), oob=scales::squish,
                             name="AdjPval") +
        scale_color_manual(values=c("True"="#990000", "False"="gray40"), ) +
        guides(fill=guide_colorbar(reverse=TRUE), color="none") +
        labs(x="Count of hubs",
             y="", title=str_c("Enriched GOBP in the top ",topQ*100,"% hubs of ",module," module")) +
        theme_classic(base_size=16, base_family="Helvetica") +
        theme(text=element_text(face="plain", color="black", family="Helvetica")) +
        theme(axis.text.x=element_text(face="plain", color="black", family="Helvetica"),
              axis.text.y=element_text(face="plain", color="black", family="Helvetica", lineheight=0.75),
              axis.title=element_text(face="plain", color="black", family="Helvetica")) +
        theme(plot.title=element_text(size=18, hjust=1.0)) +
        theme(legend.direction="vertical", legend.box="horizontal",
              legend.background=element_blank())
    options(repr.plot.width=9, repr.plot.height=max(c(1+display*0.25, 2.5)))
    plot(temp)
}
