In [1]:
import rpy2
import scipy
import logging
import warnings
import anndata2ri
import pandas as pd
import scanpy as sc
import numpy as np
import seaborn as sb
import decoupler as dc
import scrublet as scr
import decoupler as dc
from scipy import sparse
from anndata import AnnData
from tabnanny import verbose
import matplotlib.pyplot as plt
from gsva_prep import prep_gsva
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from typing import Optional, Union
from matplotlib.pyplot import rcParams
from functions import pathway_analyses
from statsmodels.stats.multitest import multipletests
from sklearn.model_selection import train_test_split
from pytorch_lightning.loggers import TensorBoardLogger
from rpy2.robjects.conversion import localconverter

In [2]:
def get_sys_dpi(width, height, diag):
    '''
    obtain dpi of system
    
    w: width in pixels (if unsure, go vist `whatismyscreenresolution.net`)
    h: height in pixels
    d: diagonal in inches
    '''
    w_inches = (diag**2/ (1 + height**2/width**2))**0.5
    return round(width/w_inches)

In [3]:
# # Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# # Automatically convert rpy2 outputs to pandas dataframes
# pandas2ri.activate()
# anndata2ri.activate()
# %load_ext rpy2.ipython

warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

rcParams['figure.dpi'] = get_sys_dpi(1512, 982, 14.125)
#rcParams['figure.figsize']=(4,4) #rescale figures

sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()



-----
anndata     0.8.0
scanpy      1.9.3
-----
PIL                         9.5.0
absl                        NA
anndata2ri                  1.1
appnope                     0.1.3
asttokens                   NA
attr                        23.1.0
backcall                    0.2.0
botocore                    1.31.17
cffi                        1.15.1
comm                        0.1.3
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.6.7
decorator                   5.1.1
decoupler                   1.4.0
deprecate                   0.3.2
dot_parser                  NA
executing                   1.2.0
fsspec                      2023.6.0
functions                   NA
google                      NA
gsva_prep                   NA
h5py                        3.9.0
igraph                      0.10.4
ipykernel                   6.23.2
ipywidgets                  8.0.6
jedi                        0.18.2
jinja2      

# **hdWGCNA in single-cell data**

Here we use `hdWGCNA` to perform co-expression network analysis on single-cell data. 

`hdWGCNA` is an R package for performing `weighted gene co-expression network analysis (WGCNA)` in high dimensional transcriptomics data such as single-cell RNA-seq or spatial transcriptomics. hdWGCNA is highly modular and can construct co-expression networks across multi-scale cellular and spatial hierarchies. hdWGNCA identifies robust modules of inerconnected genes, and provides context for these modules through various biological knowledge sources. hdWGCNA requires data formatted as Seurat objects, one of the most ubiquitous formats for single-cell data. 

`Note: hdWGCNA is under active development, so you may run into errors and small typos. We welcome users to write GitHub issues to report bugs, ask for help, and to request potential enhancements.`

Explore the full capabilites of hWGNA in the following publications:

- [**Morabito et al. bioRxiv 2022**](https://www.biorxiv.org/content/10.1101/2022.09.22.509094v1)
- [**Morabito & Miyoshi et al. Nature Genetics 2021**](https://www.nature.com/articles/s41588-021-00894-z)


Here, apply hdWGCNA to the processed single-nucleus RNA-seq (snRNA-seq) datasets. These dataset have already been fully processed using a standard single-cell transcritpomics analysis pipelines (please see the following notebooks, `preprocessing_data_integration`, `preprocessing_quality_control`, `preprocessing_cluster_annotation`). 


If you would like to apply this notebook to your own dataset, you first need to satisfy the following prerequisites:

A single-cell or single-nucleus transcriptomics dataset in Seurat format.
Normalize the gene expression matrix NormalizeData.
Identify highly variable genes VariableFeatures.
Scale the normalized expression data ScaleData
Perform dimensionality reduction RunPCA and batch correction if needed RunHarmony.
Non-linear dimensionality reduction RunUMAP for visualizations.
Group cells into clusters (FindNeighbors and FindClusters).
An example of running the prerequisite data processing steps can be found in the Seurat Guided Clustering Tutorial.


In [4]:
%%R
suppressPackageStartupMessages({
    library(WGCNA)
    library(Matrix)
    library(viridis)
    library(harmony)
    library(ggpubr)
    library(tictoc)
    library(RColorBrewer)
    library(Hmisc)
    library(corrplot)
    library(grid)
    library(gridExtra)
    library(igraph)
    library(ggrepel)
    library(readxl)
    library(conflicted)



    # single-cell analysis package
    library(Seurat)

    # plotting and data science packages
    library(tidyverse)
    library(cowplot)
    library(patchwork)

    # co-expression network analysis packages:
    library(WGCNA)
    library(hdWGCNA)

    # gene enrichment packages
    library(enrichR)
    library(GeneOverlap)


    library(GSEABase)
    library(GSVA) 
# needs to be run every time you start R and want to use %>%
})

# using the cowplot theme for ggplot
theme_set(theme_cowplot())

# set random seed for reproducibility
set.seed(12345)

# optionally enable multithreading
# enableWGCNAThreads(nThreads = 4)


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

1: package ‘IRanges’ was built under R version 4.3.1 
2: package ‘GSVA’ was built under R version 4.3.1 


## **Prepare data**

Now, we load the preprocessed and annotated data for downstream analysis.

In [5]:
save_prefix = 'leng_sfg'

adata_annot = sc.read_h5ad(f'../data/raw/{save_prefix}/{save_prefix}_raw_anndata.h5ad')
adata_annot.obs_names_make_unique()
adata_annot.var_names_make_unique()

try:
    adata_annot.X = adata_annot.layers['counts'].copy()
except KeyError:
    print('"counts" not in layers...')
    print('analysis requires unnormalized count data...')
    print('analysis proceesing with data in "adata.x"...')
    
del adata_annot.obsm, adata_annot.layers, adata_annot.varm, adata_annot.uns, adata_annot.obsp

print(adata_annot)

"counts" not in layers...
analysis requires unnormalized count data...
analysis proceesing with data in "adata.x"...
AnnData object with n_obs × n_vars = 63608 × 33694
    obs: 'SampleID', 'PatientID', 'BrainRegion', 'BraakStage', 'SampleBatch', 'nUMI', 'nGene', 'initialClusterAssignments', 'seurat.clusters', 'clusterAssignment', 'clusterCellType', 'cell_type'


In [6]:
get_cell_types = True # whether to  to obtain appropriate cell-type columns: True/False
map_meta = True
deg_method =  'DESeq2-Wald'
test_names = ['late_vs_early', 'early_vs_no', 'late_vs_no', 'ad_vs_no']
filter_genes = "TRUE"
subject_ids_for_study = {'leng_sfg': 'PatientID',
                        'leng_etc': 'PatientID',
                        'seaad_mtg': 'individualID'}

subject_id = subject_ids_for_study[save_prefix]     # for leng this is `PatientID` for mathys is 'Subject', and allen is 'individualID'
gene_celltype_threshold = 0.10          # determines number of cells the gene must be expressed in 
covariates = ['None']                   # list of covariates to be accounted for in regression.
gene_selection = 'custom'               # specifies the gene selection method when setting up seurat object for WGCNA. The th
celltypes = ["Excitatory", "Inhibitory", "Astrocyte", "Microglia", "Oligodendrocyte", "OPC"]
metadata = f'../data/raw/{save_prefix}/{save_prefix}_metadata.csv' 
meta = pd.read_csv(metadata, encoding_errors='ignore')

gene_selection = 'custom'   # specifies the gene selection method when setting up seurat object for WGCNA. The th
                            # Posible values are "custom", "fraction", "variable"
                            # If custom, a list of genes must be passed.

gene_set_select = 'diff_exp' # If gene_selection = 'custom'. This specifies how to obtain the list of
                             # genes to pass into `SetupForWGCNA`. # The posible values are "diff_exp", "overlap", "all"


In [7]:

mapping = {'leng_etc':
           
           {'Exc': 'Excitatory', 
            'Inh': 'Inhibitory', 
            'Astro': 'Astrocyte',
            'Endo': 'Endothelial', 
            'Micro': 'Microglia', 
            'OPC': 'OPC', 
            'Oligo': 'Oligodendrocyte'},

           'leng_sfg':
           
           {'Exc': 'Excitatory', 
            'Inh': 'Inhibitory', 
            'Astro': 'Astrocyte',
            'Endo': 'Endothelial', 
            'Micro': 'Microglia', 
            'OPC': 'OPC', 
            'Oligo': 'Oligodendrocyte'},
            
            'seaad_mtg':
           
           {'Excitatory': 'Excitatory', 
            'Inhibitory': 'Inhibitory',
            'Astrocyte': 'Astrocyte',
            'Microglia': 'Microglia', 
            'Endothelial': 'Endothelial', 
            'OPC': 'OPC',
            'Oligodendrocyte': 'Oligodendrocyte'},
            }

cell_column = {'leng_etc': 'clusterCellType',
               'leng_sfg': 'clusterCellType',
               'seaad_mtg': 'cell_type',
               }

In [8]:
if get_cell_types:
    adata_annot.obs['cell_type'] = adata_annot.obs[cell_column[save_prefix]].map(mapping[save_prefix])

In [9]:
# map the pathology group to teh subject id in .obs
adata_annot = adata_annot[adata_annot.obs.cell_type.isin(celltypes)]
adata_annot.obs['pathology.group'] = adata_annot.obs[subject_id].map(dict(zip(meta[subject_id].astype(str), meta['pathology.group'])))

In [10]:
%%R -i adata_annot -i subject_id -i gene_celltype_threshold -i celltypes -i test_names -i save_prefix -i gene_selection

print(adata_annot)
print('loaded data into memory for recursive use')

class: SingleCellExperiment 
dim: 33694 62211 
metadata(0):
assays(1): X
rownames(33694): RP11-34P13.3 FAM138A ... AC213203.1 FAM231B
rowData names(0):
colnames(62211): SFG2_AAACCTGAGATGGCGT SFG2_AAACCTGAGCGATCCC ...
  SFG10_TTTGTCATCACGGTTA SFG10_TTTGTCATCATACGGT
colData names(13): SampleID PatientID ... cell_type pathology.group
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
[1] "loaded data into memory for recursive use"


In [11]:
del adata_annot

## **Set up Seurat object for WGCNA**

Before running hdWGCNA, we first have to set up the Seurat object. Most of the information computed by hdWGCNA is stored in the Seurat object’s `@misc slot`. 

The `SetupForWGCNA` function selects the genes that will be used for WGCNA using three different approaches using the `gene_select` parameter:

- `variable`: use the genes stored in the Seurat object’s `VariableFeatures`.

- `fraction`: use genes that are expressed in a certain fraction of cells for in the whole dataset or in each group of cells, specified by group.by.

- `custom`: use genes that are specified in a custom list.

For our specific case, the `gene_selection` parameter is set to `custom`, we employ the genes present in all the overlapping pathways in a cell type-specific and study-specific manner.

To accomplish this, we load the list of overlapping pathways that was previously saved in `../results/{test_name}/{test_name}_pathway_overlap.csv` from the `pathway_meta_analysis.ipynb` notebook. Additionally, we load the genes from the `.gmt` file saved in `../results/gsva/{study}/{cur_celltype}/gs.gmt` from the `pathway_analysis.ipynb` notebook.

##### **Select Gene Set**

To obtain the gene sets required for the `SetupForWGCNA` function, we consider the `gene_set_select` variable when `gene_select` is set to `custom`. The `gene_set_select` variable determines how the `gene_list` passed into the `SetupForWGCNA` function will be obtained. 

There are three options available:

- `diff_exp`: This option utilizes the genes present in the differentially expressed pathways, previously computed and saved to the `../results/{test_name}/{save_prefix}_differentially_expressed_pathways.csv` file.

- `overlap`: Here, the genes used are those expressed in pathways that overlap across different studies. The pathway overlap information is retrieved from the `../results/{test_name}/{test_name}_pathway_overlap.csv` file.

- `all`: With this option, all genes in the pathways stored in the `../results/gsva/{save_prefix}/{celltype}/gs.gmt` file are included.

Here, we specifically set `gene_select=custom` and `gene_set_select = diff_exp`. However, you can customize the gene selection process for your hdWGCNA experiment.


In [12]:
%%R -o geneSets -i gene_set_select

# iterate over celltypes and obtain 
geneSets <- list()
for(test_name in test_names){

  geneSets[[test_name]] <- list()
  for (cur_celltype in celltypes){
    # get gene sets
    geneSets[[test_name]][[cur_celltype]] <- getGmt(file.path('../results/gsva/', save_prefix, '/', cur_celltype, 'gs.gmt'))
    geneSets[[test_name]][[cur_celltype]] <- geneIds(geneSets[[test_name]][[cur_celltype]])

    if (gene_set_select=='overlap'){
      pathways <- read.csv(paste0('../results/',test_name, '/', test_name, '_pathway_overlap.csv'))
      pathways <- pathways$pathway
    }else if (gene_set_select=='diff_exp'){
      pathways <- read.csv(paste0('../results/', test_name, '/', save_prefix, '/Data/differentially_expressed_pathways.csv'))
      pathways <- subset(pathways, P.Value < 0.05 & celltype == cur_celltype)$pathway
    }else if (gene_set_select=='all'){
      pathways <- names(geneSets[[test_name]][[cur_celltype]])
    }

    geneSets[[test_name]][[cur_celltype]] <-  geneSets[[test_name]][[cur_celltype]][names(geneSets[[test_name]][[cur_celltype]]) %in% pathways]
    geneSets[[test_name]][[cur_celltype]] <- unique(unlist(geneSets[[test_name]][[cur_celltype]], recursive = FALSE))

    print(length(geneSets[[test_name]][[cur_celltype]]))
  }
}

[1] 42
[1] 109
[1] 134
[1] 84
[1] 26
[1] 23
[1] 1449
[1] 714
[1] 382
[1] 97
[1] 86
[1] 209
[1] 1785
[1] 1405
[1] 393
[1] 107
[1] 160
[1] 359
[1] 2034
[1] 1410
[1] 564
[1] 102
[1] 196
[1] 353


In [13]:
# convert nested list of geneSets object into Rpy2 object 

geneSets =  robjects.ListVector(
                        {
                            test_name: robjects.ListVector(
                                    {cell_type: geneSets[test_name][cell_type]
                                                 
                                    for cell_type in celltypes
                                    }
                                )
                            for test_name in test_names
                        }

                        )

In [14]:
%%R -i geneSets

seurat_obj <- as.Seurat(adata_annot, counts = "X", data = "X")

print(seurat_obj)

# Perform dimensionality reduction and plot

seurat_obj <- FindVariableFeatures(seurat_obj)
seurat_obj <- ScaleData(seurat_obj)
seurat_obj <- RunPCA(seurat_obj)
seurat_obj <- RunHarmony(seurat_obj, group.by.vars = subject_id)
seurat_obj <- RunUMAP(seurat_obj, reduction='harmony', n.neighbors=15, dims=1:30, min.dist=0.1)

p <- DimPlot(seurat_obj, group.by = "cell_type", label = TRUE) +
     umap_theme() + ggtitle("leng") + NoLegend()

fig_dir = paste0("../results/hdWGCNA/UMAP/")

if (!dir.exists(fig_dir)) {
  dir.create(fig_dir, recursive=TRUE)
}

# Save plot to PDF
pdf(file = paste0(fig_dir, save_prefix, "_integrated_umap.pdf"), width = 4, height = 4, useDingbats = FALSE)
print(p)
dev.off()

# create a hdWGCNA experiment for each celltype
seurat_dat <- seurat_obj
seurat_obj <- list()


for (cur_cell_type in celltypes){

  seurat_obj[[cur_cell_type]] <- subset(seurat_dat, cell_type == cur_cell_type)

  print(paste0('Creating hdWGNA Experiment for ', toupper(cur_cell_type)))

  if (gene_selection == 'custom') {
    seurat_obj[[cur_cell_type]] <- SetupForWGCNA(
      seurat_obj[[cur_cell_type]],
      gene_select = "custom",                                            # the gene selection approach
      gene_list = as.vector(geneSets[['ad_vs_no']][[cur_cell_type]]),    # list of genes to be included
      group.by = 'cell_type',                                            # grouping parameter
      wgcna_name = toupper(substr(cur_cell_type, 1, 3))                  # the name of the hdWGCNA experiment
    )
  } else {
    seurat_obj[[cur_cell_type]] <- SetupForWGCNA(
      seurat_obj[[cur_cell_type]],
      gene_select = "fraction",                               # the gene selection approach
      fraction = gene_celltype_threshold,                     # fraction of cells for gene inclusion
      group.by = 'cell_type',                                 # grouping parameter
      wgcna_name = toupper(substr(cur_cell_type, 1, 3))       # the name of the hdWGCNA experiment
    )
  }

  print(seurat_obj[[cur_cell_type]])
  print(paste0(length(GetWGCNAGenes(seurat_obj[[cur_cell_type]])), " WGCNA Genes"))

}

rm(seurat_dat)
rm(adata_annot)

An object of class Seurat 
33694 features across 62211 samples within 1 assay 
Active assay: originalexp (33694 features, 0 variable features)
[1] "Creating hdWGNA Experiment for EXCITATORY"
An object of class Seurat 
33694 features across 20301 samples within 1 assay 
Active assay: originalexp (33694 features, 2000 variable features)
 3 dimensional reductions calculated: pca, harmony, umap
[1] "2034 WGCNA Genes"
[1] "Creating hdWGNA Experiment for INHIBITORY"
An object of class Seurat 
33694 features across 7964 samples within 1 assay 
Active assay: originalexp (33694 features, 2000 variable features)
 3 dimensional reductions calculated: pca, harmony, umap
[1] "1410 WGCNA Genes"
[1] "Creating hdWGNA Experiment for ASTROCYTE"
An object of class Seurat 
33694 features across 8025 samples within 1 assay 
Active assay: originalexp (33694 features, 2000 variable features)
 3 dimensional reductions calculated: pca, harmony, umap
[1] "564 WGCNA Genes"
[1] "Creating hdWGNA Experiment for MIC

Calculating gene variances
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Centering and scaling data matrix
PC_ 1 
Positive:  CNDP1, SLC1A3, SEPP1, MOBP, SLC5A11, EMX2, SPP1, FGFR3, PDK4, SDC4 
	   ZIC1, GJA1, CTD-2282P23.2, HIF3A, CX3CR1, SLC14A1, VSIG4, APOE, C1QC, APCDD1 
	   ITPKB-AS1, LINC00609, MS4A6A, ZFP36L1, C1QB, AGT, RP11-489O18.1, C1QA, CD74, CSF3R 
Negative:  MEG3, KALRN, PDE4D, OPCML, AGBL4, STXBP5L, RIMS2, KCNIP4, OXR1, CCSER1 
	   PRKCB, PHACTR1, KCNQ5, PLCB1, GABRB3, DCLK1, MKL2, RYR2, LDB2, SLC4A10 
	   RALYL, GABBR2, FRMPD4, HECW1, ATP2B1, CADPS, THRB, NELL2, FGF12, LRFN5 
PC_ 2 
Positive:  LSAMP-AS1, PDE5A, AC016831.7, RP11-479O9.4, SPD

## **Construct metacells**

In the hdWGCNA pipeline, the initial step involves constructing metacells from a single-cell dataset using the `MetacellsByGroups` function, which aggregates similar cells using the `k-Nearest Neighbors` algorithm. This process is essential for reducing the sparsity of the gene expression matrix and involves grouping by specific parameters like `Sample` and `cell_type` to ensure metacells originate from the same biological sample. The function allows tuning of the number of cells aggregated (k) and the maximum shared overlap, with a min_cells parameter to exclude small groups, thereby optimizing the construction of a metacell expression matrix for subsequent analysis.

In [16]:
%%R

# construct metacells in each group for each wgnca experiment

for (cell_type in celltypes){

  print(paste0('Constructing MetaCells in hdWGCNA Experiment for ', toupper(cell_type)))
  
  seurat_obj[[cell_type]] <- MetacellsByGroups(
      
    seurat_obj = seurat_obj[[cell_type]],
    group.by = c("cell_type", subject_id), # specify the columns in seurat_obj@meta.data to group by
    reduction = 'harmony',  # select the dimensionality reduction to perform KNN on
    k = 25, # nearest-neighbors parameter
    max_shared = 10, # maximum number of shared cells between two metacells
    ident.group = 'cell_type', # set the Idents of the metacell seurat object
    wgcna_name = toupper(substr(cell_type, 1, 3)),        # the name of the hdWGCNA experiment
    )

    # normalize metacell expression matrix:
  seurat_obj[[cell_type]] <- NormalizeMetacells(seurat_obj[[cell_type]],  wgcna_name = toupper(substr(cell_type, 1, 3)))
}


[1] "Constructing MetaCells in hdWGCNA Experiment for EXCITATORY"
[1] "Constructing MetaCells in hdWGCNA Experiment for INHIBITORY"
[1] "Constructing MetaCells in hdWGCNA Experiment for ASTROCYTE"
[1] "Constructing MetaCells in hdWGCNA Experiment for MICROGLIA"
[1] "Constructing MetaCells in hdWGCNA Experiment for OLIGODENDROCYTE"
[1] "Constructing MetaCells in hdWGCNA Experiment for OPC"


Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0%   10   20   30   40   50   6

## **Co-expression network analysis**

### **Set up the expression matrix**

the `SetDatExpr` function is used to set up the expression matrix for network analysis, involving subsetting the data to include only a specific cell type per run. By default, the metacell expression matrix is utilized `(use_metacells=TRUE)`, though the option to use the single-cell expression matrix is available. This function also allows users to choose the source slot for the expression matrix, accommodating different normalization methods like SCTransform or NormalizeData.

In [21]:
%%R

for (cell_type in celltypes){

    print(paste0('Setting up expression matrix in hdWGCNA Experiment for ', toupper(cell_type)))

    seurat_obj[[cell_type]] <- SetDatExpr(
      seurat_obj[[cell_type]],
      group_name = cell_type, # the name of the group of interest in the group.by column
      group.by = 'cell_type', # the metadata column containing the cell type info. This same column should have also been used in MetacellsByGroups
      assay = 'originalexp', # using RNA assay
      slot = 'data', # using normalized data
      wgcna_name = toupper(substr(cell_type, 1, 3)) # the name of the hdWGCNA experiment
    )
}

[1] "Setting up expression matrix in hdWGCNA Experiment for EXCITATORY"
[1] "Setting up expression matrix in hdWGCNA Experiment for INHIBITORY"
[1] "Setting up expression matrix in hdWGCNA Experiment for ASTROCYTE"
[1] "Setting up expression matrix in hdWGCNA Experiment for MICROGLIA"
[1] "Setting up expression matrix in hdWGCNA Experiment for OLIGODENDROCYTE"
[1] "Setting up expression matrix in hdWGCNA Experiment for OPC"


### **Select soft-power threshold**

Next we will select the “soft power threshold”. Selecting the `soft power threshold` is a crucial step, as it influences the construction of the gene-gene correlation adjacency matrix used to infer co-expression relationships between genes. The `TestSoftPowers` function is employed to conduct a parameter sweep for various soft power thresholds, aiding in choosing an appropriate value to ensure the co-expression network approximates a scale-free topology. Additionally, the `PlotSoftPowers`` function is used to visually represent the outcomes of this parameter sweep, to assess the network's structure at different thresholds.

In [23]:
%%R

fig_dir = paste0("../results/hdWGCNA/SoftPower/", save_prefix, '/')

if (!dir.exists(fig_dir)) {
  dir.create(fig_dir, recursive=TRUE)
}


for (cell_type in celltypes){
  
  print(paste0('Estinmating Soft-Power Threshold in hdWGCNA Experiment for ', toupper(cell_type)))

  # Test different soft powers:
  seurat_obj[[cell_type]] <- TestSoftPowers(
    seurat_obj[[cell_type]],
    networkType = 'signed', # you can also use "unsigned" or "signed hybrid"
    setDatExpr = TRUE,
    group.by = 'cell_type',
    group_name = cell_type,
  )

  # plot the results:
  plot_list <- PlotSoftPowers(seurat_obj[[cell_type]], 
                              # wgcna_name = toupper(substr(cell_type, 1, 3)) # the name of the hdWGCNA experiment
                              )

  # assemble with patchwork
  pdf(paste0(fig_dir, cell_type, '_SoftPower.pdf'), width=12, height=8)
  print(wrap_plots(plot_list, ncol=2))
  dev.off()
}



[1] "Estinmating Soft-Power Threshold in hdWGCNA Experiment for EXCITATORY"
pickSoftThreshold: will use block size 2034.
 pickSoftThreshold: calculating connectivity for given powers...
   ..working on genes 1 through 2034 of 2034
   Power SFT.R.sq  slope truncated.R.sq  mean.k. median.k.   max.k.
1      1   0.7890 16.000          0.907 1.07e+03  1.07e+03 1160.000
2      2   0.5770  6.180          0.898 5.71e+02  5.73e+02  681.000
3      3   0.0697  1.520          0.798 3.09e+02  3.08e+02  414.000
4      4   0.0185 -0.692          0.908 1.69e+02  1.67e+02  260.000
5      5   0.1540 -1.750          0.931 9.35e+01  9.17e+01  167.000
6      6   0.3620 -2.570          0.960 5.24e+01  5.05e+01  110.000
7      7   0.5330 -2.970          0.942 2.97e+01  2.82e+01   73.500
8      8   0.6620 -3.270          0.956 1.70e+01  1.58e+01   49.900
9      9   0.7480 -3.400          0.966 9.89e+00  8.91e+00   34.400
10    10   0.8020 -3.370          0.958 5.81e+00  5.12e+00   24.100
11    12   0.8540 -3.



For WGCNA and hdWGCNA, the recommended approach is to choose the lowest soft power threshold that achieves a Scale Free Topology Model Fit of 0.8 or higher. If not manually specified, the ConstructNetwork function will automatically select an appropriate soft power threshold during the network construction process.

### **Construct co-expression network**

We now have everything that we need to construct our co-expression network. Here we use the hdWGCNA function `ConstructNetwork`, which internally utilizes the `WGCNA` function `blockwiseConsensusModules`. While this function offers numerous parameters for advanced customization, default settings are generally effective for many single-cell datasets. Parameters for blockwiseConsensusModules can be directly passed to ConstructNetwork using the same names, allowing for the construction of the co-expression network based on the previously selected soft power threshold.

In [26]:
%%R

  
tom_outdir <-  paste0('../results/hdWGCNA/TOM/', save_prefix, '/')

# make output dir for the TOM
if(!dir.exists(tom_outdir)){
  dir.create(tom_outdir, recursive=TRUE)
}

for (cell_type in celltypes) {

  print(paste0('Constructing co-expression network in hdWGCNA Experiment for ', toupper(cell_type)))
  
  tryCatch({
    # construct co-expression network:
    seurat_obj[[cell_type]] <- ConstructNetwork(
      seurat_obj[[cell_type]], 
      soft_power = NULL, # Set to NULL so that the ConstructNetwork function obtains it automatically (Scale Free Topology Model Fit >= 0.8)
      setDatExpr = FALSE,
      overwrite_tom = TRUE,
      tom_outdir = tom_outdir,
      tom_name = toupper(substr(cell_type, 1, 3)) # name of the topological overlap matrix written to disk set as the name of the hdWGCNA experiment 
    )
  }, error = function(e) {
    print(paste0('Error encountered while processing ', toupper(cell_type)))
    print(paste0(toupper(cell_type), ' dropped from experiment'))
    # Code to handle the error condition
    seurat_obj <- seurat_obj[names(seurat_obj) != cell_type]
    assign("seurat_obj", seurat_obj, envir = .GlobalEnv) # Update the modified seurat_obj in the global environment

  },  
  message = function(m) {
    print(paste0('Error encountered while processing ', toupper(cell_type)))
    print(paste0(toupper(cell_type), ' dropped from experiment'))
    # Code to handle the error condition
    seurat_obj <- seurat_obj[names(seurat_obj) != cell_type]
    assign("seurat_obj", seurat_obj, envir = .GlobalEnv) # Update the modified seurat_obj in the global environment

  }) 
  
}


print(names(seurat_obj))


[1] "Constructing co-expression network in hdWGCNA Experiment for EXCITATORY"
Soft power not provided. Automatically using the lowest power that meets 0.8 scale-free topology fit. Using soft_power = 10
 Calculating consensus modules and module eigengenes block-wise from all genes
 Calculating topological overlaps block-wise from all genes
   Flagging genes and samples with too many missing values...
    ..step 1
    TOM calculation: adjacency..
    ..will not use multithreading.
     Fraction of slow calculations: 0.000000
    ..connectivity..
    ..matrix multiplication (system BLAS)..
    ..normalization..
    ..done.
 ..Working on block 1 .
 ..Working on block 1 .
 ..merging consensus modules that are too close..
[1] "Constructing co-expression network in hdWGCNA Experiment for INHIBITORY"
Soft power not provided. Automatically using the lowest power that meets 0.8 scale-free topology fit. Using soft_power = 9
 Calculating consensus modules and module eigengenes block-wise from all 

1: In ConstructNetwork(seurat_obj[[cell_type]], soft_power = NULL,  :
  Overwriting TOM ../results/hdWGCNA/TOM/leng_sfg//EXC_TOM.rda
2: In ConstructNetwork(seurat_obj[[cell_type]], soft_power = NULL,  :
  Overwriting TOM ../results/hdWGCNA/TOM/leng_sfg//INH_TOM.rda
3: In ConstructNetwork(seurat_obj[[cell_type]], soft_power = NULL,  :
  Overwriting TOM ../results/hdWGCNA/TOM/leng_sfg//AST_TOM.rda
4: In ConstructNetwork(seurat_obj[[cell_type]], soft_power = NULL,  :
  Overwriting TOM ../results/hdWGCNA/TOM/leng_sfg//MIC_TOM.rda
5: In ConstructNetwork(seurat_obj[[cell_type]], soft_power = NULL,  :
  Overwriting TOM ../results/hdWGCNA/TOM/leng_sfg//OLI_TOM.rda
6: In ConstructNetwork(seurat_obj[[cell_type]], soft_power = NULL,  :
  Overwriting TOM ../results/hdWGCNA/TOM/leng_sfg//OPC_TOM.rda


hdWGCNA also includes a function PlotDendrogram function for visualizing the WGCNA dendrogram, a standard method to display the various co-expression modules derived from network analysis. In this dendrogram, each leaf represents a gene, with colors indicating their module assignments. `Notably, genes in the "grey" module, which were not grouped into any specific co-expression module, should be excluded from all subsequent analyses and interpretations.`

In [28]:
%%R

fig_dir = paste0("../results/hdWGCNA/dendrogram/", save_prefix, '/')

if (!dir.exists(fig_dir)) {
  dir.create(fig_dir, recursive=TRUE)
}

tryCatch({
    for (cell_type in names(seurat_obj)){
        # print(paste0('Constructing co-expression networkd in hdWGCNA Experiment for ', toupper(cell_type)))
        pdf(paste0(fig_dir, cell_type, "_dendro.pdf"), height=2, width=4)
        PlotDendrogram(seurat_obj[[cell_type]], main='hdWGCNA Dendrogram')
        dev.off()
    }
}, error = function(e){
    NULL
})

### **Module Eigengenes and Connectivity**

### Compute harmonized module eigengenes

`Module Eigengenes (MEs)` summarize the gene expression profile of co-expression modules in hdWGCNA, calculated by performing principal component analysis (PCA) on each module's subset of the gene expression matrix, with the first principal component as the ME. Given that single-cell data is sensitive to technical artifacts, hdWGCNA's ModuleEigengenes function not only computes MEs but also offers Harmony batch correction to produce `harmonized module eigengenes (hMEs)`. This harmonization is executed with a focus on the sample of origin, as specified by the `group.by.vars` parameter.

In [31]:
%%R

for (cell_type in names(seurat_obj)){

    modules <- GetModules(seurat_obj[[cell_type]])

    if (!(length(unique(modules$module))<=2 &  'grey'%in%modules$module)){

        print(paste0('Estimating module eigen-genes in hdWGCNA Experiment for ', toupper(cell_type)))
        
        # fneed to run ScaleData first or else harmony throws an error:
        seurat_obj[[cell_type]] <- ScaleData(seurat_obj[[cell_type]], features=VariableFeatures(seurat_obj[[cell_type]]))

        # compute all MEs in the full single-cell dataset
        seurat_obj[[cell_type]] <- ModuleEigengenes(
        seurat_obj[[cell_type]],
        group.by.vars=subject_id
        )
        print('----------------')

    }   else{    

        print(paste0('Error encountered while processing ', toupper(cell_type)))
        print(paste0(toupper(cell_type), ' dropped from experiment'))
        # Code to handle the error condition
        seurat_obj <- seurat_obj[names(seurat_obj) != cell_type]
        assign("seurat_obj", seurat_obj, envir = .GlobalEnv) # Update the modified seurat_obj in the global environment

    } 
    
}

[1] "Estimating module eigen-genes in hdWGCNA Experiment for EXCITATORY"
[1] "grey"
[1] "blue"
[1] "turquoise"
[1] "----------------"
[1] "Estimating module eigen-genes in hdWGCNA Experiment for INHIBITORY"
[1] "green"
[1] "turquoise"
[1] "grey"
[1] "blue"
[1] "yellow"
[1] "brown"
[1] "----------------"
[1] "Estimating module eigen-genes in hdWGCNA Experiment for ASTROCYTE"
[1] "turquoise"
[1] "grey"
[1] "blue"
[1] "----------------"
[1] "Error encountered while processing MICROGLIA"
[1] "MICROGLIA dropped from experiment"
[1] "Error encountered while processing OLIGODENDROCYTE"
[1] "OLIGODENDROCYTE dropped from experiment"
[1] "Error encountered while processing OPC"
[1] "OPC dropped from experiment"


Centering and scaling data matrix
Centering and scaling data matrix
pcagrey_ 1 
Positive:  APOE, GFAP, MT2A, BDNF, RAPGEF3, ATP1B2, SLC1A3, ARC, ARRDC2, SOX8 
	   FGF17, CRYAB, PLXND1, SLC6A1, DUSP7, SLC7A11, B2M, DUSP1, ATP1A2, ASIC1 
	   GRM4, FSTL3, NTNG2, ZNF358, CHRM2, ERBB4, DMKN, PLPP3, TFEB, TPD52L1 
Negative:  SYT1, CLASP2, MAP2, RBFOX2, GPM6B, MACF1, PI4KA, CAMTA1, SCN8A, SCN2A 
	   PAFAH1B1, CCDC88A, DNM3, SH3GL2, ARHGEF7, SLC24A2, SPTBN1, PAK1, RUFY3, NTRK2 
	   CADPS2, ATP8A1, RAB6A, NFASC, DLG1, SORL1, INPP4A, CAMSAP2, TAOK1, MBNL1 
pcagrey_ 2 
Positive:  TF, HHIP, HOOK2, CPQ, BCO2, NPC1, NEK1, HSPB11, SCN9A, ABHD3 
	   GRIP1, MYO10, SLC16A9, ZNF658, SGK1, DLC1, CCDC66, SCRG1, MYO6, HIP1 
	   SLC25A13, LIMA1, WWTR1, PDE8A, SLC9A9, FGF1, MACF1, BANP, ITCH, DLG1 
Negative:  DUSP6, KCNV1, ACTN4, FSCN1, LZTS1, APC2, KCNS1, NAB2, CAMKK2, MAP1S 
	   SLC8A2, ATP9A, TESK1, ABCA2, SLC30A3, RAB1B, ADGRB1, ENSA, MYADM, USP7 
	   KCNJ9, TYRO3, CSNK1D, ARC, VPS18, OCRL, SYT7, GPD1L, R

The ME matrices are stored as a matrix where each row is a cell and each column is a module. This matrix can be extracted from the Seurat object using the GetMEs function, which retrieves the hMEs by default.

In [33]:
%%R -o hMEs -o MEs

hMEs <- list()
MEs <- list()

for (cell_type in names(seurat_obj)){
    print(paste0('Getting harmonized and non-harmonized MEs in hdWGCNA Experiment for ', toupper(cell_type)))
    
    # harmonized module eigengenes:
    hMEs[[cell_type]] <- GetMEs(seurat_obj[[cell_type]])

    # module eigengenes:
    MEs[[cell_type]] <- GetMEs(seurat_obj[[cell_type]], harmonized=FALSE)
}

[1] "Getting harmonized and non-harmonized MEs in hdWGCNA Experiment for EXCITATORY"
[1] "Getting harmonized and non-harmonized MEs in hdWGCNA Experiment for INHIBITORY"
[1] "Getting harmonized and non-harmonized MEs in hdWGCNA Experiment for ASTROCYTE"


### **Compute module connectivity**

We want to focus on `hub genes`, which are highly connected within each module. To assess this, hdWGCNA includes the `ModuleConnectivity` function to calculate the `eigengene-based connectivity (kME)` of each gene in the full single-cell data. This function computes pairwise correlations between genes and module eigengenes. While kME can be calculated for all cells, it is recommended to compute it specifically in the cell type or group initially used in the ConstructNetwork function.

In [35]:
%%R

for (cell_type in names(seurat_obj)){

  print(paste0('Estimating module connectivity in hdWGCNA Experiment for ', toupper(cell_type)))

  # compute eigengene-based connectivity (kME):
  seurat_obj[[cell_type]] <- ModuleConnectivity(
    seurat_obj[[cell_type]],
    group.by = 'cell_type', 
    group_name = cell_type
  )
  
}

[1] "Estimating module connectivity in hdWGCNA Experiment for EXCITATORY"
[1] "Estimating module connectivity in hdWGCNA Experiment for INHIBITORY"
[1] "Estimating module connectivity in hdWGCNA Experiment for ASTROCYTE"


For convenience, we re-name the hdWGCNA modules to indicate that they are from a cell-type group.

In [37]:
%%R

for (cell_type in names(seurat_obj)){

  print(paste0('Renaming modules in hdWGCNA Experiment for ', toupper(cell_type)))

  # rename the modules
  seurat_obj[[cell_type]] <- ResetModuleNames(
    seurat_obj[[cell_type]],
    new_name = paste0(toupper(substr(cell_type, 1, 3)), "-M"),
    wgcna_name = toupper(substr(cell_type, 1, 3)) # the name of the hdWGCNA experiment,
  )
}

[1] "Renaming modules in hdWGCNA Experiment for EXCITATORY"
[1] "Renaming modules in hdWGCNA Experiment for INHIBITORY"
[1] "Renaming modules in hdWGCNA Experiment for ASTROCYTE"


We also reset module colors

In [39]:
%%R


library(MetBrewer)

for (cell_type in names(seurat_obj)){

  print(paste0('Resetting module colors in hdWGCNA Experiment for ', toupper(cell_type)))


  modules <- GetModules(seurat_obj[[cell_type]])
  mods <- levels(modules$module)
  mod_colors <- dplyr::select(modules, c(module, color)) %>%
    distinct %>% arrange(module) %>% .$color
  n_colors <- length(mod_colors) -1

  new_colors <- paste0(met.brewer("Signac", n=n_colors,))
  new_colors <- sample(new_colors)
  seurat_obj[[cell_type]] <- ResetModuleColors(seurat_obj[[cell_type]], new_colors)

}


[1] "Resetting module colors in hdWGCNA Experiment for EXCITATORY"
[1] "Resetting module colors in hdWGCNA Experiment for INHIBITORY"
[1] "Resetting module colors in hdWGCNA Experiment for ASTROCYTE"


We can visualize the genes in each module ranked by kME using the PlotKMEs function.

In [41]:
%%R

library(conflicted)
conflicts_prefer(dplyr::select)

fig_dir = paste0("../results/hdWGCNA/ModuleConnectivity/", save_prefix, '/')

if (!dir.exists(fig_dir)) {
  dir.create(fig_dir, recursive=TRUE)
}

for (cell_type in names(seurat_obj)){
    
    print(paste0('Visualize Module Connectivity (kMEs) in hdWGCNA Experiment for ', toupper(cell_type)))

    # plot genes ranked by kME for each module

    p <- PlotKMEs(seurat_obj[[cell_type]], ncol=5)

    pdf(paste0(fig_dir, cell_type, '_KMEs.pdf'), width=12, height=8)
    print(p)
    dev.off()
}

[conflicted] Will prefer dplyr::select over any other package.
[1] "Visualize Module Connectivity (kMEs) in hdWGCNA Experiment for EXCITATORY"
[1] "EXC-M1"
[1] "EXC-M2"
[1] "EXC-M1"
[1] "EXC-M2"
[1] "Visualize Module Connectivity (kMEs) in hdWGCNA Experiment for INHIBITORY"
[1] "INH-M1"
[1] "INH-M2"
[1] "INH-M3"
[1] "INH-M4"
[1] "INH-M5"
[1] "INH-M1"
[1] "INH-M2"
[1] "INH-M3"
[1] "INH-M4"
[1] "INH-M5"
[1] "Visualize Module Connectivity (kMEs) in hdWGCNA Experiment for ASTROCYTE"
[1] "AST-M1"
[1] "AST-M2"
[1] "AST-M1"
[1] "AST-M2"


### Getting the module assignment table

The hdWGCNA package provides the `GetModules` function for convenient access to the module assignment table. This table includes three primary columns: gene_name for the gene symbol or ID, module for the gene's module assignment, and color for the color mapping of each module, which is frequently used in downstream plotting. Additionally, if ModuleConnectivity has been used in the hdWGCNA experiment, the table will feature extra columns showing the kME values for each module.

In [43]:
%%R

for (cell_type in names(seurat_obj)){
    
    print(paste0('Obtain Module Assignment Table in hdWGCNA Experiment for ', toupper(cell_type)))

    # get the module assignment table:
    modules <- GetModules(seurat_obj[[cell_type]])

    # show the first 6 columns:
    print(head(modules[,1:6]))
}

[1] "Obtain Module Assignment Table in hdWGCNA Experiment for EXCITATORY"
         gene_name module   color  kME_grey kME_EXC-M1 kME_EXC-M2
ACADVL      ACADVL   grey    grey 0.5326983  0.4393766  0.5220664
ADD1          ADD1 EXC-M1 #d8443c 0.6120542  0.5898830  0.5640474
ARFGAP1    ARFGAP1 EXC-M1 #d8443c 0.3459760  0.4051022  0.2897552
ATP6V0D1  ATP6V0D1 EXC-M1 #d8443c 0.4817774  0.5637797  0.3959944
CTDSP2      CTDSP2   grey    grey 0.2870663  0.2668398  0.2715963
CXXC1        CXXC1   grey    grey 0.2609811  0.2441866  0.2468051
[1] "Obtain Module Assignment Table in hdWGCNA Experiment for INHIBITORY"
         gene_name module   color kME_INH-M1 kME_INH-M2  kME_grey
ACADVL      ACADVL INH-M1 #de597c  0.3727467  0.3579442 0.3734083
ADD1          ADD1 INH-M2 #d8443c  0.4691682  0.4891299 0.4844225
ARFGAP1    ARFGAP1 INH-M2 #d8443c  0.2373664  0.3069337 0.2105659
ATP6V0D1  ATP6V0D1 INH-M2 #d8443c  0.3695820  0.4323699 0.3397629
CTDSP2      CTDSP2 INH-M1 #de597c  0.1681184  0.1631429 0.16

A table of the top N hub genes sorted by kME can be extracted using the GetHubGenes function.

In [45]:
%%R

for (cell_type in names(seurat_obj)){
    
    print(paste0('Obtain Hub Genes in hdWGCNA Experiment for ', toupper(cell_type)))

    # get hub genes
    hub_df <- GetHubGenes(seurat_obj[[cell_type]], n_hubs = 10)

    print(head(hub_df))
}

[1] "Obtain Hub Genes in hdWGCNA Experiment for EXCITATORY"
  gene_name module       kME
1     GAPDH EXC-M1 0.7639032
2      ACTB EXC-M1 0.7650383
3  HSP90AB1 EXC-M1 0.7664502
4     UCHL1 EXC-M1 0.7671371
5      NRGN EXC-M1 0.7693697
6     CALM3 EXC-M1 0.7752952
[1] "Obtain Hub Genes in hdWGCNA Experiment for INHIBITORY"
  gene_name module       kME
1      CLTC INH-M1 0.5729561
2      PTK2 INH-M1 0.5753169
3     PGAM1 INH-M1 0.5781938
4      THY1 INH-M1 0.5795863
5     STMN2 INH-M1 0.6037622
6    ATP1A1 INH-M1 0.6044947
[1] "Obtain Hub Genes in hdWGCNA Experiment for ASTROCYTE"
  gene_name module       kME
1     ITM2C AST-M1 0.3993349
2     HIF3A AST-M1 0.3998826
3     NTRK3 AST-M1 0.4000449
4      APOE AST-M1 0.4185815
5     CALM1 AST-M1 0.4201640
6    ATP1B2 AST-M1 0.4342709


## **Compute hub gene signature scores**

Gene scoring analysis is a popular method in single-cell transcriptomics for computing a score for the overall signature of a set of genes.

hdWGCNA offers the `ModuleExprScore` function, which computes gene scores for a specified number of genes in each module, employing either the Seurat or UCell algorithm. This gene scoring method serves as an alternative to summarizing module expression, differing from the approach of computing module eigengenes.

In [47]:
%%R 

for (cell_type in names(seurat_obj)){
    
  print(paste0('Compute hub gene scores in hdWGCNA Experiment for ', toupper(cell_type)))
    
  # compute gene scoring for the top 25 hub genes by kME for each module
  # with Seurat method
  # seurat_obj[[cell_type]] <- ModuleExprScore(
  #   seurat_obj[[cell_type]],
  #   n_genes = 25,
  #   method='Seurat'
  # )

  # compute gene scoring for the top 25 hub genes by kME for each module
  # with UCell method
  library(UCell)
  seurat_obj[[cell_type]] <- ModuleExprScore(
    seurat_obj[[cell_type]],
    n_genes = 25,
    method='UCell'
  )
}

[1] "Compute hub gene scores in hdWGCNA Experiment for EXCITATORY"
Selecting by kME_EXC-M1
Selecting by kME_EXC-M2
[1] "Compute hub gene scores in hdWGCNA Experiment for INHIBITORY"
Selecting by kME_INH-M1
Selecting by kME_INH-M2
Selecting by kME_INH-M3
Selecting by kME_INH-M4
Selecting by kME_INH-M5
[1] "Compute hub gene scores in hdWGCNA Experiment for ASTROCYTE"
Selecting by kME_AST-M1
Selecting by kME_AST-M2




# **Save Seurat Object**

This wraps up the critical analysis steps for hdWGCNA, let's save our output.

In [49]:
%%R -i seurat_obj

dat_dir = paste0("../results/hdWGCNA/SeuratObject/", save_prefix, '/')

if (!dir.exists(dat_dir)) {
  dir.create(dat_dir, recursive=TRUE)
}

for (cell_type in names(seurat_obj)){
    
    print(paste0('Saving hdWGCNA object in hdWGCNA Experiment for ', toupper(cell_type)))

    saveRDS(seurat_obj[[cell_type]], file=paste0(dat_dir, cell_type, '_hdWGCNA_object.rds'))

}

[1] "Saving hdWGCNA object in hdWGCNA Experiment for EXCITATORY"
[1] "Saving hdWGCNA object in hdWGCNA Experiment for INHIBITORY"
[1] "Saving hdWGCNA object in hdWGCNA Experiment for ASTROCYTE"
